From 578589d363a540c5770ce3d17294ee5dd53c0a1a Mon Sep 17 00:00:00 2001
From: xuchen-amd <xuchen@amd.com>
Date: Wed, 22 Oct 2025 15:17:43 -0400
Subject: [PATCH] [rocprofiler-compute] metrics generator (#1199)

---
 .../.pre-commit-config.yaml                   |   19 +-
 projects/rocprofiler-compute/CHANGELOG.md     |    4 +
 projects/rocprofiler-compute/CMakeLists.txt   |   23 +-
 projects/rocprofiler-compute/CONTRIBUTING.md  |   11 +-
 .../rocprofiler-compute/coverage/README.md    |    6 +-
 .../docs/data/metrics_description.yaml        | 1474 ++++----
 .../docs/how-to/analyze/cli.rst               |   91 +-
 .../docs/how-to/profile/mode.rst              |    2 +-
 .../rocprofiler-compute/docs/how-to/use.rst   |    7 +
 projects/rocprofiler-compute/src/argparser.py |   45 +-
 .../rocprof_compute_analyze/analysis_base.py  |   56 +-
 .../src/rocprof_compute_base.py               |   34 +
 .../rocprof_compute_profile/profiler_base.py  |    1 +
 .../gfx908/0000_top_stats.yaml                |    2 +-
 .../gfx908/0100_system_info.yaml              |    2 +-
 .../gfx908/0200_system_speed_of_light.yaml    |  240 +-
 .../gfx908/0300_memory_chart.yaml             |  242 +-
 .../gfx908/0400_roofline.yaml                 |  162 +-
 .../0500_command_processor_cpc_cpf.yaml       |   49 +-
 .../gfx908/0600_workgroup_manager_spi.yaml    |  110 +-
 .../gfx908/0700_wavefront.yaml                |  120 +-
 .../1000_compute_units_instruction_mix.yaml   |  116 +-
 .../1100_compute_units_compute_pipeline.yaml  |   99 +-
 .../gfx908/1200_local_data_share_lds.yaml     |  103 +-
 .../gfx908/1300_instruction_cache.yaml        |   52 +-
 .../gfx908/1400_scalar_l1_data_cache.yaml     |  119 +-
 ...ssing_unit_and_data_return_path_ta_td.yaml |  154 +-
 .../gfx908/1600_vector_l1_data_cache.yaml     |  264 +-
 .../gfx908/1700_l2_cache.yaml                 |  738 ++--
 .../gfx908/1800_l2_cache_per_channel.yaml     |    8 +-
 .../gfx908/2100_pc_sampling.yaml              |    2 +-
 .../gfx908/config_delta/gfx950_diff.yaml      | 1128 ++++++
 .../gfx90a/0000_top_stats.yaml                |    2 +-
 .../gfx90a/0100_system_info.yaml              |    2 +-
 .../gfx90a/0200_system_speed_of_light.yaml    |  240 +-
 .../gfx90a/0300_memory_chart.yaml             |  242 +-
 .../gfx90a/0400_roofline.yaml                 |  162 +-
 .../0500_command_processor_cpc_cpf.yaml       |   49 +-
 .../gfx90a/0600_workgroup_manager_spi.yaml    |  110 +-
 .../gfx90a/0700_wavefront.yaml                |  120 +-
 .../1000_compute_units_instruction_mix.yaml   |  166 +-
 .../1100_compute_units_compute_pipeline.yaml  |  181 +-
 .../gfx90a/1200_local_data_share_lds.yaml     |  103 +-
 .../gfx90a/1300_instruction_cache.yaml        |   52 +-
 .../gfx90a/1400_scalar_l1_data_cache.yaml     |  119 +-
 ...ssing_unit_and_data_return_path_ta_td.yaml |  157 +-
 .../gfx90a/1600_vector_l1_data_cache.yaml     |  264 +-
 .../gfx90a/1700_l2_cache.yaml                 |  738 ++--
 .../gfx90a/1800_l2_cache_per_channel.yaml     |    8 +-
 .../gfx90a/2100_pc_sampling.yaml              |    2 +-
 .../gfx90a/config_delta/gfx950_diff.yaml      | 1022 ++++++
 .../gfx940/0000_top_stats.yaml                |    2 +-
 .../gfx940/0100_system_info.yaml              |    2 +-
 .../gfx940/0200_system_speed_of_light.yaml    |  245 +-
 .../gfx940/0300_memory_chart.yaml             |  236 +-
 .../gfx940/0400_roofline.yaml                 |  167 +-
 .../0500_command_processor_cpc_cpf.yaml       |   49 +-
 .../gfx940/0600_workgroup_manager_spi.yaml    |  110 +-
 .../gfx940/0700_wavefront.yaml                |  120 +-
 .../1000_compute_units_instruction_mix.yaml   |  169 +-
 .../1100_compute_units_compute_pipeline.yaml  |  183 +-
 .../gfx940/1200_local_data_share_lds.yaml     |  103 +-
 .../gfx940/1300_instruction_cache.yaml        |   52 +-
 .../gfx940/1400_scalar_l1_data_cache.yaml     |  119 +-
 ...ssing_unit_and_data_return_path_ta_td.yaml |  157 +-
 .../gfx940/1600_vector_l1_data_cache.yaml     |  256 +-
 .../gfx940/1700_l2_cache.yaml                 |  422 +--
 .../gfx940/1800_l2_cache_per_channel.yaml     |    8 +-
 .../gfx940/2100_pc_sampling.yaml              |    2 +-
 .../gfx940/config_delta/gfx950_diff.yaml      |  755 ++++
 .../gfx941/0000_top_stats.yaml                |    2 +-
 .../gfx941/0100_system_info.yaml              |    2 +-
 .../gfx941/0200_system_speed_of_light.yaml    |  245 +-
 .../gfx941/0300_memory_chart.yaml             |  236 +-
 .../gfx941/0400_roofline.yaml                 |  167 +-
 .../0500_command_processor_cpc_cpf.yaml       |   49 +-
 .../gfx941/0600_workgroup_manager_spi.yaml    |  110 +-
 .../gfx941/0700_wavefront.yaml                |  120 +-
 .../1000_compute_units_instruction_mix.yaml   |  169 +-
 .../1100_compute_units_compute_pipeline.yaml  |  183 +-
 .../gfx941/1200_local_data_share_lds.yaml     |  103 +-
 .../gfx941/1300_instruction_cache.yaml        |   52 +-
 .../gfx941/1400_scalar_l1_data_cache.yaml     |  119 +-
 ...ssing_unit_and_data_return_path_ta_td.yaml |  157 +-
 .../gfx941/1600_vector_l1_data_cache.yaml     |  256 +-
 .../gfx941/1700_l2_cache.yaml                 |  422 +--
 .../gfx941/1800_l2_cache_per_channel.yaml     |    8 +-
 .../gfx941/2100_pc_sampling.yaml              |    2 +-
 .../gfx941/config_delta/gfx950_diff.yaml      |  763 ++++
 .../gfx942/0000_top_stats.yaml                |    2 +-
 .../gfx942/0100_system_info.yaml              |    2 +-
 .../gfx942/0200_system_speed_of_light.yaml    |  245 +-
 .../gfx942/0300_memory_chart.yaml             |  236 +-
 .../gfx942/0400_roofline.yaml                 |  167 +-
 .../0500_command_processor_cpc_cpf.yaml       |   49 +-
 .../gfx942/0600_workgroup_manager_spi.yaml    |  110 +-
 .../gfx942/0700_wavefront.yaml                |  120 +-
 .../1000_compute_units_instruction_mix.yaml   |  169 +-
 .../1100_compute_units_compute_pipeline.yaml  |  183 +-
 .../gfx942/1200_local_data_share_lds.yaml     |  103 +-
 .../gfx942/1300_instruction_cache.yaml        |   52 +-
 .../gfx942/1400_scalar_l1_data_cache.yaml     |  119 +-
 ...ssing_unit_and_data_return_path_ta_td.yaml |  157 +-
 .../gfx942/1600_vector_l1_data_cache.yaml     |  256 +-
 .../gfx942/1700_l2_cache.yaml                 |  424 +--
 .../gfx942/1800_l2_cache_per_channel.yaml     |    8 +-
 .../gfx942/2100_pc_sampling.yaml              |    2 +-
 .../gfx942/config_delta/gfx950_diff.yaml      |  761 ++++
 .../gfx950/0000_top_stats.yaml                |    2 +-
 .../gfx950/0100_system_info.yaml              |    2 +-
 .../gfx950/0200_system_speed_of_light.yaml    |  245 +-
 .../gfx950/0300_memory_chart.yaml             |  242 +-
 .../gfx950/0400_roofline.yaml                 |  173 +-
 .../0500_command_processor_cpc_cpf.yaml       |   49 +-
 .../gfx950/0600_workgroup_manager_spi.yaml    |  110 +-
 .../gfx950/0700_wavefront.yaml                |  120 +-
 .../1000_compute_units_instruction_mix.yaml   |  169 +-
 .../1100_compute_units_compute_pipeline.yaml  |  185 +-
 .../gfx950/1200_local_data_share_lds.yaml     |  111 +-
 .../gfx950/1300_instruction_cache.yaml        |   52 +-
 .../gfx950/1400_scalar_l1_data_cache.yaml     |  119 +-
 ...ssing_unit_and_data_return_path_ta_td.yaml |  166 +-
 .../gfx950/1600_vector_l1_data_cache.yaml     |  278 +-
 .../gfx950/1700_l2_cache.yaml                 |  519 +--
 .../gfx950/1800_l2_cache_per_channel.yaml     |    8 +-
 .../gfx950/2100_pc_sampling.yaml              |    2 +-
 .../src/rocprof_compute_soc/soc_base.py       |   14 +
 .../rocprofiler-compute/src/utils/file_io.py  |    2 +-
 projects/rocprofiler-compute/src/utils/tty.py |   62 +-
 .../rocprofiler-compute/src/utils/utils.py    |   15 +
 .../rocprofiler-compute/tests/conftest.py     |    6 +
 .../tests/test_autogen_config.py              |    2 +-
 .../rocprofiler-compute/tools/__init__.py     |    0
 .../tools/autogen_hash.yaml                   |  116 +
 .../{utils => tools}/build.sh                 |    0
 .../config_management/.config_hashes.json     |  142 +
 .../tools/config_management/README.md         |  500 +++
 .../tools/config_management/__init__.py       |    0
 .../config_management/apply_config_deltas.py  |  258 ++
 .../config_management/config_workflow.yaml    |   31 +
 .../config_management/delta_template.yaml     |  164 +
 .../generate_config_deltas.py                 |  360 ++
 .../gfx9_config_template.yaml                 |  260 ++
 .../tools/config_management/hash_checker.py   |  215 ++
 .../tools/config_management/hash_manager.py   |  279 ++
 .../master_config_workflow_script.py          | 1014 ++++++
 .../metric_description_manager.py             |  515 +++
 .../parse_config_template.py                  |  113 +
 .../tests/test_config_workflow.py             |  417 +++
 .../tools/config_management/utils.py          |   52 +
 .../verify_against_config_template.py         |  227 ++
 .../gfx908_metrics_description.yaml           | 1791 ++++++++++
 .../gfx90a_metrics_description.yaml           | 2035 +++++++++++
 .../gfx940_metrics_description.yaml           | 2040 +++++++++++
 .../gfx941_metrics_description.yaml           | 2040 +++++++++++
 .../gfx942_metrics_description.yaml           | 2043 +++++++++++
 .../gfx950_metrics_description.yaml           | 2309 ++++++++++++
 .../{utils => tools}/run-ci.py                |    0
 .../{utils => tools}/split_config.py          |   53 +-
 .../{utils => tools}/unified_config.yaml      | 3122 +++++++++--------
 .../{utils => tools}/unified_sets.yaml        |    2 +-
 .../{utils => tools}/update-coverage.sh       |    2 +-
 .../{utils => tools}/update_license.py        |    0
 .../{utils => tools}/ver_check.py             |    0
 .../utils/autogen_hash.yaml                   |  116 -
 165 files changed, 31522 insertions(+), 10056 deletions(-)
 create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/config_delta/gfx950_diff.yaml
 create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/config_delta/gfx950_diff.yaml
 create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/config_delta/gfx950_diff.yaml
 create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/config_delta/gfx950_diff.yaml
 create mode 100644 projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/config_delta/gfx950_diff.yaml
 create mode 100644 projects/rocprofiler-compute/tools/__init__.py
 create mode 100644 projects/rocprofiler-compute/tools/autogen_hash.yaml
 rename projects/rocprofiler-compute/{utils => tools}/build.sh (100%)
 create mode 100644 projects/rocprofiler-compute/tools/config_management/.config_hashes.json
 create mode 100644 projects/rocprofiler-compute/tools/config_management/README.md
 create mode 100644 projects/rocprofiler-compute/tools/config_management/__init__.py
 create mode 100644 projects/rocprofiler-compute/tools/config_management/apply_config_deltas.py
 create mode 100644 projects/rocprofiler-compute/tools/config_management/config_workflow.yaml
 create mode 100644 projects/rocprofiler-compute/tools/config_management/delta_template.yaml
 create mode 100644 projects/rocprofiler-compute/tools/config_management/generate_config_deltas.py
 create mode 100644 projects/rocprofiler-compute/tools/config_management/gfx9_config_template.yaml
 create mode 100644 projects/rocprofiler-compute/tools/config_management/hash_checker.py
 create mode 100644 projects/rocprofiler-compute/tools/config_management/hash_manager.py
 create mode 100644 projects/rocprofiler-compute/tools/config_management/master_config_workflow_script.py
 create mode 100644 projects/rocprofiler-compute/tools/config_management/metric_description_manager.py
 create mode 100644 projects/rocprofiler-compute/tools/config_management/parse_config_template.py
 create mode 100644 projects/rocprofiler-compute/tools/config_management/tests/test_config_workflow.py
 create mode 100644 projects/rocprofiler-compute/tools/config_management/utils.py
 create mode 100644 projects/rocprofiler-compute/tools/config_management/verify_against_config_template.py
 create mode 100644 projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx908_metrics_description.yaml
 create mode 100644 projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx90a_metrics_description.yaml
 create mode 100644 projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx940_metrics_description.yaml
 create mode 100644 projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx941_metrics_description.yaml
 create mode 100644 projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx942_metrics_description.yaml
 create mode 100644 projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx950_metrics_description.yaml
 rename projects/rocprofiler-compute/{utils => tools}/run-ci.py (100%)
 rename projects/rocprofiler-compute/{utils => tools}/split_config.py (87%)
 rename projects/rocprofiler-compute/{utils => tools}/unified_config.yaml (92%)
 rename projects/rocprofiler-compute/{utils => tools}/unified_sets.yaml (99%)
 rename projects/rocprofiler-compute/{utils => tools}/update-coverage.sh (99%)
 rename projects/rocprofiler-compute/{utils => tools}/update_license.py (100%)
 rename projects/rocprofiler-compute/{utils => tools}/ver_check.py (100%)
 delete mode 100644 projects/rocprofiler-compute/utils/autogen_hash.yaml

diff --git a/projects/rocprofiler-compute/.pre-commit-config.yaml b/projects/rocprofiler-compute/.pre-commit-config.yaml
index 89906b129f..10c643321f 100644
--- a/projects/rocprofiler-compute/.pre-commit-config.yaml
+++ b/projects/rocprofiler-compute/.pre-commit-config.yaml
@@ -7,12 +7,23 @@ repos:
       - id: check-yaml
       - id: end-of-file-fixer
       - id: trailing-whitespace
-    # Python import sorting and formatting
+
+  # Python import sorting and formatting
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    # Ruff version. Check https://github.com/astral-sh/ruff-pre-commit#version-compatibility,
+    # Ruff version. Check https://github.com/astral-sh/ruff-pre-commit#version-compatibility
     # for the latest ruff version supported by the hook.
     rev: v0.12.12
     hooks:
       - id: ruff-check
-        args: [--fix, --exit-non-zero-on-fix]
-      - id: ruff-format
\ No newline at end of file
+        args: [--fix]
+      - id: ruff-format
+
+  # Local hook: hash consistency check
+  - repo: local
+    hooks:
+      - id: hash-check
+        name: Hash consistency check
+        entry: bash -lc 'cd projects/rocprofiler-compute && python3 tools/config_management/hash_checker.py'
+        language: system
+        pass_filenames: false
+        stages: [pre-commit]
diff --git a/projects/rocprofiler-compute/CHANGELOG.md b/projects/rocprofiler-compute/CHANGELOG.md
index abf6de3ac4..9031b4bc63 100644
--- a/projects/rocprofiler-compute/CHANGELOG.md
+++ b/projects/rocprofiler-compute/CHANGELOG.md
@@ -5,8 +5,12 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs.
 ## Unreleased
 
 ### Added
+* Add `--list-blocks <arch>` option to general options to list available IP blocks on specified arch (similar to `--list-metrics`), cannot be used with `--block`.
+* Added `config_delta/gfx950_diff.yaml` to analysis config yamls to track the revision between a gfx9 architecture against the latest supported architecture gfx950
 
 ### Changed
+* `-b/--block` accepts block alias(es) (See block aliases using command-line option `--list-blocks <arch>`).
+* analysis configs yamls are now managed with the new config management workflow in `tools/config_management/`
 
 ### Removed
 
diff --git a/projects/rocprofiler-compute/CMakeLists.txt b/projects/rocprofiler-compute/CMakeLists.txt
index 18b5e79f3d..f758b04b4d 100644
--- a/projects/rocprofiler-compute/CMakeLists.txt
+++ b/projects/rocprofiler-compute/CMakeLists.txt
@@ -400,18 +400,6 @@ add_test(
     WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
 )
 
-# ---------------------------
-# DB Connector tests
-# ---------------------------
-
-add_test(
-    NAME test_db_connector
-    COMMAND
-        ${Python3_EXECUTABLE} -m pytest --junitxml=tests/test_db_connector.xml
-        ${COV_OPTION} ${PROJECT_SOURCE_DIR}/tests/test_db_connector.py
-    WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
-)
-
 # ---------------------------
 # Utils tests
 # ---------------------------
@@ -547,6 +535,13 @@ install(
     COMPONENT main
     PATTERN "__pycache__" EXCLUDE
 )
+# tools/config_management
+install(
+    DIRECTORY tools/config_management
+    DESTINATION ${CMAKE_INSTALL_LIBEXECDIR}/${PROJECT_NAME}
+    COMPONENT main
+    PATTERN "__pycache__" EXCLUDE
+)
 # grafana assets
 install(
     DIRECTORY grafana
@@ -586,10 +581,10 @@ install(
 add_custom_target(
     license
     COMMAND
-        ${PROJECT_SOURCE_DIR}/utils/update_license.py --source ${PROJECT_SOURCE_DIR}/src
+        ${PROJECT_SOURCE_DIR}/tools/update_license.py --source ${PROJECT_SOURCE_DIR}/src
         --license ${PROJECT_SOURCE_DIR}/LICENSE.md --extension '.py'
     COMMAND
-        ${PROJECT_SOURCE_DIR}/utils/update_license.py --source ${PROJECT_SOURCE_DIR}
+        ${PROJECT_SOURCE_DIR}/tools/update_license.py --source ${PROJECT_SOURCE_DIR}
         --license ${PROJECT_SOURCE_DIR}/LICENSE.md --file
         "src/${PACKAGE_NAME},cmake/Dockerfile,cmake/rocm_install.sh,docker/docker-entrypoint.sh,src/rocprof_compute_analyze/convertor/mongodb/convert"
 )
diff --git a/projects/rocprofiler-compute/CONTRIBUTING.md b/projects/rocprofiler-compute/CONTRIBUTING.md
index 8b6dfd6eed..d7e7c4d714 100644
--- a/projects/rocprofiler-compute/CONTRIBUTING.md
+++ b/projects/rocprofiler-compute/CONTRIBUTING.md
@@ -190,4 +190,13 @@ Any future contributions should adhere to these guidelines:
 
 ### Build and test documentation changes
 
-For instructions on how to build and test documentation changes (files under docs folder), please see https://rocm.docs.amd.com/en/latest/contribute/contributing.html
\ No newline at end of file
+For instructions on how to build and test documentation changes (files under docs folder), please see https://rocm.docs.amd.com/en/latest/contribute/contributing.html
+
+
+## Metrics Management
+
+If your PR touches **metric configs** (panel YAMLs under `src/rocprof_compute_soc/analysis_configs/gfx<arch>/*.yaml`, config deltas, or metric descriptions in `docs/data/metrics_description.yaml`), please follow the metric management workflow summarized here:
+- Edit the panel YAMLs and, when appropriate, generate/apply a delta and (optionally) promote a new architecture using the [workflow script](`tools/config_management/master_config_workflow_script.py`).
+- Verify hashes are updated and CI tests pass.
+
+For full details, see the [metric config management README](./tools/config_management/README.md)
diff --git a/projects/rocprofiler-compute/coverage/README.md b/projects/rocprofiler-compute/coverage/README.md
index 52ab47cf58..95d37c4861 100644
--- a/projects/rocprofiler-compute/coverage/README.md
+++ b/projects/rocprofiler-compute/coverage/README.md
@@ -13,7 +13,7 @@ monorepo/
 │       ├── CMakeLists.txt
 │       ├── coverage/
 │       │   └── coverage-latest.xml  # committed coverage file
-│       ├── utils/
+│       ├── tools/
 │       │   ├── update_coverage.sh  # coverage generation/update script
 │       │   └── run-ci.py             # CDash upload script
 │       └── ...
@@ -31,7 +31,7 @@ Run this periodically to update the coverage baseline:
 ```bash
 # From monorepo root
 cd projects/rocprofiler-compute
-./utils/update_coverage.sh
+./tools/update_coverage.sh
 
 # This will:
 # - Build with coverage enabled
@@ -74,4 +74,4 @@ pip install coverage pytest pytest-cov
 #verify tests can run
 cd projects/rocprofiler-compute/build
 ctest --verbose
-```
\ No newline at end of file
+```
diff --git a/projects/rocprofiler-compute/docs/data/metrics_description.yaml b/projects/rocprofiler-compute/docs/data/metrics_description.yaml
index 12eb28816a..25635f32b8 100644
--- a/projects/rocprofiler-compute/docs/data/metrics_description.yaml
+++ b/projects/rocprofiler-compute/docs/data/metrics_description.yaml
@@ -1,9 +1,10 @@
 # AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py
 Wavefront launch stats:
   AGPRs:
-    rst: 'The number of accumulation vector general-purpose registers allocated for  the
-      kernel, see :ref:`AGPRs <desc-agprs>`.  Note: this may not exactly  match the
-      number of AGPRs requested by the compiler due to allocation  granularity.'
+    rst: |-
+      The number of accumulation vector general-purpose registers allocated
+      for the kernel, see :ref:`AGPRs <desc-agprs>`. Note: this may not exactly match
+      the number of AGPRs requested by the compiler due to allocation granularity.
     unit: AGPRs
   Grid Size:
     rst: The total number of work-items (or, threads) launched as a part of the kernel
@@ -11,39 +12,43 @@ Wavefront launch stats:
       total workgroup (or, block) size.
     unit: Work-Items
   LDS Allocation:
-    rst: 'The number of bytes of :doc:`LDS <local-data-share>` memory (or, shared  memory)
-      allocated for this kernel.  Note: This may also be larger than  what was requested
-      at compile time due to both allocation granularity and  dynamic per-dispatch
-      LDS allocations.'
+    rst: |-
+      The number of bytes of :doc:`LDS <local-data-share>` memory (or, shared memory)
+      allocated for this kernel. Note: This may also be larger than what was requested
+      at compile time due to both allocation granularity and dynamic per-dispatch
+      LDS allocations.
     unit: Bytes per workgroup
   Restored Wavefronts:
-    rst: The total number of wavefronts restored from a context-save. See  `cwsr_enable
+    rst: The total number of wavefronts restored from a context-save. See `cwsr_enable
       <https://docs.kernel.org/gpu/amdgpu/module-parameters.html?highlight=cwsr>`_.
     unit: Wavefronts
   SGPRs:
-    rst: 'The number of scalar general-purpose registers allocated for the kernel,  see
-      :ref:`SALU <desc-salu>`.  Note: this may not exactly match the number  of SGPRs
-      requested by the compiler due to allocation granularity. plain'
+    rst: |-
+      The number of scalar general-purpose registers allocated for the kernel, see
+      :ref:`SALU <desc-salu>`. Note: this may not exactly match the number of
+      SGPRs requested by the compiler due to allocation granularity.
     unit: SGPRs
   Saved Wavefronts:
-    rst: The total number of wavefronts saved at a context-save. See  `cwsr_enable
+    rst: The total number of wavefronts saved at a context-save. See `cwsr_enable
       <https://docs.kernel.org/gpu/amdgpu/module-parameters.html?highlight=cwsr>`_.
     unit: Wavefronts
   Scratch Allocation:
-    rst: The number of bytes of :ref:`scratch memory <memory-spaces>` requested  per
-      work-item for this kernel. Scratch memory is used for stack memory  on the accelerator,
+    rst: The number of bytes of :ref:`scratch memory <memory-spaces>` requested per
+      work-item for this kernel. Scratch memory is used for stack memory on the accelerator,
       as well as for register spills and restores.
     unit: Bytes per work-item
   Total Wavefronts:
-    rst: "The total number of wavefronts launched as part of the kernel dispatch.\
-      \ On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront\
-      \ size is always 64 work-items. Thus, the total number of wavefronts should\
-      \ be equivalent to the ceiling of grid size divided by 64."
+    rst: |-
+      The total number of wavefronts launched as part of the kernel dispatch.
+      On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront
+      size is always 64 work-items. Thus, the total number of wavefronts should
+      be equivalent to the ceiling of grid size divided by 64.
     unit: Wavefronts
   VGPRs:
-    rst: 'The number of architected vector general-purpose registers allocated for  the
-      kernel, see :ref:`VALU <desc-valu>`.  Note: this may not exactly  match the
-      number of VGPRs requested by the compiler due to allocation  granularity.'
+    rst: |-
+      The number of architected vector general-purpose registers allocated for the
+      kernel, see :ref:`VALU <desc-valu>`. Note: this may not exactly match the
+      number of VGPRs requested by the compiler due to allocation granularity.
     unit: VGPRs
   Workgroup Size:
     rst: The total number of work-items (or, threads) in each workgroup (or, block)
@@ -52,39 +57,39 @@ Wavefront launch stats:
     unit: Work-Items
 Wavefront runtime stats:
   Active Cycles:
-    rst: The average number of cycles a wavefront in the kernel dispatch was  actively
-      executing instructions per  :ref:`normalization unit <normalization-units>`.
-      This measurement is made  on a per-wavefront basis, and may include cycles that
-      another wavefront  spent actively executing (on another execution unit, for
-      example) or was  stalled.  As such, it is most useful to get a sense of how
-      waves were  spending their time, rather than identification of a precise limiter.
-      The  sum of this metric, Issue Wait Cycles and Active Wait Cycles should be  equal
-      to the total Wave Cycles metric.
+    rst: The average number of cycles a wavefront in the kernel dispatch was actively
+      executing instructions per :ref:`normalization unit <normalization-units>`.
+      This measurement is made on a per-wavefront basis, and may include cycles that
+      another wavefront spent actively executing (on another execution unit, for example)
+      or was stalled. As such, it is most useful to get a sense of how waves were
+      spending their time, rather than identification of a precise limiter. The sum
+      of this metric, Issue Wait Cycles and Active Wait Cycles should be equal to
+      the total Wave Cycles metric.
     unit: Cycles per normalization unit
   Dependency Wait Cycles:
-    rst: The number of cycles a wavefront in the kernel dispatch stalled waiting  on
-      memory of any kind (e.g., instruction fetch, vector or scalar memory,  etc.)
-      per :ref:`normalization unit <normalization-units>`. This counter  is incremented
-      at every cycle by *all* wavefronts on a CU stalled at a  memory operation.  As
-      such, it is most useful to get a sense of how waves  were spending their time,
-      rather than identification of a precise limiter  because another wave could
-      be actively executing while a wave is stalled.  The sum of this metric, Issue
-      Wait Cycles and Active Cycles should be  equal to the total Wave Cycles metric.
+    rst: The number of cycles a wavefront in the kernel dispatch stalled waiting on
+      memory of any kind (e.g., instruction fetch, vector or scalar memory, etc.)
+      per :ref:`normalization unit <normalization-units>`. This counter is incremented
+      at every cycle by *all* wavefronts on a CU stalled at a memory operation. As
+      such, it is most useful to get a sense of how waves were spending their time,
+      rather than identification of a precise limiter because another wave could be
+      actively executing while a wave is stalled. The sum of this metric, Issue Wait
+      Cycles and Active Cycles should be equal to the total Wave Cycles metric.
     unit: Cycles per normalization unit
   Instructions per wavefront:
     rst: The average number of instructions (of all types) executed per wavefront.
       This is averaged over all wavefronts in a kernel dispatch.
     unit: Instructions per wavefront
   Issue Wait Cycles:
-    rst: The number of cycles a wavefront in the kernel dispatch was unable to  issue
-      an instruction for any reason (e.g., execution pipe back-pressure,  arbitration
-      loss, etc.) per  :ref:`normalization unit <normalization-units>`.  This counter
-      is  incremented at every cycle by *all* wavefronts on a CU unable to issue an  instruction.  As
-      such, it is most useful to get a sense of how waves were  spending their time,
-      rather than identification of a precise limiter  because another wave could
-      be actively executing while a wave is issue  stalled.  The sum of this metric,
-      Dependency Wait Cycles and Active  Cycles should be equal to the total Wave
-      Cycles metric.
+    rst: The number of cycles a wavefront in the kernel dispatch was unable to issue
+      an instruction for any reason (e.g., execution pipe back-pressure, arbitration
+      loss, etc.) per :ref:`normalization unit <normalization-units>`. This counter
+      is incremented at every cycle by *all* wavefronts on a CU unable to issue an
+      instruction. As such, it is most useful to get a sense of how waves were spending
+      their time, rather than identification of a precise limiter because another
+      wave could be actively executing while a wave is issue stalled. The sum of this
+      metric, Dependency Wait Cycles and Active Cycles should be equal to the total
+      Wave Cycles metric.
     unit: Cycles per normalization unit
   Kernel Time:
     rst: The total duration of the executed kernel.
@@ -93,24 +98,26 @@ Wavefront runtime stats:
     rst: The total duration of the executed kernel in cycles.
     unit: Cycles
   Wave Cycles:
-    rst: 'The number of cycles a wavefront in the kernel dispatch spent resident on  a
-      compute unit per :ref:`normalization unit <normalization-units>`. This  is averaged
-      over all wavefronts in a kernel dispatch.  Note: this should  not be directly
-      compared to the kernel cycles above.'
+    rst: |-
+      The number of cycles a wavefront in the kernel dispatch spent resident
+      on a compute unit per :ref:`normalization unit <normalization-units>`. This is
+      averaged over all wavefronts in a kernel dispatch. Note: this should not
+      be directly compared to the kernel cycles above.
     unit: Cycles per normalization unit
   Wavefront Occupancy:
-    rst: 'The time-averaged number of wavefronts resident on the accelerator over  the
-      lifetime of the kernel. Note: this metric may be inaccurate for  short-running
-      kernels (less than 1ms).'
+    rst: |-
+      The time-averaged number of wavefronts resident on the accelerator over the
+      lifetime of the kernel. Note: this metric may be inaccurate for short-running
+      kernels (less than 1ms).
     unit: Wavefronts
 Overall instruction mix:
   Branch:
-    rst: The total number of branch operations issued. These typically consist of  jump
-      or branch operations and are used to implement control flow.
+    rst: The total number of branch operations issued. These typically consist of
+      jump or branch operations and are used to implement control flow.
     unit: Instructions
   LDS:
-    rst: The total number of LDS (also known as shared memory) operations issued.  These
-      include loads, stores, atomics, and HIP's ``__shfl`` operations.
+    rst: The total number of LDS (also known as shared memory) operations issued.
+      These include loads, stores, atomics, and HIP's ``__shfl`` operations.
     unit: Instructions
   MFMA:
     rst: The total number of matrix fused multiply-add instructions issued.
@@ -123,264 +130,273 @@ Overall instruction mix:
       section.
     unit: Instructions
   SMEM:
-    rst: The total number of scalar memory (SMEM) operations issued. These are  typically
-      used for loading kernel arguments, base-pointers and loads  from HIP's ``__constant__``
+    rst: The total number of scalar memory (SMEM) operations issued. These are typically
+      used for loading kernel arguments, base-pointers and loads from HIP's ``__constant__``
       memory.
     unit: Instructions
   VALU:
-    rst: The total number of vector arithmetic logic unit (VALU) operations  issued.
-      These are the workhorses of the  :doc:`compute unit <compute-unit>`, and are
-      used to execute a wide range of  instruction types including floating point
-      operations, non-uniform  address calculations, transcendental operations, integer
-      operations,  shifts, conditional evaluation, etc.
+    rst: The total number of vector arithmetic logic unit (VALU) operations issued.
+      These are the workhorses of the :doc:`compute unit <compute-unit>`, and are
+      used to execute a wide range of instruction types including floating point operations,
+      non-uniform address calculations, transcendental operations, integer operations,
+      shifts, conditional evaluation, etc.
     unit: Instructions
   VMEM:
-    rst: The total number of vector memory operations issued. These include most  loads,
-      stores and atomic operations and all accesses to  :ref:`generic, global, private
+    rst: The total number of vector memory operations issued. These include most loads,
+      stores and atomic operations and all accesses to :ref:`generic, global, private
       and texture <memory-spaces>` memory.
     unit: Instructions
 VALU arithmetic instruction mix:
   Conversion:
-    rst: "The total number of type conversion instructions (such as converting data\
-      \  to or from F32\u2194F64) issued to the VALU per  :ref:`normalization unit\
-      \ <normalization-units>`."
-    unit: Instructions per normalization unit
-  F16-ADD:
-    rst: The total number of addition instructions operating on 16-bit  floating-point
-      operands issued to the VALU per  :ref:`normalization unit <normalization-units>`.
-    unit: Instructions per normalization unit
-  F16-FMA:
-    rst: The total number of fused multiply-add instructions operating on 16-bit  floating-point
-      operands issued to the VALU per  :ref:`normalization unit <normalization-units>`.
-    unit: Instructions per normalization unit
-  F16-MUL:
-    rst: The total number of multiplication instructions operating on 16-bit  floating-point
-      operands issued to the VALU per  :ref:`normalization unit <normalization-units>`.
-    unit: Instructions per normalization unit
-  F16-Trans:
-    rst: The total number of transcendental instructions (e.g., `sqrt`) operating  on
-      16-bit floating-point operands issued to the VALU per  :ref:`normalization unit
+    rst: |-
+      The total number of type conversion instructions (such as converting data
+      to or from F32\u2194F64) issued to the VALU per :ref:`normalization unit
       <normalization-units>`.
     unit: Instructions per normalization unit
+  F16-ADD:
+    rst: The total number of addition instructions operating on 16-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F16-FMA:
+    rst: The total number of fused multiply-add instructions operating on 16-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F16-MUL:
+    rst: The total number of multiplication instructions operating on 16-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F16-Trans:
+    rst: The total number of transcendental instructions (e.g., `sqrt`) operating
+      on 16-bit floating-point operands issued to the VALU per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
   F32-ADD:
-    rst: The total number of addition instructions operating on 32-bit  floating-point
-      operands issued to the VALU per  :ref:`normalization unit <normalization-units>`.
+    rst: The total number of addition instructions operating on 32-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
     unit: Instructions per normalization unit
   F32-FMA:
-    rst: The total number of fused multiply-add instructions operating on 32-bit  floating-point
-      operands issued to the VALU per  :ref:`normalization unit <normalization-units>`.
+    rst: The total number of fused multiply-add instructions operating on 32-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
     unit: Instructions per normalization unit
   F32-MUL:
-    rst: The total number of multiplication instructions operating on 32-bit  floating-point
-      operands issued to the VALU per  :ref:`normalization unit <normalization-units>`.
+    rst: The total number of multiplication instructions operating on 32-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
     unit: Instructions per normalization unit
   F32-Trans:
-    rst: The total number of transcendental instructions (such as ``sqrt``)  operating
-      on 32-bit floating-point operands issued to the VALU per  :ref:`normalization
+    rst: The total number of transcendental instructions (such as ``sqrt``) operating
+      on 32-bit floating-point operands issued to the VALU per :ref:`normalization
       unit <normalization-units>`.
     unit: Instructions per normalization unit
   F64-ADD:
-    rst: The total number of addition instructions operating on 64-bit  floating-point
-      operands issued to the VALU per  :ref:`normalization unit <normalization-units>`.
+    rst: The total number of addition instructions operating on 64-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
     unit: Instructions per normalization unit
   F64-FMA:
-    rst: The total number of fused multiply-add instructions operating on 64-bit  floating-point
-      operands issued to the VALU per  :ref:`normalization unit <normalization-units>`.
+    rst: The total number of fused multiply-add instructions operating on 64-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
     unit: Instructions per normalization unit
   F64-MUL:
-    rst: The total number of multiplication instructions operating on 64-bit  floating-point
-      operands issued to the VALU per  :ref:`normalization unit <normalization-units>`.
+    rst: The total number of multiplication instructions operating on 64-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
     unit: Instructions per normalization unit
   F64-Trans:
-    rst: The total number of transcendental instructions (such as `sqrt`)  operating
-      on 64-bit floating-point operands issued to the VALU per  :ref:`normalization
+    rst: The total number of transcendental instructions (such as `sqrt`) operating
+      on 64-bit floating-point operands issued to the VALU per :ref:`normalization
       unit <normalization-units>`.
     unit: Instructions per normalization unit
   INT32:
-    rst: The total number of instructions operating on 32-bit integer operands  issued
+    rst: The total number of instructions operating on 32-bit integer operands issued
       to the VALU per :ref:`normalization unit <normalization-units>`.
     unit: Instructions per normalization unit
   INT64:
-    rst: The total number of instructions operating on 64-bit integer operands  issued
+    rst: The total number of instructions operating on 64-bit integer operands issued
       to the VALU per :ref:`normalization unit <normalization-units>`.
     unit: Instructions per normalization unit
 MFMA instruction mix:
   MFMA-BF16:
-    rst: The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>`  instructions
+    rst: The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>` instructions
       issued per :ref:`normalization unit <normalization-units>`.
     unit: Instructions per normalization unit
   MFMA-F16:
-    rst: The total number of 16-bit floating point :ref:`MFMA <desc-mfma>`  instructions
+    rst: The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` instructions
       issued per :ref:`normalization unit <normalization-units>`.
     unit: Instructions per normalization unit
   MFMA-F32:
-    rst: The total number of 32-bit floating-point :ref:`MFMA <desc-mfma>`  instructions
+    rst: The total number of 32-bit floating-point :ref:`MFMA <desc-mfma>` instructions
       issued per :ref:`normalization unit <normalization-units>`.
     unit: Instructions per normalization unit
   MFMA-F64:
-    rst: The total number of 64-bit floating-point :ref:`MFMA <desc-mfma>`  instructions
+    rst: The total number of 64-bit floating-point :ref:`MFMA <desc-mfma>` instructions
       issued per :ref:`normalization unit <normalization-units>`.
     unit: Instructions per normalization unit
   MFMA-F8:
-    rst: The total number of 8-bit floating point :ref:`MFMA <desc-mfma>` instructions  issued
-      per :ref:`normalization unit <normalization-units>`. This is supported in AMD
-      Instinct MI300 series and later only.
+    rst: The total number of 8-bit floating point :ref:`MFMA <desc-mfma>` instructions
+      issued per :ref:`normalization unit <normalization-units>`. This is supported
+      in AMD Instinct MI300 series and later only.
     unit: Instructions per normalization unit
   MFMA-I8:
-    rst: The total number of 8-bit integer :ref:`MFMA <desc-mfma>` instructions  issued
+    rst: The total number of 8-bit integer :ref:`MFMA <desc-mfma>` instructions issued
       per :ref:`normalization unit <normalization-units>`.
     unit: Instructions per normalization unit
 Compute Speed-of-Light:
   MFMA FLOPs (BF16):
-    rst: 'The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>`  operations
-      executed per second. Note: this does not include any 16-bit  brain floating
-      point operations from :ref:`VALU <desc-valu>`  instructions. This is also presented
-      as a percent of the peak theoretical  BF16 MFMA operations achievable on the
-      specific accelerator.'
+    rst: |-
+      The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 16-bit brain floating
+      point operations from :ref:`VALU <desc-valu>` instructions. This is also
+      presented as a percent of the peak theoretical BF16 MFMA operations achievable
+      on the specific accelerator.
     unit: GFLOPs
   MFMA FLOPs (F16):
-    rst: 'The total number of 16-bit floating point :ref:`MFMA <desc-mfma>`  operations
-      executed per second. Note: this does not include any 16-bit  floating point
-      operations from :ref:`VALU <desc-valu>` instructions. This  is also presented
-      as a percent of the peak theoretical F16 MFMA  operations achievable on the
-      specific accelerator.'
+    rst: |-
+      The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 16-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. This is also presented
+      as a percent of the peak theoretical F16 MFMA operations achievable on the
+      specific accelerator.
     unit: GFLOPs
   MFMA FLOPs (F32):
-    rst: 'The total number of 32-bit floating point :ref:`MFMA <desc-mfma>`  operations
-      executed per second. Note: this does not include any 32-bit  floating point
-      operations from :ref:`VALU <desc-valu>` instructions. This  is also presented
-      as a percent of the peak theoretical F32 MFMA  operations achievable on the
-      specific accelerator.'
+    rst: |-
+      The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 32-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. This is also presented
+      as a percent of the peak theoretical F32 MFMA operations achievable on the
+      specific accelerator.
     unit: GFLOPs
   MFMA FLOPs (F64):
-    rst: 'The total number of 64-bit floating point :ref:`MFMA <desc-mfma>`  operations
-      executed per second. Note: this does not include any 64-bit  floating point
-      operations from :ref:`VALU <desc-valu>` instructions. This  is also presented
-      as a percent of the peak theoretical F64 MFMA  operations achievable on the
-      specific accelerator.  The total number of 64-bit floating point :ref:`MFMA
-      <desc-mfma>`  operations executed per second. Note: this does not include any
-      64-bit  floating point operations from :ref:`VALU <desc-valu>` instructions.
-      This  is also presented as a percent of the peak theoretical F64 MFMA  operations
-      achievable on the specific accelerator.'
+    rst: |-
+      The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 64-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. This is also presented
+      as a percent of the peak theoretical F64 MFMA operations achievable on the
+      specific accelerator. The total number of 64-bit floating point :ref:`MFMA
+      <desc-mfma>` operations executed per second. Note: this does not include
+      any 64-bit floating point operations from :ref:`VALU <desc-valu>` instructions.
+      This is also presented as a percent of the peak theoretical F64 MFMA operations
+      achievable on the specific accelerator.
     unit: GFLOPs
   MFMA IOPs (INT8):
-    rst: 'The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations  executed
-      per second. Note: this does not include any 8-bit integer  operations from :ref:`VALU
-      <desc-valu>` instructions. This is also  presented as a percent of the peak
-      theoretical INT8 MFMA operations  achievable on the specific accelerator.'
+    rst: |-
+      The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
+      per second. Note: this does not include any 8-bit integer operations from
+      :ref:`VALU <desc-valu>` instructions. This is also presented as a percent
+      of the peak theoretical INT8 MFMA operations achievable on the specific accelerator.
     unit: GFLOPs
   VALU FLOPs:
-    rst: 'The total floating-point operations executed per second on the  :ref:`VALU
-      <desc-valu>`. This is also presented as a percent of the peak  theoretical FLOPs
-      achievable on the specific accelerator. Note: this does  not include any floating-point
-      operations from :ref:`MFMA <desc-mfma>`  instructions.'
+    rst: |-
+      The total floating-point operations executed per second on the :ref:`VALU
+      <desc-valu>`. This is also presented as a percent of the peak theoretical
+      FLOPs achievable on the specific accelerator. Note: this does not include
+      any floating-point operations from :ref:`MFMA <desc-mfma>` instructions.
     unit: GFLOPs
   VALU IOPs:
-    rst: 'The total integer operations executed per second on the  :ref:`VALU <desc-valu>`.
-      This is also presented as a percent of the peak  theoretical IOPs achievable
-      on the specific accelerator. Note: this does  not include any integer operations
-      from :ref:`MFMA <desc-mfma>`  instructions.'
+    rst: |-
+      The total integer operations executed per second on the :ref:`VALU <desc-valu>`.
+      This is also presented as a percent of the peak theoretical IOPs achievable
+      on the specific accelerator. Note: this does not include any integer operations
+      from :ref:`MFMA <desc-mfma>` instructions.
     unit: GIOPs
 Pipeline statistics:
   Branch Utilization:
-    rst: Indicates what percent of the kernel's duration the  :ref:`branch <desc-branch>`
-      unit was busy executing instructions.  Computed as the ratio of the total number
-      of cycles spent by the  :ref:`scheduler <desc-scheduler>` issuing branch instructions
-      over the  :ref:`total CU cycles <total-cu-cycles>`.
+    rst: Indicates what percent of the kernel's duration the :ref:`branch <desc-branch>`
+      unit was busy executing instructions. Computed as the ratio of the total number
+      of cycles spent by the :ref:`scheduler <desc-scheduler>` issuing branch instructions
+      over the :ref:`total CU cycles <total-cu-cycles>`.
     unit: Percent
   IPC:
-    rst: The ratio of the total number of instructions executed on the  :doc:`CU <compute-unit>`
-      over the  :ref:`total active CU cycles <total-active-cu-cycles>`.
+    rst: The ratio of the total number of instructions executed on the :doc:`CU <compute-unit>`
+      over the :ref:`total active CU cycles <total-active-cu-cycles>`.
     unit: Instructions per cycle
   IPC (Issued):
-    rst: The ratio of the total number of  (non-:ref:`internal <ipc-internal-instructions>`)
-      instructions issued over  the number of cycles where the :ref:`scheduler <desc-scheduler>`
-      was  actively working on issuing instructions. Refer to the  :ref:`Issued IPC
+    rst: The ratio of the total number of (non-:ref:`internal <ipc-internal-instructions>`)
+      instructions issued over the number of cycles where the :ref:`scheduler <desc-scheduler>`
+      was actively working on issuing instructions. Refer to the :ref:`Issued IPC
       <issued-ipc>` example for further detail.
     unit: Instructions per cycle
   MFMA Instruction Cycles:
-    rst: The average duration of :ref:`MFMA <desc-mfma>` instructions in this  kernel
-      in cycles. Computed as the ratio of the total number of cycles the  MFMA unit
-      was busy over the total number of MFMA instructions. Compare  to, for example,
-      the  `AMD Matrix Instruction Calculator <https://github.com/RadeonOpenCompute/amd_matrix_instruction_calculator>`_.
+    rst: The average duration of :ref:`MFMA <desc-mfma>` instructions in this kernel
+      in cycles. Computed as the ratio of the total number of cycles the MFMA unit
+      was busy over the total number of MFMA instructions. Compare to, for example,
+      the `AMD Matrix Instruction Calculator <https://github.com/RadeonOpenCompute/amd_matrix_instruction_calculator>`_.
     unit: Cycles per instruction
   MFMA Utilization:
-    rst: Indicates what percent of the kernel's duration the  :ref:`MFMA <desc-mfma>`
-      unit was busy executing instructions. Computed as  the ratio of the total number
-      of cycles spent by the  :ref:`MFMA <desc-salu>` was busy over the  :ref:`total
+    rst: Indicates what percent of the kernel's duration the :ref:`MFMA <desc-mfma>`
+      unit was busy executing instructions. Computed as the ratio of the total number
+      of cycles spent by the :ref:`MFMA <desc-salu>` was busy over the :ref:`total
       CU cycles <total-cu-cycles>`.
     unit: Percent
   SALU Utilization:
-    rst: Indicates what percent of the kernel's duration the  :ref:`SALU <desc-salu>`
-      was busy executing instructions. Computed as the  ratio of the total number
-      of cycles spent by the  :ref:`scheduler <desc-scheduler>` issuing SALU / :ref:`SMEM
-      <desc-smem>`  instructions over the :ref:`total CU cycles <total-cu-cycles>`.
+    rst: Indicates what percent of the kernel's duration the :ref:`SALU <desc-salu>`
+      was busy executing instructions. Computed as the ratio of the total number of
+      cycles spent by the :ref:`scheduler <desc-scheduler>` issuing SALU / :ref:`SMEM
+      <desc-smem>` instructions over the :ref:`total CU cycles <total-cu-cycles>`.
     unit: Percent
   SMEM Latency:
-    rst: The average number of round-trip cycles (that is, from issue to data  return
+    rst: The average number of round-trip cycles (that is, from issue to data return
       / acknowledgment) required for a SMEM instruction to complete.
     unit: Cycles
   VALU Active Threads:
-    rst: Indicates the average level of :ref:`divergence <desc-divergence>` within  a
-      wavefront over the lifetime of the kernel. The number of work-items  that were
-      active in a wavefront during execution of each  :ref:`VALU <desc-valu>` instruction,
-      time-averaged over all VALU  instructions run on all wavefronts in the kernel.
+    rst: Indicates the average level of :ref:`divergence <desc-divergence>` within
+      a wavefront over the lifetime of the kernel. The number of work-items that were
+      active in a wavefront during execution of each :ref:`VALU <desc-valu>` instruction,
+      time-averaged over all VALU instructions run on all wavefronts in the kernel.
     unit: Work-items
   VALU Utilization:
-    rst: Indicates what percent of the kernel's duration the  :ref:`VALU <desc-valu>`
-      was busy executing instructions. Does not include  :ref:`VMEM <desc-vmem>` operations.
-      Computed as the ratio of the total  number of cycles spent by the :ref:`scheduler
-      <desc-scheduler>` issuing  VALU instructions over the :ref:`total CU cycles
-      <total-cu-cycles>`.
+    rst: Indicates what percent of the kernel's duration the :ref:`VALU <desc-valu>`
+      was busy executing instructions. Does not include :ref:`VMEM <desc-vmem>` operations.
+      Computed as the ratio of the total number of cycles spent by the :ref:`scheduler
+      <desc-scheduler>` issuing VALU instructions over the :ref:`total CU cycles <total-cu-cycles>`.
     unit: Percent
   VMEM Latency:
-    rst: The average number of round-trip cycles (that is, from issue to data  return
+    rst: The average number of round-trip cycles (that is, from issue to data return
       / acknowledgment) required for a VMEM instruction to complete.
     unit: Cycles
   VMEM Utilization:
-    rst: Indicates what percent of the kernel's duration the  :ref:`VMEM <desc-vmem>`
-      unit was busy executing instructions, including  both global/generic and spill/scratch
-      operations (see the  :ref:`VMEM instruction count metrics <ta-instruction-counts>`
-      for more  detail).  Does not include :ref:`VALU <desc-valu>` operations. Computed  as
-      the ratio of the total number of cycles spent by the  :ref:`scheduler <desc-scheduler>`
-      issuing VMEM instructions over the  :ref:`total CU cycles <total-cu-cycles>`.
+    rst: Indicates what percent of the kernel's duration the :ref:`VMEM <desc-vmem>`
+      unit was busy executing instructions, including both global/generic and spill/scratch
+      operations (see the :ref:`VMEM instruction count metrics <ta-instruction-counts>`
+      for more detail). Does not include :ref:`VALU <desc-valu>` operations. Computed
+      as the ratio of the total number of cycles spent by the :ref:`scheduler <desc-scheduler>`
+      issuing VMEM instructions over the :ref:`total CU cycles <total-cu-cycles>`.
     unit: Percent
 Arithmetic operations:
   BF16 OPs:
-    rst: 'The total number of 16-bit brain floating-point operations executed on either
-      the  :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per  :ref:`normalization
-      unit <normalization-units>`. Note: on current CDNA  accelerators, the VALU has
-      no native BF16 instructions.'
+    rst: |-
+      The total number of 16-bit brain floating-point operations executed on
+      either the :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization
+      unit <normalization-units>`. Note: on current CDNA accelerators, the VALU
+      has no native BF16 instructions.
     unit: FLOP per normalization unit
   F16 OPs:
-    rst: The total number of 16-bit floating-point operations executed on either the  :ref:`VALU
-      <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per  :ref:`normalization unit
-      <normalization-units>`.
+    rst: The total number of 16-bit floating-point operations executed on either the
+      :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization
+      unit <normalization-units>`.
     unit: FLOP per normalization unit
   F32 OPs:
-    rst: The total number of 32-bit floating-point operations executed on either  the
-      :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per  :ref:`normalization
+    rst: The total number of 32-bit floating-point operations executed on either the
+      :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization
       unit <normalization-units>`.
     unit: FLOP per normalization unit
   F64 OPs:
-    rst: The total number of 64-bit floating-point operations executed on either  the
-      :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per  :ref:`normalization
+    rst: The total number of 64-bit floating-point operations executed on either the
+      :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization
       unit <normalization-units>`.
     unit: FLOP per normalization unit
   FLOPs (Total):
-    rst: The total number of floating-point operations executed on either the  :ref:`VALU
-      <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per  :ref:`normalization unit
+    rst: The total number of floating-point operations executed on either the :ref:`VALU
+      <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization unit
       <normalization-units>`.
     unit: FLOP per normalization unit
   INT8 OPs:
-    rst: 'The total number of 8-bit integer operations executed on either the  :ref:`VALU
-      <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per  :ref:`normalization unit
-      <normalization-units>`. Note: on current CDNA  accelerators, the VALU has no
-      native INT8 instructions.'
+    rst: |-
+      The total number of 8-bit integer operations executed on either the :ref:`VALU
+      <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization unit
+      <normalization-units>`. Note: on current CDNA accelerators, the VALU has
+      no native INT8 instructions.
     unit: IOP per normalization unit
   IOPs (Total):
-    rst: The total number of integer operations executed on either the  :ref:`VALU
-      <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per  :ref:`normalization unit
+    rst: The total number of integer operations executed on either the :ref:`VALU
+      <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization unit
       <normalization-units>`.
     unit: IOP per normalization unit
 LDS Speed-of-Light:
@@ -392,152 +408,156 @@ LDS Speed-of-Light:
       CU cycles <total-cu-cycles>`.
     unit: Percent
   Bank Conflict Rate:
-    rst: Indicates the percentage of active LDS cycles that were spent servicing  bank
-      conflicts. Calculated as the ratio of LDS cycles spent servicing  bank conflicts
-      over the number of LDS cycles that would have been  required to move the same
+    rst: Indicates the percentage of active LDS cycles that were spent servicing bank
+      conflicts. Calculated as the ratio of LDS cycles spent servicing bank conflicts
+      over the number of LDS cycles that would have been required to move the same
       amount of data in an uncontended access. [#lds-bank-conflict]_
     unit: Percent
   Theoretical Bandwidth Utilization:
-    rst: Indicates the maximum amount of bytes that could have been loaded from,  stored
+    rst: Indicates the maximum amount of bytes that could have been loaded from, stored
       to, or atomically updated in the LDS divided as percentage of theoretical peak.
-      Does *not* take into  account the execution mask of the wavefront when the instruction
-      was  executed. See the  :ref:`LDS bandwidth example <lds-bandwidth>` for more
+      Does *not* take into account the execution mask of the wavefront when the instruction
+      was executed. See the :ref:`LDS bandwidth example <lds-bandwidth>` for more
       detail.
     unit: Percent
   Utilization:
-    rst: Indicates what percent of the kernel's duration the :ref:`LDS <desc-lds>`  was
-      actively executing instructions (including, but not limited to, load,  store,
-      atomic and HIP's ``__shfl`` operations).  Calculated as the ratio  of the total
-      number of cycles LDS was active over the  :ref:`total CU cycles <total-cu-cycles>`.
+    rst: Indicates what percent of the kernel's duration the :ref:`LDS <desc-lds>`
+      was actively executing instructions (including, but not limited to, load, store,
+      atomic and HIP's ``__shfl`` operations). Calculated as the ratio of the total
+      number of cycles LDS was active over the :ref:`total CU cycles <total-cu-cycles>`.
     unit: Percent
 LDS Statistics:
   Addr Conflict:
-    rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>`  due
-      to address conflicts (as determined by the conflict resolution  hardware) per
+    rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>` due
+      to address conflicts (as determined by the conflict resolution hardware) per
       :ref:`normalization unit <normalization-units>`.
     unit: Cycles per normalization unit
   Atomic Return Cycles:
-    rst: The total number of cycles spent on LDS atomics with return per  :ref:`normalization
+    rst: The total number of cycles spent on LDS atomics with return per :ref:`normalization
       unit <normalization-units>`.
     unit: Cycles per normalization unit
   Bank Conflict:
-    rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>`  due
-      to bank conflicts (as determined by the conflict resolution hardware)  per :ref:`normalization
+    rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>` due
+      to bank conflicts (as determined by the conflict resolution hardware) per :ref:`normalization
       unit <normalization-units>`.
     unit: Cycles per normalization unit
   Bank Conflicts/Access:
-    rst: The ratio of the number of cycles spent in the  :ref:`LDS scheduler <desc-lds>`
-      due to bank conflicts (as determined by  the conflict resolution hardware) to
-      the base number of cycles that would  be spent in the LDS scheduler in a completely
-      uncontended case. This is  the unnormalized form of the Bank Conflict Rate.
+    rst: The ratio of the number of cycles spent in the :ref:`LDS scheduler <desc-lds>`
+      due to bank conflicts (as determined by the conflict resolution hardware) to
+      the base number of cycles that would be spent in the LDS scheduler in a completely
+      uncontended case. This is the unnormalized form of the Bank Conflict Rate.
     unit: Conflicts per Access
   Index Accesses:
-    rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>`  over
+    rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>` over
       all operations per :ref:`normalization unit <normalization-units>`.
     unit: Cycles per normalization unit
   LDS Instructions:
-    rst: The total number of LDS instructions (including, but not limited to,  read/write/atomics
-      and HIP's ``__shfl`` instructions) executed per  :ref:`normalization unit <normalization-units>`.
+    rst: The total number of LDS instructions (including, but not limited to, read/write/atomics
+      and HIP's ``__shfl`` instructions) executed per :ref:`normalization unit <normalization-units>`.
     unit: Instructions per normalization unit
   LDS Latency:
-    rst: The average number of round-trip cycles (i.e., from issue to data-return  /
+    rst: The average number of round-trip cycles (i.e., from issue to data-return
       acknowledgment) required for an LDS instruction to complete.
     unit: Cycles
   Mem Violations:
-    rst: "The total number of out-of-bounds accesses made to the LDS, per  :ref:`normalization\
-      \ unit <normalization-units>`. This is unused and  expected to be zero in most\
-      \ configurations for modern CDNA\u2122 accelerators."
+    rst: |-
+      The total number of out-of-bounds accesses made to the LDS, per :ref:`normalization
+      unit <normalization-units>`. This is unused and expected to be zero in
+      most configurations for modern CDNA\u2122 accelerators.
     unit: Accesses per normalization unit
   Theoretical Bandwidth:
-    rst: Indicates the maximum amount of bytes that could have been loaded from,  stored
+    rst: Indicates the maximum amount of bytes that could have been loaded from, stored
       to, or atomically updated in the LDS divided by total duration. Does *not* take
-      into  account the execution mask of the wavefront when the instruction was  executed.
-      See the  :ref:`LDS bandwidth example <lds-bandwidth>` for more detail.
+      into account the execution mask of the wavefront when the instruction was executed.
+      See the :ref:`LDS bandwidth example <lds-bandwidth>` for more detail.
     unit: Gbps
   Unaligned Stall:
-    rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>`  due
-      to stalls from non-dword aligned addresses per  :ref:`normalization unit <normalization-units>`.
+    rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>` due
+      to stalls from non-dword aligned addresses per :ref:`normalization unit <normalization-units>`.
     unit: Cycles per normalization unit
 vL1D Speed-of-Light:
   Bandwidth Utilization:
-    rst: The number of bytes looked up in the vL1D cache as a result of  :ref:`VMEM
-      <desc-vmem>` instructions, as a percent of the peak  theoretical bandwidth achievable
-      on the specific accelerator. The number  of bytes is calculated as the number
-      of cache lines requested multiplied  by the cache line size. This value does
-      not consider partial requests, so  for instance, if only a single value is requested
-      in a cache line, the  data movement will still be counted as a full cache line.
+    rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM
+      <desc-vmem>` instructions, as a percent of the peak theoretical bandwidth achievable
+      on the specific accelerator. The number of bytes is calculated as the number
+      of cache lines requested multiplied by the cache line size. This value does
+      not consider partial requests, so for instance, if only a single value is requested
+      in a cache line, the data movement will still be counted as a full cache line.
     unit: Percent
   Coalescing:
-    rst: Indicates how well memory instructions were coalesced by the  :ref:`address
-      processing unit <desc-ta>`, ranging from uncoalesced (25%)  to fully coalesced
-      (100%). Calculated as the average number of  :ref:`thread-requests <thread-requests>`
-      generated per instruction  divided by the ideal number of thread-requests per
+    rst: Indicates how well memory instructions were coalesced by the :ref:`address
+      processing unit <desc-ta>`, ranging from uncoalesced (25%) to fully coalesced
+      (100%). Calculated as the average number of :ref:`thread-requests <thread-requests>`
+      generated per instruction divided by the ideal number of thread-requests per
       instruction.
     unit: Percent
   Hit rate:
-    rst: The ratio of the number of vL1D cache line requests that hit [#vl1d-hit]_  in
-      vL1D cache over the total number of cache line requests to the  :ref:`vL1D Cache
-      RAM <desc-tc>`.
+    rst: The ratio of the number of vL1D cache line requests that hit [#vl1d-hit]_
+      in vL1D cache over the total number of cache line requests to the :ref:`vL1D
+      Cache RAM <desc-tc>`.
     unit: Percent
   Utilization:
-    rst: Indicates how busy the :ref:`vL1D Cache RAM <desc-tc>` was during the  kernel
-      execution. The number of cycles where the vL1D Cache RAM is  actively processing
-      any request divided by the number of cycles where the  vL1D is active [#vl1d-activity]_.
+    rst: Indicates how busy the :ref:`vL1D Cache RAM <desc-tc>` was during the kernel
+      execution. The number of cycles where the vL1D Cache RAM is actively processing
+      any request divided by the number of cycles where the vL1D is active [#vl1d-activity]_.
     unit: Percent
 Busy / stall metrics:
   Address Processing Unit Busy:
-    rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the address  processor
+    rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the address processor
       was busy
     unit: Percent
   Address Stall:
-    rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the address  processor
-      was stalled from sending address requests further into the vL1D  pipeline
+    rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the address processor
+      was stalled from sending address requests further into the vL1D pipeline
     unit: Percent
   Data Stall:
-    rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the address  processor
-      was stalled from sending write/atomic data further into the  vL1D pipeline
+    rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the address processor
+      was stalled from sending write/atomic data further into the vL1D pipeline
     unit: Percent
   "Data-Processor \u2192 Address Stall":
-    rst: Percent of :ref:`total CU cycles <total-cu-cycles>` the address processor  was
-      stalled waiting to send command data to the  :ref:`data processor <desc-td>`
+    rst: Percent of :ref:`total CU cycles <total-cu-cycles>` the address processor
+      was stalled waiting to send command data to the :ref:`data processor <desc-td>`
     unit: Percent
 Instruction counts:
   Global/Generic Atomic Instructions:
-    rst: The total number of global & generic memory atomic (with and without  return)
-      instructions executed on all :doc:`compute units <compute-unit>`  on the accelerator,
+    rst: The total number of global & generic memory atomic (with and without return)
+      instructions executed on all :doc:`compute units <compute-unit>` on the accelerator,
       per :ref:`normalization unit <normalization-units>`.
     unit: Instructions per normalization unit
   Global/Generic Instructions:
-    rst: The total number of global & generic memory instructions executed on all  :doc:`compute
-      units <compute-unit>` on the accelerator, per  :ref:`normalization unit <normalization-units>`.
+    rst: The total number of global & generic memory instructions executed on all
+      :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
     unit: Instructions per normalization unit
   Global/Generic Read Instructions:
-    rst: The total number of global & generic memory read instructions executed on  all
-      :doc:`compute units <compute-unit>` on the accelerator, per  :ref:`normalization
+    rst: The total number of global & generic memory read instructions executed on
+      all :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
       unit <normalization-units>`.
     unit: Instructions per normalization unit
   Global/Generic Write Instructions:
-    rst: The total number of global & generic memory write instructions executed  on
-      all :doc:`compute units <compute-unit>` on the accelerator, per  :ref:`normalization
+    rst: The total number of global & generic memory write instructions executed on
+      all :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
       unit <normalization-units>`.
     unit: Instructions per normalization unit
   Spill/Stack Atomic Instructions:
-    rst: The total number of spill/stack memory atomic (with and without return)  instructions
-      executed on all :doc:`compute units <compute-unit>` on the  accelerator, per
-      :ref:`normalization unit <normalization-units>`.  Typically unused as these
-      memory operations are typically used to  implement thread-local storage.
+    rst: The total number of spill/stack memory atomic (with and without return) instructions
+      executed on all :doc:`compute units <compute-unit>` on the accelerator, per
+      :ref:`normalization unit <normalization-units>`. Typically unused as these memory
+      operations are typically used to implement thread-local storage.
     unit: Instructions per normalization unit
   Spill/Stack Instructions:
-    rst: The total number of spill/stack memory instructions executed on all  :doc:`compute
-      units <compute-unit>` on the accelerator, per  :ref:`normalization unit <normalization-units>`.
+    rst: The total number of spill/stack memory instructions executed on all :doc:`compute
+      units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
     unit: Instructions per normalization unit
   Spill/Stack Read Instructions:
-    rst: The total number of spill/stack memory read instructions executed on all  :doc:`compute
-      units <compute-unit>` on the accelerator, per  :ref:`normalization unit <normalization-units>`.
+    rst: The total number of spill/stack memory read instructions executed on all
+      :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
     unit: Instructions per normalization unit
   Spill/Stack Write Instructions:
-    rst: The total number of spill/stack memory write instructions executed on all  :doc:`compute
-      units <compute-unit>` on the accelerator, per  :ref:`normalization unit <normalization-units>`.
+    rst: The total number of spill/stack memory write instructions executed on all
+      :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
     unit: Instructions per normalization unit
   Total Instructions:
     rst: The total number of memory instructions executed by the address processer
@@ -545,16 +565,16 @@ Instruction counts:
     unit: Instructions per normalization unit
 Spill / stack metrics:
   Spill/Stack Coalesced Read:
-    rst: The number of cycles the address processing unit spent working on  coalesced
-      spill/stack read instructions, per  :ref:`normalization unit <normalization-units>`.
+    rst: The number of cycles the address processing unit spent working on coalesced
+      spill/stack read instructions, per :ref:`normalization unit <normalization-units>`.
     unit: Cycles per normalization unit
   Spill/Stack Coalesced Write:
-    rst: The number of cycles the address processing unit spent working on  coalesced
-      spill/stack write instructions, per  :ref:`normalization unit <normalization-units>`.
+    rst: The number of cycles the address processing unit spent working on coalesced
+      spill/stack write instructions, per :ref:`normalization unit <normalization-units>`.
     unit: Cycles per normalization unit
   Spill/Stack Total Cycles:
-    rst: The number of cycles the address processing unit spent working on  spill/stack
-      instructions, per  :ref:`normalization unit <normalization-units>`.
+    rst: The number of cycles the address processing unit spent working on spill/stack
+      instructions, per :ref:`normalization unit <normalization-units>`.
     unit: Cycles per normalization unit
 L1 Unified Translation Cache (UTCL1):
   Hit Ratio:
@@ -566,48 +586,49 @@ L1 Unified Translation Cache (UTCL1):
       per normalization unit.
     unit: Requests per normalization unit
   Permission Misses:
-    rst: "The total number of translation requests that missed in the UTCL1 due to\
-      \  a permission error, per :ref:`normalization unit <normalization-units>`.\
-      \  This is unused and expected to be zero in most configurations for modern\
-      \  CDNA\u2122 accelerators."
+    rst: |-
+      The total number of translation requests that missed in the UTCL1 due
+      to a permission error, per :ref:`normalization unit <normalization-units>`.
+      This is unused and expected to be zero in most configurations for modern
+      CDNA\u2122 accelerators.
     unit: Requests per normalization unit
   Req:
     rst: The number of translation requests made to the UTCL1 per normalization unit.
     unit: Requests per normalization unit
   Translation Misses:
-    rst: The total number of translation requests that missed in the UTCL1 due to  translation
-      not being present in the cache, per  :ref:`normalization unit <normalization-units>`.
+    rst: The total number of translation requests that missed in the UTCL1 due to
+      translation not being present in the cache, per :ref:`normalization unit <normalization-units>`.
     unit: unit
 vL1D cache stall metrics:
   Stalled on L2 Data:
-    rst: The ratio of the number of cycles where the vL1D is stalled waiting for  requested
-      data to return from the :doc:`L2 cache <l2-cache>` divided by  the number of
+    rst: The ratio of the number of cycles where the vL1D is stalled waiting for requested
+      data to return from the :doc:`L2 cache <l2-cache>` divided by the number of
       cycles where the vL1D is active [#vl1d-activity]_.
     unit: Percent
   Stalled on L2 Req:
-    rst: The ratio of the number of cycles where the vL1D is stalled waiting to  issue
-      a request for data to the :doc:`L2 cache <l2-cache>` divided by the  number
-      of cycles where the vL1D is active [#vl1d-activity]_.
+    rst: The ratio of the number of cycles where the vL1D is stalled waiting to issue
+      a request for data to the :doc:`L2 cache <l2-cache>` divided by the number of
+      cycles where the vL1D is active [#vl1d-activity]_.
     unit: Percent
   Tag RAM Stall (Atomic):
     rst: The ratio of the number of cycles where the vL1D is stalled due to Atomic
-      requests with conflicting tags being looked up  concurrently, divided by the
-      number of cycles where the  vL1D is active [#vl1d-activity]_.
+      requests with conflicting tags being looked up concurrently, divided by the
+      number of cycles where the vL1D is active [#vl1d-activity]_.
     unit: Percent
   Tag RAM Stall (Read):
     rst: The ratio of the number of cycles where the vL1D is stalled due to Read requests
-      with conflicting tags being looked up  concurrently, divided by the number of
-      cycles where the  vL1D is active [#vl1d-activity]_.
+      with conflicting tags being looked up concurrently, divided by the number of
+      cycles where the vL1D is active [#vl1d-activity]_.
     unit: Percent
   Tag RAM Stall (Write):
     rst: The ratio of the number of cycles where the vL1D is stalled due to Write
-      requests with conflicting tags being looked up  concurrently, divided by the
-      number of cycles where the  vL1D is active [#vl1d-activity]_.
+      requests with conflicting tags being looked up concurrently, divided by the
+      number of cycles where the vL1D is active [#vl1d-activity]_.
     unit: Percent
 vL1D cache access metrics:
   Atomic Req:
-    rst: The total number of incoming atomic requests from the  :ref:`address processing
-      unit <desc-ta>` after coalescing per  :ref:`normalization unit <normalization-units>`
+    rst: The total number of incoming atomic requests from the :ref:`address processing
+      unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
     unit: Requests per normalization unit
   Cache Accesses:
     rst: The total number of cache line lookups in the vL1D.
@@ -616,103 +637,103 @@ vL1D cache access metrics:
     rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM
       <desc-vmem>` instructions divided by total duration. The number of bytes is
       calculated as the number of cache lines requested multiplied by the cache line
-      size. This value does not consider partial requests, so for  instance, if only
+      size. This value does not consider partial requests, so for instance, if only
       a single value is requested in a cache line, the data movement will still be
       counted as a full cache line.
     unit: Gbps
   Cache Hit Rate:
-    rst: The ratio of the number of vL1D cache line requests that hit in vL1D  cache
-      over the total number of cache line requests to the  :ref:`vL1D Cache RAM <desc-tc>`.
+    rst: The ratio of the number of vL1D cache line requests that hit in vL1D cache
+      over the total number of cache line requests to the :ref:`vL1D Cache RAM <desc-tc>`.
     unit: Percent
   Cache Hits:
-    rst: The number of cache accesses minus the number of outgoing requests to the  :doc:`L2
-      cache <l2-cache>`, that is, the number of cache line requests  serviced by the
-      :ref:`vL1D Cache RAM <desc-tc>` per  :ref:`normalization unit <normalization-units>`.
+    rst: The number of cache accesses minus the number of outgoing requests to the
+      :doc:`L2 cache <l2-cache>`, that is, the number of cache line requests serviced
+      by the :ref:`vL1D Cache RAM <desc-tc>` per :ref:`normalization unit <normalization-units>`.
     unit: Cache lines per normalization unit
   Invalidations:
-    rst: The number of times the vL1D was issued a write-back invalidate command  during
-      the kernel's execution per  :ref:`normalization unit <normalization-units>`.  This
-      may be triggered  by, for instance, the ``buffer_wbinvl1`` instruction.
+    rst: The number of times the vL1D was issued a write-back invalidate command during
+      the kernel's execution per :ref:`normalization unit <normalization-units>`.
+      This may be triggered by, for instance, the ``buffer_wbinvl1`` instruction.
     unit: Invalidations per normalization unit
   L1 Access Latency:
     rst: Calculated as the average number of cycles that a vL1D cache line request
       spent in the vL1D cache pipeline.
     unit: Cycles
   L1-L2 Atomic:
-    rst: The number of atomic requests that are sent through the vL1D to the  :doc:`L2
-      cache <l2-cache>`, per  :ref:`normalization unit <normalization-units>`. This
-      includes requests  for atomics with, and without return.
+    rst: The number of atomic requests that are sent through the vL1D to the :doc:`L2
+      cache <l2-cache>`, per :ref:`normalization unit <normalization-units>`. This
+      includes requests for atomics with, and without return.
     unit: Requests per normalization unit
   L1-L2 BW:
-    rst: The number of bytes transferred across the vL1D-L2 interface as a result  of
-      :ref:`VMEM <desc-vmem>` instructions, divided by total duration. The number
-      of bytes is  calculated as the number of cache lines requested multiplied by
-      the cache  line size. This value does not consider partial requests, so for  instance,
-      if only a single value is requested in a cache line, the data  movement will
+    rst: The number of bytes transferred across the vL1D-L2 interface as a result
+      of :ref:`VMEM <desc-vmem>` instructions, divided by total duration. The number
+      of bytes is calculated as the number of cache lines requested multiplied by
+      the cache line size. This value does not consider partial requests, so for instance,
+      if only a single value is requested in a cache line, the data movement will
       still be counted as a full cache line.
     unit: Gbps
   L1-L2 Read:
-    rst: The number of read requests for a vL1D cache line that were not satisfied  by
-      the vL1D and must be retrieved from the to the  :doc:`L2 Cache <l2-cache>` per  :ref:`normalization
-      unit <normalization-units>`.
+    rst: The number of read requests for a vL1D cache line that were not satisfied
+      by the vL1D and must be retrieved from the to the :doc:`L2 Cache <l2-cache>`
+      per :ref:`normalization unit <normalization-units>`.
     unit: Requests per normalization unit
   L1-L2 Read Latency:
-    rst: Calculated as the average number of cycles that the vL1D cache took to  issue
-      and receive read requests from the :doc:`L2 Cache <l2-cache>`. This  number
-      also includes requests for atomics with return values.
+    rst: Calculated as the average number of cycles that the vL1D cache took to issue
+      and receive read requests from the :doc:`L2 Cache <l2-cache>`. This number also
+      includes requests for atomics with return values.
     unit: Cycles
   L1-L2 Write:
-    rst: The number of write requests to a vL1D cache line that were sent through  the
-      vL1D to the :doc:`L2 cache <l2-cache>`, per  :ref:`normalization unit <normalization-units>`.
+    rst: The number of write requests to a vL1D cache line that were sent through
+      the vL1D to the :doc:`L2 cache <l2-cache>`, per :ref:`normalization unit <normalization-units>`.
     unit: Requests per normalization unit
   L1-L2 Write Latency:
-    rst: Calculated as the average number of cycles that the vL1D cache took to  issue
-      and receive acknowledgement of a write request to the  :doc:`L2 Cache <l2-cache>`.
-      This number also includes requests for  atomics without return values.
+    rst: Calculated as the average number of cycles that the vL1D cache took to issue
+      and receive acknowledgement of a write request to the :doc:`L2 Cache <l2-cache>`.
+      This number also includes requests for atomics without return values.
     unit: Cycles
   Read Req:
-    rst: The total number of incoming read requests from the  :ref:`address processing
-      unit <desc-ta>` after coalescing per  :ref:`normalization unit <normalization-units>`
+    rst: The total number of incoming read requests from the :ref:`address processing
+      unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
     unit: Requests per normalization unit
   Total Req:
-    rst: The total number of incoming requests from the  :ref:`address processing
-      unit <desc-ta>` after coalescing.
+    rst: The total number of incoming requests from the :ref:`address processing unit
+      <desc-ta>` after coalescing.
     unit: Requests
   Write Req:
-    rst: The total number of incoming write requests from the  :ref:`address processing
-      unit <desc-ta>` after coalescing per  :ref:`normalization unit <normalization-units>`
+    rst: The total number of incoming write requests from the :ref:`address processing
+      unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
     unit: Requests per normalization unit
 Vector L1 data-return path or Texture Data (TD):
   Atomic Instructions:
-    rst: The number of atomic instructions submitted to the  :ref:`data-return unit
-      <desc-td>` by the  :ref:`address processor <desc-ta>` summed over all  :doc:`compute
-      units <compute-unit>` on the accelerator, per  :ref:`normalization unit <normalization-units>`.
-      This is expected to be  the sum of global/generic and spill/stack atomics in
-      the  :ref:`address processor <desc-ta>`.
+    rst: The number of atomic instructions submitted to the :ref:`data-return unit
+      <desc-td>` by the :ref:`address processor <desc-ta>` summed over all :doc:`compute
+      units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
+      This is expected to be the sum of global/generic and spill/stack atomics in
+      the :ref:`address processor <desc-ta>`.
     unit: Instructions per normalization unit
   "Cache RAM \u2192 Data-Return Stall":
-    rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the data-return  unit
-      was stalled on data to be returned from the  :ref:`vL1D Cache RAM <desc-tc>`.
+    rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the data-return unit
+      was stalled on data to be returned from the :ref:`vL1D Cache RAM <desc-tc>`.
     unit: Percent
   Coalescable Instructions:
-    rst: The number of instructions submitted to the  :ref:`data-return unit <desc-td>`
-      by the  :ref:`address processor <desc-ta>` that were found to be coalescable,
-      per  :ref:`normalization unit <normalization-units>`.
+    rst: The number of instructions submitted to the :ref:`data-return unit <desc-td>`
+      by the :ref:`address processor <desc-ta>` that were found to be coalescable,
+      per :ref:`normalization unit <normalization-units>`.
     unit: Instructions per normalization unit
   Data-Return Busy:
-    rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the data-return  unit
-      was busy processing or waiting on data to return to the  :doc:`CU <compute-unit>`.
+    rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the data-return unit
+      was busy processing or waiting on data to return to the :doc:`CU <compute-unit>`.
     unit: Percent
   Read Instructions:
-    rst: The number of read instructions submitted to the  :ref:`data-return unit
-      <desc-td>` by the  :ref:`address processor <desc-ta>` summed over all  :doc:`compute
-      units <compute-unit>` on the accelerator, per  :ref:`normalization unit <normalization-units>`.
-      This is expected to be  the sum of global/generic and spill/stack reads in the  :ref:`address
-      processor <desc-ta>`.
+    rst: The number of read instructions submitted to the :ref:`data-return unit <desc-td>`
+      by the :ref:`address processor <desc-ta>` summed over all :doc:`compute units
+      <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
+      This is expected to be the sum of global/generic and spill/stack reads in the
+      :ref:`address processor <desc-ta>`.
     unit: Instructions per normalization unit
   "Workgroup manager \u2192 Data-Return Stall":
-    rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the data-return  unit
-      was stalled by the :ref:`workgroup manager <desc-spi>` due to  initialization
+    rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the data-return unit
+      was stalled by the :ref:`workgroup manager <desc-spi>` due to initialization
       of registers as a part of launching new workgroups.
     unit: Percent
   Write Ack Instructions:
@@ -721,11 +742,11 @@ Vector L1 data-return path or Texture Data (TD):
       normalization unit.
     unit: Instructions per normalization unit
   Write Instructions:
-    rst: The number of store instructions submitted to the  :ref:`data-return unit
-      <desc-td>` by the  :ref:`address processor <desc-ta>` summed over all  :doc:`compute
-      units <compute-unit>` on the accelerator, per  :ref:`normalization unit <normalization-units>`.
-      This is expected to be  the sum of global/generic and spill/stack stores counted
-      by the  :ref:`vL1D cache-front-end <ta-instruction-counts>`.
+    rst: The number of store instructions submitted to the :ref:`data-return unit
+      <desc-td>` by the :ref:`address processor <desc-ta>` summed over all :doc:`compute
+      units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
+      This is expected to be the sum of global/generic and spill/stack stores counted
+      by the :ref:`vL1D cache-front-end <ta-instruction-counts>`.
     unit: Instructions per normalization unit
 L2 Speed-of-Light:
   HBM Bandwidth:
@@ -734,29 +755,29 @@ L2 Speed-of-Light:
       multiplied by the HBM channel width multiplied by the HBM clock frequency.
     unit: GB/s
   Hit Rate:
-    rst: The ratio of the number of L2 cache line requests that hit in the L2  cache
+    rst: The ratio of the number of L2 cache line requests that hit in the L2 cache
       over the total number of incoming cache line requests to the L2 cache.
     unit: Percent
   L2-Fabric Read BW:
-    rst: The number of bytes read by the L2 over the  :ref:`Infinity Fabric interface
+    rst: The number of bytes read by the L2 over the :ref:`Infinity Fabric interface
       <l2-fabric>` per unit time.
     unit: GB/s
   L2-Fabric Write and Atomic BW:
-    rst: The number of bytes sent by the L2 over the  :ref:`Infinity Fabric interface
-      <l2-fabric>` by write and atomic  operations per unit time.
+    rst: The number of bytes sent by the L2 over the :ref:`Infinity Fabric interface
+      <l2-fabric>` by write and atomic operations per unit time.
     unit: GB/s
   Peak Bandwidth:
-    rst: The number of bytes looked up in the L2 cache, as a percent of the peak  theoretical
-      bandwidth achievable on the specific accelerator. The number  of bytes is calculated
-      as the number of cache lines requested multiplied  by the cache line size. This
-      value does not consider partial requests, so  e.g., if only a single value is
-      requested in a cache line, the data  movement will still be counted as a full
+    rst: The number of bytes looked up in the L2 cache, as a percent of the peak theoretical
+      bandwidth achievable on the specific accelerator. The number of bytes is calculated
+      as the number of cache lines requested multiplied by the cache line size. This
+      value does not consider partial requests, so e.g., if only a single value is
+      requested in a cache line, the data movement will still be counted as a full
       cache line.
     unit: Percent
   Utilization:
-    rst: The ratio of the  :ref:`number of cycles an L2 channel was active, summed
-      over all L2 channels on the accelerator <total-active-l2-cycles>`  over the
-      :ref:`total L2 cycles <total-l2-cycles>`.
+    rst: The ratio of the :ref:`number of cycles an L2 channel was active, summed
+      over all L2 channels on the accelerator <total-active-l2-cycles>` over the :ref:`total
+      L2 cycles <total-l2-cycles>`.
     unit: Percent
 L2 cache accesses:
   Atomic Bandwidth:
@@ -769,74 +790,74 @@ L2 cache accesses:
     unit: Requests per normalization unit
   Bandwidth:
     rst: The number of bytes looked up in the L2 cache, divided by total duration.
-      The number of bytes is  calculated as the number of cache lines requested multiplied
+      The number of bytes is calculated as the number of cache lines requested multiplied
       by the cache line size. This value does not consider partial requests, so for
-      example, if only a single value is requested in a cache line, the data movement  will
-      still be counted as a full cache line.
+      example, if only a single value is requested in a cache line, the data movement
+      will still be counted as a full cache line.
     unit: Gbps
   CC Req:
-    rst: The total number of requests to the L2 that go to Coherently Cacheable (CC)  memory
-      allocations. See the :ref:`memory-type` for more information.
+    rst: The total number of requests to the L2 that go to Coherently Cacheable (CC)
+      memory allocations. See the :ref:`memory-type` for more information.
     unit: Requests per normalization unit
   Cache Hit:
-    rst: The ratio of the number of L2 cache line requests that hit in the L2  cache
-      over the total number of incoming cache line requests to the L2  cache.
+    rst: The ratio of the number of L2 cache line requests that hit in the L2 cache
+      over the total number of incoming cache line requests to the L2 cache.
     unit: Percent
   Evict (Internal):
-    rst: The total number of L2 cache lines evicted from the cache due to capacity  limits,
-      per :ref:`normalization unit <normalization-units>`.
+    rst: The total number of L2 cache lines evicted from the cache due to capacity
+      limits, per :ref:`normalization unit <normalization-units>`.
     unit: Cache lines per normalization unit
   Evict (vL1D Req):
-    rst: The total number of L2 cache lines evicted from the cache due to  invalidation
-      requests initiated by the  :doc:`vL1D cache <vector-l1-cache>`, per  :ref:`normalization
+    rst: The total number of L2 cache lines evicted from the cache due to invalidation
+      requests initiated by the :doc:`vL1D cache <vector-l1-cache>`, per :ref:`normalization
       unit <normalization-units>`.
     unit: Cache lines per normalization unit
   Hits:
-    rst: The total number of requests to the L2 from all clients that hit in the  cache.
-      As noted in the :ref:`Speed-of-Light <l2-sol>` section, this  includes hit-on-miss
+    rst: The total number of requests to the L2 from all clients that hit in the cache.
+      As noted in the :ref:`Speed-of-Light <l2-sol>` section, this includes hit-on-miss
       requests.
     unit: Requests per normalization unit
   Misses:
-    rst: The total number of requests to the L2 from all clients that miss in the  cache.
-      As noted in the :ref:`Speed-of-Light <l2-sol>` section, these do  not include
-      hit-on-miss requests.
+    rst: The total number of requests to the L2 from all clients that miss in the
+      cache. As noted in the :ref:`Speed-of-Light <l2-sol>` section, these do not
+      include hit-on-miss requests.
     unit: Requests per normalization unit
   NC Req:
-    rst: The total number of requests to the L2 to Not-hardware-Coherent (NC)  memory
-      allocations, per :ref:`normalization unit <normalization-units>`.  See the :ref:`memory-type`
+    rst: The total number of requests to the L2 to Not-hardware-Coherent (NC) memory
+      allocations, per :ref:`normalization unit <normalization-units>`. See the :ref:`memory-type`
       for more information.
     unit: Requests per normalization unit
   Probe Req:
-    rst: The number of coherence probe requests made to the L2 cache from outside  the
-      accelerator. On an :ref:`MI2XX <mixxx-note>`, probe requests may be  generated
-      by, for example, writes to  :ref:`fine-grained device <memory-type>` memory
-      or by writes to  :ref:`coarse-grained <memory-type>` device memory.
+    rst: The number of coherence probe requests made to the L2 cache from outside
+      the accelerator. On an :ref:`MI2XX <mixxx-note>`, probe requests may be generated
+      by, for example, writes to :ref:`fine-grained device <memory-type>` memory or
+      by writes to :ref:`coarse-grained <memory-type>` device memory.
     unit: Requests per normalization unit
   RW Req:
-    rst: The total number of requests to the L2 that go to Read-Write coherent memory  (RW)
-      allocations. See the :ref:`memory-type` for more information.
+    rst: The total number of requests to the L2 that go to Read-Write coherent memory
+      (RW) allocations. See the :ref:`memory-type` for more information.
     unit: Requests per normalization unit
   Read Bandwidth:
     rst: Total number of bytes looked up in the L2 cache for read requests, divided
       by total duration.
     unit: Gbps
   Read Req:
-    rst: 'The total number of read requests to the L2 from all clients.  '
+    rst: The total number of read requests to the L2 from all clients.
     unit: Requests per normalization unit
   Req:
-    rst: The total number of incoming requests to the L2 from all clients for all  request
-      types, per :ref:`normalization unit <normalization-units>`.
+    rst: The total number of incoming requests to the L2 from all clients for all
+      request types, per :ref:`normalization unit <normalization-units>`.
     unit: Requests per normalization unit
   Streaming Req:
-    rst: The total number of incoming requests to the L2 that are marked as  *streaming*.
-      The exact meaning of this may differ depending on the  targeted accelerator,
-      however on an :ref:`MI2XX <mixxx-note>` this  corresponds to  `non-temporal
-      load or stores <https://clang.llvm.org/docs/LanguageExtensions.html#non-temporal-load-store-builtins>`_.  The
-      L2 cache attempts to evict *streaming* requests before normal  requests when
+    rst: The total number of incoming requests to the L2 that are marked as *streaming*.
+      The exact meaning of this may differ depending on the targeted accelerator,
+      however on an :ref:`MI2XX <mixxx-note>` this corresponds to `non-temporal load
+      or stores <https://clang.llvm.org/docs/LanguageExtensions.html#non-temporal-load-store-builtins>`_.
+      The L2 cache attempts to evict *streaming* requests before normal requests when
       the L2 is at capacity.
     unit: Requests per normalization unit
   UC Req:
-    rst: The total number of requests to the L2 that go to Uncached (UC) memory  allocations.
+    rst: The total number of requests to the L2 that go to Uncached (UC) memory allocations.
       See the :ref:`memory-type` for more information.
     unit: Requests per normalization unit
   Write Bandwidth:
@@ -847,18 +868,19 @@ L2 cache accesses:
     rst: The total number of write requests to the L2 from all clients.
     unit: Requests per normalization unit
   Writeback:
-    rst: The total number of L2 cache lines written back to memory for any reason.  Write-backs
-      may occur due to user code (such as HIP kernel calls to  ``__threadfence_system``
-      or atomic built-ins) by the  :doc:`command processor <command-processor>`'s
-      memory acquire/release  fences, or for other internal hardware reasons.
+    rst: The total number of L2 cache lines written back to memory for any reason.
+      Write-backs may occur due to user code (such as HIP kernel calls to ``__threadfence_system``
+      or atomic built-ins) by the :doc:`command processor <command-processor>`'s memory
+      acquire/release fences, or for other internal hardware reasons.
     unit: Cache lines per normalization unit
   Writeback (Internal):
-    rst: The total number of L2 cache lines written back to memory for internal  hardware
+    rst: The total number of L2 cache lines written back to memory for internal hardware
       reasons, per :ref:`normalization unit <normalization-units>`.
     unit: Cache lines per normalization unit
   Writeback (vL1D Req):
-    rst: The total number of L2 cache lines written back to memory due to requests  initiated
-      by the :doc:`vL1D cache <vector-l1-cache>`, per  :ref:`normalization unit <normalization-units>`.
+    rst: The total number of L2 cache lines written back to memory due to requests
+      initiated by the :doc:`vL1D cache <vector-l1-cache>`, per :ref:`normalization
+      unit <normalization-units>`.
     unit: Cache lines per normalization unit
 L2-Fabric interface metrics:
   Atomic Latency:
@@ -867,83 +889,83 @@ L2-Fabric interface metrics:
       with return value) was returned to the L2.
     unit: Cycles
   Atomic Traffic:
-    rst: The percent of write requests generated by the L2 cache that are atomic  requests
-      to *any* memory location. This breakdown does not consider the  *size* of the
-      request (meaning that 32B and 64B requests are both counted  as a single request),
-      so this metric only *approximates* the percent of  the L2-Fabric Read bandwidth
-      directed to a remote location. Note that on  current CDNA accelerators, such
-      as the :ref:`MI2XX <mixxx-note>`,  requests are only considered *atomic* by
-      Infinity Fabric if they are  targeted at :ref:`fine-grained memory <memory-type>`
-      allocations or  :ref:`uncached memory <memory-type>` allocations.
+    rst: The percent of write requests generated by the L2 cache that are atomic requests
+      to *any* memory location. This breakdown does not consider the *size* of the
+      request (meaning that 32B and 64B requests are both counted as a single request),
+      so this metric only *approximates* the percent of the L2-Fabric Read bandwidth
+      directed to a remote location. Note that on current CDNA accelerators, such
+      as the :ref:`MI2XX <mixxx-note>`, requests are only considered *atomic* by Infinity
+      Fabric if they are targeted at :ref:`fine-grained memory <memory-type>` allocations
+      or :ref:`uncached memory <memory-type>` allocations.
     unit: Percent
   HBM Read Traffic:
-    rst: The percent of read requests generated by the L2 cache that are routed to  the
-      accelerator's local high-bandwidth memory (HBM). This breakdown does  not consider
-      the *size* of the request (meaning that 32B and 64B requests  are both counted
-      as a single request), so this metric only *approximates*  the percent of the
-      L2-Fabric Read bandwidth directed to the local HBM.
+    rst: The percent of read requests generated by the L2 cache that are routed to
+      the accelerator's local high-bandwidth memory (HBM). This breakdown does not
+      consider the *size* of the request (meaning that 32B and 64B requests are both
+      counted as a single request), so this metric only *approximates* the percent
+      of the L2-Fabric Read bandwidth directed to the local HBM.
     unit: Percent
   HBM Write and Atomic Traffic:
-    rst: The percent of write and atomic requests generated by the L2 cache that  are
-      routed to the accelerator's local high-bandwidth memory (HBM). This  breakdown
-      does not consider the *size* of the request (meaning that 32B  and 64B requests
-      are both counted as a single request), so this metric  only *approximates* the
-      percent of the L2-Fabric Write and Atomic  bandwidth directed to the local HBM.
-      Note that on current CDNA  accelerators, such as the :ref:`MI2XX <mixxx-note>`,
-      requests are only  considered *atomic* by Infinity Fabric if they are targeted
-      at  :ref:`fine-grained memory <memory-type>` allocations or  :ref:`uncached
-      memory <memory-type>` allocations.
+    rst: The percent of write and atomic requests generated by the L2 cache that are
+      routed to the accelerator's local high-bandwidth memory (HBM). This breakdown
+      does not consider the *size* of the request (meaning that 32B and 64B requests
+      are both counted as a single request), so this metric only *approximates* the
+      percent of the L2-Fabric Write and Atomic bandwidth directed to the local HBM.
+      Note that on current CDNA accelerators, such as the :ref:`MI2XX <mixxx-note>`,
+      requests are only considered *atomic* by Infinity Fabric if they are targeted
+      at :ref:`fine-grained memory <memory-type>` allocations or :ref:`uncached memory
+      <memory-type>` allocations.
     unit: Percent
   Read BW:
     rst: The total number of bytes read by the L2 cache from Infinity Fabric divided
       by total duration.
     unit: Gbps
   Read Latency:
-    rst: The time-averaged number of cycles read requests spent in Infinity Fabric  before
-      data was returned to the L2.
+    rst: The time-averaged number of cycles read requests spent in Infinity Fabric
+      before data was returned to the L2.
     unit: Cycles
   Read Stall:
-    rst: "The ratio of the total number of cycles the L2-Fabric interface was stalled\
-      \ on a read request to any destination (local HBM, remote PCIe\xAE connected\
-      \ accelerator or CPU, or remote Infinity Fabric connected accelerator [#inf]_\
-      \ or CPU) over the :ref:`total active L2 cycles <total-active-l2-cycles>`."
+    rst: |-
+      The ratio of the total number of cycles the L2-Fabric interface was stalled
+      on a read request to any destination (local HBM, remote PCIe\xAE connected
+      accelerator or CPU, or remote Infinity Fabric connected accelerator [#inf]_
+      or CPU) over the :ref:`total active L2 cycles <total-active-l2-cycles>`.
     unit: Percent
   Remote Read Traffic:
-    rst: The percent of read requests generated by the L2 cache that are routed to  any
-      memory location other than the accelerator's local high-bandwidth  memory (HBM)
-      -- for example, the CPU's DRAM or a remote accelerator's  HBM. This breakdown
-      does not consider the *size* of the request (meaning  that 32B and 64B requests
-      are both counted as a single request), so this  metric only *approximates* the
-      percent of the L2-Fabric Read bandwidth  directed to a remote location.
+    rst: The percent of read requests generated by the L2 cache that are routed to
+      any memory location other than the accelerator's local high-bandwidth memory
+      (HBM) -- for example, the CPU's DRAM or a remote accelerator's HBM. This breakdown
+      does not consider the *size* of the request (meaning that 32B and 64B requests
+      are both counted as a single request), so this metric only *approximates* the
+      percent of the L2-Fabric Read bandwidth directed to a remote location.
     unit: Percent
   Remote Write and Atomic Traffic:
-    rst: The percent of read requests generated by the L2 cache that are routed to  any
-      memory location other than the accelerator's local high-bandwidth  memory (HBM)
-      -- for example, the CPU's DRAM or a remote accelerator's  HBM. This breakdown
-      does not consider the *size* of the request (meaning  that 32B and 64B requests
-      are both counted as a single request), so this  metric only *approximates* the
-      percent of the L2-Fabric Read bandwidth  directed to a remote location. Note
-      that on current CDNA  accelerators, such as the :ref:`MI2XX <mixxx-note>`, requests
-      are only  considered *atomic* by Infinity Fabric if they are targeted at  :ref:`fine-grained
-      memory <memory-type>` allocations or  :ref:`uncached memory <memory-type>` allocations.
+    rst: The percent of read requests generated by the L2 cache that are routed to
+      any memory location other than the accelerator's local high-bandwidth memory
+      (HBM) -- for example, the CPU's DRAM or a remote accelerator's HBM. This breakdown
+      does not consider the *size* of the request (meaning that 32B and 64B requests
+      are both counted as a single request), so this metric only *approximates* the
+      percent of the L2-Fabric Read bandwidth directed to a remote location. Note
+      that on current CDNA accelerators, such as the :ref:`MI2XX <mixxx-note>`, requests
+      are only considered *atomic* by Infinity Fabric if they are targeted at :ref:`fine-grained
+      memory <memory-type>` allocations or :ref:`uncached memory <memory-type>` allocations.
     unit: Percent
   Uncached Read Traffic:
-    rst: The percent of read requests generated by the L2 cache that are reading  from
-      an :ref:`uncached memory allocation <memory-type>`. Note, as  described in the
-      :ref:`request flow <l2-request-flow>` section, a single  64B read request is
-      typically counted as two uncached read requests. So,  it is possible for the
-      Uncached Read Traffic to reach up to 200% of the  total number of read requests.
-      This breakdown does not consider the  *size* of the request (i.e., 32B and 64B
-      requests are both counted as a  single request), so this metric only *approximates*
-      the percent of the  L2-Fabric read bandwidth directed to an uncached memory
-      location.
+    rst: The percent of read requests generated by the L2 cache that are reading from
+      an :ref:`uncached memory allocation <memory-type>`. Note, as described in the
+      :ref:`request flow <l2-request-flow>` section, a single 64B read request is
+      typically counted as two uncached read requests. So, it is possible for the
+      Uncached Read Traffic to reach up to 200% of the total number of read requests.
+      This breakdown does not consider the *size* of the request (i.e., 32B and 64B
+      requests are both counted as a single request), so this metric only *approximates*
+      the percent of the L2-Fabric read bandwidth directed to an uncached memory location.
     unit: Percent
   Uncached Write and Atomic Traffic:
-    rst: The percent of write and atomic requests generated by the L2 cache that  are
-      targeting :ref:`uncached memory allocations <memory-type>`. This  breakdown
-      does not consider the *size* of the request (meaning that 32B  and 64B requests
-      are both counted as a single request), so this metric  only *approximates* the
-      percent of the L2-Fabric read bandwidth directed  to uncached memory allocations.
+    rst: The percent of write and atomic requests generated by the L2 cache that are
+      targeting :ref:`uncached memory allocations <memory-type>`. This breakdown does
+      not consider the *size* of the request (meaning that 32B and 64B requests are
+      both counted as a single request), so this metric only *approximates* the percent
+      of the L2-Fabric read bandwidth directed to uncached memory allocations.
     unit: Percent
   Write Stall:
     rst: The ratio of the total number of cycles the L2-Fabric interface was stalled
@@ -952,12 +974,12 @@ L2-Fabric interface metrics:
       accelerator [#inf]_ or CPU) over the :ref:`total active L2 cycles <total-active-l2-cycles>`.
     unit: Percent
   Write and Atomic BW:
-    rst: The total number of bytes written by the L2 over Infinity Fabric by write  and
-      atomic operations divided by total duration. Note that on current  CDNA accelerators,
-      such as the :ref:`MI2XX <mixxx-note>`, requests are  only considered *atomic*
-      by Infinity Fabric if they are targeted at  non-write-cacheable memory, for
-      example,  :ref:`fine-grained memory <memory-type>` allocations or  :ref:`uncached
-      memory <memory-type>` allocations on the  MI2XX.
+    rst: The total number of bytes written by the L2 over Infinity Fabric by write
+      and atomic operations divided by total duration. Note that on current CDNA accelerators,
+      such as the :ref:`MI2XX <mixxx-note>`, requests are only considered *atomic*
+      by Infinity Fabric if they are targeted at non-write-cacheable memory, for example,
+      :ref:`fine-grained memory <memory-type>` allocations or :ref:`uncached memory
+      <memory-type>` allocations on the MI2XX.
     unit: Gbps
   Write and Atomic Latency:
     rst: The time-averaged number of cycles write requests spent in Infinity Fabric
@@ -965,13 +987,13 @@ L2-Fabric interface metrics:
     unit: Cycles
 L2 - Fabric interface detailed metrics:
   Atomic:
-    rst: The total number of L2 requests to Infinity Fabric to atomically update  32B
-      or 64B of data in any memory location, per  :ref:`normalization unit <normalization-units>`.
-      See  :ref:`l2-request-flow` for more detail. Note that on current CDNA  accelerators,
-      such as the :ref:`MI2XX <mixxx-note>`, requests are only  considered *atomic*
-      by Infinity Fabric if they are targeted at  non-write-cacheable memory, such
-      as  :ref:`fine-grained memory <memory-type>` allocations or  :ref:`uncached
-      memory <memory-type>` allocations on the MI2XX.
+    rst: The total number of L2 requests to Infinity Fabric to atomically update 32B
+      or 64B of data in any memory location, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`l2-request-flow` for more detail. Note that on current CDNA accelerators,
+      such as the :ref:`MI2XX <mixxx-note>`, requests are only considered *atomic*
+      by Infinity Fabric if they are targeted at non-write-cacheable memory, such
+      as :ref:`fine-grained memory <memory-type>` allocations or :ref:`uncached memory
+      <memory-type>` allocations on the MI2XX.
     unit: Requests per normalization unit
   Atomic Bandwidth - HBM:
     rst: Total number of bytes due to L2 atomic requests due to HBM traffic, divided
@@ -986,30 +1008,30 @@ L2 - Fabric interface detailed metrics:
       by total duration.
     unit: Gbps
   HBM Read:
-    rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of  data
-      from the accelerator's local HBM, per  :ref:`normalization unit <normalization-units>`.
-      See  :ref:`l2-request-flow` for more detail.
+    rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of
+      data from the accelerator's local HBM, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`l2-request-flow` for more detail.
     unit: Requests per normalization unit
   HBM Write and Atomic:
-    rst: The total number of L2 requests to Infinity Fabric to write or atomically  update
-      32B or 64B of data in the accelerator's local HBM, per  :ref:`normalization
-      unit <normalization-units>`. See  :ref:`l2-request-flow` for more detail.  plain
+    rst: The total number of L2 requests to Infinity Fabric to write or atomically
+      update 32B or 64B of data in the accelerator's local HBM, per :ref:`normalization
+      unit <normalization-units>`. See :ref:`l2-request-flow` for more detail. plain
     unit: Requests per normalization unit
   Read (32B):
-    rst: The total number of L2 requests to Infinity Fabric to read 32B of data  from
-      any memory location, per  :ref:`normalization unit <normalization-units>`. See  :ref:`l2-request-flow`
-      for more detail. Typically unused on CDNA  accelerators.
+    rst: The total number of L2 requests to Infinity Fabric to read 32B of data from
+      any memory location, per :ref:`normalization unit <normalization-units>`. See
+      :ref:`l2-request-flow` for more detail. Typically unused on CDNA accelerators.
     unit: Requests per normalization unit
   Read (64B):
-    rst: The total number of L2 requests to Infinity Fabric to read 64B of data  from
-      any memory location, per  :ref:`normalization unit <normalization-units>`. See  :ref:`l2-request-flow`
-      for more detail.
+    rst: The total number of L2 requests to Infinity Fabric to read 64B of data from
+      any memory location, per :ref:`normalization unit <normalization-units>`. See
+      :ref:`l2-request-flow` for more detail.
     unit: Requests per normalization unit
   Read (Uncached):
-    rst: The total number of L2 requests to Infinity Fabric to read  :ref:`uncached
-      data <memory-type>` from any memory location, per  :ref:`normalization unit
-      <normalization-units>`. 64B requests for  uncached data are counted as two 32B
-      uncached data requests. See  :ref:`l2-request-flow` for more detail.
+    rst: The total number of L2 requests to Infinity Fabric to read :ref:`uncached
+      data <memory-type>` from any memory location, per :ref:`normalization unit <normalization-units>`.
+      64B requests for uncached data are counted as two 32B uncached data requests.
+      See :ref:`l2-request-flow` for more detail.
     unit: Requests per normalization unit
   Read Bandwidth - HBM:
     rst: Total number of bytes due to L2 read requests due to HBM traffic, divided
@@ -1024,14 +1046,14 @@ L2 - Fabric interface detailed metrics:
       by total duration.
     unit: Gbps
   Remote Read:
-    rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of  data
-      from any source other than the accelerator's local HBM, per  :ref:`normalization
-      unit <normalization-units>`. See  :ref:`l2-request-flow` for more detail.
+    rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of
+      data from any source other than the accelerator's local HBM, per :ref:`normalization
+      unit <normalization-units>`. See :ref:`l2-request-flow` for more detail.
     unit: Requests per normalization unit
   Remote Write and Atomic:
-    rst: The total number of L2 requests to Infinity Fabric to write or atomically  update
-      32B or 64B of data in any memory location other than the  accelerator's local
-      HBM, per  :ref:`normalization unit <normalization-units>`. See  :ref:`l2-request-flow`
+    rst: The total number of L2 requests to Infinity Fabric to write or atomically
+      update 32B or 64B of data in any memory location other than the accelerator's
+      local HBM, per :ref:`normalization unit <normalization-units>`. See :ref:`l2-request-flow`
       for more detail.
     unit: Requests per normalization unit
   Write Bandwidth - HBM:
@@ -1047,19 +1069,19 @@ L2 - Fabric interface detailed metrics:
       by total duration.
     unit: Gbps
   Write and Atomic (32B):
-    rst: The total number of L2 requests to Infinity Fabric to write or atomically  update
-      32B of data to any memory location, per  :ref:`normalization unit <normalization-units>`.
-      See  :ref:`l2-request-flow` for more detail.
+    rst: The total number of L2 requests to Infinity Fabric to write or atomically
+      update 32B of data to any memory location, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`l2-request-flow` for more detail.
     unit: Requests per normalization unit
   Write and Atomic (64B):
-    rst: The total number of L2 requests to Infinity Fabric to write or atomically  update
-      64B of data in any memory location, per  :ref:`normalization unit <normalization-units>`.
-      See  :ref:`l2-request-flow` for more detail.
+    rst: The total number of L2 requests to Infinity Fabric to write or atomically
+      update 64B of data in any memory location, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`l2-request-flow` for more detail.
     unit: Requests per normalization unit
   Write and Atomic (Uncached):
-    rst: The total number of L2 requests to Infinity Fabric to write or atomically  update
-      32B or 64B of :ref:`uncached data <memory-type>`, per  :ref:`normalization unit
-      <normalization-units>`. See  :ref:`l2-request-flow` for more detail.
+    rst: The total number of L2 requests to Infinity Fabric to write or atomically
+      update 32B or 64B of :ref:`uncached data <memory-type>`, per :ref:`normalization
+      unit <normalization-units>`. See :ref:`l2-request-flow` for more detail.
     unit: Requests per normalization unit
 L2 - Fabric Interface stalls:
   Read - HBM Stall:
@@ -1078,9 +1100,9 @@ L2 - Fabric Interface stalls:
       active L2 cycles <total-active-l2-cycles>`.
     unit: Percent
   Write - Credit Starvation:
-    rst: The number of cycles the L2-Fabric interface was stalled on write or  atomic
-      requests to any memory location because too many write/atomic  requests were
-      currently in flight, as a percent of the  :ref:`total active L2 cycles <total-active-l2-cycles>`.
+    rst: The number of cycles the L2-Fabric interface was stalled on write or atomic
+      requests to any memory location because too many write/atomic requests were
+      currently in flight, as a percent of the :ref:`total active L2 cycles <total-active-l2-cycles>`.
     unit: Percent
   Write - HBM Stall:
     rst: The number of cycles the L2-Fabric interface was stalled on write or atomic
@@ -1098,137 +1120,140 @@ L2 - Fabric Interface stalls:
     unit: Percent
 Scalar L1D Speed-of-Light:
   Bandwidth Utilization:
-    rst: The number of bytes looked up in the sL1D cache, as a percent of the peak  theoretical
-      bandwidth. Calculated as the ratio of sL1D requests over the  :ref:`total sL1D
-      cycles <total-sl1d-cycles>`.
+    rst: The number of bytes looked up in the sL1D cache, as a percent of the peak
+      theoretical bandwidth. Calculated as the ratio of sL1D requests over the :ref:`total
+      sL1D cycles <total-sl1d-cycles>`.
     unit: Percent
   Cache Hit Rate:
-    rst: Indicates the percent of sL1D requests that hit on a previously loaded  line
-      the cache. The ratio of the number of sL1D requests that hit  [#sl1d-cache]_
+    rst: Indicates the percent of sL1D requests that hit on a previously loaded line
+      the cache. The ratio of the number of sL1D requests that hit [#sl1d-cache]_
       over the number of all sL1D requests.
     unit: Percent
   sL1D-L2 BW Utilization:
-    rst: The percentage of the peak theoretical sL1D - L2 interface bandwidth acheived.\
-      \ Caclulated as total number of bytes read from, written to, or atomically updated\
-      \ across the sL1D - L2 interface.
+    rst: The percentage of the peak theoretical sL1D - L2 interface bandwidth acheived.
+      Calculated as total number of bytes read from, written to, or atomically updated
+      across the sL1D - L2 interface.
     unit: Percent
 Scalar L1D cache accesses:
   Atomic Req:
-    rst: The total number of atomic requests from sL1D to the  :doc:`L2 <l2-cache>`,
-      per  :ref:`normalization unit <normalization-units>`. Typically unused on  current
+    rst: The total number of atomic requests from sL1D to the :doc:`L2 <l2-cache>`,
+      per :ref:`normalization unit <normalization-units>`. Typically unused on current
       CDNA accelerators.
     unit: Requests per normalization unit
   Cache Hit Rate:
-    rst: Indicates the percent of sL1D requests that hit on a previously loaded  line
-      the cache. The ratio of the number of sL1D requests that hit  [#sl1d-cache]_
+    rst: Indicates the percent of sL1D requests that hit on a previously loaded line
+      the cache. The ratio of the number of sL1D requests that hit [#sl1d-cache]_
       over the number of all sL1D requests.
     unit: Percent
   Hits:
-    rst: The total number of sL1D requests that hit on a previously loaded cache  line,
+    rst: The total number of sL1D requests that hit on a previously loaded cache line,
       per :ref:`normalization unit <normalization-units>`.
     unit: Requests per normalization unit
   Misses - Non Duplicated:
-    rst: The total number of sL1D requests that missed on a cache line that *was  not*
-      already pending due to another request, per  :ref:`normalization unit <normalization-units>`.
-      See :ref:`desc-sl1d-sol`  for more detail.
+    rst: The total number of sL1D requests that missed on a cache line that *was not*
+      already pending due to another request, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`desc-sl1d-sol` for more detail.
     unit: Requests per normalization unit
   Misses- Duplicated:
-    rst: The total number of sL1D requests that missed on a cache line that *was*  already
-      pending due to another request, per  :ref:`normalization unit <normalization-units>`.
-      See  :ref:`desc-sl1d-sol` for more detail.
+    rst: The total number of sL1D requests that missed on a cache line that *was*
+      already pending due to another request, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`desc-sl1d-sol` for more detail.
     unit: Requests per normalization unit
   Read Req (1 DWord):
-    rst: The total number of sL1D read requests made for a single dword of data  (4B),
+    rst: The total number of sL1D read requests made for a single dword of data (4B),
       per :ref:`normalization unit <normalization-units>`.
     unit: Requests per normalization unit
   Read Req (16 DWord):
-    rst: The total number of sL1D read requests made for a sixteen dwords of data  (64B),
-      per :ref:`normalization unit <normalization-units>`.
+    rst: The total number of sL1D read requests made for a sixteen dwords of data
+      (64B), per :ref:`normalization unit <normalization-units>`.
     unit: Requests per normalization unit
   Read Req (2 DWord):
-    rst: The total number of sL1D read requests made for a two dwords of data  (8B),
+    rst: The total number of sL1D read requests made for a two dwords of data (8B),
       per :ref:`normalization unit <normalization-units>`.
     unit: Requests per normalization unit
   Read Req (4 DWord):
-    rst: The total number of sL1D read requests made for a four dwords of data  (16B),
+    rst: The total number of sL1D read requests made for a four dwords of data (16B),
       per :ref:`normalization unit <normalization-units>`.
     unit: Requests per normalization unit
   Read Req (8 DWord):
-    rst: The total number of sL1D read requests made for a eight dwords of data  (32B),
+    rst: The total number of sL1D read requests made for a eight dwords of data (32B),
       per :ref:`normalization unit <normalization-units>`.
     unit: Requests per normalization unit
   Read Req (Total):
-    rst: The total number of sL1D read requests of any size, per  :ref:`normalization
+    rst: The total number of sL1D read requests of any size, per :ref:`normalization
       unit <normalization-units>`.
     unit: Requests per normalization unit
   Req:
-    rst: The total number of requests, of any size or type, made to the sL1D per  :ref:`normalization
+    rst: The total number of requests, of any size or type, made to the sL1D per :ref:`normalization
       unit <normalization-units>`.
     unit: Requests per normalization unit
 Scalar L1D Cache - L2 Interface:
   Atomic Req:
-    rst: The total number of atomic requests from sL1D to the  :doc:`L2 <l2-cache>`,
-      per  :ref:`normalization unit <normalization-units>`. Typically unused on  current
+    rst: The total number of atomic requests from sL1D to the :doc:`L2 <l2-cache>`,
+      per :ref:`normalization unit <normalization-units>`. Typically unused on current
       CDNA accelerators.
     unit: Requests per normalization unit
   Read Req:
-    rst: The total number of read requests from sL1D to the :doc:`L2 <l2-cache>`,  per
-      :ref:`normalization unit <normalization-units>`.
+    rst: The total number of read requests from sL1D to the :doc:`L2 <l2-cache>`,
+      per :ref:`normalization unit <normalization-units>`.
     unit: Requests per normalization unit
   Stall Cycles:
-    rst: "The total number of cycles the sL1D\u2194  :doc:`L2 <l2-cache>` interface\
-      \ was stalled, per  :ref:`normalization unit <normalization-units>`."
+    rst: |-
+      The total number of cycles the sL1D\u2194 :doc:`L2 <l2-cache>` interface
+      was stalled, per :ref:`normalization unit <normalization-units>`.
     unit: Cycles per normalization unit
   Write Req:
-    rst: The total number of write requests from sL1D to the :doc:`L2 <l2-cache>`,  per
-      :ref:`normalization unit <normalization-units>`. Typically unused on  current
+    rst: The total number of write requests from sL1D to the :doc:`L2 <l2-cache>`,
+      per :ref:`normalization unit <normalization-units>`. Typically unused on current
       CDNA accelerators.
     unit: Requests per normalization unit
   sL1D-L2 BW:
-    rst: "The total number of bytes read from, written to, or atomically updated \
-      \ across the sL1D\u2194:doc:`L2 <l2-cache>` interface, divided by total duration.\
-      \ Note that sL1D writes and atomics are typically unused on current CDNA accelerators,\
-      \ so in the  majority of cases this can be interpreted as an sL1D\u2192L2 read\
-      \ bandwidth."
+    rst: |-
+      The total number of bytes read from, written to, or atomically updated
+      across the sL1D\u2194:doc:`L2 <l2-cache>` interface, divided by total duration.
+      Note that sL1D writes and atomics are typically
+      unused on current CDNA accelerators, so in the majority of cases this can
+      be interpreted as an sL1D\u2192L2 read bandwidth.
     unit: Gbps
 L1I Speed-of-Light:
   Bandwidth Utilization:
-    rst: The number of bytes looked up in the L1I cache, as a percent of the peak  theoretical
-      bandwidth. Calculated as the ratio of L1I requests over the  :ref:`total L1I
-      cycles <total-l1i-cycles>`.
+    rst: The number of bytes looked up in the L1I cache, as a percent of the peak
+      theoretical bandwidth. Calculated as the ratio of L1I requests over the :ref:`total
+      L1I cycles <total-l1i-cycles>`.
     unit: Percent
   Cache Hit Rate:
-    rst: The percent of L1I requests that hit [#l1i-cache]_ on a previously loaded  line
-      the cache. Calculated as the ratio of the number of L1I requests  that hit over
-      the number of all L1I requests.
+    rst: The percent of L1I requests that hit [#l1i-cache]_ on a previously loaded
+      line the cache. Calculated as the ratio of the number of L1I requests that hit
+      over the number of all L1I requests.
     unit: Percent
   L1I-L2 Bandwidth Utilization:
-    rst: "The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth\
-      \  achieved. Calculated as the ratio of the total number of requests from  the\
-      \ L1I to the L2 cache over the  :ref:`total L1I-L2 interface cycles <total-l1i-cycles>`."
+    rst: |-
+      The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth
+      achieved. Calculated as the ratio of the total number of requests from
+      the L1I to the L2 cache over the :ref:`total L1I-L2 interface cycles <total-l1i-cycles>`.
     unit: Percent
 L1I cache accesses:
   Cache Hit Rate:
-    rst: The percent of L1I requests that hit [#l1i-cache]_ on a previously loaded  line
-      the cache. Calculated as the ratio of the number of L1I requests  that hit over
-      the number of all L1I requests.
+    rst: The percent of L1I requests that hit [#l1i-cache]_ on a previously loaded
+      line the cache. Calculated as the ratio of the number of L1I requests that hit
+      over the number of all L1I requests.
     unit: Percent
   Hits:
-    rst: The total number of L1I requests that hit on a previously loaded cache  line,
+    rst: The total number of L1I requests that hit on a previously loaded cache line,
       per :ref:`normalization-unit <normalization-units>`.
     unit: Requests per normalization unit
   Instruction Fetch Latency:
-    rst: The average number of cycles spent to fetch instructions to a  :doc:`CU <compute-unit>`.
+    rst: The average number of cycles spent to fetch instructions to a :doc:`CU <compute-unit>`.
     unit: Cycles
   Misses - Duplicated:
-    rst: The total number of L1I requests that missed on a cache line that *were*  already
-      pending due to another request, per  :ref:`normalization-unit <normalization-units>`.
-      See note in  :ref:`desc-l1i-sol` for more detail.
+    rst: The total number of L1I requests that missed on a cache line that *were*
+      already pending due to another request, per :ref:`normalization-unit <normalization-units>`.
+      See note in :ref:`desc-l1i-sol` for more detail.
     unit: Requests per normalization unit
   Misses - Non Duplicated:
-    rst: The total number of L1I requests that missed on a cache line that  *were
-      not* already pending due to another request, per  :ref:`normalization-unit <normalization-units>`.
-      See note in  :ref:`desc-l1i-sol` for more detail.
+    rst: The total number of L1I requests that missed on a cache line that *were not*
+      already pending due to another request, per :ref:`normalization-unit <normalization-units>`.
+      See note in :ref:`desc-l1i-sol` for more detail.
     unit: Requests per normalization unit
   Req:
     rst: The total number of requests made to the L1I per normalization-unit
@@ -1251,29 +1276,30 @@ Workgroup manager utilizations:
     rst: The total number of workgroups forming this kernel launch.
     unit: Workgroups
   SGPR Writes:
-    rst: The average number of cycles spent initializing :ref:`SGPRs <desc-salu>`  at
-      wave creation.
+    rst: The average number of cycles spent initializing :ref:`SGPRs <desc-salu>`
+      at wave creation.
     unit: Cycles/wave
   SIMD Utilization:
-    rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel  where
-      any :ref:`SIMD <desc-valu>` on a CU was actively doing any work,  summed over
-      all CUs. Low values (less than 100%) indicate that the  accelerator was not
-      fully saturated by the kernel, or a potential  load-imbalance issue.
+    rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel
+      where any :ref:`SIMD <desc-valu>` on a CU was actively doing any work, summed
+      over all CUs. Low values (less than 100%) indicate that the accelerator was
+      not fully saturated by the kernel, or a potential load-imbalance issue.
     unit: Percent
   Scheduler-Pipe Utilization:
-    rst: 'The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>` in  the
-      kernel where the scheduler-pipes were actively doing any work. Note:  this value
-      is expected to range between 0% and 25%. See :ref:`desc-spi`.'
+    rst: |-
+      The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
+      in the kernel where the scheduler-pipes were actively doing any work. Note: this
+      value is expected to range between 0% and 25%. See :ref:`desc-spi`.
     unit: Percent
   Shader Engine Utilization:
-    rst: The percent of :ref:`total shader engine cycles <total-se-cycles>` in the  kernel
-      where any CU in a shader-engine was actively doing any work,  normalized over
-      all shader-engines. Low values (e.g., << 100%) indicate  that the accelerator
-      was not fully saturated by the kernel, or a  potential load-imbalance issue.
+    rst: The percent of :ref:`total shader engine cycles <total-se-cycles>` in the
+      kernel where any CU in a shader-engine was actively doing any work, normalized
+      over all shader-engines. Low values (e.g., << 100%) indicate that the accelerator
+      was not fully saturated by the kernel, or a potential load-imbalance issue.
     unit: Percent
   VGPR Writes:
-    rst: The average number of cycles spent initializing :ref:`VGPRs <desc-valu>`  at
-      wave creation.
+    rst: The average number of cycles spent initializing :ref:`VGPRs <desc-valu>`
+      at wave creation.
     unit: Cycles/wave
   Workgroup Manager Utilization:
     rst: The percent of cycles in the kernel where the workgroup manager was actively
@@ -1281,70 +1307,75 @@ Workgroup manager utilizations:
     unit: Percent
 Workgroup Manager - Resource Allocation:
   Insufficient CU Barriers:
-    rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel  where
-      a workgroup could not be scheduled to a :doc:`CU <compute-unit>`  due to lack
+    rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel where
+      a workgroup could not be scheduled to a :doc:`CU <compute-unit>` due to lack
       of available :ref:`barriers <desc-barrier>`.
     unit: Percent
   Insufficient CU LDS:
-    rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel  where
-      a workgroup could not be scheduled to a :doc:`CU <compute-unit>`  due to lack
+    rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel where
+      a workgroup could not be scheduled to a :doc:`CU <compute-unit>` due to lack
       of available :doc:`LDS <local-data-share>`.
     unit: Percent
   Insufficient SIMD SGPRs:
-    rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel  where
-      a workgroup could not be scheduled to a :ref:`SIMD <desc-valu>`  due to lack
-      of available :ref:`SGPRs <desc-salu>`.
+    rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel
+      where a workgroup could not be scheduled to a :ref:`SIMD <desc-valu>` due to
+      lack of available :ref:`SGPRs <desc-salu>`.
     unit: Percent
   Insufficient SIMD VGPRs:
-    rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel  where
-      a workgroup could not be scheduled to a  :ref:`SIMD <desc-valu>`  due to lack
-      of available :ref:`VGPRs <desc-valu>`.
+    rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel
+      where a workgroup could not be scheduled to a :ref:`SIMD <desc-valu>` due to
+      lack of available :ref:`VGPRs <desc-valu>`.
     unit: Percent
   Insufficient SIMD Waveslots:
-    rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel  where
-      a workgroup could not be scheduled to a  :ref:`SIMD <desc-valu>`  due to lack
-      of available :ref:`waveslots <desc-valu>`.
+    rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel
+      where a workgroup could not be scheduled to a :ref:`SIMD <desc-valu>` due to
+      lack of available :ref:`waveslots <desc-valu>`.
     unit: Percent
   Not-scheduled Rate (Scheduler-Pipe):
-    rst: 'The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>` in  the
-      kernel where a workgroup could not be scheduled to a  :doc:`CU <compute-unit>`
-      due to a bottleneck within the scheduler-pipes  rather than a lack of a CU or
-      :ref:`SIMD <desc-valu>` with sufficient  resources. Note: this value is expected
-      to range between 0-25%, see note  in :ref:`workgroup manager <desc-spi>` description.'
+    rst: |-
+      The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
+      in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
+      due to a bottleneck within the scheduler-pipes rather than a lack of a CU
+      or :ref:`SIMD <desc-valu>` with sufficient resources. Note: this value is
+      expected to range between 0-25%, see note in :ref:`workgroup manager <desc-spi>`
+      description.
     unit: Percent
   Not-scheduled Rate (Workgroup Manager):
-    rst: 'The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>` in  the
-      kernel where a workgroup could not be scheduled to a  :doc:`CU <compute-unit>`
-      due to a bottleneck within the workgroup manager  rather than a lack of a CU
-      or :ref:`SIMD <desc-valu>` with sufficient  resources. Note: this value is expected
-      to range between 0-25%. See note  in :ref:`workgroup manager <desc-spi>` description.'
+    rst: |-
+      The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
+      in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
+      due to a bottleneck within the workgroup manager rather than a lack of a
+      CU or :ref:`SIMD <desc-valu>` with sufficient resources. Note: this value
+      is expected to range between 0-25%. See note in :ref:`workgroup manager <desc-spi>`
+      description.
     unit: Percent
   Reached CU Wavefront Limit:
-    rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel  where
-      a wavefront could not be scheduled to a :doc:`CU <compute-unit>`  due to limits
-      within the workgroup manager.  This is expected to be  always be zero on CDNA2
-      or newer accelerators (and small for previous  accelerators).
+    rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel where
+      a wavefront could not be scheduled to a :doc:`CU <compute-unit>` due to limits
+      within the workgroup manager. This is expected to be always be zero on CDNA2
+      or newer accelerators (and small for previous accelerators).
     unit: Percent
   Reached CU Workgroup Limit:
-    rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel  where
-      a workgroup could not be scheduled to a :doc:`CU <compute-unit>`  due to limits
-      within the workgroup manager.  This is expected to be  always be zero on CDNA2
-      or newer accelerators (and small for previous  accelerators).
+    rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel where
+      a workgroup could not be scheduled to a :doc:`CU <compute-unit>` due to limits
+      within the workgroup manager. This is expected to be always be zero on CDNA2
+      or newer accelerators (and small for previous accelerators).
     unit: Percent
   Scheduler-Pipe Stall Rate:
-    rst: 'The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>` in  the
-      kernel where a workgroup could not be scheduled to a  :doc:`CU <compute-unit>`
-      due to occupancy limitations (like a lack of a  CU or :ref:`SIMD <desc-valu>`
-      with sufficient resources). Note: this  value is expected to range between 0-25%,
-      see note in  :ref:`workgroup manager <desc-spi>` description.'
+    rst: |-
+      The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
+      in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
+      due to occupancy limitations (like a lack of a CU or :ref:`SIMD <desc-valu>`
+      with sufficient resources). Note: this value is expected to range between
+      0-25%, see note in :ref:`workgroup manager <desc-spi>` description.
     unit: Percent
   Scratch Stall Rate:
-    rst: The percent of :ref:`total shader-engine cycles <total-se-cycles>` in the  kernel
-      where a workgroup could not be scheduled to a  :doc:`CU <compute-unit>` due
-      to lack of  :ref:`private (a.k.a., scratch) memory <memory-type>` slots. While
-      this  can reach up to 100%, note that the actual occupancy limitations on a  kernel
-      using private memory are typically quite small (for example, less  than 1% of
-      the total number of waves that can be scheduled to an  accelerator).
+    rst: The percent of :ref:`total shader-engine cycles <total-se-cycles>` in the
+      kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
+      due to lack of :ref:`private (a.k.a., scratch) memory <memory-type>` slots.
+      While this can reach up to 100%, note that the actual occupancy limitations
+      on a kernel using private memory are typically quite small (for example, less
+      than 1% of the total number of waves that can be scheduled to an accelerator).
     unit: Percent
 Command processor fetcher (CPF):
   CPF Stall:
@@ -1359,9 +1390,9 @@ Command processor fetcher (CPF):
       was stalled for any reason.
     unit: Percent
   CPF-L2 Utilization:
-    rst: Percent of total cycles counted by the CPF-:doc:`L2 <l2-cache>` interface  where
-      the CPF-L2 interface was active doing any work. The ratio of CPF-L2  busy cycles
-      over total cycles counted by the CPF-L2.
+    rst: Percent of total cycles counted by the CPF-:doc:`L2 <l2-cache>` interface
+      where the CPF-L2 interface was active doing any work. The ratio of CPF-L2 busy
+      cycles over total cycles counted by the CPF-L2.
     unit: Percent
   CPF-UTCL1 Stall:
     rst: Percent of CPF busy cycles where the CPF was stalled by address translation.
@@ -1378,18 +1409,18 @@ Command processor packet processor (CPC):
       ratio of CPC busy cycles over total cycles counted by the CPC.
     unit: Percent
   CPC-L2 Utilization:
-    rst: Percent of total cycles counted by the CPC-:doc:`L2 <l2-cache>` interface  where
-      the CPC-L2 interface was active doing any work.
+    rst: Percent of total cycles counted by the CPC-:doc:`L2 <l2-cache>` interface
+      where the CPC-L2 interface was active doing any work.
     unit: Percent
   CPC-UTCL1 Stall:
     rst: Percent of CPC busy cycles where the CPC was stalled by address translation
     unit: Percent
   CPC-UTCL2 Utilization:
-    rst: Percent of total cycles counted by the CPC's :doc:`L2 <l2-cache>` address  translation
-      interface where the CPC was busy doing address translation  work.
+    rst: Percent of total cycles counted by the CPC's :doc:`L2 <l2-cache>` address
+      translation interface where the CPC was busy doing address translation work.
     unit: Percent
   CPC-Workgroup Manager Utilization:
-    rst: Percent of CPC busy cycles spent dispatching workgroups to the  :ref:`workgroup
+    rst: Percent of CPC busy cycles spent dispatching workgroups to the :ref:`workgroup
       manager <desc-spi>`.
     unit: Percent
 System Speed-of-Light:
@@ -1408,91 +1439,103 @@ System Speed-of-Light:
       over the :ref:`total active CU cycles <total-active-cu-cycles>`.
     unit: Instructions per-cycle
   L1I BW:
-    rst: The number of bytes looked up in the L1I cache per unit time. This is  also
-      presented as a percent of the peak theoretical bandwidth achievable  on the
-      specific accelerator.
+    rst: The number of bytes looked up in the L1I cache per unit time. This is also
+      presented as a percent of the peak theoretical bandwidth achievable on the specific
+      accelerator.
     unit: Percent
   L1I Fetch Latency:
-    rst: The average number of cycles spent to fetch instructions to a  :doc:`CU <compute-unit>`.
+    rst: The average number of cycles spent to fetch instructions to a :doc:`CU <compute-unit>`.
     unit: Cycles
   L1I Hit Rate:
-    rst: The percent of L1I requests that hit on a previously loaded line the  cache.
-      Calculated as the ratio of the number of L1I requests that hit  over the number
+    rst: The percent of L1I requests that hit on a previously loaded line the cache.
+      Calculated as the ratio of the number of L1I requests that hit over the number
       of all L1I requests.
     unit: GB/s
   L2 Cache BW:
-    rst: The number of bytes looked up in the L2 cache per unit time.  The number  of
-      bytes is calculated as the number of cache lines requested multiplied  by the
-      cache line size. This value does not consider partial requests, so  e.g., if
-      only a single value is requested in a cache line, the data  movement will still
-      be counted as a full cache line. This is also  presented as a percent of the
-      peak theoretical bandwidth achievable on  the specific accelerator.
+    rst: The number of bytes looked up in the L2 cache per unit time. The number of
+      bytes is calculated as the number of cache lines requested multiplied by the
+      cache line size. This value does not consider partial requests, so e.g., if
+      only a single value is requested in a cache line, the data movement will still
+      be counted as a full cache line. This is also presented as a percent of the
+      peak theoretical bandwidth achievable on the specific accelerator.
     unit: GB/s
   L2 Cache Hit Rate:
-    rst: The ratio of the number of L2 cache line requests that hit in the L2  cache
-      over the total number of incoming cache line requests to the L2  cache.
+    rst: The ratio of the number of L2 cache line requests that hit in the L2 cache
+      over the total number of incoming cache line requests to the L2 cache.
     unit: Percent
   L2-Fabric Read BW:
-    rst: "The number of bytes read by the L2 over the  :ref:`Infinity Fabric\u2122\
-      \ interface <l2-fabric>` per unit time. This is also  presented as a percent\
-      \ of the peak theoretical bandwidth achievable on  the specific accelerator."
+    rst: |-
+      The number of bytes read by the L2 over the :ref:`Infinity Fabric\u2122
+      interface <l2-fabric>` per unit time. This is also presented as a percent
+      of the peak theoretical bandwidth achievable on the specific accelerator.
     unit: GB/s
   L2-Fabric Read Latency:
-    rst: The time-averaged number of cycles read requests spent in Infinity Fabric  before
-      data was returned to the L2.
+    rst: The time-averaged number of cycles read requests spent in Infinity Fabric
+      before data was returned to the L2.
     unit: Cycles
   L2-Fabric Write BW:
-    rst: The number of bytes sent by the L2 over the  :ref:`Infinity Fabric interface
-      <l2-fabric>` by write and atomic  operations per unit time. This is also presented
-      as a percent of the peak  theoretical bandwidth achievable on the specific accelerator.
+    rst: The number of bytes sent by the L2 over the :ref:`Infinity Fabric interface
+      <l2-fabric>` by write and atomic operations per unit time. This is also presented
+      as a percent of the peak theoretical bandwidth achievable on the specific accelerator.
     unit: GB/s
   L2-Fabric Write Latency:
-    rst: The time-averaged number of cycles write requests spent in Infinity  Fabric
+    rst: The time-averaged number of cycles write requests spent in Infinity Fabric
       before a completion acknowledgement was returned to the L2.
     unit: Cycles
   LDS Bank Conflicts/Access:
-    rst: The ratio of the number of cycles spent in the  :doc:`LDS scheduler <local-data-share>`
-      due to bank conflicts (as  determined by the conflict resolution hardware) to
-      the base number of  cycles that would be spent in the LDS scheduler in a completely  uncontended
-      case. This is also presented in normalized form (i.e., the  Bank Conflict Rate).
+    rst: The ratio of the number of cycles spent in the :doc:`LDS scheduler <local-data-share>`
+      due to bank conflicts (as determined by the conflict resolution hardware) to
+      the base number of cycles that would be spent in the LDS scheduler in a completely
+      uncontended case. This is also presented in normalized form (i.e., the Bank
+      Conflict Rate).
     unit: Conflicts/Access
   MFMA FLOPs (BF16):
-    rst: 'The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>`
+    rst: |-
+      The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>`
       operations executed per second. Note: this does not include any 16-bit brain
-      floating point operations from :ref:`VALU <desc-valu>` instructions. This is
-      also presented as a percent of the peak theoretical BF16 MFMA operations achievable
-      on the specific accelerator.'
+      floating point operations from :ref:`VALU <desc-valu>` instructions. This
+      is also presented as a percent of the peak theoretical BF16 MFMA operations
+      achievable on the specific accelerator.
     unit: GFLOPs
   MFMA FLOPs (F16):
-    rst: 'The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
-      executed per second. Note: this does not include any 16-bit floating point operations
-      from :ref:`VALU <desc-valu>` instructions. This is also presented as a percent
-      of the peak theoretical F16 MFMA operations achievable on the specific accelerator.'
+    rst: |-
+      The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 16-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. This is also presented
+      as a percent of the peak theoretical F16 MFMA operations achievable on the
+      specific accelerator.
     unit: GFLOPs
   MFMA FLOPs (F32):
-    rst: 'The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
-      executed per second. Note: this does not include any 32-bit floating point operations
-      from :ref:`VALU <desc-valu>` instructions. This is also presented as a percent
-      of the peak theoretical F32 MFMA operations achievable on the specific accelerator.'
+    rst: |-
+      The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 32-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. This is also presented
+      as a percent of the peak theoretical F32 MFMA operations achievable on the
+      specific accelerator.
     unit: GFLOPs
   MFMA FLOPs (F64):
-    rst: 'The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
-      executed per second. Note: this does not include any 64-bit floating point operations
-      from :ref:`VALU <desc-valu>` instructions. This is also presented as a percent
-      of the peak theoretical F64 MFMA operations achievable on the specific accelerator.'
+    rst: |-
+      The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 64-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. This is also presented
+      as a percent of the peak theoretical F64 MFMA operations achievable on the
+      specific accelerator.
     unit: GFLOPs
   MFMA FLOPs (F8):
-    rst: 'The total number of 8-bit brain floating point :ref:`MFMA <desc-mfma>` operations
-      executed per second. Note: this does not include any 16-bit brain floating point
-      operations from :ref:`VALU <desc-valu>` instructions. This is also presented
-      as a percent of the peak theoretical F8 MFMA operations achievable on the specific
-      accelerator. It is supported on AMD Instinct MI300 series and later only.'
+    rst: |-
+      The total number of 8-bit brain floating point :ref:`MFMA <desc-mfma>`
+      operations executed per second. Note: this does not include any 16-bit brain
+      floating point operations from :ref:`VALU <desc-valu>` instructions. This
+      is also presented as a percent of the peak theoretical F8 MFMA operations
+      achievable on the specific accelerator. It is supported on AMD Instinct MI300
+      series and later only.
     unit: GFLOPs
   MFMA IOPs (Int8):
-    rst: 'The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
-      per second. Note: this does not include any 8-bit integer operations from :ref:`VALU
-      <desc-valu>` instructions. This is also presented as a percent of the peak theoretical
-      INT8 MFMA operations achievable on the specific accelerator.'
+    rst: |-
+      The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
+      per second. Note: this does not include any 8-bit integer operations from
+      :ref:`VALU <desc-valu>` instructions. This is also presented as a percent
+      of the peak theoretical INT8 MFMA operations achievable on the specific accelerator.
     unit: GIOPs
   MFMA Utilization:
     rst: Indicates what percent of the kernel's duration the :ref:`MFMA <desc-mfma>`
@@ -1519,16 +1562,18 @@ System Speed-of-Light:
       time-averaged over all VALU instructions run on all wavefronts in the kernel.
     unit: Work-items
   VALU FLOPs:
-    rst: 'The total floating-point operations executed per second on the :ref:`VALU
-      <desc-valu>`. This is also presented as a percent of the peak theoretical FLOPs
-      achievable on the specific accelerator. Note: this does not include any floating-point
-      operations from :ref:`MFMA <desc-mfma>` instructions.'
+    rst: |-
+      The total floating-point operations executed per second on the :ref:`VALU
+      <desc-valu>`. This is also presented as a percent of the peak theoretical
+      FLOPs achievable on the specific accelerator. Note: this does not include
+      any floating-point operations from :ref:`MFMA <desc-mfma>` instructions.
     unit: GFLOPs
   VALU IOPs:
-    rst: 'The total integer operations executed per second on the :ref:`VALU <desc-valu>`.
+    rst: |-
+      The total integer operations executed per second on the :ref:`VALU <desc-valu>`.
       This is also presented as a percent of the peak theoretical IOPs achievable
       on the specific accelerator. Note: this does not include any integer operations
-      from :ref:`MFMA <desc-mfma>` instructions.'
+      from :ref:`MFMA <desc-mfma>` instructions.
     unit: GOIPs
   VALU Utilization:
     rst: Indicates what percent of the kernel's duration the :ref:`VALU <desc-valu>`
@@ -1540,36 +1585,37 @@ System Speed-of-Light:
     rst: Indicates what percent of the kernel's duration the :ref:`VMEM <desc-vmem>`
       unit was busy executing instructions, including both global/generic and spill/scratch
       operations (see the :ref:`VMEM instruction count metrics <ta-instruction-counts>`
-      for more detail).  Does not include :ref:`VALU <desc-valu>` operations. Computed
+      for more detail). Does not include :ref:`VALU <desc-valu>` operations. Computed
       as the ratio of the total number of cycles spent by the :ref:`scheduler <desc-scheduler>`
       issuing VMEM instructions over the :ref:`total CU cycles <total-cu-cycles>`.
     unit: Percent
   Wavefront Occupancy:
-    rst: 'The time-averaged number of wavefronts resident on the accelerator over
+    rst: |-
+      The time-averaged number of wavefronts resident on the accelerator over
       the lifetime of the kernel. Note: this metric may be inaccurate for short-running
       kernels (less than 1ms). This is also presented as a percent of the peak theoretical
-      occupancy achievable on the specific accelerator.'
+      occupancy achievable on the specific accelerator.
     unit: Wavefronts
   sL1D Cache BW:
-    rst: The number of bytes looked up in the sL1D cache per unit time. This is  also
-      presented as a percent of the peak theoretical bandwidth achievable  on the
-      specific accelerator.
+    rst: The number of bytes looked up in the sL1D cache per unit time. This is also
+      presented as a percent of the peak theoretical bandwidth achievable on the specific
+      accelerator.
     unit: GB/s
   sL1D Cache Hit Rate:
-    rst: The percent of sL1D requests that hit on a previously loaded line the  cache.
-      Calculated as the ratio of the number of sL1D requests that hit  over the number
+    rst: The percent of sL1D requests that hit on a previously loaded line the cache.
+      Calculated as the ratio of the number of sL1D requests that hit over the number
       of all sL1D requests.
     unit: Percent
   vL1D Cache BW:
-    rst: The number of bytes looked up in the vL1D cache as a result of  :ref:`VMEM
-      <desc-vmem>` instructions per unit time. The number of bytes  is calculated
-      as the number of cache lines requested multiplied by the  cache line size. This
-      value does not consider partial requests, so e.g.,  if only a single value is
-      requested in a cache line, the data movement  will still be counted as a full
-      cache line. This is also presented as a  percent of the peak theoretical bandwidth
-      achievable on the specific  accelerator.
+    rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM
+      <desc-vmem>` instructions per unit time. The number of bytes is calculated as
+      the number of cache lines requested multiplied by the cache line size. This
+      value does not consider partial requests, so e.g., if only a single value is
+      requested in a cache line, the data movement will still be counted as a full
+      cache line. This is also presented as a percent of the peak theoretical bandwidth
+      achievable on the specific accelerator.
     unit: GB/s
   vL1D Cache Hit Rate:
-    rst: The ratio of the number of vL1D cache line requests that hit in vL1D  cache
-      over the total number of cache line requests to the  :ref:`vL1D cache RAM <desc-tc>`.
+    rst: The ratio of the number of vL1D cache line requests that hit in vL1D cache
+      over the total number of cache line requests to the :ref:`vL1D cache RAM <desc-tc>`.
     unit: Percent
diff --git a/projects/rocprofiler-compute/docs/how-to/analyze/cli.rst b/projects/rocprofiler-compute/docs/how-to/analyze/cli.rst
index 6b358182b1..c6013455d3 100644
--- a/projects/rocprofiler-compute/docs/how-to/analyze/cli.rst
+++ b/projects/rocprofiler-compute/docs/how-to/analyze/cli.rst
@@ -19,7 +19,7 @@ This section provides an overview of ROCm Compute Profiler's CLI analysis featur
 * :ref:`Filtering <cli-analysis-options>`: Hone in on a particular kernel,
   GPU ID, or dispatch ID via post-process filtering.
 
-* :ref:`Per-kernel roofline analysis <per-kernel-roofline>`: Detailed arithmetic 
+* :ref:`Per-kernel roofline analysis <per-kernel-roofline>`: Detailed arithmetic
    intensity and performance analysis for individual kernels.
 
 Run ``rocprof-compute analyze -h`` for more details.
@@ -214,6 +214,90 @@ There are three high-level GPU analysis views:
       │ 2.1.28  │ Instr Fetch Latency       │ 21.729248046875       │ Cycles           │                    │                        │
       ╘═════════╧═══════════════════════════╧═══════════════════════╧══════════════════╧════════════════════╧════════════════════════╛
 
+   Alternatively, use the option ``-b`` (or ``--block``) with block alias(es).
+   The following snippet shows how to generate a report containing only metric 2 with the alias equivalent of ``sol``
+
+   .. code-block:: shell-session
+
+      $ rocprof-compute analyze -p workloads/vcopy/MI200/ -b sol
+
+      --------
+      Analyze
+      --------
+
+      --------------------------------------------------------------------------------
+      1. Top Stat
+      ╒════╤══════════════════════════════════════════╤═════════╤═══════════╤════════════╤══════════════╤════════╕
+      │    │ KernelName                               │   Count │   Sum(ns) │   Mean(ns) │   Median(ns) │    Pct │
+      ╞════╪══════════════════════════════════════════╪═════════╪═══════════╪════════════╪══════════════╪════════╡
+      │  0 │ vecCopy(double*, double*, double*, int,  │       1 │  20000.00 │   20000.00 │     20000.00 │ 100.00 │
+      │    │ int) [clone .kd]                         │         │           │            │              │        │
+      ╘════╧══════════════════════════════════════════╧═════════╧═══════════╧════════════╧══════════════╧════════╛
+
+
+      --------------------------------------------------------------------------------
+      2. System Speed-of-Light
+      ╒═════════╤═══════════════════════════╤═══════════════════════╤══════════════════╤════════════════════╤════════════════════════╕
+      │ Index   │ Metric                    │ Value                 │ Unit             │ Peak               │ PoP                    │
+      ╞═════════╪═══════════════════════════╪═══════════════════════╪══════════════════╪════════════════════╪════════════════════════╡
+      │ 2.1.0   │ VALU FLOPs                │ 0.0                   │ Gflop            │ 22630.4            │ 0.0                    │
+      ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤
+      │ 2.1.1   │ VALU IOPs                 │ 367.0016              │ Giop             │ 22630.4            │ 1.6217194570135745     │
+      ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤
+      │ 2.1.2   │ MFMA FLOPs (BF16)         │ 0.0                   │ Gflop            │ 90521.6            │ 0.0                    │
+      ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤
+      │ 2.1.3   │ MFMA FLOPs (F16)          │ 0.0                   │ Gflop            │ 181043.2           │ 0.0                    │
+      ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤
+      │ 2.1.4   │ MFMA FLOPs (F32)          │ 0.0                   │ Gflop            │ 45260.8            │ 0.0                    │
+      ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤
+      │ 2.1.5   │ MFMA FLOPs (F64)          │ 0.0                   │ Gflop            │ 45260.8            │ 0.0                    │
+      ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤
+      │ 2.1.6   │ MFMA IOPs (Int8)          │ 0.0                   │ Giop             │ 181043.2           │ 0.0                    │
+      ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤
+      │ 2.1.7   │ Active CUs                │ 74                    │ Cus              │ 104                │ 71.15384615384616      │
+      ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤
+      │ 2.1.8   │ SALU Util                 │ 4.016057506716307     │ Pct              │ 100                │ 4.016057506716307      │
+      ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤
+      │ 2.1.9   │ VALU Util                 │ 5.737225009594725     │ Pct              │ 100                │ 5.737225009594725      │
+      ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤
+      │ 2.1.10  │ MFMA Util                 │ 0.0                   │ Pct              │ 100                │ 0.0                    │
+      ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤
+      │ 2.1.11  │ VALU Active Threads/Wave  │ 64.0                  │ Threads          │ 64                 │ 100.0                  │
+      ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤
+      │ 2.1.12  │ IPC - Issue               │ 1.0                   │ Instr/cycle      │ 5                  │ 20.0                   │
+      ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤
+      │ 2.1.13  │ LDS BW                    │ 0.0                   │ Gb/sec           │ 22630.4            │ 0.0                    │
+      ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤
+      │ 2.1.14  │ LDS Bank Conflict         │                       │ Conflicts/access │ 32                 │                        │
+      ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤
+      │ 2.1.15  │ Instr Cache Hit Rate      │ 99.91306912556854     │ Pct              │ 100                │ 99.91306912556854      │
+      ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤
+      │ 2.1.16  │ Instr Cache BW            │ 209.7152              │ Gb/s             │ 6092.8             │ 3.442016806722689      │
+      ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤
+      │ 2.1.17  │ Scalar L1D Cache Hit Rate │ 99.81986908342313     │ Pct              │ 100                │ 99.81986908342313      │
+      ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤
+      │ 2.1.18  │ Scalar L1D Cache BW       │ 209.7152              │ Gb/s             │ 6092.8             │ 3.442016806722689      │
+      ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤
+      │ 2.1.19  │ Vector L1D Cache Hit Rate │ 50.0                  │ Pct              │ 100                │ 50.0                   │
+      ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤
+      │ 2.1.20  │ Vector L1D Cache BW       │ 1677.7216             │ Gb/s             │ 11315.199999999999 │ 14.82714932126697      │
+      ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤
+      │ 2.1.21  │ L2 Cache Hit Rate         │ 35.55067615693325     │ Pct              │ 100                │ 35.55067615693325      │
+      ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤
+      │ 2.1.22  │ L2-Fabric Read BW         │ 419.8496              │ Gb/s             │ 1638.4             │ 25.6255859375          │
+      ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤
+      │ 2.1.23  │ L2-Fabric Write BW        │ 293.9456              │ Gb/s             │ 1638.4             │ 17.941015625           │
+      ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤
+      │ 2.1.24  │ L2-Fabric Read Latency    │ 256.6482321288385     │ Cycles           │                    │                        │
+      ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤
+      │ 2.1.25  │ L2-Fabric Write Latency   │ 317.2264255699014     │ Cycles           │                    │                        │
+      ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤
+      │ 2.1.26  │ Wave Occupancy            │ 1821.723057333852     │ Wavefronts       │ 3328               │ 54.73927455931046      │
+      ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤
+      │ 2.1.27  │ Instr Fetch BW            │ 4.174722306564298e-08 │ Gb/s             │ 3046.4             │ 1.3703789084047721e-09 │
+      ├─────────┼───────────────────────────┼───────────────────────┼──────────────────┼────────────────────┼────────────────────────┤
+      │ 2.1.28  │ Instr Fetch Latency       │ 21.729248046875       │ Cycles           │                    │                        │
+      ╘═════════╧═══════════════════════════╧═══════════════════════╧══════════════════╧════════════════════╧════════════════════════╛
    .. note::
 
       Some cells may be blank indicating a missing or unavailable hardware
@@ -245,6 +329,11 @@ List metrics
 
      $ rocprof-compute analyze -p workloads/vcopy/MI200/  --list-metrics gfx90a
 
+List IP blocks
+  .. code-block:: shell
+
+     $ rocprof-compute analyze -p workloads/vcopy/MI200/  --list-blocks gfx90a
+
 Show Description column which is excluded by default in cli output
   .. code-block:: shell
 
diff --git a/projects/rocprofiler-compute/docs/how-to/profile/mode.rst b/projects/rocprofiler-compute/docs/how-to/profile/mode.rst
index c3296c51d9..925779c4fc 100644
--- a/projects/rocprofiler-compute/docs/how-to/profile/mode.rst
+++ b/projects/rocprofiler-compute/docs/how-to/profile/mode.rst
@@ -261,7 +261,7 @@ detailed description of profiling filters available when using ROCm Compute Prof
 Filtering options
 -----------------
 
-``-b``, ``--block <block-name>``
+``-b``, ``--block <block-id|block-alias|metric-id>``
    Allows system profiling on one or more selected analysis report blocks to speed
    up the profiling process. See :ref:`profiling-hw-component-filtering`.
    Note that this option cannot be used with ``--roof-only`` or ``--set``.
diff --git a/projects/rocprofiler-compute/docs/how-to/use.rst b/projects/rocprofiler-compute/docs/how-to/use.rst
index 614be6a53b..0154a06b6d 100644
--- a/projects/rocprofiler-compute/docs/how-to/use.rst
+++ b/projects/rocprofiler-compute/docs/how-to/use.rst
@@ -70,6 +70,13 @@ to view the metrics for current system architecture:
    $ rocprof-compute --list-metrics <sys_arch>
    $ rocprof-compute profile --list-available-metrics
 
+To view available aliases by hardware block, use the ``--list-blocks``
+option with a system architecture argument
+
+.. code-block:: shell
+
+   $ rocprof-compute --list-blocks <sys_arch>
+
 .. _basic-analyze-cli:
 
 Analyze in the command line
diff --git a/projects/rocprofiler-compute/src/argparser.py b/projects/rocprofiler-compute/src/argparser.py
index e0fdf367a1..0c42b0d110 100644
--- a/projects/rocprofiler-compute/src/argparser.py
+++ b/projects/rocprofiler-compute/src/argparser.py
@@ -25,13 +25,30 @@
 
 import argparse
 import os
-import re
 from pathlib import Path
 from typing import Optional
 
+from utils.utils import METRIC_ID_RE
 
-def print_avail_arch(avail_arch: list[str]) -> str:
-    ret_str = "List all available metrics for analysis on specified arch:"
+
+def validate_block(value: str) -> str:
+    if METRIC_ID_RE.match(value):
+        return value
+    raise argparse.ArgumentTypeError(f"Invalid metric id: {value}")
+
+
+def block_token_or_alias(s: str) -> str:
+    try:
+        return validate_block(s)
+    except argparse.ArgumentTypeError:
+        s = (s or "").strip()
+        if not s:
+            raise argparse.ArgumentTypeError("empty token for --block")
+        return s
+
+
+def print_avail_arch(avail_arch: list[str], args: str) -> str:
+    ret_str = f"List all available {args} for analysis on specified arch:"
     for arch in avail_arch:
         ret_str += f"\n   {arch}"
     return ret_str
@@ -66,7 +83,14 @@ def add_general_group(
         dest="list_metrics",
         metavar="",
         choices=supported_archs.keys(),  # ["gfx908", "gfx90a"],
-        help=print_avail_arch(list(supported_archs.keys())),
+        help=print_avail_arch(list(supported_archs.keys()), "metrics"),
+    )
+    general_group.add_argument(
+        "--list-blocks",
+        dest="list_blocks",
+        metavar="",
+        choices=supported_archs.keys(),  # ["gfx908", "gfx90a"],
+        help=print_avail_arch(list(supported_archs.keys()), "blocks"),
     )
     general_group.add_argument(
         "--config-dir",
@@ -234,12 +258,6 @@ Examples:
         help="\t\t\tDispatch ID filtering.",
     )
 
-    def validate_block(value: str) -> str:
-        # Metric id is of the form I or I.I or I.I.I where I is two digit number.
-        if re.compile(r"^\d{1,2}(?:\.\d{1,2}){0,2}$").match(value):
-            return value
-        raise argparse.ArgumentTypeError(f"Invalid metric id: {value}")
-
     profile_group.add_argument(
         "--list-available-metrics",
         dest="list_available_metrics",
@@ -249,15 +267,19 @@ Examples:
     profile_group.add_argument(
         "-b",
         "--block",
-        type=validate_block,
         dest="filter_blocks",
         metavar="",
         nargs="+",
+        type=block_token_or_alias,
         required=False,
         default=[],
         help=(
             "\t\t\tSpecify metric id(s) from --list-metrics for filtering "
             "(e.g. 12, 12.1, 12.1.1).\n"
+            "\t\t\tAlternatively, specify block id(s) for filtering "
+            "(e.g. 12, 13, 14).\n"
+            "\t\t\tAlternatively, specify block alias(es) for filtering "
+            "(e.g. lds, l1i, sl1d).\n"
             "\t\t\tCan provide multiple space separated arguments.\n"
             "\t\t\tCannot be used with --set or --roof-only"
         ),
@@ -656,6 +678,7 @@ Examples:
         dest="filter_metrics",
         metavar="",
         nargs="+",
+        type=block_token_or_alias,
         help="\t\tSpecify metric id(s) from --list-metrics for filtering.",
     )
     analyze_group.add_argument(
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_analyze/analysis_base.py b/projects/rocprofiler-compute/src/rocprof_compute_analyze/analysis_base.py
index 7c5dd5d1cb..464edf8cd4 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_analyze/analysis_base.py
+++ b/projects/rocprofiler-compute/src/rocprof_compute_analyze/analysis_base.py
@@ -45,7 +45,12 @@ from utils.logger import (
     console_warning,
     demarcate,
 )
-from utils.utils import get_uuid, is_workload_empty, merge_counters_spatial_multiplex
+from utils.utils import (
+    get_panel_alias,
+    get_uuid,
+    is_workload_empty,
+    merge_counters_spatial_multiplex,
+)
 
 # the build-in config to list kernel names purpose only
 TOP_STATS_BUILD_IN_CONFIG: OrderedDict[int, dict[str, Any]] = OrderedDict([
@@ -160,21 +165,41 @@ class OmniAnalyze_Base:
         }
         for key, value in self._arch_configs[arch].metric_list.items():
             dot_count = str(key).count(".")
-            if dot_count == 0:
-                prefix = ""
-            elif dot_count == 1:
-                prefix = "\t"
-            else:
-                prefix = "\t\t"
+            indent = "\t" * min(dot_count, 2)
 
-            description = metric_descriptions.get(key, "") if dot_count > 1 else ""
+            print(f"{indent}{key} -> {value}\n")
 
-            print(f"{prefix}{key} -> {value}\n")
-            if description:
-                formatted_desc = f"\n{prefix}".join(
-                    textwrap.wrap(description, width=40)
-                )
-                print(f"{prefix}{formatted_desc}\n")
+            if dot_count > 1:
+                description = metric_descriptions.get(key, "")
+                if description:
+                    wrapped = textwrap.wrap(description, width=40)
+                    print(f"{indent}" + f"\n{indent}".join(wrapped) + "\n")
+
+        sys.exit(0)
+
+    @demarcate
+    def list_blocks(self) -> None:
+        args = self.get_args()
+        arch = args.list_blocks
+
+        if arch not in self.__supported_archs:
+            console_error("analysis", "Unsupported arch")
+        if arch not in self._arch_configs:
+            sys_info = file_io.load_sys_info(f"{args.path[0][0]}/sysinfo.csv")
+            self.generate_configs(
+                arch,
+                args.config_dir,
+                args.list_stats,
+                args.filter_metrics,
+                sys_info.iloc[0],
+            )
+
+        print(f"{'INDEX':<8} {'BLOCK ALIAS':<16} {'BLOCK NAME'}")
+        for key, value in self._arch_configs[arch].metric_list.items():
+            panel_alias_dict = get_panel_alias()
+            if key.count(".") > 0:
+                continue
+            print(f"{key:<8} {panel_alias_dict[value]:<16} {value}")
 
         sys.exit(0)
 
@@ -208,6 +233,9 @@ class OmniAnalyze_Base:
         if args.list_metrics:
             self.list_metrics()
 
+        if args.list_blocks:
+            self.list_blocks()
+
         def get_sysinfo_path(data_path: str) -> Optional[str]:
             return (
                 data_path
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_base.py b/projects/rocprofiler-compute/src/rocprof_compute_base.py
index 275f29c367..0457f10b6b 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_base.py
+++ b/projects/rocprofiler-compute/src/rocprof_compute_base.py
@@ -49,6 +49,7 @@ from utils.mi_gpu_spec import mi_gpu_specs
 from utils.specs import MachineSpecs, generate_machine_specs
 from utils.utils import (
     detect_rocprof,
+    get_panel_alias,
     get_submodules,
     get_version,
     get_version_display,
@@ -142,6 +143,8 @@ class RocProfCompute:
 
         if self.__args.list_metrics is not None and block:
             console_error("Cannot use --list-metrics with --blocks")
+        if self.__args.list_blocks is not None and block:
+            console_error("Cannot use --list-blocks with --blocks")
         if (
             hasattr(self.__args, "list_available_metrics")
             and self.__args.list_available_metrics
@@ -194,6 +197,9 @@ class RocProfCompute:
             elif self.__args.list_metrics is not None:
                 self.list_metrics()
                 sys.exit(0)
+            elif self.__args.list_blocks is not None:
+                self.list_blocks()
+                sys.exit(0)
             elif self.__args.config_dir:
                 parser.print_help(sys.stderr)
                 console_error(
@@ -250,6 +256,34 @@ class RocProfCompute:
         else:
             console_error("Unsupported arch")
 
+    @demarcate
+    def list_blocks(self) -> None:
+        for_current_arch = getattr(self.__args, "list_available_metrics", False)
+
+        arch = (
+            self.__mspec.gpu_arch
+            if (for_current_arch or self.__args.list_blocks is None)
+            else self.__args.list_blocks
+        )
+        if arch in self.__supported_archs.keys():
+            ac = schema.ArchConfig()
+            ac.panel_configs = file_io.load_panel_configs([
+                str(Path(self.__args.config_dir) / arch)
+            ])
+            sys_info = (
+                self.__mspec.get_class_members().iloc[0] if for_current_arch else None
+            )
+            parser.build_dfs(arch_configs=ac, filter_metrics=[], sys_info=sys_info)
+
+            print(f"{'INDEX':<8} {'BLOCK ALIAS':<16} {'BLOCK NAME'}")
+            for key, value in ac.metric_list.items():
+                if key.count(".") > 0:
+                    continue
+                print(f"{key:<8} {get_panel_alias()[value]:<16} {value}")
+            sys.exit(0)
+        else:
+            console_error("Unsupported arch")
+
     @demarcate
     def list_sets(self) -> None:
         sets_info = parse_sets_yaml(self.__mspec.gpu_arch)
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_profile/profiler_base.py b/projects/rocprofiler-compute/src/rocprof_compute_profile/profiler_base.py
index ee9719b001..93e1d8057d 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_profile/profiler_base.py
+++ b/projects/rocprofiler-compute/src/rocprof_compute_profile/profiler_base.py
@@ -505,6 +505,7 @@ class RocProfCompute_Base:
         # PC sampling data is only collected when block "21" is specified
         if not (
             "21" in args.filter_blocks
+            and "pc_sampling" in args.filter_blocks
             and self.__profiler in ("rocprofv3", "rocprofiler-sdk")
         ):
             return
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0000_top_stats.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0000_top_stats.yaml
index 55c6f6bb24..5ce5aeeb28 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0000_top_stats.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0000_top_stats.yaml
@@ -2,7 +2,6 @@
 Panel Config:
   id: 0
   title: Top Stats
-  metrics_description: {}
   data source:
   - raw_csv_table:
       id: 1
@@ -12,3 +11,4 @@ Panel Config:
       id: 2
       title: Dispatch List
       source: pmc_dispatch_info.csv
+  metrics_description: {}
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0100_system_info.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0100_system_info.yaml
index 23d024fde3..8b48c2253b 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0100_system_info.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0100_system_info.yaml
@@ -2,10 +2,10 @@
 Panel Config:
   id: 100
   title: System Info
-  metrics_description: {}
   data source:
   - raw_csv_table:
       id: 101
       title: System Info
       source: sysinfo.csv
       columnwise: true
+  metrics_description: {}
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0200_system_speed_of_light.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0200_system_speed_of_light.yaml
index b48fd0b677..ae059bc0cb 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0200_system_speed_of_light.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0200_system_speed_of_light.yaml
@@ -2,124 +2,6 @@
 Panel Config:
   id: 200
   title: System Speed-of-Light
-  metrics_description:
-    VALU FLOPs: 'The total floating-point operations executed per second on the VALU.
-      This is also presented as a percent of the peak theoretical FLOPs achievable
-      on the specific accelerator. Note: this does not include any floating-point
-      operations from MFMA instructions.'
-    VALU IOPs: 'The total integer operations executed per second on the VALU. This
-      is also presented as a percent of the peak theoretical IOPs achievable on the
-      specific accelerator. Note: this does not include any integer operations from
-      MFMA instructions.'
-    MFMA FLOPs (F8): The total number of 8-bit brain floating point MFMA operations
-      executed per second. This does not include any 16-bit brain floating point operations
-      from VALU instructions. This is also presented as a percent of the peak theoretical
-      F8 MFMA operations achievable on the specific accelerator. It is supported on
-      AMD Instinct MI300 series and later only.
-    MFMA FLOPs (BF16): 'The total number of 16-bit brain floating point MFMA operations
-      executed per second. Note: this does not include any 16-bit brain floating point
-      operations from VALU instructions. This is also presented as a percent of the
-      peak theoretical BF16 MFMA operations achievable on the specific accelerator.'
-    MFMA FLOPs (F16): 'The total number of 16-bit floating point MFMA operations executed
-      per second. Note: this does not include any 16-bit floating point operations
-      from VALU instructions. This is also presented as a percent of the peak theoretical
-      F16 MFMA operations achievable on the specific accelerator.'
-    MFMA FLOPs (F32): 'The total number of 32-bit floating point MFMA operations executed
-      per second. Note: this does not include any 32-bit floating point operations
-      from VALU instructions. This is also presented as a percent of the peak theoretical
-      F32 MFMA operations achievable on the specific accelerator.'
-    MFMA FLOPs (F64): 'The total number of 64-bit floating point MFMA operations executed
-      per second. Note: this does not include any 64-bit floating point operations
-      from VALU instructions. This is also presented as a percent of the peak theoretical
-      F64 MFMA operations achievable on the specific accelerator.'
-    MFMA IOPs (Int8): 'The total number of 8-bit integer MFMA operations executed
-      per second. Note: this does not include any 8-bit integer operations from VALU
-      instructions. This is also presented as a percent of the peak theoretical INT8
-      MFMA operations achievable on the specific accelerator.'
-    Active CUs: Total number of active compute units (CUs) on the accelerator during
-      the kernel execution.
-    SALU Utilization: Indicates what percent of the kernel's duration the SALU was
-      busy executing instructions. Computed as the ratio of the total number of cycles
-      spent by the scheduler issuing SALU or SMEM instructions over the total CU cycles.
-    VALU Utilization: Indicates what percent of the kernel's duration the VALU was
-      busy executing instructions. Does not include VMEM operations. Computed as the
-      ratio of the total number of cycles spent by the scheduler issuing VALU instructions
-      over the total CU cycles.
-    MFMA Utilization: Indicates what percent of the kernel's duration the MFMA unit
-      was busy executing instructions. Computed as the ratio of the total number of
-      cycles the MFMA was busy over the total CU cycles.
-    VMEM Utilization: Indicates what percent of the kernel's duration the VMEM unit
-      was busy executing instructions, including both global/generic and spill/scratch
-      operations (see the VMEM instruction count metrics) for more detail). Does not
-      include VALU operations. Computed as the ratio of the total number of cycles
-      spent by the scheduler issuing VMEM instructions over the total CU cycles.
-    Branch Utilization: Indicates what percent of the kernel's duration the branch
-      unit was busy executing instructions. Computed as the ratio of the total number
-      of cycles spent by the scheduler issuing branch instructions over the total
-      CU cycles
-    VALU Active Threads: Indicates the average level of divergence within a wavefront
-      over the lifetime of the kernel. The number of work-items that were active in
-      a wavefront during execution of each VALU instruction, time-averaged over all
-      VALU instructions run on all wavefronts in the kernel.
-    IPC: The ratio of the total number of instructions executed on the CU over the
-      total active CU cycles. This is also presented as a percent of the peak theoretical
-      bandwidth achievable on the specific accelerator.
-    Wavefront Occupancy: 'The time-averaged number of wavefronts resident on the accelerator
-      over the lifetime of the kernel. Note: this metric may be inaccurate for short-running
-      kernels (less than 1ms). This is also presented as a percent of the peak theoretical
-      occupancy achievable on the specific accelerator.'
-    Theoretical LDS Bandwidth: Indicates the maximum amount of bytes that could have
-      been loaded from, stored to, or atomically updated in the LDS per unit time
-      (see LDS Bandwidth example for more detail). This is also presented as a percent
-      of the peak theoretical F64 MFMA operations achievable on the specific accelerator.
-    LDS Bank Conflicts/Access: The ratio of the number of cycles spent in the LDS
-      scheduler due to bank conflicts (as determined by the conflict resolution hardware)
-      to the base number of cycles that would be spent in the LDS scheduler in a completely
-      uncontended case. This is also presented in normalized form (i.e., the Bank
-      Conflict Rate).
-    vL1D Cache Hit Rate: The ratio of the number of vL1D cache line requests that
-      hit in vL1D cache over the total number of cache line requests to the vL1D cache
-      RAM.
-    vL1D Cache BW: The number of bytes looked up in the vL1D cache as a result of
-      VMEM instructions per unit time. The number of bytes is calculated as the number
-      of cache lines requested multiplied by the cache line size. This value does
-      not consider partial requests, so e.g., if only a single value is requested
-      in a cache line, the data movement will still be counted as a full cache line.
-      This is also presented as a percent of the peak theoretical bandwidth achievable
-      on the specific accelerator.
-    L2 Cache Hit Rate: The ratio of the number of L2 cache line requests that hit
-      in the L2 cache over the total number of incoming cache line requests to the
-      L2 cache.
-    L2 Cache BW: The number of bytes looked up in the L2 cache per unit time. The
-      number of bytes is calculated as the number of cache lines requested multiplied
-      by the cache line size. This value does not consider partial requests, so e.g.,
-      if only a single value is requested in a cache line, the data movement will
-      still be counted as a full cache line. This is also presented as a percent of
-      the peak theoretical bandwidth achievable on the specific accelerator.
-    L2-Fabric Read BW: "The number of bytes read by the L2 over the Infinity Fabric\u2122\
-      \ interface per unit time. This is also presented as a percent of the peak theoretical\
-      \ bandwidth achievable on the specific accelerator."
-    L2-Fabric Write BW: The number of bytes sent by the L2 over the Infinity Fabric
-      interface by write and atomic operations per unit time. This is also presented
-      as a percent of the peak theoretical bandwidth achievable on the specific accelerator.
-    L2-Fabric Read Latency: The time-averaged number of cycles read requests spent
-      in Infinity Fabric before data was returned to the L2.
-    L2-Fabric Write Latency: The time-averaged number of cycles write requests spent
-      in Infinity Fabric before a completion acknowledgement was returned to the L2.
-    sL1D Cache Hit Rate: The percent of sL1D requests that hit on a previously loaded
-      line the cache. Calculated as the ratio of the number of sL1D requests that
-      hit over the number of all sL1D requests.
-    sL1D Cache BW: The number of bytes looked up in the sL1D cache per unit time.
-      This is also presented as a percent of the peak theoretical bandwidth achievable
-      on the specific accelerator.
-    L1I Hit Rate: The number of bytes looked up in the L1I cache per unit time. This
-      is also presented as a percent of the peak theoretical bandwidth achievable
-      on the specific accelerator.
-    L1I BW: The percent of L1I requests that hit on a previously loaded line the cache.
-      Calculated as the ratio of the number of L1I requests that hit over the number
-      of all L1I requests.
-    L1I Fetch Latency: The average number of cycles spent to fetch instructions to
-      a CU.
   data source:
   - metric_table:
       id: 201
@@ -317,3 +199,125 @@ Panel Config:
           peak: None
           pop: None
           coll_level: SQ_IFETCH_LEVEL
+  metrics_description:
+    VALU FLOPs: |-
+      The total floating-point operations executed per second on the VALU.
+      This is also presented as a percent of the peak theoretical FLOPs achievable
+      on the specific accelerator. Note: this does not include any floating-point
+      operations from MFMA instructions.
+    VALU IOPs: |-
+      The total integer operations executed per second on the VALU. This is
+      also presented as a percent of the peak theoretical IOPs achievable on the
+      specific accelerator. Note: this does not include any integer operations from
+      MFMA instructions.
+    MFMA FLOPs (BF16): |-
+      The total number of 16-bit brain floating point MFMA operations executed
+      per second. Note: this does not include any 16-bit brain floating point operations
+      from VALU instructions. This is also presented as a percent of the peak theoretical
+      BF16 MFMA operations achievable on the specific accelerator.
+    MFMA FLOPs (F16): |-
+      The total number of 16-bit floating point MFMA operations executed per
+      second. Note: this does not include any 16-bit floating point operations from
+      VALU instructions. This is also presented as a percent of the peak theoretical
+      F16 MFMA operations achievable on the specific accelerator.
+    MFMA FLOPs (F32): |-
+      The total number of 32-bit floating point MFMA operations executed per
+      second. Note: this does not include any 32-bit floating point operations from
+      VALU instructions. This is also presented as a percent of the peak theoretical
+      F32 MFMA operations achievable on the specific accelerator.
+    MFMA FLOPs (F64): |-
+      The total number of 64-bit floating point MFMA operations executed per
+      second. Note: this does not include any 64-bit floating point operations from
+      VALU instructions. This is also presented as a percent of the peak theoretical
+      F64 MFMA operations achievable on the specific accelerator.
+    MFMA IOPs (Int8): |-
+      The total number of 8-bit integer MFMA operations executed per second.
+      Note: this does not include any 8-bit integer operations from VALU instructions.
+      This is also presented as a percent of the peak theoretical INT8 MFMA operations
+      achievable on the specific accelerator.
+    Active CUs: Total number of active compute units (CUs) on the accelerator during
+      the kernel execution.
+    SALU Utilization: Indicates what percent of the kernel's duration the SALU was
+      busy executing instructions. Computed as the ratio of the total number of cycles
+      spent by the scheduler issuing SALU or SMEM instructions over the total CU cycles.
+    VALU Utilization: Indicates what percent of the kernel's duration the VALU was
+      busy executing instructions. Does not include VMEM operations. Computed as the
+      ratio of the total number of cycles spent by the scheduler issuing VALU instructions
+      over the total CU cycles.
+    MFMA Utilization: Indicates what percent of the kernel's duration the MFMA unit
+      was busy executing instructions. Computed as the ratio of the total number of
+      cycles the MFMA was busy over the total CU cycles.
+    VMEM Utilization: Indicates what percent of the kernel's duration the VMEM unit
+      was busy executing instructions, including both global/generic and spill/scratch
+      operations (see the VMEM instruction count metrics) for more detail). Does not
+      include VALU operations. Computed as the ratio of the total number of cycles
+      spent by the scheduler issuing VMEM instructions over the total CU cycles.
+    Branch Utilization: Indicates what percent of the kernel's duration the branch
+      unit was busy executing instructions. Computed as the ratio of the total number
+      of cycles spent by the scheduler issuing branch instructions over the total
+      CU cycles
+    VALU Active Threads: Indicates the average level of divergence within a wavefront
+      over the lifetime of the kernel. The number of work-items that were active in
+      a wavefront during execution of each VALU instruction, time-averaged over all
+      VALU instructions run on all wavefronts in the kernel.
+    IPC: The ratio of the total number of instructions executed on the CU over the
+      total active CU cycles. This is also presented as a percent of the peak theoretical
+      bandwidth achievable on the specific accelerator.
+    Wavefront Occupancy: |-
+      The time-averaged number of wavefronts resident on the accelerator over
+      the lifetime of the kernel. Note: this metric may be inaccurate for short-running
+      kernels (less than 1ms). This is also presented as a percent of the peak theoretical
+      occupancy achievable on the specific accelerator.
+    Theoretical LDS Bandwidth: Indicates the maximum amount of bytes that could have
+      been loaded from, stored to, or atomically updated in the LDS per unit time
+      (see LDS Bandwidth example for more detail). This is also presented as a percent
+      of the peak theoretical F64 MFMA operations achievable on the specific accelerator.
+    LDS Bank Conflicts/Access: The ratio of the number of cycles spent in the LDS
+      scheduler due to bank conflicts (as determined by the conflict resolution hardware)
+      to the base number of cycles that would be spent in the LDS scheduler in a completely
+      uncontended case. This is also presented in normalized form (i.e., the Bank
+      Conflict Rate).
+    vL1D Cache Hit Rate: The ratio of the number of vL1D cache line requests that
+      hit in vL1D cache over the total number of cache line requests to the vL1D cache
+      RAM.
+    vL1D Cache BW: The number of bytes looked up in the vL1D cache as a result of
+      VMEM instructions per unit time. The number of bytes is calculated as the number
+      of cache lines requested multiplied by the cache line size. This value does
+      not consider partial requests, so e.g., if only a single value is requested
+      in a cache line, the data movement will still be counted as a full cache line.
+      This is also presented as a percent of the peak theoretical bandwidth achievable
+      on the specific accelerator.
+    L2 Cache Hit Rate: The ratio of the number of L2 cache line requests that hit
+      in the L2 cache over the total number of incoming cache line requests to the
+      L2 cache.
+    L2 Cache BW: The number of bytes looked up in the L2 cache per unit time. The
+      number of bytes is calculated as the number of cache lines requested multiplied
+      by the cache line size. This value does not consider partial requests, so e.g.,
+      if only a single value is requested in a cache line, the data movement will
+      still be counted as a full cache line. This is also presented as a percent of
+      the peak theoretical bandwidth achievable on the specific accelerator.
+    L2-Fabric Read BW: |-
+      The number of bytes read by the L2 over the Infinity Fabric\u2122 interface
+      per unit time. This is also presented as a percent of the peak theoretical
+      bandwidth achievable on the specific accelerator.
+    L2-Fabric Write BW: The number of bytes sent by the L2 over the Infinity Fabric
+      interface by write and atomic operations per unit time. This is also presented
+      as a percent of the peak theoretical bandwidth achievable on the specific accelerator.
+    L2-Fabric Read Latency: The time-averaged number of cycles read requests spent
+      in Infinity Fabric before data was returned to the L2.
+    L2-Fabric Write Latency: The time-averaged number of cycles write requests spent
+      in Infinity Fabric before a completion acknowledgement was returned to the L2.
+    sL1D Cache Hit Rate: The percent of sL1D requests that hit on a previously loaded
+      line the cache. Calculated as the ratio of the number of sL1D requests that
+      hit over the number of all sL1D requests.
+    sL1D Cache BW: The number of bytes looked up in the sL1D cache per unit time.
+      This is also presented as a percent of the peak theoretical bandwidth achievable
+      on the specific accelerator.
+    L1I Hit Rate: The number of bytes looked up in the L1I cache per unit time. This
+      is also presented as a percent of the peak theoretical bandwidth achievable
+      on the specific accelerator.
+    L1I BW: The percent of L1I requests that hit on a previously loaded line the cache.
+      Calculated as the ratio of the number of L1I requests that hit over the number
+      of all L1I requests.
+    L1I Fetch Latency: The average number of cycles spent to fetch instructions to
+      a CU.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0300_memory_chart.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0300_memory_chart.yaml
index 06e680802e..d817e3e02f 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0300_memory_chart.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0300_memory_chart.yaml
@@ -2,122 +2,6 @@
 Panel Config:
   id: 300
   title: Memory Chart
-  metrics_description:
-    Wavefront Occupancy: Wavefronts per active CU.
-    Wave Life: Average number of cycles executing a wave.
-    SALU: Total Number of SALU (Scalar ALU) instructions issued per normalization
-      unit.
-    SMEM: Total number of SMEM (Scalar Memory Read) instructions issued normalization
-      unit.
-    VALU: The number of VALU (Vector ALU) instructions issued per normalization unit.
-    MFMA: Total number of MFMA (Matrix-Fused-Multiply-Add) instructions issued per
-      normalization unit.
-    VMEM: The number of VMEM (GPU Memory) read instructions issued (including FLAT/scratch
-      memory) per normalization unit.
-    LDS: The total number of LDS instructions (including, but not limited to, read/write/atomics
-      and HIP's __shfl instructions) executed per normalization unit.
-    GWS: Total number of GDS (global data sync) instructions issued per normalization
-      unit.
-    BR: Total number of BRANCH instructions issued per normalization unit.
-    Active CUs: Total number of active compute units (CUs) on the accelerator during
-      the kernel execution.
-    Num CUs: Total number of compute units (CUs) on the accelerator.
-    VGPR: 'The number of architected vector general-purpose registers allocated for
-      the kernel, see VALU. Note: this may not exactly match the number of VGPRs requested
-      by the compiler due to allocation granularity.'
-    SGPR: 'The number of scalar general-purpose registers allocated for the kernel,
-      see SALU. Note: this may not exactly match the number of SGPRs requested by
-      the compiler due to allocation granularity.'
-    LDS Allocation: 'The number of bytes of LDS memory (or, shared memory) allocated
-      for this kernel. Note: This may also be larger than what was requested at compile
-      time due to both allocation granularity and dynamic per-dispatch LDS allocations.'
-    Scratch Allocation: The number of bytes of scratch memory requested per work-item
-      for this kernel. Scratch memory is used for stack memory on the accelerator,
-      as well as for register spills and restores.
-    Wavefronts: The total number of wavefronts, summed over all workgroups, forming
-      this kernel launch.
-    Workgroups: The total number of workgroups forming this kernel launch.
-    LDS Req: The total number of LDS instructions (including, but not limited to,
-      read/write/atomics and HIP's __shfl instructions) executed per normalization
-      unit.
-    LDS Util: Indicates what percent of the kernel's duration the LDS was actively
-      executing instructions (including, but not limited to, load, store, atomic and
-      HIP's __shfl operations). Calculated as the ratio of the total number of cycles
-      LDS was active over the total CU cycles.
-    LDS Latency: The average number of round-trip cycles (i.e., from issue to data-return
-      / acknowledgment) required for an LDS instruction to complete.
-    VL1 Rd: The total number of incoming read requests from the address processing
-      unit after coalescing per normalization unit
-    VL1 Wr: The total number of incoming write requests from the address processing
-      unit after coalescing per normalization unit
-    VL1 Atomic: The total number of incoming atomic requests from the address processing
-      unit after coalescing per normalization unit
-    VL1 Hit: The ratio of the number of vL1D cache line requests that hit in vL1D
-      cache over the total number of cache line requests to the vL1D Cache RAM.
-    VL1 Lat: Calculated as the average number of cycles that a vL1D cache line request
-      spent in the vL1D cache pipeline.
-    VL1 Coalesce: Indicates how well memory instructions were coalesced by the address
-      processing unit, ranging from uncoalesced (25%) to fully coalesced (100%). Calculated
-      as the average number of thread-requests generated per instruction divided by
-      the ideal number of thread-requests per instruction.
-    VL1 Stall: The ratio of the number of cycles where the vL1D is stalled waiting
-      to issue a request for data to the L2 cache divided by the number of cycles
-      where the vL1D is active.
-    VL1_L2 Rd: The number of read requests for a vL1D cache line that were not satisfied
-      by the vL1D and must be retrieved from the to the L2 Cache per normalization
-      unit.
-    VL1_L2 Wr: The number of write requests to a vL1D cache line that were sent through
-      the vL1D to the L2 cache, per normalization unit.
-    VL1_L2 Atomic: The number of atomic requests that are sent through the vL1D to
-      the L2 cache, per normalization unit. This includes requests for atomics with,
-      and without return.
-    sL1D Rd: The total number of requests, of any size or type, made to the sL1D per
-      normalization unit.
-    sL1D Hit: The total number of sL1D requests that hit on a previously loaded cache
-      line, per normalization unit.
-    sL1D_L2 Rd: The total number of read requests from sL1D to the L2, per normalization
-      unit.
-    sL1D_L2 Wr: The total number of write requests from sL1D to the L2, per normalization
-      unit. Typically unused on current CDNA accelerators.
-    sL1D_L2 Atomic: The total number of atomic requests from sL1D to the L2, per normalization
-      unit. Typically unused on current CDNA accelerators.
-    IL1 Fetch: The total number of requests made to the L1I per normalization-unit.
-    IL1 Hit: The percent of L1I requests that hit on a previously loaded line the
-      cache. Calculated as the ratio of the number of L1I requests that hit over the
-      number of all L1I requests.
-    IL1 Lat: The average number of cycles spent to fetch instructions to a CU.
-    IL1_L2 Rd: The total number of requests across the L1I - L2 interface per normalization-unit.
-    L2 Rd: The total number of read requests to the L2 from all clients.
-    L2 Wr: The total number of write requests to the L2 from all clients.
-    L2 Atomic: The total number of atomic requests (with and without return) to the
-      L2 from all clients.
-    L2 Hit: The ratio of the number of L2 cache line requests that hit in the L2 cache
-      over the total number of incoming cache line requests to the L2 cache.
-    L2 Rd Lat: Calculated as the average number of cycles that the vL1D cache took
-      to issue and receive read requests from the L2 Cache. This number also includes
-      requests for atomics with return values.
-    L2 Wr Lat: Calculated as the average number of cycles that the vL1D cache took
-      to issue and receive acknowledgement of a write request to the L2 Cache. This
-      number also includes requests for atomics without return values.
-    Fabric_L2 Rd: Number of L2 cache - Infinity Fabric read requests (either 32-byte
-      or 64-byte) summed over TCC instances per normalization unit.
-    Fabric_L2 Wr: Number of L2 cache - Infinity Fabric write requests (either 32-byte
-      or 64-byte) summed over TCC instances per normalization unit.
-    Fabric_L2 Atomic: Number of L2 cache - Infinity Fabric write requests (either
-      32-byte or 64-byte) that are actually atomic requests summed over TCC instances
-      per normalization unit.
-    Fabric Rd Lat: The time-averaged number of cycles read requests spent in Infinity
-      Fabric before data was returned to the L2.
-    Fabric Wr Lat: The time-averaged number of cycles write requests spent in Infinity
-      Fabric before a completion acknowledgement was returned to the L2.
-    Fabric Atomic Lat: The time-averaged number of cycles atomic requests spent in
-      Infinity Fabric before a completion acknowledgement (atomic without return value)
-      or data (atomic with return value) was returned to the L2.
-    HBM Rd: The total number of L2 requests to Infinity Fabric to read 32B or 64B
-      of data from the accelerator's local HBM, per normalization unit.
-    HBM Wr: 'The total number of L2 requests to Infinity Fabric to write or atomically
-      update 32B or 64B of data in the accelerator''s local HBM, per normalization
-      unit. '
   data source:
   - metric_table:
       id: 301
@@ -252,13 +136,13 @@ Panel Config:
           value: ROUND(AVG((TCC_EA0_ATOMIC_sum / $denom)), 0)
         Fabric Rd Lat:
           value: ROUND(AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-            != 0) else  0)), 0)
+            != 0) else 0)), 0)
         Fabric Wr Lat:
           value: ROUND(AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-            != 0) else  0)), 0)
+            != 0) else 0)), 0)
         Fabric Atomic Lat:
           value: ROUND(AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
-            != 0) else  0)), 0)
+            != 0) else 0)), 0)
         HBM Rd:
           value: ROUND(AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)), 0)
         HBM Wr:
@@ -266,3 +150,123 @@ Panel Config:
       comparable: false
       cli_style: mem_chart
       tui_style: mem_chart
+  metrics_description:
+    Wavefront Occupancy: Wavefronts per active CU.
+    Wave Life: Average number of cycles executing a wave.
+    SALU: Total Number of SALU (Scalar ALU) instructions issued per normalization
+      unit.
+    SMEM: Total number of SMEM (Scalar Memory Read) instructions issued normalization
+      unit.
+    VALU: The number of VALU (Vector ALU) instructions issued per normalization unit.
+    MFMA: Total number of MFMA (Matrix-Fused-Multiply-Add) instructions issued per
+      normalization unit.
+    VMEM: The number of VMEM (GPU Memory) read instructions issued (including FLAT/scratch
+      memory) per normalization unit.
+    LDS: The total number of LDS instructions (including, but not limited to, read/write/atomics
+      and HIP's __shfl instructions) executed per normalization unit.
+    GWS: Total number of GDS (global data sync) instructions issued per normalization
+      unit.
+    BR: Total number of BRANCH instructions issued per normalization unit.
+    Active CUs: Total number of active compute units (CUs) on the accelerator during
+      the kernel execution.
+    Num CUs: Total number of compute units (CUs) on the accelerator.
+    VGPR: |-
+      The number of architected vector general-purpose registers allocated
+      for the kernel, see VALU. Note: this may not exactly match the number of VGPRs
+      requested by the compiler due to allocation granularity.
+    SGPR: |-
+      The number of scalar general-purpose registers allocated for the kernel,
+      see SALU. Note: this may not exactly match the number of SGPRs requested by
+      the compiler due to allocation granularity.
+    LDS Allocation: |-
+      The number of bytes of LDS memory (or, shared memory) allocated for
+      this kernel. Note: This may also be larger than what was requested at compile
+      time due to both allocation granularity and dynamic per-dispatch LDS allocations.
+    Scratch Allocation: The number of bytes of scratch memory requested per work-item
+      for this kernel. Scratch memory is used for stack memory on the accelerator,
+      as well as for register spills and restores.
+    Wavefronts: The total number of wavefronts, summed over all workgroups, forming
+      this kernel launch.
+    Workgroups: The total number of workgroups forming this kernel launch.
+    LDS Req: The total number of LDS instructions (including, but not limited to,
+      read/write/atomics and HIP's __shfl instructions) executed per normalization
+      unit.
+    LDS Util: Indicates what percent of the kernel's duration the LDS was actively
+      executing instructions (including, but not limited to, load, store, atomic and
+      HIP's __shfl operations). Calculated as the ratio of the total number of cycles
+      LDS was active over the total CU cycles.
+    LDS Latency: The average number of round-trip cycles (i.e., from issue to data-return
+      / acknowledgment) required for an LDS instruction to complete.
+    VL1 Rd: The total number of incoming read requests from the address processing
+      unit after coalescing per normalization unit
+    VL1 Wr: The total number of incoming write requests from the address processing
+      unit after coalescing per normalization unit
+    VL1 Atomic: The total number of incoming atomic requests from the address processing
+      unit after coalescing per normalization unit
+    VL1 Hit: The ratio of the number of vL1D cache line requests that hit in vL1D
+      cache over the total number of cache line requests to the vL1D Cache RAM.
+    VL1 Lat: Calculated as the average number of cycles that a vL1D cache line request
+      spent in the vL1D cache pipeline.
+    VL1 Coalesce: Indicates how well memory instructions were coalesced by the address
+      processing unit, ranging from uncoalesced (25%) to fully coalesced (100%). Calculated
+      as the average number of thread-requests generated per instruction divided by
+      the ideal number of thread-requests per instruction.
+    VL1 Stall: The ratio of the number of cycles where the vL1D is stalled waiting
+      to issue a request for data to the L2 cache divided by the number of cycles
+      where the vL1D is active.
+    VL1_L2 Rd: The number of read requests for a vL1D cache line that were not satisfied
+      by the vL1D and must be retrieved from the to the L2 Cache per normalization
+      unit.
+    VL1_L2 Wr: The number of write requests to a vL1D cache line that were sent through
+      the vL1D to the L2 cache, per normalization unit.
+    VL1_L2 Atomic: The number of atomic requests that are sent through the vL1D to
+      the L2 cache, per normalization unit. This includes requests for atomics with,
+      and without return.
+    sL1D Rd: The total number of requests, of any size or type, made to the sL1D per
+      normalization unit.
+    sL1D Hit: The total number of sL1D requests that hit on a previously loaded cache
+      line, per normalization unit.
+    sL1D_L2 Rd: The total number of read requests from sL1D to the L2, per normalization
+      unit.
+    sL1D_L2 Wr: The total number of write requests from sL1D to the L2, per normalization
+      unit. Typically unused on current CDNA accelerators.
+    sL1D_L2 Atomic: The total number of atomic requests from sL1D to the L2, per normalization
+      unit. Typically unused on current CDNA accelerators.
+    IL1 Fetch: The total number of requests made to the L1I per normalization-unit.
+    IL1 Hit: The percent of L1I requests that hit on a previously loaded line the
+      cache. Calculated as the ratio of the number of L1I requests that hit over the
+      number of all L1I requests.
+    IL1 Lat: The average number of cycles spent to fetch instructions to a CU.
+    IL1_L2 Rd: The total number of requests across the L1I - L2 interface per normalization-unit.
+    L2 Rd: The total number of read requests to the L2 from all clients.
+    L2 Wr: The total number of write requests to the L2 from all clients.
+    L2 Atomic: The total number of atomic requests (with and without return) to the
+      L2 from all clients.
+    L2 Hit: The ratio of the number of L2 cache line requests that hit in the L2 cache
+      over the total number of incoming cache line requests to the L2 cache.
+    L2 Rd Lat: Calculated as the average number of cycles that the vL1D cache took
+      to issue and receive read requests from the L2 Cache. This number also includes
+      requests for atomics with return values.
+    L2 Wr Lat: Calculated as the average number of cycles that the vL1D cache took
+      to issue and receive acknowledgement of a write request to the L2 Cache. This
+      number also includes requests for atomics without return values.
+    Fabric_L2 Rd: Number of L2 cache - Infinity Fabric read requests (either 32-byte
+      or 64-byte) summed over TCC instances per normalization unit.
+    Fabric_L2 Wr: Number of L2 cache - Infinity Fabric write requests (either 32-byte
+      or 64-byte) summed over TCC instances per normalization unit.
+    Fabric_L2 Atomic: Number of L2 cache - Infinity Fabric write requests (either
+      32-byte or 64-byte) that are actually atomic requests summed over TCC instances
+      per normalization unit.
+    Fabric Rd Lat: The time-averaged number of cycles read requests spent in Infinity
+      Fabric before data was returned to the L2.
+    Fabric Wr Lat: The time-averaged number of cycles write requests spent in Infinity
+      Fabric before a completion acknowledgement was returned to the L2.
+    Fabric Atomic Lat: The time-averaged number of cycles atomic requests spent in
+      Infinity Fabric before a completion acknowledgement (atomic without return value)
+      or data (atomic with return value) was returned to the L2.
+    HBM Rd: The total number of L2 requests to Infinity Fabric to read 32B or 64B
+      of data from the accelerator's local HBM, per normalization unit.
+    HBM Wr: |-
+      The total number of L2 requests to Infinity Fabric to write or atomically
+      update 32B or 64B of data in the accelerator's local HBM, per normalization
+      unit.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0400_roofline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0400_roofline.yaml
index c8ac74a94b..6cf7344c4f 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0400_roofline.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0400_roofline.yaml
@@ -2,85 +2,6 @@
 Panel Config:
   id: 400
   title: Roofline
-  metrics_description:
-    VALU FLOPs (F16): 'The total 16-bit floating-point operations executed per second
-      on the VALU. This is presented with the value of the peak empirical F16 FLOPs
-      achievable on the specific accelerator. Note: this does not include any F16
-      operations from MFMA instructions.'
-    VALU FLOPs (F32): 'The total 32-bit floating-point operations executed per second
-      on the VALU. This is presented with the value of the peak empirical F32 FLOPs
-      achievable on the specific accelerator. Note: this does not include any F32
-      operations from MFMA instructions.'
-    VALU FLOPs (F64): 'The total 64-bit floating-point operations executed per second
-      on the VALU. This is presented with the value of the peak empirical F64 FLOPs
-      achievable on the specific accelerator. Note: this does not include any F64
-      operations from MFMA instructions.'
-    MFMA FLOPs (F8): The total number of 8-bit brain floating point MFMA operations
-      executed per second. This does not include any 16-bit brain floating point operations
-      from VALU instructions. The peak empirically measured F8 MFMA operations achievable
-      on the specific accelerator is displayed alongside for comparison. It is supported
-      on AMD Instinct MI300 series and later only.
-    MFMA FLOPs (BF16): 'The total number of 16-bit brain floating point MFMA operations
-      executed per second. Note: this does not include any 16-bit brain floating point
-      operations from VALU instructions. The peak empirically measured BF16 MFMA operations
-      achievable on the specific accelerator is displayed alongside for comparison.'
-    MFMA FLOPs (F16): 'The total number of 16-bit floating point MFMA operations executed
-      per second. Note: this does not include any 16-bit floating point operations
-      from VALU instructions. The peak empirically measured F16 MFMA operations achievable
-      on the specific accelerator is displayed alongside for comparison.'
-    MFMA FLOPs (F32): 'The total number of 32-bit floating point MFMA operations executed
-      per second. Note: this does not include any 32-bit floating point operations
-      from VALU instructions. The peak empirically measured F32 MFMA operations achievable
-      on the specific accelerator is displayed alongside for comparison.'
-    MFMA FLOPs (F64): 'The total number of 64-bit floating point MFMA operations executed
-      per second. Note: this does not include any 64-bit floating point operations
-      from VALU instructions. The peak empirically measured F64 MFMA operations achievable
-      on the specific accelerator is displayed alongside for comparison.'
-    MFMA FLOPs (F6F4): 'The total number of 4-bit and 6-bit floating point MFMA operations
-      executed per second. Note: this does not include any floating point operations
-      from VALU instructions. The peak empirically measured F6F4 MFMA operations achievable
-      on the specific accelerator is displayed alongside for comparison. It is supported
-      on AMD Instinct MI350 series (gfx950) and later only.'
-    MFMA IOPs (Int8): 'The total number of 8-bit integer MFMA operations executed
-      per second. Note: this does not include any 8-bit integer operations from VALU
-      instructions. The peak empirically measured INT8 MFMA operations achievable
-      on the specific accelerator is displayed alongside for comparison.'
-    HBM Bandwidth: The total number of bytes read from and written to High-Bandwidth
-      Memory (HBM) per second. The peak empirically measured bandwidth achievable
-      on the specific accelerator is displayed alongside for comparison.
-    L2 Cache Bandwidth: The number of bytes looked up in the L2 cache per unit time.
-      The number of bytes is calculated as the number of cache lines requested multiplied
-      by the cache line size. This value does not consider partial requests, so e.g.,
-      if only a single value is requested in a cache line, the data movement will
-      still be counted as a full cache line. The peak empirically measured bandwidth
-      achievable on the specific accelerator is displayed alongside for comparison.
-    L1 Cache Bandwidth: The number of bytes looked up in the vL1D cache as a result
-      of VMEM instructions per unit time. The number of bytes is calculated as the
-      number of cache lines requested multiplied by the cache line size. This value
-      does not consider partial requests, so e.g., if only a single value is requested
-      in a cache line, the data movement will still be counted as a full cache line.
-      The peak empirically measured bandwidth achievable on the specific accelerator
-      is displayed alongside for comparison.
-    LDS Bandwidth: Indicates the maximum amount of bytes that could have been loaded
-      from, stored to, or atomically updated in the LDS per unit time (see LDS Bandwidth
-      example for more detail). The peak empirically measured LDS bandwidth achievable
-      on the specific accelerator is displayed alongside for comparison.
-    AI L1: The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio
-      of total floating-point operations (FLOPs) to total bytes transferred between
-      the L1 cache and the processing units. This value is used as the x-coordinate
-      for the L1 roofline.
-    AI L2: The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio
-      of total floating-point operations (FLOPs) to total bytes transferred between
-      the L2 cache and the L1 cache. This value is used as the x-coordinate for the
-      L2 roofline.
-    AI HBM: The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM).
-      It is the ratio of total floating-point operations (FLOPs) to total bytes transferred
-      between HBM and the L2 cache. This value is used as the x-coordinate for the
-      HBM roofline.
-    Performance (GFLOPs): The overall achieved performance, measured in GigaFLOPs
-      per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point
-      operations divided by the total execution time. This value is used as the y-coordinate
-      for the kernel's point on the Roofline plot.
   data source:
   - metric_table:
       id: 401
@@ -212,3 +133,86 @@ Panel Config:
             512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) ) / (SUM(End_Timestamp - Start_Timestamp)
             / 1e9) ) / 1e9
           unit: GFLOP/s
+  metrics_description:
+    VALU FLOPs (F16): |-
+      The total 16-bit floating-point operations executed per second on the VALU.
+      This is presented with the value of the peak empirical F16 FLOPs achievable
+      on the specific accelerator. Note: this does not include any F16 operations
+      from MFMA instructions.
+    VALU FLOPs (F32): |-
+      The total 32-bit floating-point operations executed per second on the VALU.
+      This is presented with the value of the peak empirical F32 FLOPs achievable
+      on the specific accelerator. Note: this does not include any F32 operations
+      from MFMA instructions.
+    VALU FLOPs (F64): |-
+      The total 64-bit floating-point operations executed per second on the VALU.
+      This is presented with the value of the peak empirical F64 FLOPs achievable
+      on the specific accelerator. Note: this does not include any F64 operations
+      from MFMA instructions.
+    MFMA FLOPs (BF16): |-
+      The total number of 16-bit brain floating point MFMA operations executed
+      per second. Note: this does not include any 16-bit brain floating point
+      operations from VALU instructions. The peak empirically measured BF16 MFMA
+      operations achievable on the specific accelerator is displayed alongside
+      for comparison.
+    MFMA FLOPs (F16): |-
+      The total number of 16-bit floating point MFMA operations executed per
+      second. Note: this does not include any 16-bit floating point operations from
+      VALU instructions. The peak empirically measured F16 MFMA operations
+      achievable on the specific accelerator is displayed alongside for comparison.
+    MFMA FLOPs (F32): |-
+      The total number of 32-bit floating point MFMA operations executed per
+      second. Note: this does not include any 32-bit floating point operations from
+      VALU instructions. The peak empirically measured F32 MFMA operations
+      achievable on the specific accelerator is displayed alongside for comparison.
+    MFMA FLOPs (F64): |-
+      The total number of 64-bit floating point MFMA operations executed per
+      second. Note: this does not include any 64-bit floating point operations from
+      VALU instructions. The peak empirically measured F64 MFMA operations
+      achievable on the specific accelerator is displayed alongside for comparison.
+    MFMA IOPs (Int8): |-
+      The total number of 8-bit integer MFMA operations executed per second.
+      Note: this does not include any 8-bit integer operations from VALU instructions.
+      The peak empirically measured INT8 MFMA operations achievable on the specific
+      accelerator is displayed alongside for comparison.
+    HBM Bandwidth: |-
+      The total number of bytes read from and written to High-Bandwidth
+      Memory (HBM) per second. The peak empirically measured bandwidth achievable
+      on the specific accelerator is displayed alongside for comparison.
+    L2 Cache Bandwidth: The number of bytes looked up in the L2 cache per unit time.
+      The number of bytes is calculated as the number of cache lines requested multiplied
+      by the cache line size. This value does not consider partial requests, so e.g.,
+      if only a single value is requested in a cache line, the data movement will
+      still be counted as a full cache line. The peak empirically measured bandwidth
+      achievable on the specific accelerator is displayed alongside for comparison.
+    L1 Cache Bandwidth: The number of bytes looked up in the vL1D cache as a result
+      of VMEM instructions per unit time. The number of bytes is calculated as the
+      number of cache lines requested multiplied by the cache line size. This value
+      does not consider partial requests, so e.g., if only a single value is requested
+      in a cache line, the data movement will still be counted as a full cache line.
+      The peak empirically measured bandwidth achievable on the specific accelerator
+      is displayed alongside for comparison.
+    LDS Bandwidth: Indicates the maximum amount of bytes that could have been loaded
+      from, stored to, or atomically updated in the LDS per unit time (see LDS Bandwidth
+      example for more detail). The peak empirically measured LDS bandwidth achievable
+      on the specific accelerator is displayed alongside for comparison.
+    AI L1: |-
+      The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio
+      of total floating-point operations (FLOPs) to total bytes transferred between
+      the L1 cache and the processing units. This value is used as the x-coordinate
+      for the L1 roofline.
+    AI L2: |-
+      The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio
+      of total floating-point operations (FLOPs) to total bytes transferred between
+      the L2 cache and the L1 cache. This value is used as the x-coordinate for
+      the L2 roofline.
+    AI HBM: |-
+      The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM).
+      It is the ratio of total floating-point operations (FLOPs) to total bytes
+      transferred between HBM and the L2 cache. This value is used as the x-coordinate
+      for the HBM roofline.
+    Performance (GFLOPs): |-
+      The overall achieved performance, measured in GigaFLOPs
+      per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point
+      operations divided by the total execution time. This value is used as the y-coordinate
+      for the kernel's point on the Roofline plot.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0500_command_processor_cpc_cpf.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0500_command_processor_cpc_cpf.yaml
index c4d2cabf52..118ce18331 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0500_command_processor_cpc_cpf.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0500_command_processor_cpc_cpf.yaml
@@ -2,30 +2,6 @@
 Panel Config:
   id: 500
   title: Command Processor (CPC/CPF)
-  metrics_description:
-    CPF Utilization: Percent of total cycles where the CPF was busy actively doing
-      any work. The ratio of CPF busy cycles over total cycles counted by the CPF.
-    CPF Stall: Percent of CPF busy cycles where the CPF was stalled for any reason.
-    CPF-L2 Utilization: Percent of total cycles counted by the CPF-L2 interface where
-      the CPF-L2 interface was active doing any work. The ratio of CPF-L2 busy cycles
-      over total cycles counted by the CPF-L2.
-    CPF-L2 Stall: Percent of CPF-L2 L2 busy cycles where the CPF-L2 interface was
-      stalled for any reason.
-    CPF-UTCL1 Stall: Percent of CPF busy cycles where the CPF was stalled by address
-      translation.
-    CPC Utilization: Percent of total cycles where the CPC was busy actively doing
-      any work. The ratio of CPC busy cycles over total cycles counted by the CPC.
-    CPC Stall Rate: Percent of CPC busy cycles where the CPC was stalled for any reason.
-    CPC Packet Decoding Utilization: Percent of CPC busy cycles spent decoding commands
-      for processing.
-    CPC-Workgroup Manager Utilization: Percent of CPC busy cycles spent dispatching
-      workgroups to the workgroup manager.
-    CPC-L2 Utilization: Percent of total cycles counted by the CPC-L2 interface where
-      the CPC-L2 interface was active doing any work.
-    CPC-UTCL1 Stall: Percent of CPC busy cycles where the CPC was stalled by address
-      translation
-    CPC-UTCL2 Utilization: 'Percent of total cycles counted by the CPC''s L2 address
-      translation interface where the CPC was busy doing address translation work.  '
   data source:
   - metric_table:
       id: 501
@@ -143,3 +119,28 @@ Panel Config:
           max: MAX((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
             if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
           unit: pct
+  metrics_description:
+    CPF Utilization: Percent of total cycles where the CPF was busy actively doing
+      any work. The ratio of CPF busy cycles over total cycles counted by the CPF.
+    CPF Stall: Percent of CPF busy cycles where the CPF was stalled for any reason.
+    CPF-L2 Utilization: Percent of total cycles counted by the CPF-L2 interface where
+      the CPF-L2 interface was active doing any work. The ratio of CPF-L2 busy cycles
+      over total cycles counted by the CPF-L2.
+    CPF-L2 Stall: Percent of CPF-L2 L2 busy cycles where the CPF-L2 interface was
+      stalled for any reason.
+    CPF-UTCL1 Stall: Percent of CPF busy cycles where the CPF was stalled by address
+      translation.
+    CPC Utilization: Percent of total cycles where the CPC was busy actively doing
+      any work. The ratio of CPC busy cycles over total cycles counted by the CPC.
+    CPC Stall Rate: Percent of CPC busy cycles where the CPC was stalled for any reason.
+    CPC Packet Decoding Utilization: Percent of CPC busy cycles spent decoding commands
+      for processing.
+    CPC-Workgroup Manager Utilization: Percent of CPC busy cycles spent dispatching
+      workgroups to the workgroup manager.
+    CPC-L2 Utilization: Percent of total cycles counted by the CPC-L2 interface where
+      the CPC-L2 interface was active doing any work.
+    CPC-UTCL1 Stall: Percent of CPC busy cycles where the CPC was stalled by address
+      translation
+    CPC-UTCL2 Utilization: |-
+      Percent of total cycles counted by the CPC's L2 address translation
+      interface where the CPC was busy doing address translation work.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0600_workgroup_manager_spi.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0600_workgroup_manager_spi.yaml
index f6bf13d8b8..eb9845aa82 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0600_workgroup_manager_spi.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0600_workgroup_manager_spi.yaml
@@ -2,61 +2,6 @@
 Panel Config:
   id: 600
   title: Workgroup Manager (SPI)
-  metrics_description:
-    Accelerator Utilization: The percent of cycles in the kernel where the accelerator
-      was actively doing any work.
-    Scheduler-Pipe Utilization: The percent of total scheduler-pipe cycles in the
-      kernel where the scheduler-pipes were actively doing any work.
-    Workgroup Manager Utilization: The percent of cycles in the kernel where the workgroup
-      manager was actively doing any work.
-    Shader Engine Utilization: The percent of total shader engine cycles in the kernel
-      where any CU in a shader-engine was actively doing any work, normalized over
-      all shader-engines. Low values (e.g., << 100%) indicate that the accelerator
-      was not fully saturated by the kernel, or a potential load-imbalance issue.
-    SIMD Utilization: The percent of total SIMD cycles in the kernel where any SIMD
-      on a CU was actively doing any work, summed over all CUs. Low values (less than
-      100%) indicate that the accelerator was not fully saturated by the kernel, or
-      a potential load-imbalance issue.
-    Dispatched Workgroups: The total number of workgroups forming this kernel launch.
-    Dispatched Wavefronts: The total number of wavefronts, summed over all workgroups,
-      forming this kernel launch.
-    VGPR Writes: The average number of cycles spent initializing VGPRs at wave creation.
-    SGPR Writes: The average number of cycles spent initializing SGPRs at wave creation.
-    Not-scheduled Rate (Workgroup Manager): The percent of total scheduler-pipe cycles
-      in the kernel where a workgroup could not be scheduled to a CU due to a bottleneck
-      within the workgroup manager rather than a lack of a CU or SIMD with sufficient
-      resources.
-    Not-scheduled Rate (Scheduler-Pipe): 'The percent of total scheduler-pipe cycles
-      in the kernel where a workgroup could not be scheduled to a CU due to a bottleneck
-      within the scheduler-pipes rather than a lack of a CU or SIMD with sufficient
-      resources. '
-    Scheduler-Pipe Stall Rate: The percent of total scheduler-pipe cycles in the kernel
-      where a workgroup could not be scheduled to a CU due to occupancy limitations
-      (like a lack of a CU or SIMD with sufficient resources).
-    Scratch Stall Rate: The percent of total shader-engine cycles in the kernel where
-      a workgroup could not be scheduled to a CU due to lack of private (a.k.a., scratch)
-      memory slots. While this can reach up to 100%, note that the actual occupancy
-      limitations on a kernel using private memory are typically quite small (for
-      example, less than 1% of the total number of waves that can be scheduled to
-      an accelerator).
-    Insufficient SIMD Waveslots: The percent of total SIMD cycles in the kernel where
-      a workgroup could not be scheduled to a SIMD due to lack of available waveslots.
-    Insufficient SIMD VGPRs: The percent of total SIMD cycles in the kernel where
-      a workgroup could not be scheduled to a SIMD due to lack of available VGPRs.
-    Insufficient SIMD SGPRs: The percent of total SIMD cycles in the kernel where
-      a workgroup could not be scheduled to a SIMD due to lack of available SGPRs.
-    Insufficient CU LDS: The percent of total CU cycles in the kernel where a workgroup
-      could not be scheduled to a CU due to lack of available LDS.
-    Insufficient CU Barriers: The percent of total CU cycles in the kernel where a
-      workgroup could not be scheduled to a CU due to lack of available barriers.
-    Reached CU Workgroup Limit: The percent of total CU cycles in the kernel where
-      a workgroup could not be scheduled to a CU due to limits within the workgroup
-      manager. This is expected to be always be zero on CDNA2 or newer accelerators
-      (and small for previous accelerators).
-    Reached CU Wavefront Limit: The percent of total CU cycles in the kernel where
-      a wavefront could not be scheduled to a CU due to limits within the workgroup
-      manager. This is expected to be always be zero on CDNA2 or newer accelerators
-      (and small for previous accelerators).
   data source:
   - metric_table:
       id: 601
@@ -199,3 +144,58 @@ Panel Config:
           min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
           max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
           unit: Pct
+  metrics_description:
+    Accelerator Utilization: The percent of cycles in the kernel where the accelerator
+      was actively doing any work.
+    Scheduler-Pipe Utilization: The percent of total scheduler-pipe cycles in the
+      kernel where the scheduler-pipes were actively doing any work.
+    Workgroup Manager Utilization: The percent of cycles in the kernel where the workgroup
+      manager was actively doing any work.
+    Shader Engine Utilization: The percent of total shader engine cycles in the kernel
+      where any CU in a shader-engine was actively doing any work, normalized over
+      all shader-engines. Low values (e.g., << 100%) indicate that the accelerator
+      was not fully saturated by the kernel, or a potential load-imbalance issue.
+    SIMD Utilization: The percent of total SIMD cycles in the kernel where any SIMD
+      on a CU was actively doing any work, summed over all CUs. Low values (less than
+      100%) indicate that the accelerator was not fully saturated by the kernel, or
+      a potential load-imbalance issue.
+    Dispatched Workgroups: The total number of workgroups forming this kernel launch.
+    Dispatched Wavefronts: The total number of wavefronts, summed over all workgroups,
+      forming this kernel launch.
+    VGPR Writes: The average number of cycles spent initializing VGPRs at wave creation.
+    SGPR Writes: The average number of cycles spent initializing SGPRs at wave creation.
+    Not-scheduled Rate (Workgroup Manager): The percent of total scheduler-pipe cycles
+      in the kernel where a workgroup could not be scheduled to a CU due to a bottleneck
+      within the workgroup manager rather than a lack of a CU or SIMD with sufficient
+      resources.
+    Not-scheduled Rate (Scheduler-Pipe): |-
+      The percent of total scheduler-pipe cycles in the kernel where a workgroup
+      could not be scheduled to a CU due to a bottleneck within the scheduler-pipes
+      rather than a lack of a CU or SIMD with sufficient resources.
+    Scheduler-Pipe Stall Rate: The percent of total scheduler-pipe cycles in the kernel
+      where a workgroup could not be scheduled to a CU due to occupancy limitations
+      (like a lack of a CU or SIMD with sufficient resources).
+    Scratch Stall Rate: The percent of total shader-engine cycles in the kernel where
+      a workgroup could not be scheduled to a CU due to lack of private (a.k.a., scratch)
+      memory slots. While this can reach up to 100%, note that the actual occupancy
+      limitations on a kernel using private memory are typically quite small (for
+      example, less than 1% of the total number of waves that can be scheduled to
+      an accelerator).
+    Insufficient SIMD Waveslots: The percent of total SIMD cycles in the kernel where
+      a workgroup could not be scheduled to a SIMD due to lack of available waveslots.
+    Insufficient SIMD VGPRs: The percent of total SIMD cycles in the kernel where
+      a workgroup could not be scheduled to a SIMD due to lack of available VGPRs.
+    Insufficient SIMD SGPRs: The percent of total SIMD cycles in the kernel where
+      a workgroup could not be scheduled to a SIMD due to lack of available SGPRs.
+    Insufficient CU LDS: The percent of total CU cycles in the kernel where a workgroup
+      could not be scheduled to a CU due to lack of available LDS.
+    Insufficient CU Barriers: The percent of total CU cycles in the kernel where a
+      workgroup could not be scheduled to a CU due to lack of available barriers.
+    Reached CU Workgroup Limit: The percent of total CU cycles in the kernel where
+      a workgroup could not be scheduled to a CU due to limits within the workgroup
+      manager. This is expected to be always be zero on CDNA2 or newer accelerators
+      (and small for previous accelerators).
+    Reached CU Wavefront Limit: The percent of total CU cycles in the kernel where
+      a wavefront could not be scheduled to a CU due to limits within the workgroup
+      manager. This is expected to be always be zero on CDNA2 or newer accelerators
+      (and small for previous accelerators).
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0700_wavefront.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0700_wavefront.yaml
index 5e332c0b8f..e9e9407cfc 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0700_wavefront.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0700_wavefront.yaml
@@ -2,63 +2,6 @@
 Panel Config:
   id: 700
   title: Wavefront
-  metrics_description:
-    Grid Size: The total number of work-items (or, threads) launched as a part of
-      the kernel dispatch. In HIP, this is equivalent to the total grid size multiplied
-      by the total workgroup (or, block) size.
-    Workgroup Size: The total number of work-items (or, threads) in each workgroup
-      (or, block) launched as part of the kernel dispatch. In HIP, this is equivalent
-      to the total block size.
-    Total Wavefronts: "The total number of wavefronts launched as part of the kernel\
-      \ dispatch. On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs,\
-      \ the wavefront size is always 64 work-items. Thus, the total number of wavefronts\
-      \ should be equivalent to the ceiling of grid size divided by 64."
-    Saved Wavefronts: The total number of wavefronts saved at a context-save.
-    Restored Wavefronts: The total number of wavefronts restored from a context-save.
-    VGPRs: 'The number of architected vector general-purpose registers allocated for
-      the kernel, see VALU. Note: this may not exactly match the number of VGPRs requested
-      by the compiler due to allocation granularity.'
-    AGPRs: 'The number of accumulation vector general-purpose registers allocated
-      for the kernel, see AGPRs. Note: this may not exactly match the number of AGPRs
-      requested by the compiler due to allocation granularity.'
-    SGPRs: 'The number of scalar general-purpose registers allocated for the kernel,
-      see SALU. Note: this may not exactly match the number of SGPRs requested by
-      the compiler due to allocation granularity.'
-    LDS Allocation: 'The number of bytes of LDS memory (or, shared memory) allocated
-      for this kernel. Note: This may also be larger than what was requested at compile
-      time due to both allocation granularity and dynamic per-dispatch LDS allocations.'
-    Scratch Allocation: The number of bytes of scratch memory requested per work-item
-      for this kernel. Scratch memory is used for stack memory on the accelerator,
-      as well as for register spills and restores.
-    Kernel Time: The total duration of the executed kernel.
-    Kernel Time (Cycles): The total duration of the executed kernel in cycles.
-    Instructions per wavefront: The average number of instructions (of all types)
-      executed per wavefront. This is averaged over all wavefronts in a kernel dispatch.
-    Wave Cycles: The number of cycles a wavefront in the kernel dispatch spent resident
-      on a compute unit per normalization unit. This is averaged over all wavefronts
-      in a kernel dispatch.
-    Dependency Wait Cycles: The number of cycles a wavefront in the kernel dispatch
-      spent resident on a compute unit per normalization unit. This is averaged over
-      all wavefronts in a kernel dispatch.
-    Issue Wait Cycles: The number of cycles a wavefront in the kernel dispatch was
-      unable to issue an instruction for any reason (e.g., execution pipe back-pressure,
-      arbitration loss, etc.) per normalization unit. This counter is incremented
-      at every cycle by all wavefronts on a CU unable to issue an instruction. As
-      such, it is most useful to get a sense of how waves were spending their time,
-      rather than identification of a precise limiter because another wave could be
-      actively executing while a wave is issue stalled. The sum of this metric, Dependency
-      Wait Cycles and Active Cycles should be equal to the total Wave Cycles metric.
-    Active Cycles: The average number of cycles a wavefront in the kernel dispatch
-      was actively executing instructions per normalization unit. This measurement
-      is made on a per-wavefront basis, and may include cycles that another wavefront
-      spent actively executing (on another execution unit, for example) or was stalled.
-      As such, it is most useful to get a sense of how waves were spending their time,
-      rather than identification of a precise limiter. The sum of this metric, Issue
-      Wait Cycles and Active Wait Cycles should be equal to the total Wave Cycles
-      metric.
-    Wavefront Occupancy: 'The time-averaged number of wavefronts resident on the accelerator
-      over the lifetime of the kernel. Note: this metric may be inaccurate for short-running
-      kernels (less than 1ms).'
   data source:
   - metric_table:
       id: 701
@@ -171,3 +114,66 @@ Panel Config:
           max: MAX((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
           unit: Wavefronts
           coll_level: SQ_LEVEL_WAVES
+  metrics_description:
+    Grid Size: The total number of work-items (or, threads) launched as a part of
+      the kernel dispatch. In HIP, this is equivalent to the total grid size multiplied
+      by the total workgroup (or, block) size.
+    Workgroup Size: The total number of work-items (or, threads) in each workgroup
+      (or, block) launched as part of the kernel dispatch. In HIP, this is equivalent
+      to the total block size.
+    Total Wavefronts: |-
+      The total number of wavefronts launched as part of the kernel dispatch.
+      On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront
+      size is always 64 work-items. Thus, the total number of wavefronts should
+      be equivalent to the ceiling of grid size divided by 64.
+    Saved Wavefronts: The total number of wavefronts saved at a context-save.
+    Restored Wavefronts: The total number of wavefronts restored from a context-save.
+    VGPRs: |-
+      The number of architected vector general-purpose registers allocated
+      for the kernel, see VALU. Note: this may not exactly match the number of VGPRs
+      requested by the compiler due to allocation granularity.
+    AGPRs: |-
+      The number of accumulation vector general-purpose registers allocated
+      for the kernel, see AGPRs. Note: this may not exactly match the number of
+      AGPRs requested by the compiler due to allocation granularity.
+    SGPRs: |-
+      The number of scalar general-purpose registers allocated for the kernel,
+      see SALU. Note: this may not exactly match the number of SGPRs requested by
+      the compiler due to allocation granularity.
+    LDS Allocation: |-
+      The number of bytes of LDS memory (or, shared memory) allocated for
+      this kernel. Note: This may also be larger than what was requested at compile
+      time due to both allocation granularity and dynamic per-dispatch LDS allocations.
+    Scratch Allocation: The number of bytes of scratch memory requested per work-item
+      for this kernel. Scratch memory is used for stack memory on the accelerator,
+      as well as for register spills and restores.
+    Kernel Time: The total duration of the executed kernel.
+    Kernel Time (Cycles): The total duration of the executed kernel in cycles.
+    Instructions per wavefront: The average number of instructions (of all types)
+      executed per wavefront. This is averaged over all wavefronts in a kernel dispatch.
+    Wave Cycles: The number of cycles a wavefront in the kernel dispatch spent resident
+      on a compute unit per normalization unit. This is averaged over all wavefronts
+      in a kernel dispatch.
+    Dependency Wait Cycles: The number of cycles a wavefront in the kernel dispatch
+      spent resident on a compute unit per normalization unit. This is averaged over
+      all wavefronts in a kernel dispatch.
+    Issue Wait Cycles: The number of cycles a wavefront in the kernel dispatch was
+      unable to issue an instruction for any reason (e.g., execution pipe back-pressure,
+      arbitration loss, etc.) per normalization unit. This counter is incremented
+      at every cycle by all wavefronts on a CU unable to issue an instruction. As
+      such, it is most useful to get a sense of how waves were spending their time,
+      rather than identification of a precise limiter because another wave could be
+      actively executing while a wave is issue stalled. The sum of this metric, Dependency
+      Wait Cycles and Active Cycles should be equal to the total Wave Cycles metric.
+    Active Cycles: The average number of cycles a wavefront in the kernel dispatch
+      was actively executing instructions per normalization unit. This measurement
+      is made on a per-wavefront basis, and may include cycles that another wavefront
+      spent actively executing (on another execution unit, for example) or was stalled.
+      As such, it is most useful to get a sense of how waves were spending their time,
+      rather than identification of a precise limiter. The sum of this metric, Issue
+      Wait Cycles and Active Wait Cycles should be equal to the total Wave Cycles
+      metric.
+    Wavefront Occupancy: |-
+      The time-averaged number of wavefronts resident on the accelerator over
+      the lifetime of the kernel. Note: this metric may be inaccurate for short-running
+      kernels (less than 1ms).
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1000_compute_units_instruction_mix.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1000_compute_units_instruction_mix.yaml
index b820b8de60..3a6c8c25f0 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1000_compute_units_instruction_mix.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1000_compute_units_instruction_mix.yaml
@@ -2,90 +2,6 @@
 Panel Config:
   id: 1000
   title: Compute Units - Instruction Mix
-  metrics_description:
-    VALU: The total number of vector arithmetic logic unit (VALU) operations issued.
-      These are the workhorses of the compute unit, and are used to execute a wide
-      range of instruction types including floating point operations, non-uniform
-      address calculations, transcendental operations, integer operations, shifts,
-      conditional evaluation, etc.
-    VMEM: The total number of vector memory operations issued. These include most
-      loads, stores and atomic operations and all accesses to generic, global, private
-      and texture memory.
-    LDS: The total number of LDS (also known as shared memory) operations issued.
-      These include loads, stores, atomics, and HIP's __shfl operations.
-    MFMA: The total number of matrix fused multiply-add instructions issued.
-    SALU: The total number of scalar arithmetic logic unit (SALU) operations issued.
-      Typically these are used for address calculations, literal constants, and other
-      operations that are provably uniform across a wavefront. Although scalar memory
-      (SMEM) operations are issued by the SALU, they are counted separately in this
-      section.
-    SMEM: The total number of scalar memory (SMEM) operations issued. These are typically
-      used for loading kernel arguments, base-pointers and loads from HIP's __constant__
-      memory.
-    Branch: The total number of branch operations issued. These typically consist
-      of jump or branch operations and are used to implement control flow.
-    INT32: The total number of instructions operating on 32-bit integer operands issued
-      to the VALU per normalization unit.
-    INT64: The total number of instructions operating on 64-bit integer operands issued
-      to the VALU per normalization unit.
-    F16-ADD: The total number of addition instructions operating on 16-bit floating-point
-      operands issued to the VALU per normalization unit.
-    F16-MUL: The total number of multiplication instructions operating on 16-bit floating-point
-      operands issued to the VALU per normalization unit.
-    F16-FMA: The total number of fused multiply-add instructions operating on 16-bit
-      floating-point operands issued to the VALU per normalization unit.
-    F16-Trans: The total number of transcendental instructions (e.g., sqrt) operating
-      on 16-bit floating-point operands issued to the VALU per normalization unit.
-    F32-ADD: The total number of addition instructions operating on 32-bit floating-point
-      operands issued to the VALU per normalization unit.
-    F32-MUL: The total number of multiplication instructions operating on 32-bit floating-point
-      operands issued to the VALU per normalization unit.
-    F32-FMA: The total number of fused multiply-add instructions operating on 32-bit
-      floating-point operands issued to the VALU per normalization unit.
-    F32-Trans: The total number of transcendental instructions (such as sqrt) operating
-      on 32-bit floating-point operands issued to the VALU per normalization unit.
-    F64-ADD: The total number of addition instructions operating on 64-bit floating-point
-      operands issued to the VALU per normalization unit.
-    F64-MUL: The total number of multiplication instructions operating on 64-bit floating-point
-      operands issued to the VALU per normalization unit.
-    F64-FMA: The total number of fused multiply-add instructions operating on 64-bit
-      floating-point operands issued to the VALU per normalization unit.
-    F64-Trans: The total number of transcendental instructions (such as sqrt) operating
-      on 64-bit floating-point operands issued to the VALU per normalization unit.
-    Conversion: "The total number of type conversion instructions (such as converting\
-      \ data to or from F32\u2194F64) issued to the VALU per normalization unit."
-    Global/Generic Instr: The total number of global & generic memory instructions
-      executed on all compute units on the accelerator, per normalization unit.
-    Global/Generic Read: The total number of global & generic memory read instructions
-      executed on all compute units on the accelerator, per normalization unit.
-    Global/Generic Write: The total number of global & generic memory write instructions
-      executed on all compute units on the accelerator, per normalization unit.
-    Global/Generic Atomic: The total number of global & generic memory atomic (with
-      and without return) instructions executed on all compute units on the accelerator,
-      per normalization unit.
-    Spill/Stack Instr: The total number of spill/stack memory instructions executed
-      on all compute units on the accelerator, per normalization unit.
-    Spill/Stack Read: The total number of spill/stack memory read instructions executed
-      on all compute units on the accelerator, per normalization unit.
-    Spill/Stack Write: The total number of spill/stack memory write instructions executed
-      on all compute units on the accelerator, per normalization unit.
-    Spill/Stack Atomic: The total number of spill/stack memory atomic (with and without
-      return) instructions executed on all compute units on the accelerator, per normalization
-      unit. Typically unused as these memory operations are typically used to implement
-      thread-local storage.
-    MFMA-I8: The total number of 8-bit integer MFMA instructions issued per normalization
-      unit.
-    MFMA-F8: The total number of 8-bit floating point MFMA instructions issued per
-      normalization unit. This is supported in AMD Instinct MI300 series and later
-      only.
-    MFMA-F16: The total number of 16-bit floating point MFMA instructions issued per
-      normalization unit.
-    MFMA-BF16: The total number of 16-bit brain floating point MFMA instructions issued
-      per normalization unit.
-    MFMA-F32: The total number of 32-bit floating-point MFMA instructions issued per
-      normalization unit.
-    MFMA-F64: The total number of 64-bit floating-point MFMA instructions issued per
-      normalization unit.
   data source:
   - metric_table:
       id: 1001
@@ -187,3 +103,35 @@ Panel Config:
         max: Max
         unit: Unit
       metric: {}
+  metrics_description:
+    LDS: The total number of LDS (also known as shared memory) operations issued.
+      These include loads, stores, atomics, and HIP's __shfl operations.
+    SALU: The total number of scalar arithmetic logic unit (SALU) operations issued.
+      Typically these are used for address calculations, literal constants, and other
+      operations that are provably uniform across a wavefront. Although scalar memory
+      (SMEM) operations are issued by the SALU, they are counted separately in this
+      section.
+    SMEM: The total number of scalar memory (SMEM) operations issued. These are typically
+      used for loading kernel arguments, base-pointers and loads from HIP's __constant__
+      memory.
+    Branch: The total number of branch operations issued. These typically consist
+      of jump or branch operations and are used to implement control flow.
+    Global/Generic Instr: The total number of global & generic memory instructions
+      executed on all compute units on the accelerator, per normalization unit.
+    Global/Generic Read: The total number of global & generic memory read instructions
+      executed on all compute units on the accelerator, per normalization unit.
+    Global/Generic Write: The total number of global & generic memory write instructions
+      executed on all compute units on the accelerator, per normalization unit.
+    Global/Generic Atomic: The total number of global & generic memory atomic (with
+      and without return) instructions executed on all compute units on the accelerator,
+      per normalization unit.
+    Spill/Stack Instr: The total number of spill/stack memory instructions executed
+      on all compute units on the accelerator, per normalization unit.
+    Spill/Stack Read: The total number of spill/stack memory read instructions executed
+      on all compute units on the accelerator, per normalization unit.
+    Spill/Stack Write: The total number of spill/stack memory write instructions executed
+      on all compute units on the accelerator, per normalization unit.
+    Spill/Stack Atomic: The total number of spill/stack memory atomic (with and without
+      return) instructions executed on all compute units on the accelerator, per normalization
+      unit. Typically unused as these memory operations are typically used to implement
+      thread-local storage.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1100_compute_units_compute_pipeline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1100_compute_units_compute_pipeline.yaml
index 9dd3dc97c4..7e4f357916 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1100_compute_units_compute_pipeline.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1100_compute_units_compute_pipeline.yaml
@@ -2,84 +2,6 @@
 Panel Config:
   id: 1100
   title: Compute Units - Compute Pipeline
-  metrics_description:
-    VALU FLOPs: 'The total floating-point operations executed per second on the VALU.
-      This is also presented as a percent of the peak theoretical FLOPs achievable
-      on the specific accelerator. Note: this does not include any floating-point
-      operations from MFMA instructions.'
-    VALU IOPs: 'The total integer operations executed per second on the VALU. This
-      is also presented as a percent of the peak theoretical IOPs achievable on the
-      specific accelerator. Note: this does not include any integer operations from
-      MFMA instructions.'
-    MFMA FLOPs (BF16): 'The total number of 16-bit brain floating point MFMA operations
-      executed per second. Note: this does not include any 16-bit brain floating point
-      operations from VALU instructions. This is also presented as a percent of the
-      peak theoretical BF16 MFMA operations achievable on the specific accelerator.'
-    MFMA FLOPs (F16): 'The total number of 16-bit floating point MFMA operations executed
-      per second. Note: this does not include any 16-bit floating point operations
-      from VALU instructions. This is also presented as a percent of the peak theoretical
-      F16 MFMA operations achievable on the specific accelerator.'
-    MFMA FLOPs (F32): 'The total number of 32-bit floating point MFMA operations executed
-      per second. Note: this does not include any 32-bit floating point operations
-      from VALU instructions. This is also presented as a percent of the peak theoretical
-      F32 MFMA operations achievable on the specific accelerator.'
-    MFMA FLOPs (F64): 'The total number of 64-bit floating point MFMA operations executed
-      per second. Note: this does not include any 64-bit floating point operations
-      from VALU instructions. This is also presented as a percent of the peak theoretical
-      F64 MFMA operations achievable on the specific accelerator.'
-    MFMA IOPs (INT8): 'The total number of 8-bit integer MFMA operations executed
-      per second. Note: this does not include any 8-bit integer operations from VALU
-      instructions. This is also presented as a percent of the peak theoretical INT8
-      MFMA operations achievable on the specific accelerator.'
-    IPC: The ratio of the total number of instructions executed on the CU over the
-      total active CU cycles.
-    IPC (Issued): The ratio of the total number of (non-internal) instructions issued
-      over the number of cycles where the scheduler was actively working on issuing
-      instructions.
-    SALU Utilization: Indicates what percent of the kernel's duration the SALU was
-      busy executing instructions. Computed as the ratio of the total number of cycles
-      spent by the scheduler issuing SALU / SMEM instructions over the total CU cycles.
-    VALU Utilization: Indicates what percent of the kernel's duration the VALU was
-      busy executing instructions. Does not include VMEM operations. Computed as the
-      ratio of the total number of cycles spent by the scheduler issuing VALU instructions
-      over the total CU cycles.
-    VMEM Utilization: Indicates what percent of the kernel's duration the VMEM unit
-      was busy executing instructions, including both global/generic and spill/scratch
-      operations (see the VMEM instruction count metrics for more detail). Does not
-      include VALU operations. Computed as the ratio of the total number of cycles
-      spent by the scheduler issuing VMEM instructions over the total CU cycles.
-    Branch Utilization: Indicates what percent of the kernel's duration the branch
-      unit was busy executing instructions. Computed as the ratio of the total number
-      of cycles spent by the scheduler issuing branch instructions over the total
-      CU cycles.
-    VALU Active Threads: Indicates the average level of divergence within a wavefront
-      over the lifetime of the kernel. The number of work-items that were active in
-      a wavefront during execution of each VALU instruction, time-averaged over all
-      VALU instructions run on all wavefronts in the kernel
-    MFMA Utilization: Indicates what percent of the kernel's duration the MFMA unit
-      was busy executing instructions. Computed as the ratio of the total number of
-      cycles spent by the MFMA was busy over the total CU cycles.
-    MFMA Instruction Cycles: The average duration of MFMA instructions in this kernel
-      in cycles. Computed as the ratio of the total number of cycles the MFMA unit
-      was busy over the total number of MFMA instructions.
-    VMEM Latency: The average number of round-trip cycles (that is, from issue to
-      data return / acknowledgment) required for a VMEM instruction to complete.
-    SMEM Latency: The average number of round-trip cycles (that is, from issue to
-      data return / acknowledgment) required for a SMEM instruction to complete.
-    FLOPs (Total): The total number of floating-point operations executed on either
-      the VALU or MFMA units, per normalization unit.
-    IOPs (Total): The total number of integer operations executed on either the VALU
-      or MFMA units, per normalization unit.
-    F16 OPs: The total number of 16-bit floating-point operations executed on either
-      the VALU or MFMA units, per normalization unit.
-    BF16 OPs: The total number of 16-bit brain floating-point operations executed
-      on either the VALU or MFMA units, per normalization unit.
-    F32 OPs: The total number of 32-bit floating-point operations executed on either
-      the VALU or MFMA units, per normalization unit.
-    F64 OPs: The total number of 64-bit floating-point operations executed on either
-      the VALU or MFMA units, per normalization unit.
-    INT8 OPs: The total number of 8-bit integer operations executed on either the
-      VALU or MFMA units, per normalization unit.
   data source:
   - metric_table:
       id: 1101
@@ -108,13 +30,13 @@ Panel Config:
           unit: Instr/cycle
         IPC (Issued):
           avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
-            + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED  + SQ_INSTS_LDS)
+            + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
             / SQ_ACTIVE_INST_ANY))
           min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
             + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
             / SQ_ACTIVE_INST_ANY))
           max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
-            + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED  + SQ_INSTS_LDS)
+            + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
             / SQ_ACTIVE_INST_ANY))
           unit: Instr/cycle
         SALU Utilization:
@@ -145,3 +67,20 @@ Panel Config:
         max: Max
         unit: Unit
       metric: {}
+  metrics_description:
+    IPC: The ratio of the total number of instructions executed on the CU over the
+      total active CU cycles.
+    IPC (Issued): The ratio of the total number of (non-internal) instructions issued
+      over the number of cycles where the scheduler was actively working on issuing
+      instructions.
+    SALU Utilization: Indicates what percent of the kernel's duration the SALU was
+      busy executing instructions. Computed as the ratio of the total number of cycles
+      spent by the scheduler issuing SALU / SMEM instructions over the total CU cycles.
+    VALU Utilization: Indicates what percent of the kernel's duration the VALU was
+      busy executing instructions. Does not include VMEM operations. Computed as the
+      ratio of the total number of cycles spent by the scheduler issuing VALU instructions
+      over the total CU cycles.
+    VALU Active Threads: Indicates the average level of divergence within a wavefront
+      over the lifetime of the kernel. The number of work-items that were active in
+      a wavefront during execution of each VALU instruction, time-averaged over all
+      VALU instructions run on all wavefronts in the kernel
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1200_local_data_share_lds.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1200_local_data_share_lds.yaml
index 2718654ad4..b7767fea16 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1200_local_data_share_lds.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1200_local_data_share_lds.yaml
@@ -2,51 +2,6 @@
 Panel Config:
   id: 1200
   title: Local Data Share (LDS)
-  metrics_description:
-    Utilization: Indicates what percent of the kernel's duration the LDS was actively
-      executing instructions (including, but not limited to, load, store, atomic and
-      HIP's __shfl operations). Calculated as the ratio of the total number of cycles
-      LDS was active over the total CU cycles.
-    Access Rate: Indicates the percentage of SIMDs in the VALU actively issuing LDS
-      instructions, averaged over the lifetime of the kernel. Calculated as the ratio
-      of the total number of cycles spent by the scheduler issuing LDS instructions
-      over the total CU cycles.
-    Theoretical Bandwidth Utilization: Indicates the maximum amount of bytes that
-      could have been loaded from, stored to, or atomically updated in the LDS divided
-      as percentage of theoretical peak. Does not take into account the execution
-      mask of the wavefront when the instruction was executed.
-    Theoretical Bandwidth: Indicates the maximum amount of bytes that could have been
-      loaded from, stored to, or atomically updated in the LDS divided by total duration.
-      Does not take into account the execution mask of the wavefront when the instruction
-      was executed.
-    Bank Conflict Rate: Indicates the percentage of active LDS cycles that were spent
-      servicing bank conflicts. Calculated as the ratio of LDS cycles spent servicing
-      bank conflicts over the number of LDS cycles that would have been required to
-      move the same amount of data in an uncontended access.
-    LDS Instructions: The total number of LDS instructions (including, but not limited
-      to, read/write/atomics and HIP's __shfl instructions) executed per normalization
-      unit.
-    LDS Latency: The average number of round-trip cycles (i.e., from issue to data-return
-      / acknowledgment) required for an LDS instruction to complete.
-    Bank Conflicts/Access: The ratio of the number of cycles spent in the LDS scheduler
-      due to bank conflicts (as determined by the conflict resolution hardware) to
-      the base number of cycles that would be spent in the LDS scheduler in a completely
-      uncontended case. This is the unnormalized form of the Bank Conflict Rate.
-    Index Accesses: The total number of cycles spent in the LDS scheduler over all
-      operations per normalization unit.
-    Atomic Return Cycles: The total number of cycles spent on LDS atomics with return
-      per normalization unit.
-    Bank Conflict: The total number of cycles spent in the LDS scheduler due to bank
-      conflicts (as determined by the conflict resolution hardware) per normalization
-      unit.
-    Addr Conflict: The total number of cycles spent in the LDS scheduler due to address
-      conflicts (as determined by the conflict resolution hardware) per normalization
-      unit.
-    Unaligned Stall: The total number of cycles spent in the LDS scheduler due to
-      stalls from non-dword aligned addresses per normalization unit.
-    Mem Violations: "The total number of out-of-bounds accesses made to the LDS, per\
-      \ normalization unit. This is unused and expected to be zero in most configurations\
-      \ for modern CDNA\u2122 accelerators."
   data source:
   - metric_table:
       id: 1201
@@ -87,7 +42,7 @@ Panel Config:
           avg: AVG((SQ_INSTS_LDS / $denom))
           min: MIN((SQ_INSTS_LDS / $denom))
           max: MAX((SQ_INSTS_LDS / $denom))
-          unit: (Instr  + $normUnit)
+          unit: (Instr + $normUnit)
         Theoretical Bandwidth:
           avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
             / (End_Timestamp - Start_Timestamp)))
@@ -117,29 +72,75 @@ Panel Config:
           avg: AVG((SQ_LDS_IDX_ACTIVE / $denom))
           min: MIN((SQ_LDS_IDX_ACTIVE / $denom))
           max: MAX((SQ_LDS_IDX_ACTIVE / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
         Atomic Return Cycles:
           avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom))
           min: MIN((SQ_LDS_ATOMIC_RETURN / $denom))
           max: MAX((SQ_LDS_ATOMIC_RETURN / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
         Bank Conflict:
           avg: AVG((SQ_LDS_BANK_CONFLICT / $denom))
           min: MIN((SQ_LDS_BANK_CONFLICT / $denom))
           max: MAX((SQ_LDS_BANK_CONFLICT / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
         Addr Conflict:
           avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom))
           min: MIN((SQ_LDS_ADDR_CONFLICT / $denom))
           max: MAX((SQ_LDS_ADDR_CONFLICT / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
         Unaligned Stall:
           avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom))
           min: MIN((SQ_LDS_UNALIGNED_STALL / $denom))
           max: MAX((SQ_LDS_UNALIGNED_STALL / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
         Mem Violations:
           avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom))
           min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom))
           max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom))
           unit: (Accesses + $normUnit)
+  metrics_description:
+    Utilization: Indicates what percent of the kernel's duration the LDS was actively
+      executing instructions (including, but not limited to, load, store, atomic and
+      HIP's __shfl operations). Calculated as the ratio of the total number of cycles
+      LDS was active over the total CU cycles.
+    Access Rate: Indicates the percentage of SIMDs in the VALU actively issuing LDS
+      instructions, averaged over the lifetime of the kernel. Calculated as the ratio
+      of the total number of cycles spent by the scheduler issuing LDS instructions
+      over the total CU cycles.
+    Theoretical Bandwidth Utilization: Indicates the maximum amount of bytes that
+      could have been loaded from, stored to, or atomically updated in the LDS divided
+      as percentage of theoretical peak. Does not take into account the execution
+      mask of the wavefront when the instruction was executed.
+    Theoretical Bandwidth: Indicates the maximum amount of bytes that could have been
+      loaded from, stored to, or atomically updated in the LDS divided by total duration.
+      Does not take into account the execution mask of the wavefront when the instruction
+      was executed.
+    Bank Conflict Rate: Indicates the percentage of active LDS cycles that were spent
+      servicing bank conflicts. Calculated as the ratio of LDS cycles spent servicing
+      bank conflicts over the number of LDS cycles that would have been required to
+      move the same amount of data in an uncontended access.
+    LDS Instructions: The total number of LDS instructions (including, but not limited
+      to, read/write/atomics and HIP's __shfl instructions) executed per normalization
+      unit.
+    LDS Latency: The average number of round-trip cycles (i.e., from issue to data-return
+      acknowledgment) required for an LDS instruction to complete.
+    Bank Conflicts/Access: The ratio of the number of cycles spent in the LDS scheduler
+      due to bank conflicts (as determined by the conflict resolution hardware) to
+      the base number of cycles that would be spent in the LDS scheduler in a completely
+      uncontended case. This is the unnormalized form of the Bank Conflict Rate.
+    Index Accesses: The total number of cycles spent in the LDS scheduler over all
+      operations per normalization unit.
+    Atomic Return Cycles: The total number of cycles spent on LDS atomics with return
+      per normalization unit.
+    Bank Conflict: The total number of cycles spent in the LDS scheduler due to bank
+      conflicts (as determined by the conflict resolution hardware) per normalization
+      unit.
+    Addr Conflict: The total number of cycles spent in the LDS scheduler due to address
+      conflicts (as determined by the conflict resolution hardware) per normalization
+      unit.
+    Unaligned Stall: The total number of cycles spent in the LDS scheduler due to
+      stalls from non-dword aligned addresses per normalization unit.
+    Mem Violations: |-
+      The total number of out-of-bounds accesses made to the LDS, per normalization
+      unit. This is unused and expected to be zero in most configurations for
+      modern CDNA\u2122 accelerators.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1300_instruction_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1300_instruction_cache.yaml
index aeda9bc6c7..35808d9d96 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1300_instruction_cache.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1300_instruction_cache.yaml
@@ -2,28 +2,6 @@
 Panel Config:
   id: 1300
   title: Instruction Cache
-  metrics_description:
-    Bandwidth Utilization: The number of bytes looked up in the L1I cache, as a percent
-      of the peak theoretical bandwidth. Calculated as the ratio of L1I requests over
-      the total L1I cycles.
-    Cache Hit Rate: The percent of L1I requests that hit [#l1i-cache]_ on a previously
-      loaded line the cache. Calculated as the ratio of the number of L1I requests
-      that hit over the number of all L1I requests.
-    L1I-L2 Bandwidth Utilization: "The percent of the peak theoretical L1I \u2192\
-      \ L2 cache request bandwidth achieved. Calculated as the ratio of the total\
-      \ number of requests from the L1I to the L2 cache over the total L1I-L2 interface\
-      \ cycles."
-    L1I-L2 Bandwidth: Total number of bytes transferred across L1I - L2 interface
-      divided by total duration.
-    Req: The total number of requests made to the L1I per normalization-unit
-    Hits: The total number of L1I requests that hit on a previously loaded cache line,
-      per normalization-unit.
-    Misses - Non Duplicated: The total number of L1I requests that missed on a cache
-      line that were not already pending due to another request, per normalization-unit.
-    Misses - Duplicated: The total number of L1I requests that missed on a cache line
-      that were already pending due to another request, per normalization-unit.
-    Instruction Fetch Latency: The average number of cycles spent to fetch instructions
-      to a CU.
   data source:
   - metric_table:
       id: 1301
@@ -62,22 +40,22 @@ Panel Config:
           avg: AVG((SQC_ICACHE_REQ / $denom))
           min: MIN((SQC_ICACHE_REQ / $denom))
           max: MAX((SQC_ICACHE_REQ / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Hits:
           avg: AVG((SQC_ICACHE_HITS / $denom))
           min: MIN((SQC_ICACHE_HITS / $denom))
           max: MAX((SQC_ICACHE_HITS / $denom))
-          unit: (Hits  + $normUnit)
+          unit: (Hits + $normUnit)
         Misses - Non Duplicated:
           avg: AVG((SQC_ICACHE_MISSES / $denom))
           min: MIN((SQC_ICACHE_MISSES / $denom))
           max: MAX((SQC_ICACHE_MISSES / $denom))
-          unit: (Misses  + $normUnit)
+          unit: (Misses + $normUnit)
         Misses - Duplicated:
           avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom))
           min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom))
           max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom))
-          unit: (Misses  + $normUnit)
+          unit: (Misses + $normUnit)
         Cache Hit Rate:
           avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
             + SQC_ICACHE_MISSES_DUPLICATE)))
@@ -107,3 +85,25 @@ Panel Config:
           min: MIN(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
           max: MAX(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
           unit: Gbps
+  metrics_description:
+    Bandwidth Utilization: The number of bytes looked up in the L1I cache, as a percent
+      of the peak theoretical bandwidth. Calculated as the ratio of L1I requests over
+      the total L1I cycles.
+    Cache Hit Rate: The percent of L1I requests that hit [#l1i-cache]_ on a previously
+      loaded line the cache. Calculated as the ratio of the number of L1I requests
+      that hit over the number of all L1I requests.
+    L1I-L2 Bandwidth Utilization: |-
+      The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth
+      achieved. Calculated as the ratio of the total number of requests from the
+      L1I to the L2 cache over the total L1I-L2 interface cycles.
+    L1I-L2 Bandwidth: Total number of bytes transferred across L1I - L2 interface
+      divided by total duration.
+    Req: The total number of requests made to the L1I per normalization-unit
+    Hits: The total number of L1I requests that hit on a previously loaded cache line,
+      per normalization-unit.
+    Misses - Non Duplicated: The total number of L1I requests that missed on a cache
+      line that were not already pending due to another request, per normalization-unit.
+    Misses - Duplicated: The total number of L1I requests that missed on a cache line
+      that were already pending due to another request, per normalization-unit.
+    Instruction Fetch Latency: The average number of cycles spent to fetch instructions
+      to a CU.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1400_scalar_l1_data_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1400_scalar_l1_data_cache.yaml
index 282b97ad1f..6b73164848 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1400_scalar_l1_data_cache.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1400_scalar_l1_data_cache.yaml
@@ -2,49 +2,6 @@
 Panel Config:
   id: 1400
   title: Scalar L1 Data Cache
-  metrics_description:
-    Bandwidth Utilization: The number of bytes looked up in the sL1D cache, as a percent
-      of the peak theoretical bandwidth. Calculated as the ratio of sL1D requests
-      over the total sL1D cycles.
-    Cache Hit Rate: Indicates the percent of sL1D requests that hit on a previously
-      loaded line the cache. The ratio of the number of sL1D requests that hit over
-      the number of all sL1D requests.
-    sL1D-L2 BW Utilization: The percentage of the peak theoretical sL1D - L2 interface
-      bandwidth acheived.\ \ Caclulated as total number of bytes read from, written
-      to, or atomically updated\ \ across the sL1D - L2 interface.
-    sL1D-L2 BW: "The total number of bytes read from, written to, or atomically updated\
-      \ across the sL1D\u2194L2 interface, divided by total duration. Note that sL1D\
-      \ writes and atomics are typically unused on current CDNA accelerators, so in\
-      \ the majority of cases this can be interpreted as an sL1D\u2192L2 read bandwidth."
-    Req: The total number of requests, of any size or type, made to the sL1D per normalization
-      unit.
-    Hits: The total number of sL1D requests that hit on a previously loaded cache
-      line, per normalization unit.
-    Misses - Non Duplicated: 'The total number of sL1D requests that missed on a cache
-      line that was not already pending due to another request, per normalization
-      unit. '
-    Misses- Duplicated: The total number of sL1D requests that missed on a cache line
-      that was already pending due to another request, per normalization unit.
-    Read Req (Total): The total number of sL1D read requests of any size, per normalization
-      unit.
-    Atomic Req: The total number of atomic requests from sL1D to the L2, per normalization
-      unit. Typically unused on current CDNA accelerators.
-    Read Req (1 DWord): The total number of sL1D read requests made for a single dword
-      of data (4B), per normalization unit.
-    Read Req (2 DWord): The total number of sL1D read requests made for a two dwords
-      of data (8B), per normalization unit.
-    Read Req (4 DWord): The total number of sL1D read requests made for a four dwords
-      of data (16B), per normalization unit.
-    Read Req (8 DWord): The total number of sL1D read requests made for a eight dwords
-      of data (32B), per normalization unit.
-    Read Req (16 DWord): The total number of sL1D read requests made for a sixteen
-      dwords of data (64B), per normalization unit.
-    Read Req: The total number of read requests from sL1D to the L2 per normalization
-      unit.
-    Write Req: The total number of write requests from sL1D to the L2, per normalization
-      unit. Typically unused on current CDNA accelerators.
-    Stall Cycles: "The total number of cycles the sL1D\u2194L2 interface was stalled,\
-      \ per normalization unit."
   data source:
   - metric_table:
       id: 1401
@@ -84,22 +41,22 @@ Panel Config:
           avg: AVG((SQC_DCACHE_REQ / $denom))
           min: MIN((SQC_DCACHE_REQ / $denom))
           max: MAX((SQC_DCACHE_REQ / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Hits:
           avg: AVG((SQC_DCACHE_HITS / $denom))
           min: MIN((SQC_DCACHE_HITS / $denom))
           max: MAX((SQC_DCACHE_HITS / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Misses - Non Duplicated:
           avg: AVG((SQC_DCACHE_MISSES / $denom))
           min: MIN((SQC_DCACHE_MISSES / $denom))
           max: MAX((SQC_DCACHE_MISSES / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Misses- Duplicated:
           avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom))
           min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom))
           max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Cache Hit Rate:
           avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
             + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
@@ -118,37 +75,37 @@ Panel Config:
             + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
           max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
             + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Atomic Req:
           avg: AVG((SQC_DCACHE_ATOMIC / $denom))
           min: MIN((SQC_DCACHE_ATOMIC / $denom))
           max: MAX((SQC_DCACHE_ATOMIC / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Read Req (1 DWord):
           avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom))
           min: MIN((SQC_DCACHE_REQ_READ_1 / $denom))
           max: MAX((SQC_DCACHE_REQ_READ_1 / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Read Req (2 DWord):
           avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom))
           min: MIN((SQC_DCACHE_REQ_READ_2 / $denom))
           max: MAX((SQC_DCACHE_REQ_READ_2 / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Read Req (4 DWord):
           avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom))
           min: MIN((SQC_DCACHE_REQ_READ_4 / $denom))
           max: MAX((SQC_DCACHE_REQ_READ_4 / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Read Req (8 DWord):
           avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom))
           min: MIN((SQC_DCACHE_REQ_READ_8 / $denom))
           max: MAX((SQC_DCACHE_REQ_READ_8 / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Read Req (16 DWord):
           avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom))
           min: MIN((SQC_DCACHE_REQ_READ_16 / $denom))
           max: MAX((SQC_DCACHE_REQ_READ_16 / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
   - metric_table:
       id: 1403
       title: Scalar L1D Cache - L2 Interface
@@ -171,19 +128,65 @@ Panel Config:
           avg: AVG((SQC_TC_DATA_READ_REQ / $denom))
           min: MIN((SQC_TC_DATA_READ_REQ / $denom))
           max: MAX((SQC_TC_DATA_READ_REQ / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Write Req:
           avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom))
           min: MIN((SQC_TC_DATA_WRITE_REQ / $denom))
           max: MAX((SQC_TC_DATA_WRITE_REQ / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Atomic Req:
           avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom))
           min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom))
           max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Stall Cycles:
           avg: AVG((SQC_TC_STALL / $denom))
           min: MIN((SQC_TC_STALL / $denom))
           max: MAX((SQC_TC_STALL / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
+  metrics_description:
+    Bandwidth Utilization: The number of bytes looked up in the sL1D cache, as a percent
+      of the peak theoretical bandwidth. Calculated as the ratio of sL1D requests
+      over the total sL1D cycles.
+    Cache Hit Rate: Indicates the percent of sL1D requests that hit on a previously
+      loaded line the cache. The ratio of the number of sL1D requests that hit over
+      the number of all sL1D requests.
+    sL1D-L2 BW Utilization: The percentage of the peak theoretical sL1D - L2 interface
+      bandwidth acheived. Calculated as total number of bytes read from, written to,
+      or atomically updated across the sL1D - L2 interface.
+    sL1D-L2 BW: |-
+      The total number of bytes read from, written to, or atomically updated
+      across the sL1D\u2194L2 interface, divided by total duration. Note that sL1D
+      writes and atomics are typically unused on current CDNA accelerators, so
+      in the majority of cases this can be interpreted as an sL1D\u2192L2 read
+      bandwidth.
+    Req: The total number of requests, of any size or type, made to the sL1D per normalization
+      unit.
+    Hits: The total number of sL1D requests that hit on a previously loaded cache
+      line, per normalization unit.
+    Misses - Non Duplicated: |-
+      The total number of sL1D requests that missed on a cache line that was
+      not already pending due to another request, per normalization unit.
+    Misses- Duplicated: The total number of sL1D requests that missed on a cache line
+      that was already pending due to another request, per normalization unit.
+    Read Req (Total): The total number of sL1D read requests of any size, per normalization
+      unit.
+    Atomic Req: The total number of atomic requests from sL1D to the L2, per normalization
+      unit. Typically unused on current CDNA accelerators.
+    Read Req (1 DWord): The total number of sL1D read requests made for a single dword
+      of data (4B), per normalization unit.
+    Read Req (2 DWord): The total number of sL1D read requests made for a two dwords
+      of data (8B), per normalization unit.
+    Read Req (4 DWord): The total number of sL1D read requests made for a four dwords
+      of data (16B), per normalization unit.
+    Read Req (8 DWord): The total number of sL1D read requests made for a eight dwords
+      of data (32B), per normalization unit.
+    Read Req (16 DWord): The total number of sL1D read requests made for a sixteen
+      dwords of data (64B), per normalization unit.
+    Read Req: The total number of read requests from sL1D to the L2 per normalization
+      unit.
+    Write Req: The total number of write requests from sL1D to the L2, per normalization
+      unit. Typically unused on current CDNA accelerators.
+    Stall Cycles: |-
+      The total number of cycles the sL1D\u2194L2 interface was stalled, per
+      normalization unit.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1500_address_processing_unit_and_data_return_path_ta_td.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1500_address_processing_unit_and_data_return_path_ta_td.yaml
index 4c615fb0d5..61a1df00de 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1500_address_processing_unit_and_data_return_path_ta_td.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1500_address_processing_unit_and_data_return_path_ta_td.yaml
@@ -2,70 +2,6 @@
 Panel Config:
   id: 1500
   title: Address Processing Unit and Data Return Path (TA/TD)
-  metrics_description:
-    Address Processing Unit Busy: Percent of the total CU cycles the address processor
-      was busy
-    Address Stall: Percent of the total CU cycles the address processor was stalled
-      from sending address requests further into the vL1D pipeline.
-    Data Stall: Percent of the total CU cycles the address processor was stalled from
-      sending write/atomic data further into the vL1D pipeline.
-    "Data-Processor \u2192 Address Stall": Percent of total CU cycles the address
-      processor was stalled waiting to send command data to the data processor.
-    Total Instructions: The total number of memory instructions executed by the address
-      processer over all compute units on the accelerator, per normalization unit.
-    Global/Generic Instructions: The total number of global & generic memory instructions
-      executed on all compute units on the accelerator, per normalization unit.
-    Global/Generic Read Instructions: The total number of global & generic memory
-      read instructions executed on all compute units on the accelerator, per normalization
-      unit.
-    Global/Generic Write Instructions: The total number of global & generic memory
-      write instructions executed on all compute units on the accelerator, per normalization
-      unit.
-    Global/Generic Atomic Instructions: The total number of global & generic memory
-      atomic (with and without return) instructions executed on all compute units
-      on the accelerator, per normalization unit.
-    Spill/Stack Instructions: The total number of spill/stack memory instructions
-      executed on all compute units on the accelerator, per normalization unit.
-    Spill/Stack Read Instructions: The total number of spill/stack memory read instructions
-      executed on all compute units on the accelerator, per normalization unit.
-    Spill/Stack Write Instructions: The total number of spill/stack memory write instructions
-      executed on all compute units on the accelerator, per normalization unit.
-    Spill/Stack Atomic Instructions: The total number of spill/stack memory atomic
-      (with and without return) instructions executed on all compute units on the
-      accelerator, per normalization unit. Typically unused as these memory operations
-      are typically used to implement thread-local storage.
-    Spill/Stack Total Cycles: The number of cycles the address processing unit spent
-      working on spill/stack instructions, per normalization unit.
-    Spill/Stack Coalesced Read: The number of cycles the address processing unit spent
-      working on coalesced spill/stack read instructions, per normalization unit.
-    Spill/Stack Coalesced Write: The number of cycles the address processing unit
-      spent working on coalesced spill/stack write instructions, per normalization
-      unit.
-    Data-Return Busy: Percent of the total CU cycles the data-return unit was busy
-      processing or waiting on data to return to the CU.
-    "Cache RAM \u2192 Data-Return Stall": Percent of the total CU cycles the data-return
-      unit was stalled on data to be returned from the vL1D Cache RAM.
-    "Workgroup manager \u2192 Data-Return Stall": Percent of the total CU cycles the
-      data-return unit was stalled by the workgroup manager due to initialization
-      of registers as a part of launching new workgroups.
-    Coalescable Instructions: The number of instructions submitted to the data-return
-      unit by the address processor that were found to be coalescable, per normalization
-      unit.
-    Read Instructions: The number of read instructions submitted to the data-return
-      unit by the address processor summed over all compute units on the accelerator,
-      per normalization unit. This is expected to be the sum of global/generic and
-      spill/stack reads in the address processor.
-    Write Instructions: The number of store instructions submitted to the data-return
-      unit by the address processor summed over all compute units on the accelerator,
-      per normalization unit. This is expected to be the sum of global/generic and
-      spill/stack stores in the address processor.
-    Atomic Instructions: The number of atomic instructions submitted to the data-return
-      unit by the address processor summed over all compute units on the accelerator,
-      per normalization unit. This is expected to be the sum of global/generic and
-      spill/stack atomics in the address processor.
-    Write Ack Instructions: The total number of write acknowledgements submitted by
-      data-return unit to SQ, summed over all compute units on the accelerator, per
-      normalization unit.
   data source:
   - metric_table:
       id: 1501
@@ -120,47 +56,47 @@ Panel Config:
           avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom))
           min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom))
           max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Global/Generic Instructions:
           avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
           min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
           max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Global/Generic Read Instructions:
           avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
           min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
           max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Global/Generic Write Instructions:
           avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
           min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
           max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Global/Generic Atomic Instructions:
           avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
           min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
           max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Spill/Stack Instructions:
           avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
           min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
           max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Spill/Stack Read Instructions:
           avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
           min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
           max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Spill/Stack Write Instructions:
           avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
           min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
           max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Spill/Stack Atomic Instructions:
           avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
           min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
           max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
   - metric_table:
       id: 1503
       title: Spill and stack metrics
@@ -175,17 +111,17 @@ Panel Config:
           avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
           min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
           max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
         Spill/Stack Coalesced Read:
           avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
           min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
           max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
         Spill/Stack Coalesced Write:
           avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
           min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
           max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
   - metric_table:
       id: 1504
       title: Vector L1 data-return path or Texture Data (TD)
@@ -210,7 +146,7 @@ Panel Config:
           avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom))
           min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom))
           max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Read Instructions:
           avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
             / $denom))
@@ -218,14 +154,72 @@ Panel Config:
             / $denom))
           max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
             / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Write Instructions:
           avg: AVG((TD_STORE_WAVEFRONT_sum / $denom))
           min: MIN((TD_STORE_WAVEFRONT_sum / $denom))
           max: MAX((TD_STORE_WAVEFRONT_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Atomic Instructions:
           avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom))
           min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom))
           max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
+  metrics_description:
+    Address Processing Unit Busy: Percent of the total CU cycles the address processor
+      was busy
+    Address Stall: Percent of the total CU cycles the address processor was stalled
+      from sending address requests further into the vL1D pipeline.
+    Data Stall: Percent of the total CU cycles the address processor was stalled from
+      sending write/atomic data further into the vL1D pipeline.
+    "Data-Processor \u2192 Address Stall": Percent of total CU cycles the address
+      processor was stalled waiting to send command data to the data processor.
+    Total Instructions: The total number of memory instructions executed by the address
+      processer over all compute units on the accelerator, per normalization unit.
+    Global/Generic Instructions: The total number of global & generic memory instructions
+      executed on all compute units on the accelerator, per normalization unit.
+    Global/Generic Read Instructions: The total number of global & generic memory
+      read instructions executed on all compute units on the accelerator, per normalization
+      unit.
+    Global/Generic Write Instructions: The total number of global & generic memory
+      write instructions executed on all compute units on the accelerator, per normalization
+      unit.
+    Global/Generic Atomic Instructions: The total number of global & generic memory
+      atomic (with and without return) instructions executed on all compute units
+      on the accelerator, per normalization unit.
+    Spill/Stack Instructions: The total number of spill/stack memory instructions
+      executed on all compute units on the accelerator, per normalization unit.
+    Spill/Stack Read Instructions: The total number of spill/stack memory read instructions
+      executed on all compute units on the accelerator, per normalization unit.
+    Spill/Stack Write Instructions: The total number of spill/stack memory write instructions
+      executed on all compute units on the accelerator, per normalization unit.
+    Spill/Stack Atomic Instructions: The total number of spill/stack memory atomic
+      (with and without return) instructions executed on all compute units on the
+      accelerator, per normalization unit. Typically unused as these memory operations
+      are typically used to implement thread-local storage.
+    Spill/Stack Total Cycles: The number of cycles the address processing unit spent
+      working on spill/stack instructions, per normalization unit.
+    Spill/Stack Coalesced Read: The number of cycles the address processing unit spent
+      working on coalesced spill/stack read instructions, per normalization unit.
+    Spill/Stack Coalesced Write: The number of cycles the address processing unit
+      spent working on coalesced spill/stack write instructions, per normalization
+      unit.
+    Data-Return Busy: Percent of the total CU cycles the data-return unit was busy
+      processing or waiting on data to return to the CU.
+    "Cache RAM \u2192 Data-Return Stall": Percent of the total CU cycles the data-return
+      unit was stalled on data to be returned from the vL1D Cache RAM.
+    Coalescable Instructions: The number of instructions submitted to the data-return
+      unit by the address processor that were found to be coalescable, per normalization
+      unit.
+    Read Instructions: The number of read instructions submitted to the data-return
+      unit by the address processor summed over all compute units on the accelerator,
+      per normalization unit. This is expected to be the sum of global/generic and
+      spill/stack reads in the address processor.
+    Write Instructions: The number of store instructions submitted to the data-return
+      unit by the address processor summed over all compute units on the accelerator,
+      per normalization unit. This is expected to be the sum of global/generic and
+      spill/stack stores in the address processor.
+    Atomic Instructions: The number of atomic instructions submitted to the data-return
+      unit by the address processor summed over all compute units on the accelerator,
+      per normalization unit. This is expected to be the sum of global/generic and
+      spill/stack atomics in the address processor.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1600_vector_l1_data_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1600_vector_l1_data_cache.yaml
index b374ea9466..2be99f875f 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1600_vector_l1_data_cache.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1600_vector_l1_data_cache.yaml
@@ -2,117 +2,6 @@
 Panel Config:
   id: 1600
   title: Vector L1 Data Cache
-  metrics_description:
-    Hit rate: The ratio of the number of vL1D cache line requests that hit in vL1D
-      cache over the total number of cache line requests to the vL1D Cache RAM.
-    Bandwidth Utilization: The number of bytes looked up in the vL1D cache as a result
-      of VMEM instructions, as a percent of the peak theoretical bandwidth achievable
-      on the specific accelerator. The number of bytes is calculated as the number
-      of cache lines requested multiplied by the cache line size. This value does
-      not consider partial requests, so for instance, if only a single value is requested
-      in a cache line, the data movement will still be counted as a full cache line.
-    Utilization: Indicates how busy the vL1D Cache RAM was during the kernel execution.
-      The number of cycles where the vL1D Cache RAM is actively processing any request
-      divided by the number of cycles where the vL1D is active.
-    Coalescing: Indicates how well memory instructions were coalesced by the address
-      processing unit, ranging from uncoalesced (25%) to fully coalesced (100%). Calculated
-      as the average number of thread-requests generated per instruction divided by
-      the ideal number of thread-requests per instruction.
-    Stalled on L2 Data: The ratio of the number of cycles where the vL1D is stalled
-      waiting for requested data to return from the L2 cache divided by the number
-      of cycles where the vL1D is active.
-    Stalled on L2 Req: The ratio of the number of cycles where the vL1D is stalled
-      waiting to issue a request for data to the L2 cache divided by the number of
-      cycles where the vL1D is active.
-    Tag RAM Stall (Read): The ratio of the number of cycles where the vL1D is stalled
-      due to Read requests with conflicting tags being looked up concurrently, divided
-      by the number of cycles where the vL1D is active.
-    Tag RAM Stall (Write): The ratio of the number of cycles where the vL1D is stalled
-      due to Write requests with conflicting tags being looked up concurrently, divided
-      by the number of cycles where the vL1D is active.
-    Tag RAM Stall (Atomic): The ratio of the number of cycles where the vL1D is stalled
-      due to Atomic requests with conflicting tags being looked up concurrently, divided
-      by the number of cycles where the vL1D is active.
-    Total Req: The total number of incoming requests from the address processing unit
-      after coalescing.
-    Read Req: The total number of incoming read requests from the address processing
-      unit after coalescing per normalization unit.
-    Write Req: The total number of incoming write requests from the address processing
-      unit after coalescing per normalization unit.
-    Atomic Req: The total number of incoming atomic requests from the address processing
-      unit after coalescing per normalization unit.
-    Cache BW: The number of bytes looked up in the vL1D cache as a result of VMEM
-      instructions divided by total duration. The number of bytes is calculated as
-      the number of cache lines requested multiplied by the cache line size.  This
-      value does not consider partial requests, so for instance, if only a single
-      value is requested in a cache line, the data movement will still be counted
-      as a full cache line.
-    Cache Hit Rate: The ratio of the number of vL1D cache line requests that hit in
-      vL1D cache over the total number of cache line requests to the vL1D Cache RAM.
-    Cache Accesses: The total number of cache line lookups in the vL1D.
-    Cache Hits: The number of cache accesses minus the number of outgoing requests
-      to the L2 cache, that is, the number of cache line requests serviced by the
-      vL1D Cache RAM per normalization unit.
-    Invalidations: The number of times the vL1D was issued a write-back invalidate
-      command during the kernel's execution per normalization unit. This may be triggered
-      by, for instance, the buffer_wbinvl1 instruction.
-    L1-L2 BW: The number of bytes transferred across the vL1D-L2 interface as a result
-      of VMEM instructions, divided by total duration. The number of bytes is calculated
-      as the number of cache lines requested multiplied by the cache line size. This
-      value does not consider partial requests, so for instance, if only a single
-      value is requested in a cache line, the data movement will still be counted
-      as a full cache line.
-    L1-L2 Read: The number of read requests for a vL1D cache line that were not satisfied
-      by the vL1D and must be retrieved from the to the L2 Cache per normalization
-      unit.
-    L1-L2 Write: The number of write requests to a vL1D cache line that were sent
-      through the vL1D to the L2 cache, per normalization unit.
-    L1-L2 Atomic: The number of atomic requests that are sent through the vL1D to
-      the L2 cache, per normalization unit. This includes requests for atomics with,
-      and without return.
-    L1 Access Latency: Calculated as the average number of cycles that a vL1D cache
-      line request spent in the vL1D cache pipeline.
-    L1-L2 Read Latency: Calculated as the average number of cycles that the vL1D cache
-      took to issue and receive read requests from the L2 Cache. This number also
-      includes requests for atomics with return values.
-    L1-L2 Write Latency: Calculated as the average number of cycles that the vL1D
-      cache took to issue and receive acknowledgement of a write request to the L2
-      Cache. This number also includes requests for atomics without return values.
-    NC - Read: Total read requests with NC mtype from this TCP to all TCCs Sum over
-      TCP instances per normalization unit.
-    UC - Read: Total read requests with UC mtype from this TCP to all TCCs Sum over
-      TCP instances per normalization unit.
-    CC - Read: Total read requests with CC mtype from this TCP to all TCCs Sum over
-      TCP instances per normalization unit.
-    RW - Read: Total read requests with RW mtype from this TCP to all TCCs Sum over
-      TCP instances per normalization unit.
-    RW - Write: Total write requests with RW mtype from this TCP to all TCCs Sum over
-      TCP instances per normalization unit.
-    NC - Write: Total write requests with NC mtype from this TCP to all TCCs Sum over
-      TCP instances per normalization unit.
-    UC - Write: Total write requests with UC mtype from this TCP to all TCCs Sum over
-      TCP instances per normalization unit.
-    CC - Write: Total write requests with CC mtype from this TCP to all TCCs Sum over
-      TCP instances per normalization unit.
-    NC - Atomic: Total atomic requests with NC mtype from this TCP to all TCCs Sum
-      over TCP instances per normalization unit.
-    UC - Atomic: Total atomic requests with UC mtype from this TCP to all TCCs Sum
-      over TCP instances per normalization unit.
-    CC - Atomic: Total atomic requests with CC mtype from this TCP to all TCCs Sum
-      over TCP instances per normalization unit.
-    RW - Atomic: Total atomic requests with RW mtype from this TCP to all TCCs Sum
-      over TCP instances per normalization unit.
-    Req: The number of translation requests made to the UTCL1 per normalization unit.
-    Hit Ratio: The ratio of the number of translation requests that hit in the UTCL1
-      divided by the total number of translation requests made to the UTCL1.
-    Hits: The number of translation requests that hit in the UTCL1, and could be reused,
-      per normalization unit.
-    Translation Misses: The total number of translation requests that missed in the
-      UTCL1 due to  translation not being present in the cache, per normalization
-      unit.
-    Permission Misses: "The total number of translation requests that missed in the\
-      \ UTCL1 due to a permission error, per normalization unit. This is unused and\
-      \ expected to be zero in most configurations for modern CDNA\u2122 accelerators."
   data source:
   - metric_table:
       id: 1601
@@ -181,17 +70,17 @@ Panel Config:
           avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom))
           min: MIN((TCP_TOTAL_ACCESSES_sum / $denom))
           max: MAX((TCP_TOTAL_ACCESSES_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Read Req:
           avg: AVG((TCP_TOTAL_READ_sum / $denom))
           min: MIN((TCP_TOTAL_READ_sum / $denom))
           max: MAX((TCP_TOTAL_READ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Write Req:
           avg: AVG((TCP_TOTAL_WRITE_sum / $denom))
           min: MIN((TCP_TOTAL_WRITE_sum / $denom))
           max: MAX((TCP_TOTAL_WRITE_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Atomic Req:
           avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
             / $denom))
@@ -199,7 +88,7 @@ Panel Config:
             / $denom))
           max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
             / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Cache BW:
           avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp)))
           min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp)))
@@ -223,7 +112,7 @@ Panel Config:
           avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
           min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
           max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Cache Hits:
           avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
             + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
@@ -234,7 +123,7 @@ Panel Config:
           max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
             + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
             / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Invalidations:
           avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
           min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
@@ -252,12 +141,12 @@ Panel Config:
           avg: AVG((TCP_TCC_READ_REQ_sum / $denom))
           min: MIN((TCP_TCC_READ_REQ_sum / $denom))
           max: MAX((TCP_TCC_READ_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         L1-L2 Write:
           avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom))
           min: MIN((TCP_TCC_WRITE_REQ_sum / $denom))
           max: MAX((TCP_TCC_WRITE_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         L1-L2 Atomic:
           avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
             / $denom))
@@ -265,7 +154,7 @@ Panel Config:
             / $denom))
           max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
             / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         L1 Access Latency:
           avg: AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum
             != 0) else None))
@@ -314,84 +203,84 @@ Panel Config:
           avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom))
           min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom))
           max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         UC - Read:
           xfer: Read
           coherency: UC
           avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom))
           min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom))
           max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         CC - Read:
           xfer: Read
           coherency: CC
           avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom))
           min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom))
           max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         RW - Read:
           xfer: Read
           coherency: RW
           avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom))
           min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom))
           max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         RW - Write:
           xfer: Write
           coherency: RW
           avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom))
           min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom))
           max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         NC - Write:
           xfer: Write
           coherency: NC
           avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom))
           min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom))
           max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         UC - Write:
           xfer: Write
           coherency: UC
           avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom))
           min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom))
           max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         CC - Write:
           xfer: Write
           coherency: CC
           avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom))
           min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom))
           max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         NC - Atomic:
           xfer: Atomic
           coherency: NC
           avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
           min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
           max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         UC - Atomic:
           xfer: Atomic
           coherency: UC
           avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
           min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
           max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         CC - Atomic:
           xfer: Atomic
           coherency: CC
           avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
           min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
           max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         RW - Atomic:
           xfer: Atomic
           coherency: RW
           avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
           min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
           max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
   - metric_table:
       id: 1605
       title: L1 Unified Translation Cache (UTCL1)
@@ -440,3 +329,114 @@ Panel Config:
         max: Max
         units: Unit
       metric: {}
+  metrics_description:
+    Hit rate: The ratio of the number of vL1D cache line requests that hit in vL1D
+      cache over the total number of cache line requests to the vL1D Cache RAM.
+    Bandwidth Utilization: The number of bytes looked up in the vL1D cache as a result
+      of VMEM instructions, as a percent of the peak theoretical bandwidth achievable
+      on the specific accelerator. The number of bytes is calculated as the number
+      of cache lines requested multiplied by the cache line size. This value does
+      not consider partial requests, so for instance, if only a single value is requested
+      in a cache line, the data movement will still be counted as a full cache line.
+    Utilization: Indicates how busy the vL1D Cache RAM was during the kernel execution.
+      The number of cycles where the vL1D Cache RAM is actively processing any request
+      divided by the number of cycles where the vL1D is active.
+    Coalescing: Indicates how well memory instructions were coalesced by the address
+      processing unit, ranging from uncoalesced (25%) to fully coalesced (100%). Calculated
+      as the average number of thread-requests generated per instruction divided by
+      the ideal number of thread-requests per instruction.
+    Stalled on L2 Data: The ratio of the number of cycles where the vL1D is stalled
+      waiting for requested data to return from the L2 cache divided by the number
+      of cycles where the vL1D is active.
+    Stalled on L2 Req: The ratio of the number of cycles where the vL1D is stalled
+      waiting to issue a request for data to the L2 cache divided by the number of
+      cycles where the vL1D is active.
+    Tag RAM Stall (Read): The ratio of the number of cycles where the vL1D is stalled
+      due to Read requests with conflicting tags being looked up concurrently, divided
+      by the number of cycles where the vL1D is active.
+    Tag RAM Stall (Write): The ratio of the number of cycles where the vL1D is stalled
+      due to Write requests with conflicting tags being looked up concurrently, divided
+      by the number of cycles where the vL1D is active.
+    Tag RAM Stall (Atomic): The ratio of the number of cycles where the vL1D is stalled
+      due to Atomic requests with conflicting tags being looked up concurrently, divided
+      by the number of cycles where the vL1D is active.
+    Total Req: The total number of incoming requests from the address processing unit
+      after coalescing.
+    Read Req: The total number of incoming read requests from the address processing
+      unit after coalescing per normalization unit.
+    Write Req: The total number of incoming write requests from the address processing
+      unit after coalescing per normalization unit.
+    Atomic Req: The total number of incoming atomic requests from the address processing
+      unit after coalescing per normalization unit.
+    Cache BW: The number of bytes looked up in the vL1D cache as a result of VMEM
+      instructions divided by total duration. The number of bytes is calculated as
+      the number of cache lines requested multiplied by the cache line size. This
+      value does not consider partial requests, so for instance, if only a single
+      value is requested in a cache line, the data movement will still be counted
+      as a full cache line.
+    Cache Hit Rate: The ratio of the number of vL1D cache line requests that hit in
+      vL1D cache over the total number of cache line requests to the vL1D Cache RAM.
+    Cache Accesses: The total number of cache line lookups in the vL1D.
+    Cache Hits: The number of cache accesses minus the number of outgoing requests
+      to the L2 cache, that is, the number of cache line requests serviced by the
+      vL1D Cache RAM per normalization unit.
+    Invalidations: The number of times the vL1D was issued a write-back invalidate
+      command during the kernel's execution per normalization unit. This may be triggered
+      by, for instance, the buffer_wbinvl1 instruction.
+    L1-L2 BW: The number of bytes transferred across the vL1D-L2 interface as a result
+      of VMEM instructions, divided by total duration. The number of bytes is calculated
+      as the number of cache lines requested multiplied by the cache line size. This
+      value does not consider partial requests, so for instance, if only a single
+      value is requested in a cache line, the data movement will still be counted
+      as a full cache line.
+    L1-L2 Read: The number of read requests for a vL1D cache line that were not satisfied
+      by the vL1D and must be retrieved from the to the L2 Cache per normalization
+      unit.
+    L1-L2 Write: The number of write requests to a vL1D cache line that were sent
+      through the vL1D to the L2 cache, per normalization unit.
+    L1-L2 Atomic: The number of atomic requests that are sent through the vL1D to
+      the L2 cache, per normalization unit. This includes requests for atomics with,
+      and without return.
+    L1 Access Latency: Calculated as the average number of cycles that a vL1D cache
+      line request spent in the vL1D cache pipeline.
+    L1-L2 Read Latency: Calculated as the average number of cycles that the vL1D cache
+      took to issue and receive read requests from the L2 Cache. This number also
+      includes requests for atomics with return values.
+    L1-L2 Write Latency: Calculated as the average number of cycles that the vL1D
+      cache took to issue and receive acknowledgement of a write request to the L2
+      Cache. This number also includes requests for atomics without return values.
+    NC - Read: Total read requests with NC mtype from this TCP to all TCCs Sum over
+      TCP instances per normalization unit.
+    UC - Read: Total read requests with UC mtype from this TCP to all TCCs Sum over
+      TCP instances per normalization unit.
+    CC - Read: Total read requests with CC mtype from this TCP to all TCCs Sum over
+      TCP instances per normalization unit.
+    RW - Read: Total read requests with RW mtype from this TCP to all TCCs Sum over
+      TCP instances per normalization unit.
+    RW - Write: Total write requests with RW mtype from this TCP to all TCCs Sum over
+      TCP instances per normalization unit.
+    NC - Write: Total write requests with NC mtype from this TCP to all TCCs Sum over
+      TCP instances per normalization unit.
+    UC - Write: Total write requests with UC mtype from this TCP to all TCCs Sum over
+      TCP instances per normalization unit.
+    CC - Write: Total write requests with CC mtype from this TCP to all TCCs Sum over
+      TCP instances per normalization unit.
+    NC - Atomic: Total atomic requests with NC mtype from this TCP to all TCCs Sum
+      over TCP instances per normalization unit.
+    UC - Atomic: Total atomic requests with UC mtype from this TCP to all TCCs Sum
+      over TCP instances per normalization unit.
+    CC - Atomic: Total atomic requests with CC mtype from this TCP to all TCCs Sum
+      over TCP instances per normalization unit.
+    RW - Atomic: Total atomic requests with RW mtype from this TCP to all TCCs Sum
+      over TCP instances per normalization unit.
+    Req: The number of translation requests made to the UTCL1 per normalization unit.
+    Hit Ratio: The ratio of the number of translation requests that hit in the UTCL1
+      divided by the total number of translation requests made to the UTCL1.
+    Hits: The number of translation requests that hit in the UTCL1, and could be reused,
+      per normalization unit.
+    Translation Misses: The total number of translation requests that missed in the
+      UTCL1 due to translation not being present in the cache, per normalization unit.
+    Permission Misses: |-
+      The total number of translation requests that missed in the UTCL1 due
+      to a permission error, per normalization unit. This is unused and expected
+      to be zero in most configurations for modern CDNA\u2122 accelerators.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1700_l2_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1700_l2_cache.yaml
index adc12c83d3..b949115fd8 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1700_l2_cache.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1700_l2_cache.yaml
@@ -2,6 +2,350 @@
 Panel Config:
   id: 1700
   title: L2 Cache
+  data source:
+  - metric_table:
+      id: 1701
+      title: L2 Speed-of-Light
+      header:
+        metric: Metric
+        value: Avg
+        unit: Unit
+      metric:
+        Utilization:
+          value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($total_l2_chan) * $GRBM_GUI_ACTIVE_PER_XCD)))
+          unit: pct
+        Peak Bandwidth:
+          value: ((100 * AVG(((TCC_REQ_sum * 64) / (End_Timestamp - Start_Timestamp))))
+            / ((($max_sclk / 1000) * 64) * TO_INT($total_l2_chan)))
+          unit: pct
+        Hit Rate:
+          value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
+            + TCC_MISS_sum) != 0) else 0))
+          unit: pct
+        L2-Fabric Read BW:
+          value: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
+            * 64)) / (End_Timestamp - Start_Timestamp)))
+          unit: GB/s
+        L2-Fabric Write and Atomic BW:
+          value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
+            * 32)) / (End_Timestamp - Start_Timestamp)))
+          unit: GB/s
+        HBM Bandwidth:
+          value: $hbmBandwidth
+          unit: GB/s
+  - metric_table:
+      id: 1702
+      title: L2-Fabric interface metrics
+      header:
+        metric: Metric
+        avg: Avg
+        min: Min
+        max: Max
+        unit: Unit
+      metric:
+        Read BW:
+          avg: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
+            * 64)) / $denom))
+          min: MIN((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
+            * 64)) / $denom))
+          max: MAX((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
+            * 64)) / $denom))
+          unit: (Bytes + $normUnit)
+        HBM Read Traffic:
+          avg: AVG((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
+            != 0) else None))
+          min: MIN((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
+            != 0) else None))
+          max: MAX((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
+            != 0) else None))
+          unit: pct
+        Remote Read Traffic:
+          avg: AVG((100 * ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum) / TCC_EA0_RDREQ_sum)
+            if (TCC_EA0_RDREQ_sum != 0) else None))
+          min: MIN((100 * ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum) / TCC_EA0_RDREQ_sum)
+            if (TCC_EA0_RDREQ_sum != 0) else None))
+          max: MAX((100 * ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum) / TCC_EA0_RDREQ_sum)
+            if (TCC_EA0_RDREQ_sum != 0) else None))
+          unit: pct
+        Uncached Read Traffic:
+          avg: AVG((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
+            != 0) else None))
+          min: MIN((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
+            != 0) else None))
+          max: MAX((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
+            != 0) else None))
+          unit: pct
+        Write and Atomic BW:
+          avg: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
+            * 32)) / $denom))
+          min: MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
+            * 32)) / $denom))
+          max: MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
+            * 32)) / $denom))
+          unit: (Bytes + $normUnit)
+        HBM Write and Atomic Traffic:
+          avg: AVG((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
+            != 0) else None))
+          min: MIN((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
+            != 0) else None))
+          max: MAX((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
+            != 0) else None))
+          unit: pct
+        Remote Write and Atomic Traffic:
+          avg: AVG((100 * ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum) / TCC_EA0_WRREQ_sum)
+            if (TCC_EA0_WRREQ_sum != 0) else None))
+          min: MIN((100 * ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum) / TCC_EA0_WRREQ_sum)
+            if (TCC_EA0_WRREQ_sum != 0) else None))
+          max: MAX((100 * ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum) / TCC_EA0_WRREQ_sum)
+            if (TCC_EA0_WRREQ_sum != 0) else None))
+          unit: pct
+        Atomic Traffic:
+          avg: AVG((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
+            != 0) else None))
+          min: MIN((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
+            != 0) else None))
+          max: MAX((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
+            != 0) else None))
+          unit: pct
+        Uncached Write and Atomic Traffic:
+          avg: AVG((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
+            != 0) else None))
+          min: MIN((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
+            != 0) else None))
+          max: MAX((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
+            != 0) else None))
+          unit: pct
+        Read Latency:
+          avg: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
+            != 0) else None))
+          min: MIN(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
+            != 0) else None))
+          max: MAX(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
+            != 0) else None))
+          unit: Cycles
+        Write and Atomic Latency:
+          avg: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
+            != 0) else None))
+          min: MIN(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
+            != 0) else None))
+          max: MAX(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
+            != 0) else None))
+          unit: Cycles
+        Atomic Latency:
+          avg: AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
+            != 0) else None))
+          min: MIN(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
+            != 0) else None))
+          max: MAX(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
+            != 0) else None))
+          unit: Cycles
+  - metric_table:
+      id: 1703
+      title: L2 Cache Accesses
+      header:
+        metric: Metric
+        avg: Avg
+        min: Min
+        max: Max
+        unit: Unit
+      metric:
+        Bandwidth:
+          avg: AVG((TCC_REQ_sum * 64) / (End_Timestamp - Start_Timestamp))
+          min: MIN((TCC_REQ_sum * 64) / (End_Timestamp - Start_Timestamp))
+          max: MAX((TCC_REQ_sum * 64) / (End_Timestamp - Start_Timestamp))
+          unit: Gbps
+        Req:
+          avg: AVG((TCC_REQ_sum / $denom))
+          min: MIN((TCC_REQ_sum / $denom))
+          max: MAX((TCC_REQ_sum / $denom))
+          unit: (Req + $normUnit)
+        Read Req:
+          avg: AVG((TCC_READ_sum / $denom))
+          min: MIN((TCC_READ_sum / $denom))
+          max: MAX((TCC_READ_sum / $denom))
+          unit: (Req + $normUnit)
+        Write Req:
+          avg: AVG((TCC_WRITE_sum / $denom))
+          min: MIN((TCC_WRITE_sum / $denom))
+          max: MAX((TCC_WRITE_sum / $denom))
+          unit: (Req + $normUnit)
+        Atomic Req:
+          avg: AVG((TCC_ATOMIC_sum / $denom))
+          min: MIN((TCC_ATOMIC_sum / $denom))
+          max: MAX((TCC_ATOMIC_sum / $denom))
+          unit: (Req + $normUnit)
+        Streaming Req:
+          avg: AVG((TCC_STREAMING_REQ_sum / $denom))
+          min: MIN((TCC_STREAMING_REQ_sum / $denom))
+          max: MAX((TCC_STREAMING_REQ_sum / $denom))
+          unit: (Req + $normUnit)
+        Probe Req:
+          avg: AVG((TCC_PROBE_sum / $denom))
+          min: MIN((TCC_PROBE_sum / $denom))
+          max: MAX((TCC_PROBE_sum / $denom))
+          unit: (Req + $normUnit)
+        Cache Hit:
+          avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
+            + TCC_MISS_sum) != 0) else None))
+          min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
+            + TCC_MISS_sum) != 0) else None))
+          max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
+            + TCC_MISS_sum) != 0) else None))
+          unit: pct
+        Hits:
+          avg: AVG((TCC_HIT_sum / $denom))
+          min: MIN((TCC_HIT_sum / $denom))
+          max: MAX((TCC_HIT_sum / $denom))
+          unit: (Hits + $normUnit)
+        Misses:
+          avg: AVG((TCC_MISS_sum / $denom))
+          min: MIN((TCC_MISS_sum / $denom))
+          max: MAX((TCC_MISS_sum / $denom))
+          unit: (Misses + $normUnit)
+        Writeback:
+          avg: AVG((TCC_WRITEBACK_sum / $denom))
+          min: MIN((TCC_WRITEBACK_sum / $denom))
+          max: MAX((TCC_WRITEBACK_sum / $denom))
+          unit: (Cachelines + $normUnit)
+        Writeback (Internal):
+          avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom))
+          min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom))
+          max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom))
+          unit: (Cachelines + $normUnit)
+        Writeback (vL1D Req):
+          avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
+          min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
+          max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
+          unit: (Cachelines + $normUnit)
+        Evict (Internal):
+          avg: AVG((TCC_NORMAL_EVICT_sum / $denom))
+          min: MIN((TCC_NORMAL_EVICT_sum / $denom))
+          max: MAX((TCC_NORMAL_EVICT_sum / $denom))
+          unit: (Cachelines + $normUnit)
+        Evict (vL1D Req):
+          avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
+          min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
+          max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
+          unit: (Cachelines + $normUnit)
+        NC Req:
+          avg: AVG((TCC_NC_REQ_sum / $denom))
+          min: MIN((TCC_NC_REQ_sum / $denom))
+          max: MAX((TCC_NC_REQ_sum / $denom))
+          unit: (Req + $normUnit)
+        UC Req:
+          avg: AVG((TCC_UC_REQ_sum / $denom))
+          min: MIN((TCC_UC_REQ_sum / $denom))
+          max: MAX((TCC_UC_REQ_sum / $denom))
+          unit: (Req + $normUnit)
+        CC Req:
+          avg: AVG((TCC_CC_REQ_sum / $denom))
+          min: MIN((TCC_CC_REQ_sum / $denom))
+          max: MAX((TCC_CC_REQ_sum / $denom))
+          unit: (Req + $normUnit)
+        RW Req:
+          avg: AVG((TCC_RW_REQ_sum / $denom))
+          min: MIN((TCC_RW_REQ_sum / $denom))
+          max: MAX((TCC_RW_REQ_sum / $denom))
+          unit: (Req + $normUnit)
+  - metric_table:
+      id: 1704
+      title: L2 Cache Stalls
+      header:
+        metric: Metric
+        avg: Avg
+        min: Min
+        max: Max
+        unit: Unit
+      metric: {}
+  - metric_table:
+      id: 1705
+      title: L2 - Fabric Interface stalls
+      header:
+        metric: Metric
+        type: Type
+        transaction: Transaction
+        avg: Avg
+        min: Min
+        max: Max
+        unit: Unit
+      style:
+        type: simple_multi_bar
+      metric:
+        Write - Credit Starvation:
+          type: Credit Starvation
+          transaction: Write
+          avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum
+            != 0) else None))
+          min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum
+            != 0) else None))
+          max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum
+            != 0) else None))
+          unit: pct
+  - metric_table:
+      id: 1706
+      title: L2 - Fabric interface detailed metrics
+      header:
+        metric: Metric
+        avg: Avg
+        min: Min
+        max: Max
+        unit: Unit
+      metric:
+        Read (32B):
+          avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom))
+          min: MIN((TCC_EA0_RDREQ_32B_sum / $denom))
+          max: MAX((TCC_EA0_RDREQ_32B_sum / $denom))
+          unit: (Req + $normUnit)
+        Read (64B):
+          avg: AVG(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
+          min: MIN(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
+          max: MAX(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
+          unit: (Req + $normUnit)
+        Read (Uncached):
+          avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
+          min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
+          max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
+          unit: (Req + $normUnit)
+        HBM Read:
+          avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom))
+          min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom))
+          max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom))
+          unit: (Req + $normUnit)
+        Remote Read:
+          avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
+          min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
+          max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
+          unit: (Req + $normUnit)
+        Write and Atomic (32B):
+          avg: AVG(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
+          min: MIN(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
+          max: MAX(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
+          unit: (Req + $normUnit)
+        Write and Atomic (Uncached):
+          avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
+          min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
+          max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
+          unit: (Req + $normUnit)
+        Write and Atomic (64B):
+          avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom))
+          min: MIN((TCC_EA0_WRREQ_64B_sum / $denom))
+          max: MAX((TCC_EA0_WRREQ_64B_sum / $denom))
+          unit: (Req + $normUnit)
+        HBM Write and Atomic:
+          avg: AVG((TCC_EA0_WRREQ_DRAM_sum / $denom))
+          min: MIN((TCC_EA0_WRREQ_DRAM_sum / $denom))
+          max: MAX((TCC_EA0_WRREQ_DRAM_sum / $denom))
+          unit: (Req + $normUnit)
+        Remote Write and Atomic:
+          avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
+          min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
+          max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
+          unit: (Req + $normUnit)
+        Atomic:
+          avg: AVG((TCC_EA0_ATOMIC_sum / $denom))
+          min: MIN((TCC_EA0_ATOMIC_sum / $denom))
+          max: MAX((TCC_EA0_ATOMIC_sum / $denom))
+          unit: (Req + $normUnit)
   metrics_description:
     Utilization: The ratio of the number of cycles an L2 channel was active, summed
       over all L2 channels on the accelerator over the total L2 cycles.
@@ -87,12 +431,6 @@ Panel Config:
       by the cache line size. This value does not consider partial requests, so for
       example, if only a single value is requested in a cache line, the data movement
       will still be counted as a full cache line.
-    Read Bandwidth: Total number of bytes looked up in the L2 cache for read requests,
-      divided by total duration.
-    Write Bandwidth: Total number of bytes looked up in the L2 cache for write requests,
-      divided by total duration.
-    Atomic Bandwidth: Total number of bytes looked up in the L2 cache for atomic requests,
-      divided by total duration.
     Req: The total number of incoming requests to the L2 from all clients for all
       request types, per normalization unit.
     Read Req: The total number of read requests to the L2 from all clients.
@@ -149,12 +487,6 @@ Panel Config:
     Remote Read: The total number of L2 requests to Infinity Fabric to read 32B or
       64B of data from any source other than the accelerator's local HBM, per normalization
       unit.
-    Read Bandwidth - PCIe: Total number of bytes due to L2 read requests due to PCIe
-      traffic, divided by total duration.
-    "Read Bandwidth - Infinity Fabric\u2122": Total number of bytes due to L2 read
-      requests due to Infinity Fabric traffic, divided by total duration.
-    Read Bandwidth - HBM: Total number of bytes due to L2 read requests due to HBM
-      traffic, divided by total duration.
     Write and Atomic (32B): The total number of L2 requests to Infinity Fabric to
       write or atomically update 32B of data to any memory location, per normalization
       unit.
@@ -170,391 +502,9 @@ Panel Config:
     Remote Write and Atomic: The total number of L2 requests to Infinity Fabric to
       write or atomically update 32B or 64B of data in any memory location other than
       the accelerator's local HBM, per normalization unit.
-    Write Bandwidth - PCIe: Total number of bytes due to L2 write requests due to
-      PCIe traffic, divided by total duration.
-    "Write Bandwidth - Infinity Fabric\u2122": Total number of bytes due to L2 write
-      requests due to Infinity Fabric traffic, divided by total duration.
-    Write Bandwidth - HBM: Total number of bytes due to L2 write requests due to HBM
-      traffic, divided by total duration.
-    Atomic Bandwidth - PCIe: Total number of bytes due to L2 atomic requests due to
-      PCIe traffic, divided by total duration.
-    "Atomic Bandwidth - Infinity Fabric\u2122": Total number of bytes due to L2 atomic
-      requests due to Infinity Fabric traffic, divided by total duration.
-    Atomic Bandwidth - HBM: Total number of bytes due to L2 atomic requests due to
-      HBM traffic, divided by total duration.
     Atomic: The total number of L2 requests to Infinity Fabric to atomically update
       32B or 64B of data in any memory location, per normalization unit. See Request
       flow for more detail. Note that on current CDNA accelerators, such as the MI2XX,
       requests are only considered atomic by Infinity Fabric if they are targeted
       at non-write-cacheable memory, such as fine-grained memory allocations or uncached
       memory allocations on the MI2XX.
-    Read Stall: "The ratio of the total number of cycles the L2-Fabric interface was\
-      \ stalled on a read request to any destination (local HBM, remote PCIe\xAE connected\
-      \ accelerator or CPU, or remote Infinity Fabric connected accelerator or CPU)\
-      \ over the total active L2 cycles."
-    Write Stall: The ratio of the total number of cycles the L2-Fabric interface was
-      stalled on a write or atomic request to any destination (local HBM, remote accelerator
-      or CPU, PCIe connected accelerator or CPU, or remote Infinity Fabric connected
-      accelerator or CPU) over the total active L2 cycles.
-    Read - PCIe Stall: The number of cycles the L2-Fabric interface was stalled on
-      read requests to remote PCIe connected accelerators or CPUs as a percent of
-      the total active L2 cycles.
-    Read - Infinity Fabric Stall: The number of cycles the L2-Fabric interface was
-      stalled on read requests to remote Infinity Fabric connected accelerators or
-      CPUs as a percent of the total active L2 cycles.
-    Read - HBM Stall: The number of cycles the L2-Fabric interface was stalled on
-      read requests to the accelerator's local HBM as a percent of the total active
-      L2 cycles.
-    Write - PCIe Stall: The number of cycles the L2-Fabric interface was stalled on
-      write or atomic requests to remote PCIe connected accelerators or CPUs as a
-      percent of the total active L2 cycles.
-    Write - Infinity Fabric Stall: The number of cycles the L2-Fabric interface was
-      stalled on write or atomic requests to remote Infinity Fabric connected accelerators
-      or CPUs as a percent of the total active L2 cycles.
-    Write - HBM Stall: The number of cycles the L2-Fabric interface was stalled on
-      write or atomic requests to accelerator's local HBM as a percent of the total
-      active L2 cycles.
-  data source:
-  - metric_table:
-      id: 1701
-      title: L2 Speed-of-Light
-      header:
-        metric: Metric
-        value: Avg
-        unit: Unit
-      metric:
-        Utilization:
-          value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($total_l2_chan) * $GRBM_GUI_ACTIVE_PER_XCD)))
-          unit: pct
-        Peak Bandwidth:
-          value: ((100 * AVG(((TCC_REQ_sum * 64) / (End_Timestamp - Start_Timestamp))))
-            / ((($max_sclk / 1000) * 64) * TO_INT($total_l2_chan)))
-          unit: pct
-        Hit Rate:
-          value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-            + TCC_MISS_sum) != 0) else 0))
-          unit: pct
-        L2-Fabric Read BW:
-          value: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
-            * 64)) / (End_Timestamp - Start_Timestamp)))
-          unit: GB/s
-        L2-Fabric Write and Atomic BW:
-          value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
-            * 32)) / (End_Timestamp - Start_Timestamp)))
-          unit: GB/s
-        HBM Bandwidth:
-          value: $hbmBandwidth
-          unit: GB/s
-  - metric_table:
-      id: 1702
-      title: L2-Fabric interface metrics
-      header:
-        metric: Metric
-        avg: Avg
-        min: Min
-        max: Max
-        unit: Unit
-      metric:
-        Read BW:
-          avg: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
-            * 64)) / $denom))
-          min: MIN((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
-            * 64)) / $denom))
-          max: MAX((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
-            * 64)) / $denom))
-          unit: (Bytes  + $normUnit)
-        HBM Read Traffic:
-          avg: AVG((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-            != 0) else None))
-          min: MIN((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-            != 0) else None))
-          max: MAX((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-            != 0) else None))
-          unit: pct
-        Remote Read Traffic:
-          avg: AVG((100 * ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum) / TCC_EA0_RDREQ_sum)
-            if (TCC_EA0_RDREQ_sum != 0) else None))
-          min: MIN((100 * ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum) / TCC_EA0_RDREQ_sum)
-            if (TCC_EA0_RDREQ_sum != 0) else None))
-          max: MAX((100 * ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum) / TCC_EA0_RDREQ_sum)
-            if (TCC_EA0_RDREQ_sum != 0) else None))
-          unit: pct
-        Uncached Read Traffic:
-          avg: AVG((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-            != 0) else None))
-          min: MIN((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-            != 0) else None))
-          max: MAX((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-            != 0) else None))
-          unit: pct
-        Write and Atomic BW:
-          avg: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
-            * 32)) / $denom))
-          min: MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
-            * 32)) / $denom))
-          max: MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
-            * 32)) / $denom))
-          unit: (Bytes  + $normUnit)
-        HBM Write and Atomic Traffic:
-          avg: AVG((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-            != 0) else None))
-          min: MIN((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-            != 0) else None))
-          max: MAX((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-            != 0) else None))
-          unit: pct
-        Remote Write and Atomic Traffic:
-          avg: AVG((100 * ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum) / TCC_EA0_WRREQ_sum)
-            if (TCC_EA0_WRREQ_sum != 0) else None))
-          min: MIN((100 * ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum) / TCC_EA0_WRREQ_sum)
-            if (TCC_EA0_WRREQ_sum != 0) else None))
-          max: MAX((100 * ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum) / TCC_EA0_WRREQ_sum)
-            if (TCC_EA0_WRREQ_sum != 0) else None))
-          unit: pct
-        Atomic Traffic:
-          avg: AVG((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-            != 0) else None))
-          min: MIN((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-            != 0) else None))
-          max: MAX((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-            != 0) else None))
-          unit: pct
-        Uncached Write and Atomic Traffic:
-          avg: AVG((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-            != 0) else None))
-          min: MIN((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-            != 0) else None))
-          max: MAX((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-            != 0) else None))
-          unit: pct
-        Read Latency:
-          avg: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-            != 0) else None))
-          min: MIN(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-            != 0) else None))
-          max: MAX(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-            != 0) else None))
-          unit: Cycles
-        Write and Atomic Latency:
-          avg: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-            != 0) else None))
-          min: MIN(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-            != 0) else None))
-          max: MAX(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-            != 0) else None))
-          unit: Cycles
-        Atomic Latency:
-          avg: AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
-            != 0) else None))
-          min: MIN(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
-            != 0) else None))
-          max: MAX(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
-            != 0) else None))
-          unit: Cycles
-  - metric_table:
-      id: 1703
-      title: L2 Cache Accesses
-      header:
-        metric: Metric
-        avg: Avg
-        min: Min
-        max: Max
-        unit: Unit
-      metric:
-        Bandwidth:
-          avg: AVG((TCC_REQ_sum * 64) / (End_Timestamp - Start_Timestamp))
-          min: MIN((TCC_REQ_sum * 64) / (End_Timestamp - Start_Timestamp))
-          max: MAX((TCC_REQ_sum * 64) / (End_Timestamp - Start_Timestamp))
-          unit: Gbps
-        Req:
-          avg: AVG((TCC_REQ_sum / $denom))
-          min: MIN((TCC_REQ_sum / $denom))
-          max: MAX((TCC_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
-        Read Req:
-          avg: AVG((TCC_READ_sum / $denom))
-          min: MIN((TCC_READ_sum / $denom))
-          max: MAX((TCC_READ_sum / $denom))
-          unit: (Req  + $normUnit)
-        Write Req:
-          avg: AVG((TCC_WRITE_sum / $denom))
-          min: MIN((TCC_WRITE_sum / $denom))
-          max: MAX((TCC_WRITE_sum / $denom))
-          unit: (Req  + $normUnit)
-        Atomic Req:
-          avg: AVG((TCC_ATOMIC_sum / $denom))
-          min: MIN((TCC_ATOMIC_sum / $denom))
-          max: MAX((TCC_ATOMIC_sum / $denom))
-          unit: (Req  + $normUnit)
-        Streaming Req:
-          avg: AVG((TCC_STREAMING_REQ_sum / $denom))
-          min: MIN((TCC_STREAMING_REQ_sum / $denom))
-          max: MAX((TCC_STREAMING_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
-        Probe Req:
-          avg: AVG((TCC_PROBE_sum / $denom))
-          min: MIN((TCC_PROBE_sum / $denom))
-          max: MAX((TCC_PROBE_sum / $denom))
-          unit: (Req  + $normUnit)
-        Cache Hit:
-          avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-            + TCC_MISS_sum) != 0) else None))
-          min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-            + TCC_MISS_sum) != 0) else None))
-          max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-            + TCC_MISS_sum) != 0) else None))
-          unit: pct
-        Hits:
-          avg: AVG((TCC_HIT_sum / $denom))
-          min: MIN((TCC_HIT_sum / $denom))
-          max: MAX((TCC_HIT_sum / $denom))
-          unit: (Hits  + $normUnit)
-        Misses:
-          avg: AVG((TCC_MISS_sum / $denom))
-          min: MIN((TCC_MISS_sum / $denom))
-          max: MAX((TCC_MISS_sum / $denom))
-          unit: (Misses  + $normUnit)
-        Writeback:
-          avg: AVG((TCC_WRITEBACK_sum / $denom))
-          min: MIN((TCC_WRITEBACK_sum / $denom))
-          max: MAX((TCC_WRITEBACK_sum / $denom))
-          unit: (Cachelines + $normUnit)
-        Writeback (Internal):
-          avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom))
-          min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom))
-          max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom))
-          unit: (Cachelines + $normUnit)
-        Writeback (vL1D Req):
-          avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
-          min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
-          max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
-          unit: (Cachelines + $normUnit)
-        Evict (Internal):
-          avg: AVG((TCC_NORMAL_EVICT_sum / $denom))
-          min: MIN((TCC_NORMAL_EVICT_sum / $denom))
-          max: MAX((TCC_NORMAL_EVICT_sum / $denom))
-          unit: (Cachelines + $normUnit)
-        Evict (vL1D Req):
-          avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
-          min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
-          max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
-          unit: (Cachelines + $normUnit)
-        NC Req:
-          avg: AVG((TCC_NC_REQ_sum / $denom))
-          min: MIN((TCC_NC_REQ_sum / $denom))
-          max: MAX((TCC_NC_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
-        UC Req:
-          avg: AVG((TCC_UC_REQ_sum / $denom))
-          min: MIN((TCC_UC_REQ_sum / $denom))
-          max: MAX((TCC_UC_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
-        CC Req:
-          avg: AVG((TCC_CC_REQ_sum / $denom))
-          min: MIN((TCC_CC_REQ_sum / $denom))
-          max: MAX((TCC_CC_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
-        RW Req:
-          avg: AVG((TCC_RW_REQ_sum / $denom))
-          min: MIN((TCC_RW_REQ_sum / $denom))
-          max: MAX((TCC_RW_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
-  - metric_table:
-      id: 1704
-      title: L2 Cache Stalls
-      header:
-        metric: Metric
-        avg: Avg
-        min: Min
-        max: Max
-        unit: Unit
-      metric: {}
-  - metric_table:
-      id: 1705
-      title: L2 - Fabric Interface stalls
-      header:
-        metric: Metric
-        type: Type
-        transaction: Transaction
-        avg: Avg
-        min: Min
-        max: Max
-        unit: Unit
-      style:
-        type: simple_multi_bar
-      metric:
-        Write - Credit Starvation:
-          type: Credit Starvation
-          transaction: Write
-          avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum
-            != 0) else None))
-          min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum
-            != 0) else None))
-          max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum
-            != 0) else None))
-          unit: pct
-  - metric_table:
-      id: 1706
-      title: L2 - Fabric interface detailed metrics
-      header:
-        metric: Metric
-        avg: Avg
-        min: Min
-        max: Max
-        unit: Unit
-      metric:
-        Read (32B):
-          avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom))
-          min: MIN((TCC_EA0_RDREQ_32B_sum / $denom))
-          max: MAX((TCC_EA0_RDREQ_32B_sum / $denom))
-          unit: (Req  + $normUnit)
-        Read (64B):
-          avg: AVG(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
-          min: MIN(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
-          max: MAX(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
-          unit: (Req  + $normUnit)
-        Read (Uncached):
-          avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
-          min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
-          max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
-          unit: (Req  + $normUnit)
-        HBM Read:
-          avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom))
-          min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom))
-          max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom))
-          unit: (Req  + $normUnit)
-        Remote Read:
-          avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
-          min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
-          max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
-          unit: (Req  + $normUnit)
-        Write and Atomic (32B):
-          avg: AVG(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
-          min: MIN(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
-          max: MAX(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
-          unit: (Req  + $normUnit)
-        Write and Atomic (Uncached):
-          avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
-          min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
-          max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
-          unit: (Req  + $normUnit)
-        Write and Atomic (64B):
-          avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom))
-          min: MIN((TCC_EA0_WRREQ_64B_sum / $denom))
-          max: MAX((TCC_EA0_WRREQ_64B_sum / $denom))
-          unit: (Req  + $normUnit)
-        HBM Write and Atomic:
-          avg: AVG((TCC_EA0_WRREQ_DRAM_sum / $denom))
-          min: MIN((TCC_EA0_WRREQ_DRAM_sum / $denom))
-          max: MAX((TCC_EA0_WRREQ_DRAM_sum / $denom))
-          unit: (Req  + $normUnit)
-        Remote Write and Atomic:
-          avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
-          min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
-          max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
-          unit: (Req  + $normUnit)
-        Atomic:
-          avg: AVG((TCC_EA0_ATOMIC_sum / $denom))
-          min: MIN((TCC_EA0_ATOMIC_sum / $denom))
-          max: MAX((TCC_EA0_ATOMIC_sum / $denom))
-          unit: (Req  + $normUnit)
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1800_l2_cache_per_channel.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1800_l2_cache_per_channel.yaml
index c509b68d04..98484413e9 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1800_l2_cache_per_channel.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1800_l2_cache_per_channel.yaml
@@ -2,10 +2,6 @@
 Panel Config:
   id: 1800
   title: L2 Cache (per Channel)
-  metrics_description:
-    L2 Cache Hit Rate: The percent of total number of requests to the L2 from all
-      clients that hit in the cache. As noted in the Speed-of-Light section, this
-      includes hit-on-miss requests.
   data source:
   - metric_table:
       id: 1801
@@ -321,3 +317,7 @@ Panel Config:
           ::_1: $total_l2_chan
       cli_style: simple_box
       tui_style: simple_box
+  metrics_description:
+    L2 Cache Hit Rate: The percent of total number of requests to the L2 from all
+      clients that hit in the cache. As noted in the Speed-of-Light section, this
+      includes hit-on-miss requests.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/2100_pc_sampling.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/2100_pc_sampling.yaml
index e94471d7dc..16e4d01e7e 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/2100_pc_sampling.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/2100_pc_sampling.yaml
@@ -2,10 +2,10 @@
 Panel Config:
   id: 2100
   title: PC Sampling
-  metrics_description: {}
   data source:
   - pc_sampling_table:
       id: 2101
       title: PC Sampling
       source: ps_file
       comparable: false
+  metrics_description: {}
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/config_delta/gfx950_diff.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/config_delta/gfx950_diff.yaml
new file mode 100644
index 0000000000..b90fd37e86
--- /dev/null
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/config_delta/gfx950_diff.yaml
@@ -0,0 +1,1128 @@
+# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated by tools/config_management/generate_config_deltas.py
+Addition:
+  - Panel Config:
+      id: 200
+      title: System Speed-of-Light
+    metric_tables:
+      - metric_table:
+          id: 201
+          title: System Speed-of-Light
+          metrics:
+            - MFMA FLOPs (F6F4):
+                value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp - Start_Timestamp)))
+                unit: GFLOP/s
+                peak: ((($max_sclk * $cu_per_gpu) * 16834) / 1000)
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 16834) / 1000))
+            - MFMA FLOPs (F8):
+                value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))
+                unit: GFLOP/s
+                peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
+  - Panel Config:
+      id: 400
+      title: Roofline
+    metric_tables:
+      - metric_table:
+          id: 401
+          title: Roofline Performance Rates
+          metrics:
+            - MFMA FLOPs (F6F4):
+                value: |
+                  AVG((((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
+                unit: GFLOP/s
+                peak: $MFMA_FLOPs_F6F4_empirical_peak
+            - MFMA FLOPs (F8):
+                value: |
+                  AVG((((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
+                unit: GFLOP/s
+                peak: $MFMAF8Flops_empirical_peak
+  - Panel Config:
+      id: 500
+      title: Command Processor (CPC/CPF)
+    metric_tables:
+      - metric_table:
+          id: 502
+          title: Command processor packet processor (CPC)
+          metrics:
+            - CPC ADC Utilization:
+                avg: AVG((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else None)
+                min: MIN((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else None)
+                max: MAX((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else None)
+                unit: pct
+            - CPC SYNC FIFO Full Rate:
+                avg: |
+                  AVG((100 * CPC_SYNC_FIFO_FULL) / CPC_SYNC_WRREQ_FIFO_BUSY if (CPC_SYNC_WRREQ_FIFO_BUSY != 0) else None)
+                min: |
+                  MIN((100 * CPC_SYNC_FIFO_FULL) / CPC_SYNC_WRREQ_FIFO_BUSY if (CPC_SYNC_WRREQ_FIFO_BUSY != 0) else None)
+                max: |
+                  MAX((100 * CPC_SYNC_FIFO_FULL) / CPC_SYNC_WRREQ_FIFO_BUSY if (CPC_SYNC_WRREQ_FIFO_BUSY != 0) else None)
+                unit: pct
+            - CPC CANE Stall Rate:
+                avg: AVG((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0) else None)
+                min: MIN((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0) else None)
+                max: MAX((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0) else None)
+                unit: pct
+  - Panel Config:
+      id: 600
+      title: Workgroup Manager (SPI)
+    metric_tables:
+      - metric_table:
+          id: 601
+          title: Workgroup manager utilizations
+          metrics:
+            - Schedule-Pipe Wave Occupancy:
+                avg: |
+                  AVG(SPI_CSQ_P0_OCCUPANCY + SPI_CSQ_P1_OCCUPANCY + SPI_CSQ_P2_OCCUPANCY + SPI_CSQ_P3_OCCUPANCY)
+                min: |
+                  MIN(SPI_CSQ_P0_OCCUPANCY + SPI_CSQ_P1_OCCUPANCY + SPI_CSQ_P2_OCCUPANCY + SPI_CSQ_P3_OCCUPANCY)
+                max: |
+                  MAX(SPI_CSQ_P0_OCCUPANCY + SPI_CSQ_P1_OCCUPANCY + SPI_CSQ_P2_OCCUPANCY + SPI_CSQ_P3_OCCUPANCY)
+                unit: Wave
+            - Scheduler-Pipe Wave Utilization:
+                avg: |
+                  AVG(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
+                min: |
+                  MIN(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
+                max: |
+                  MAX(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
+                unit: Pct
+      - metric_table:
+          id: 602
+          title: Workgroup Manager - Resource Allocation
+          metrics:
+            - Scheduler-Pipe FIFO Full Rate:
+                avg: |
+                  AVG((100 * (SPI_CS0_CRAWLER_STALL + SPI_CS1_CRAWLER_STALL + SPI_CS2_CRAWLER_STALL + SPI_CS3_CRAWLER_STALL) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
+                min: |
+                  MIN((100 * (SPI_CS0_CRAWLER_STALL + SPI_CS1_CRAWLER_STALL + SPI_CS2_CRAWLER_STALL + SPI_CS3_CRAWLER_STALL) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
+                max: |
+                  MAX((100 * (SPI_CS0_CRAWLER_STALL + SPI_CS1_CRAWLER_STALL + SPI_CS2_CRAWLER_STALL + SPI_CS3_CRAWLER_STALL) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
+                unit: Pct
+  - Panel Config:
+      id: 1000
+      title: Compute Units - Instruction Mix
+    metric_tables:
+      - metric_table:
+          id: 1001
+          title: Overall Instruction Mix
+          metrics:
+            - MFMA:
+                avg: AVG((SQ_INSTS_MFMA / $denom))
+                min: MIN((SQ_INSTS_MFMA / $denom))
+                max: MAX((SQ_INSTS_MFMA / $denom))
+                unit: (instr + $normUnit)
+            - VALU:
+                avg: AVG(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
+                min: MIN(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
+                max: MAX(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
+                unit: (instr + $normUnit)
+            - VMEM:
+                avg: AVG(((SQ_INSTS_VMEM) / $denom))
+                min: MIN(((SQ_INSTS_VMEM) / $denom))
+                max: MAX(((SQ_INSTS_VMEM) / $denom))
+                unit: (instr + $normUnit)
+      - metric_table:
+          id: 1002
+          title: VALU Arithmetic Instruction Mix
+          metrics:
+            - INT32:
+                avg: AVG((SQ_INSTS_VALU_INT32 / $denom))
+                min: MIN((SQ_INSTS_VALU_INT32 / $denom))
+                max: MAX((SQ_INSTS_VALU_INT32 / $denom))
+                unit: (instr + $normUnit)
+            - F32-Trans:
+                avg: AVG((SQ_INSTS_VALU_TRANS_F32 / $denom))
+                min: MIN((SQ_INSTS_VALU_TRANS_F32 / $denom))
+                max: MAX((SQ_INSTS_VALU_TRANS_F32 / $denom))
+                unit: (instr + $normUnit)
+            - F64-FMA:
+                avg: AVG((SQ_INSTS_VALU_FMA_F64 / $denom))
+                min: MIN((SQ_INSTS_VALU_FMA_F64 / $denom))
+                max: MAX((SQ_INSTS_VALU_FMA_F64 / $denom))
+                unit: (instr + $normUnit)
+            - F16-FMA:
+                avg: AVG((SQ_INSTS_VALU_FMA_F16 / $denom))
+                min: MIN((SQ_INSTS_VALU_FMA_F16 / $denom))
+                max: MAX((SQ_INSTS_VALU_FMA_F16 / $denom))
+                unit: (instr + $normUnit)
+            - F16-MUL:
+                avg: AVG((SQ_INSTS_VALU_MUL_F16 / $denom))
+                min: MIN((SQ_INSTS_VALU_MUL_F16 / $denom))
+                max: MAX((SQ_INSTS_VALU_MUL_F16 / $denom))
+                unit: (instr + $normUnit)
+            - INT64:
+                avg: AVG((SQ_INSTS_VALU_INT64 / $denom))
+                min: MIN((SQ_INSTS_VALU_INT64 / $denom))
+                max: MAX((SQ_INSTS_VALU_INT64 / $denom))
+                unit: (instr + $normUnit)
+            - F32-MUL:
+                avg: AVG((SQ_INSTS_VALU_MUL_F32 / $denom))
+                min: MIN((SQ_INSTS_VALU_MUL_F32 / $denom))
+                max: MAX((SQ_INSTS_VALU_MUL_F32 / $denom))
+                unit: (instr + $normUnit)
+            - F64-MUL:
+                avg: AVG((SQ_INSTS_VALU_MUL_F64 / $denom))
+                min: MIN((SQ_INSTS_VALU_MUL_F64 / $denom))
+                max: MAX((SQ_INSTS_VALU_MUL_F64 / $denom))
+                unit: (instr + $normUnit)
+            - F32-FMA:
+                avg: AVG((SQ_INSTS_VALU_FMA_F32 / $denom))
+                min: MIN((SQ_INSTS_VALU_FMA_F32 / $denom))
+                max: MAX((SQ_INSTS_VALU_FMA_F32 / $denom))
+                unit: (instr + $normUnit)
+            - F64-ADD:
+                avg: AVG((SQ_INSTS_VALU_ADD_F64 / $denom))
+                min: MIN((SQ_INSTS_VALU_ADD_F64 / $denom))
+                max: MAX((SQ_INSTS_VALU_ADD_F64 / $denom))
+                unit: (instr + $normUnit)
+            - F16-Trans:
+                avg: AVG((SQ_INSTS_VALU_TRANS_F16 / $denom))
+                min: MIN((SQ_INSTS_VALU_TRANS_F16 / $denom))
+                max: MAX((SQ_INSTS_VALU_TRANS_F16 / $denom))
+                unit: (instr + $normUnit)
+            - F64-Trans:
+                avg: AVG((SQ_INSTS_VALU_TRANS_F64 / $denom))
+                min: MIN((SQ_INSTS_VALU_TRANS_F64 / $denom))
+                max: MAX((SQ_INSTS_VALU_TRANS_F64 / $denom))
+                unit: (instr + $normUnit)
+            - F16-ADD:
+                avg: AVG((SQ_INSTS_VALU_ADD_F16 / $denom))
+                min: MIN((SQ_INSTS_VALU_ADD_F16 / $denom))
+                max: MAX((SQ_INSTS_VALU_ADD_F16 / $denom))
+                unit: (instr + $normUnit)
+            - F32-ADD:
+                avg: AVG((SQ_INSTS_VALU_ADD_F32 / $denom))
+                min: MIN((SQ_INSTS_VALU_ADD_F32 / $denom))
+                max: MAX((SQ_INSTS_VALU_ADD_F32 / $denom))
+                unit: (instr + $normUnit)
+            - Conversion:
+                avg: AVG((SQ_INSTS_VALU_CVT / $denom))
+                min: MIN((SQ_INSTS_VALU_CVT / $denom))
+                max: MAX((SQ_INSTS_VALU_CVT / $denom))
+                unit: (instr + $normUnit)
+      - metric_table:
+          id: 1003
+          title: VMEM Instruction Mix
+          metrics:
+            - Spill/Stack Coalesceable Instr:
+                avg: AVG((TA_BUFFER_COALESCEABLE_WAVEFRONTS_sum / $denom))
+                min: MIN((TA_BUFFER_COALESCEABLE_WAVEFRONTS_sum / $denom))
+                max: MAX((TA_BUFFER_COALESCEABLE_WAVEFRONTS_sum / $denom))
+                unit: (instr + $normUnit)
+      - metric_table:
+          id: 1004
+          title: MFMA Arithmetic Instruction Mix
+          metrics:
+            - MFMA-F16:
+                avg: AVG((SQ_INSTS_VALU_MFMA_F16 / $denom))
+                min: MIN((SQ_INSTS_VALU_MFMA_F16 / $denom))
+                max: MAX((SQ_INSTS_VALU_MFMA_F16 / $denom))
+                unit: (instr + $normUnit)
+            - MFMA-I8:
+                avg: AVG((SQ_INSTS_VALU_MFMA_I8 / $denom))
+                min: MIN((SQ_INSTS_VALU_MFMA_I8 / $denom))
+                max: MAX((SQ_INSTS_VALU_MFMA_I8 / $denom))
+                unit: (instr + $normUnit)
+            - MFMA-F64:
+                avg: AVG((SQ_INSTS_VALU_MFMA_F64 / $denom))
+                min: MIN((SQ_INSTS_VALU_MFMA_F64 / $denom))
+                max: MAX((SQ_INSTS_VALU_MFMA_F64 / $denom))
+                unit: (instr + $normUnit)
+            - MFMA-F32:
+                avg: AVG((SQ_INSTS_VALU_MFMA_F32 / $denom))
+                min: MIN((SQ_INSTS_VALU_MFMA_F32 / $denom))
+                max: MAX((SQ_INSTS_VALU_MFMA_F32 / $denom))
+                unit: (instr + $normUnit)
+            - MFMA-F6F4:
+                avg: AVG((SQ_INSTS_VALU_MFMA_F6F4 / $denom))
+                min: MIN((SQ_INSTS_VALU_MFMA_F6F4 / $denom))
+                max: MAX((SQ_INSTS_VALU_MFMA_F6F4 / $denom))
+                unit: (instr + $normUnit)
+            - MFMA-F8:
+                avg: AVG((SQ_INSTS_VALU_MFMA_F8 / $denom))
+                min: MIN((SQ_INSTS_VALU_MFMA_F8 / $denom))
+                max: MAX((SQ_INSTS_VALU_MFMA_F8 / $denom))
+                unit: (instr + $normUnit)
+            - MFMA-BF16:
+                avg: AVG((SQ_INSTS_VALU_MFMA_BF16 / $denom))
+                min: MIN((SQ_INSTS_VALU_MFMA_BF16 / $denom))
+                max: MAX((SQ_INSTS_VALU_MFMA_BF16 / $denom))
+                unit: (instr + $normUnit)
+  - Panel Config:
+      id: 1100
+      title: Compute Units - Compute Pipeline
+    metric_tables:
+      - metric_table:
+          id: 1101
+          title: Compute Speed-of-Light
+          metrics:
+            - MFMA IOPs (INT8):
+                value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))
+                unit: GIOP
+                peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
+            - MFMA FLOPs (BF16):
+                value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))
+                unit: GFLOP
+                peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
+            - VALU FLOPs:
+                value: |
+                  AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))
+                unit: GFLOP
+                peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
+                pop: |
+                  ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000))
+            - MFMA FLOPs (F64):
+                value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))
+                unit: GFLOP
+                peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000)
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 128) / 1000))
+            - MFMA FLOPs (F16):
+                value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))
+                unit: GFLOP
+                peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
+            - MFMA FLOPs (F6F4):
+                value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp - Start_Timestamp)))
+                unit: GFLOP
+                peak: ((($max_sclk * $cu_per_gpu) * 16834) / 1000)
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 16834) / 1000))
+            - MFMA FLOPs (F8):
+                value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))
+                unit: GFLOP
+                peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
+            - MFMA FLOPs (F32):
+                value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))
+                unit: GFLOP
+                peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
+            - VALU IOPs:
+                value: |
+                  AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp - Start_Timestamp)))
+                unit: GIOP
+                peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
+                pop: |
+                  ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000))
+      - metric_table:
+          id: 1102
+          title: Pipeline Statistics
+          metrics:
+            - MFMA Utilization:
+                avg: |
+                  AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD)))
+                min: |
+                  MIN(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD)))
+                max: |
+                  MAX(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD)))
+                unit: pct
+            - VMEM Utilization:
+                avg: |
+                  AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
+                min: |
+                  MIN((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
+                max: |
+                  MAX((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
+                unit: pct
+            - VMEM Latency:
+                avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) else None))
+                min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) else None))
+                max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) else None))
+                unit: Cycles
+                coll_level: SQ_INST_LEVEL_VMEM
+            - MFMA Instruction Cycles:
+                avg: |
+                  AVG(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA != 0) else None))
+                min: |
+                  MIN(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA != 0) else None))
+                max: |
+                  MAX(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA != 0) else None))
+                unit: cycles/instr
+            - SMEM Latency:
+                avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) else None))
+                min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) else None))
+                max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) else None))
+                unit: Cycles
+                coll_level: SQ_INST_LEVEL_SMEM
+            - Branch Utilization:
+                avg: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
+                min: MIN((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
+                max: MAX((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
+                unit: pct
+            - VALU Co-Issue Efficiency:
+                avg: AVG((100 * SQ_ACTIVE_INST_VALU2) / (SQ_ACTIVE_INST_VALU - SQ_ACTIVE_INST_VALU2))
+                min: MIN((100 * SQ_ACTIVE_INST_VALU2) / (SQ_ACTIVE_INST_VALU - SQ_ACTIVE_INST_VALU2))
+                max: MAX((100 * SQ_ACTIVE_INST_VALU2) / (SQ_ACTIVE_INST_VALU - SQ_ACTIVE_INST_VALU2))
+                unit: pct
+      - metric_table:
+          id: 1103
+          title: Arithmetic Operations
+          metrics:
+            - F8 OPs:
+                avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
+                min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
+                max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
+                unit: (OPs + $normUnit)
+            - BF16 OPs:
+                avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
+                min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
+                max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
+                unit: (OPs + $normUnit)
+            - INT8 OPs:
+                avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
+                min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
+                max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
+                unit: (OPs + $normUnit)
+            - IOPs (Total):
+                avg: |
+                  AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / $denom)
+                min: |
+                  MIN(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / $denom)
+                max: |
+                  MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / $denom)
+                unit: (OPs + $normUnit)
+            - F32 OPs:
+                avg: |
+                  AVG((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) / $denom))
+                min: |
+                  MIN((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) / $denom))
+                max: |
+                  MAX((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) / $denom))
+                unit: (OPs + $normUnit)
+            - F16 OPs:
+                avg: |
+                  AVG(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
+                min: |
+                  MIN(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
+                max: |
+                  MAX(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
+                unit: (OPs + $normUnit)
+            - F6F4 OPs:
+                avg: AVG((512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4) / $denom)
+                min: MIN((512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4) / $denom)
+                max: MAX((512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4) / $denom)
+                unit: (OPs + $normUnit)
+            - FLOPs (Total):
+                avg: |
+                  AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom))
+                min: |
+                  MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom))
+                max: |
+                  MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom))
+                unit: (OPs + $normUnit)
+            - F64 OPs:
+                avg: |
+                  AVG((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom))
+                min: |
+                  MIN((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom))
+                max: |
+                  MAX((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom))
+                unit: (OPs + $normUnit)
+  - Panel Config:
+      id: 1200
+      title: Local Data Share (LDS)
+    metric_tables:
+      - metric_table:
+          id: 1202
+          title: LDS Statistics
+          metrics:
+            - LDS STORE Bandwidth:
+                avg: AVG(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp))
+                min: MIN(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp))
+                max: MAX(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp))
+                units: Gbps
+            - LDS Data FIFO Full Rate:
+                avg: AVG((SQ_LDS_DATA_FIFO_FULL / $denom))
+                min: MIN((SQ_LDS_DATA_FIFO_FULL / $denom))
+                max: MAX((SQ_LDS_DATA_FIFO_FULL / $denom))
+                unit: (Cycles + $normUnit)
+            - LDS LOAD Bandwidth:
+                avg: AVG(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp))
+                min: MIN(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp))
+                max: MAX(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp))
+                units: Gbps
+            - LDS STORE:
+                avg: AVG((SQ_INSTS_LDS_STORE / $denom))
+                min: MIN((SQ_INSTS_LDS_STORE / $denom))
+                max: MAX((SQ_INSTS_LDS_STORE / $denom))
+                unit: (instr + $normUnit)
+            - LDS ATOMIC:
+                avg: AVG((SQ_INSTS_LDS_ATOMIC / $denom))
+                min: MIN((SQ_INSTS_LDS_ATOMIC / $denom))
+                max: MAX((SQ_INSTS_LDS_ATOMIC / $denom))
+                unit: (instr + $normUnit)
+            - LDS Command FIFO Full Rate:
+                avg: AVG((SQ_LDS_CMD_FIFO_FULL / $denom))
+                min: MIN((SQ_LDS_CMD_FIFO_FULL / $denom))
+                max: MAX((SQ_LDS_CMD_FIFO_FULL / $denom))
+                unit: (Cycles + $normUnit)
+            - LDS LOAD:
+                avg: AVG((SQ_INSTS_LDS_LOAD / $denom))
+                min: MIN((SQ_INSTS_LDS_LOAD / $denom))
+                max: MAX((SQ_INSTS_LDS_LOAD / $denom))
+                unit: (instr + $normUnit)
+            - LDS ATOMIC Bandwidth:
+                avg: AVG(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp))
+                min: MIN(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp))
+                max: MAX(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp))
+                units: Gbps
+  - Panel Config:
+      id: 1500
+      title: Address Processing Unit and Data Return Path (TA/TD)
+    metric_tables:
+      - metric_table:
+          id: 1504
+          title: Vector L1 data-return path or Texture Data (TD)
+          metrics:
+            - Workgroup manager → Data-Return Stall:
+                avg: AVG(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
+                min: MIN(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
+                max: MAX(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
+                unit: pct
+            - Write Ack Instructions:
+                avg: AVG((TD_WRITE_ACKT_WAVEFRONT_sum / $denom))
+                min: MIN((TD_WRITE_ACKT_WAVEFRONT_sum / $denom))
+                max: MAX((TD_WRITE_ACKT_WAVEFRONT_sum / $denom))
+                unit: (Instructions + $normUnit)
+      - metric_table:
+          id: 1501
+          title: Busy and stall metrics
+          metrics:
+            - Sequencer → TA Data Stall:
+                avg: AVG((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom))
+                min: MIN((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom))
+                max: MAX((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom))
+                unit: (Cycles + $normUnit)
+            - Sequencer → TA Command Stall:
+                avg: AVG((SQ_VMEM_TA_CMD_FIFO_FULL / $denom))
+                min: MIN((SQ_VMEM_TA_CMD_FIFO_FULL / $denom))
+                max: MAX((SQ_VMEM_TA_CMD_FIFO_FULL / $denom))
+                unit: (Cycles + $normUnit)
+            - Sequencer → TA Address Stall:
+                avg: AVG((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom))
+                min: MIN((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom))
+                max: MAX((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom))
+                unit: (Cycles + $normUnit)
+      - metric_table:
+          id: 1502
+          title: Instruction counts
+          metrics:
+            - Global/Generic Read Instructions for LDS:
+                avg: AVG((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom))
+                min: MIN((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom))
+                max: MAX((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom))
+                unit: (Instructions + $normUnit)
+            - Spill/Stack Read Instructions for LDS:
+                avg: AVG((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom))
+                min: MIN((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom))
+                max: MAX((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom))
+                unit: (Instructions + $normUnit)
+  - Panel Config:
+      id: 1600
+      title: Vector L1 Data Cache
+    metric_tables:
+      - metric_table:
+          id: 1602
+          title: vL1D cache stall metrics
+          metrics:
+            - Stalled on Address:
+                expr: |
+                  (((100 * TCP_TCP_TA_ADDR_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None)
+            - Stalled on Read Return:
+                expr: |
+                  (((100 * TCP_TCR_RDRET_STALL_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None)
+            - Stalled on Data:
+                expr: |
+                  (((100 * TCP_TCP_TA_DATA_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None)
+            - Stalled on Latency FIFO:
+                expr: |
+                  (((100 * TCP_LFIFO_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None)
+            - Stalled on Request FIFO:
+                expr: |
+                  (((100 * TCP_RFIFO_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None)
+      - metric_table:
+          id: 1603
+          title: vL1D cache access metrics
+          metrics:
+            - Tag RAM 2 Req:
+                avg: AVG((TCP_TAGRAM2_REQ_sum / $denom))
+                min: MIN((TCP_TAGRAM2_REQ_sum / $denom))
+                max: MAX((TCP_TAGRAM2_REQ_sum / $denom))
+                unit: (Req + $normUnit)
+            - Tag RAM 3 Req:
+                avg: AVG((TCP_TAGRAM3_REQ_sum / $denom))
+                min: MIN((TCP_TAGRAM3_REQ_sum / $denom))
+                max: MAX((TCP_TAGRAM3_REQ_sum / $denom))
+                unit: (Req + $normUnit)
+            - Tag RAM 1 Req:
+                avg: AVG((TCP_TAGRAM1_REQ_sum / $denom))
+                min: MIN((TCP_TAGRAM1_REQ_sum / $denom))
+                max: MAX((TCP_TAGRAM1_REQ_sum / $denom))
+                unit: (Req + $normUnit)
+            - Tag RAM 0 Req:
+                avg: AVG((TCP_TAGRAM0_REQ_sum / $denom))
+                min: MIN((TCP_TAGRAM0_REQ_sum / $denom))
+                max: MAX((TCP_TAGRAM0_REQ_sum / $denom))
+                unit: (Req + $normUnit)
+      - metric_table:
+          id: 1605
+          title: L1 Unified Translation Cache (UTCL1)
+          metrics:
+            - Misses under Translation Miss:
+                avg: AVG((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom))
+                min: MIN((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom))
+                max: MAX((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom))
+                units: (Req + $normUnit)
+            - Inflight Req:
+                avg: AVG((TCP_CLIENT_UTCL1_INFLIGHT_sum / $denom))
+                min: MIN((TCP_CLIENT_UTCL1_INFLIGHT_sum / $denom))
+                max: MAX((TCP_CLIENT_UTCL1_INFLIGHT_sum / $denom))
+                units: (Req + $normUnit)
+      - metric_table:
+          id: 1606
+          title: L1D Addr Translation Stalls
+          metrics:
+            - Thrashing Stall:
+                avg: AVG((TCP_UTCL1_THRASHING_STALL_sum / $denom))
+                min: MIN((TCP_UTCL1_THRASHING_STALL_sum / $denom))
+                max: MAX((TCP_UTCL1_THRASHING_STALL_sum / $denom))
+                units: (Cycles + $normUnit)
+            - Serialization Stall:
+                avg: AVG((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom))
+                min: MIN((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom))
+                max: MAX((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom))
+                units: (Cycles + $normUnit)
+            - Cache Miss Stall:
+                avg: AVG((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom))
+                min: MIN((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom))
+                max: MAX((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom))
+                units: (Cycles + $normUnit)
+            - Resident Page Full Stall:
+                avg: AVG((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom))
+                min: MIN((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom))
+                max: MAX((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom))
+                units: (Cycles + $normUnit)
+            - Cache Full Stall:
+                avg: AVG((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom))
+                min: MIN((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom))
+                max: MAX((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom))
+                units: (Cycles + $normUnit)
+            - UTCL2 Stall:
+                avg: AVG((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom))
+                min: MIN((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom))
+                max: MAX((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom))
+                units: (Cycles + $normUnit)
+            - Latency FIFO Stall:
+                avg: AVG((TCP_UTCL1_LFIFO_FULL_sum / $denom))
+                min: MIN((TCP_UTCL1_LFIFO_FULL_sum / $denom))
+                max: MAX((TCP_UTCL1_LFIFO_FULL_sum / $denom))
+                units: (Cycles + $normUnit)
+  - Panel Config:
+      id: 1700
+      title: L2 Cache
+    metric_tables:
+      - metric_table:
+          id: 1702
+          title: L2-Fabric interface metrics
+          metrics:
+            - Read Stall:
+                avg: |
+                  AVG((((100 * ((TCC_EA0_RDREQ_IO_CREDIT_STALL_sum + TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum) + TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None))
+                min: |
+                  MIN((((100 * ((TCC_EA0_RDREQ_IO_CREDIT_STALL_sum + TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum) + TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None))
+                max: |
+                  MAX((((100 * ((TCC_EA0_RDREQ_IO_CREDIT_STALL_sum + TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum) + TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None))
+                unit: pct
+            - Write Stall:
+                avg: |
+                  AVG(((100 * (TCC_EA0_WRREQ_STALL_sum) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None))
+                min: |
+                  MIN(((100 * (TCC_EA0_WRREQ_STALL_sum) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None))
+                max: |
+                  MAX(((100 * (TCC_EA0_WRREQ_STALL_sum) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None))
+                unit: pct
+      - metric_table:
+          id: 1703
+          title: L2 Cache Accesses
+          metrics:
+            - Read Bandwidth:
+                avg: AVG(TCC_READ_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_READ_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_READ_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Atomic Bandwidth:
+                avg: AVG(TCC_ATOMIC_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_ATOMIC_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_ATOMIC_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Bypasss Req:
+                avg: AVG((TCC_BYPASS_REQ_sum / $denom))
+                min: MIN((TCC_BYPASS_REQ_sum / $denom))
+                max: MAX((TCC_BYPASS_REQ_sum / $denom))
+                unit: (Req + $normUnit)
+            - Input Buffer Req:
+                avg: AVG((TCC_IB_REQ_sum / $denom))
+                min: MIN((TCC_IB_REQ_sum / $denom))
+                max: MAX((TCC_IB_REQ_sum / $denom))
+                unit: (Req + $normUnit)
+            - Write Bandwidth:
+                avg: AVG(TCC_WRITE_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_WRITE_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_WRITE_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+      - metric_table:
+          id: 1704
+          title: L2 Cache Stalls
+          metrics:
+            - Stalled on Latency FIFO:
+                avg: AVG(TCC_LATENCY_FIFO_FULL_sum / $denom)
+                min: MIN(TCC_LATENCY_FIFO_FULL_sum / $denom)
+                max: MAX(TCC_LATENCY_FIFO_FULL_sum / $denom)
+                unit: (Cycles + $normUnit)
+            - Input Buffer Stalled on L2:
+                avg: AVG(TCC_IB_STALL_sum / $denom)
+                min: MIN(TCC_IB_STALL_sum / $denom)
+                max: MAX(TCC_IB_STALL_sum / $denom)
+                unit: (Cycles + $normUnit)
+            - Stalled on Write Data FIFO:
+                avg: AVG(TCC_SRC_FIFO_FULL_sum / $denom)
+                min: MIN(TCC_SRC_FIFO_FULL_sum / $denom)
+                max: MAX(TCC_SRC_FIFO_FULL_sum / $denom)
+                unit: (Cycles + $normUnit)
+      - metric_table:
+          id: 1705
+          title: L2 - Fabric Interface stalls
+          metrics:
+            - Read - HBM Stall:
+                type: HBM Stall
+                transaction: Read
+                avg: |
+                  AVG(((100 * (TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                min: |
+                  MIN(((100 * (TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                max: |
+                  MAX(((100 * (TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                unit: pct
+            - Write - HBM Stall:
+                type: HBM Stall
+                transaction: Write
+                avg: |
+                  AVG(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                min: |
+                  MIN(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                max: |
+                  MAX(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                unit: pct
+            - Read - PCIe Stall:
+                type: PCIe Stall
+                transaction: Read
+                avg: |
+                  AVG(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                min: |
+                  MIN(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                max: |
+                  MAX(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                unit: pct
+            - Write - Infinity Fabric Stall:
+                type: Infinity Fabric™ Stall
+                transaction: Write
+                avg: |
+                  AVG(((100 * (TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                min: |
+                  MIN(((100 * (TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                max: |
+                  MAX(((100 * (TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                unit: pct
+            - Write - PCIe Stall:
+                type: PCIe Stall
+                transaction: Write
+                avg: |
+                  AVG(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                min: |
+                  MIN(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                max: |
+                  MAX(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                unit: pct
+            - Read - Infinity Fabric Stall:
+                type: Infinity Fabric™ Stall
+                transaction: Read
+                avg: |
+                  AVG(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                min: |
+                  MIN(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                max: |
+                  MAX(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                unit: pct
+      - metric_table:
+          id: 1706
+          title: L2 - Fabric interface detailed metrics
+          metrics:
+            - Atomic - HBM:
+                avg: AVG((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom))
+                min: MIN((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom))
+                max: MAX((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom))
+                unit: (Req + $normUnit)
+            - Atomic Bandwidth - PCIe:
+                avg: AVG(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Write Bandwidth - PCIe:
+                avg: AVG(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Write Bandwidth - HBM:
+                avg: AVG(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Read Bandwidth - HBM:
+                avg: AVG(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Read Bandwidth - Infinity Fabric™:
+                avg: AVG(TCC_EA0_RDREQ_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_EA0_RDREQ_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_EA0_RDREQ_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Atomic Bandwidth - Infinity Fabric™:
+                avg: AVG(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Atomic Bandwidth - HBM:
+                avg: AVG(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Read Bandwidth - PCIe:
+                avg: AVG(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Write Bandwidth - Infinity Fabric™:
+                avg: AVG(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Read (128B):
+                avg: AVG((TCC_EA0_RDREQ_128B_sum / $denom))
+                min: MIN((TCC_EA0_RDREQ_128B_sum / $denom))
+                max: MAX((TCC_EA0_RDREQ_128B_sum / $denom))
+                unit: (Req + $normUnit)
+
+Deletion:
+  []
+
+Modification:
+  - Panel Config:
+      id: 200
+      title: System Speed-of-Light
+    metric_tables:
+      - metric_table:
+          id: 201
+          title: System Speed-of-Light
+          metrics:
+            - VALU FLOPs:
+                value: |
+                  AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))
+                pop: |
+                  ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000))
+            - MFMA FLOPs (F16):
+                value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
+                peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
+            - MFMA FLOPs (F64):
+                value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 128) / 1000))
+                peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000)
+            - VMEM Utilization:
+                value: |
+                  AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
+                pop: |
+                  AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
+            - VALU IOPs:
+                value: |
+                  AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp - Start_Timestamp)))
+                pop: |
+                  ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000))
+            - Branch Utilization:
+                value: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
+                pop: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
+            - L2 Cache BW:
+                value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))
+                pop: |
+                  ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)))
+                peak: ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))
+            - MFMA Utilization:
+                value: |
+                  AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu) * 4)))
+                pop: |
+                  AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu) * 4)))
+            - L2-Fabric Read BW:
+                value: |
+                  AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp))
+                pop: |
+                  ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth)
+            - vL1D Cache BW:
+                value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
+                pop: |
+                  ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu))
+                peak: ((($max_sclk / 1000) * 128) * $cu_per_gpu)
+            - MFMA IOPs (Int8):
+                value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
+                peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
+            - MFMA FLOPs (F32):
+                value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
+            - MFMA FLOPs (BF16):
+                value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
+                peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
+  - Panel Config:
+      id: 300
+      title: Memory Chart
+    metric_tables:
+      - metric_table:
+          id: 301
+          title: Memory Chart
+          metrics:
+            - Wavefronts:
+                value: ROUND(AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE), 0)
+            - MFMA:
+                value: ROUND(AVG((SQ_INSTS_MFMA / $denom)), 0)
+            - Workgroups:
+                value: |
+                  ROUND(AVG(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS), 0)
+  - Panel Config:
+      id: 400
+      title: Roofline
+    metric_tables:
+      - metric_table:
+          id: 402
+          title: Roofline Plot Points
+          metrics:
+            - AI HBM:
+                value: |
+                  ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCC_BUBBLE_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64) ) )
+            - AI L2:
+                value: |
+                  ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) * 64 ) )
+            - Performance (GFLOPs):
+                value: |
+                  ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / (SUM(End_Timestamp - Start_Timestamp) / 1e9) ) / 1e9
+            - AI L1:
+                value: |
+                  ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM(TCP_TOTAL_CACHE_ACCESSES_sum * 64) )
+  - Panel Config:
+      id: 600
+      title: Workgroup Manager (SPI)
+    metric_tables:
+      - metric_table:
+          id: 601
+          title: Workgroup manager utilizations
+          metrics:
+            - SGPR Writes:
+                max: |
+                  MAX((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
+                avg: |
+                  AVG((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
+                min: |
+                  MIN((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
+            - Dispatched Wavefronts:
+                max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
+                avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
+                min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
+            - Dispatched Workgroups:
+                max: |
+                  MAX(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS)
+                avg: |
+                  AVG(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS)
+                min: |
+                  MIN(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS)
+            - Scheduler-Pipe Utilization:
+                max: |
+                  MAX(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
+                avg: |
+                  AVG(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
+                min: |
+                  MIN(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
+            - VGPR Writes:
+                max: |
+                  MAX((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
+                avg: |
+                  AVG((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
+                min: |
+                  MIN((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
+  - Panel Config:
+      id: 700
+      title: Wavefront
+    metric_tables:
+      - metric_table:
+          id: 701
+          title: Wavefront Launch Stats
+          metrics:
+            - Total Wavefronts:
+                max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
+                avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
+                min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
+  - Panel Config:
+      id: 1600
+      title: Vector L1 Data Cache
+    metric_tables:
+      - metric_table:
+          id: 1601
+          title: vL1D Speed-of-Light
+          metrics:
+            - Bandwidth Utilization:
+                value: |
+                  ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu))
+      - metric_table:
+          id: 1603
+          title: vL1D cache access metrics
+          metrics:
+            - L1-L2 Write Latency:
+                max: MAX((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom))
+                avg: AVG((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom))
+                min: MIN((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom))
+                unit: (Cycles + $normUnit)
+            - L1-L2 BW:
+                max: |
+                  MAX(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp)))
+                avg: |
+                  AVG(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp)))
+                min: |
+                  MIN(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp)))
+            - L1 Access Latency:
+                max: MAX((TCP_TCP_LATENCY_sum / $denom))
+                avg: AVG((TCP_TCP_LATENCY_sum / $denom))
+                min: MIN((TCP_TCP_LATENCY_sum / $denom))
+                unit: (Cycles + $normUnit)
+            - L1-L2 Read Latency:
+                max: MAX((TCP_TCC_READ_REQ_LATENCY_sum / $denom))
+                avg: AVG((TCP_TCC_READ_REQ_LATENCY_sum / $denom))
+                min: MIN((TCP_TCC_READ_REQ_LATENCY_sum / $denom))
+                unit: (Cycles + $normUnit)
+            - Cache BW:
+                max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
+                avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
+                min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
+  - Panel Config:
+      id: 1700
+      title: L2 Cache
+    metric_tables:
+      - metric_table:
+          id: 1701
+          title: L2 Speed-of-Light
+          metrics:
+            - L2-Fabric Read BW:
+                value: |
+                  AVG((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp)))
+            - Peak Bandwidth:
+                value: |
+                  ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)))
+      - metric_table:
+          id: 1702
+          title: L2-Fabric interface metrics
+          metrics:
+            - Remote Write and Atomic Traffic:
+                max: |
+                  MAX((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
+                avg: |
+                  AVG((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
+                min: |
+                  MIN((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
+            - Write and Atomic BW:
+                max: |
+                  MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp)))
+                avg: |
+                  AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp)))
+                min: |
+                  MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp)))
+                unit: Gbps
+            - Remote Read Traffic:
+                max: |
+                  MAX((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
+                avg: |
+                  AVG((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
+                min: |
+                  MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
+            - Read BW:
+                max: |
+                  MAX((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp)))
+                avg: |
+                  AVG((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp)))
+                min: |
+                  MIN((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp)))
+                unit: Gbps
+      - metric_table:
+          id: 1703
+          title: L2 Cache Accesses
+          metrics:
+            - Bandwidth:
+                max: MAX((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
+                avg: AVG((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
+                min: MIN((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
+      - metric_table:
+          id: 1706
+          title: L2 - Fabric interface detailed metrics
+          metrics:
+            - HBM Write and Atomic:
+                max: MAX((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom))
+                avg: AVG((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom))
+                min: MIN((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom))
+            - Read (64B):
+                max: MAX((TCC_EA0_RDREQ_64B_sum / $denom))
+                avg: AVG((TCC_EA0_RDREQ_64B_sum / $denom))
+                min: MIN((TCC_EA0_RDREQ_64B_sum / $denom))
+  - Panel Config:
+      id: 1800
+      title: L2 Cache (per Channel)
+    metric_tables:
+      - metric_table:
+          id: 1801
+          title: Aggregate Stats (All channels)
+          metrics:
+            - L2 Cache Hit Rate:
+                max: |
+                  MAX(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
+                std dev: |
+                  STD(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
+                avg: |
+                  AVG(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
+                min: |
+                  MIN(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
+      - metric_table:
+          id: 1809
+          title: L2-Fabric Read Stall (Cycles per normUnit)
+          metrics:
+            - ::_1:
+                ea read stall - pcie: AVG((TO_INT(TCC_EA0_RDREQ_IO_CREDIT_STALL[::_1]) / $denom))
+                ea read stall - hbm: AVG((TO_INT(TCC_EA0_RDREQ_DRAM_CREDIT_STALL[::_1]) / $denom))
+                ea read stall - if: AVG((TO_INT(TCC_EA0_RDREQ_GMI_CREDIT_STALL[::_1]) / $denom))
+      - metric_table:
+          id: 1810
+          title: L2-Fabric Write and Atomic Stall (Cycles per normUnit)
+          metrics:
+            - ::_1:
+                ea write stall - hbm: AVG((TO_INT(TCC_EA0_WRREQ_DRAM_CREDIT_STALL[::_1]) / $denom))
+                ea write stall - pcie: AVG((TO_INT(TCC_EA0_WRREQ_IO_CREDIT_STALL[::_1]) / $denom))
+                ea write stall - if: AVG((TO_INT(TCC_EA0_WRREQ_GMI_CREDIT_STALL[::_1]) / $denom))
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0000_top_stats.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0000_top_stats.yaml
index 55c6f6bb24..5ce5aeeb28 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0000_top_stats.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0000_top_stats.yaml
@@ -2,7 +2,6 @@
 Panel Config:
   id: 0
   title: Top Stats
-  metrics_description: {}
   data source:
   - raw_csv_table:
       id: 1
@@ -12,3 +11,4 @@ Panel Config:
       id: 2
       title: Dispatch List
       source: pmc_dispatch_info.csv
+  metrics_description: {}
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0100_system_info.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0100_system_info.yaml
index 23d024fde3..8b48c2253b 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0100_system_info.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0100_system_info.yaml
@@ -2,10 +2,10 @@
 Panel Config:
   id: 100
   title: System Info
-  metrics_description: {}
   data source:
   - raw_csv_table:
       id: 101
       title: System Info
       source: sysinfo.csv
       columnwise: true
+  metrics_description: {}
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0200_system_speed_of_light.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0200_system_speed_of_light.yaml
index d7020cface..b8bdb7e664 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0200_system_speed_of_light.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0200_system_speed_of_light.yaml
@@ -2,124 +2,6 @@
 Panel Config:
   id: 200
   title: System Speed-of-Light
-  metrics_description:
-    VALU FLOPs: 'The total floating-point operations executed per second on the VALU.
-      This is also presented as a percent of the peak theoretical FLOPs achievable
-      on the specific accelerator. Note: this does not include any floating-point
-      operations from MFMA instructions.'
-    VALU IOPs: 'The total integer operations executed per second on the VALU. This
-      is also presented as a percent of the peak theoretical IOPs achievable on the
-      specific accelerator. Note: this does not include any integer operations from
-      MFMA instructions.'
-    MFMA FLOPs (F8): The total number of 8-bit brain floating point MFMA operations
-      executed per second. This does not include any 16-bit brain floating point operations
-      from VALU instructions. This is also presented as a percent of the peak theoretical
-      F8 MFMA operations achievable on the specific accelerator. It is supported on
-      AMD Instinct MI300 series and later only.
-    MFMA FLOPs (BF16): 'The total number of 16-bit brain floating point MFMA operations
-      executed per second. Note: this does not include any 16-bit brain floating point
-      operations from VALU instructions. This is also presented as a percent of the
-      peak theoretical BF16 MFMA operations achievable on the specific accelerator.'
-    MFMA FLOPs (F16): 'The total number of 16-bit floating point MFMA operations executed
-      per second. Note: this does not include any 16-bit floating point operations
-      from VALU instructions. This is also presented as a percent of the peak theoretical
-      F16 MFMA operations achievable on the specific accelerator.'
-    MFMA FLOPs (F32): 'The total number of 32-bit floating point MFMA operations executed
-      per second. Note: this does not include any 32-bit floating point operations
-      from VALU instructions. This is also presented as a percent of the peak theoretical
-      F32 MFMA operations achievable on the specific accelerator.'
-    MFMA FLOPs (F64): 'The total number of 64-bit floating point MFMA operations executed
-      per second. Note: this does not include any 64-bit floating point operations
-      from VALU instructions. This is also presented as a percent of the peak theoretical
-      F64 MFMA operations achievable on the specific accelerator.'
-    MFMA IOPs (Int8): 'The total number of 8-bit integer MFMA operations executed
-      per second. Note: this does not include any 8-bit integer operations from VALU
-      instructions. This is also presented as a percent of the peak theoretical INT8
-      MFMA operations achievable on the specific accelerator.'
-    Active CUs: Total number of active compute units (CUs) on the accelerator during
-      the kernel execution.
-    SALU Utilization: Indicates what percent of the kernel's duration the SALU was
-      busy executing instructions. Computed as the ratio of the total number of cycles
-      spent by the scheduler issuing SALU or SMEM instructions over the total CU cycles.
-    VALU Utilization: Indicates what percent of the kernel's duration the VALU was
-      busy executing instructions. Does not include VMEM operations. Computed as the
-      ratio of the total number of cycles spent by the scheduler issuing VALU instructions
-      over the total CU cycles.
-    MFMA Utilization: Indicates what percent of the kernel's duration the MFMA unit
-      was busy executing instructions. Computed as the ratio of the total number of
-      cycles the MFMA was busy over the total CU cycles.
-    VMEM Utilization: Indicates what percent of the kernel's duration the VMEM unit
-      was busy executing instructions, including both global/generic and spill/scratch
-      operations (see the VMEM instruction count metrics) for more detail). Does not
-      include VALU operations. Computed as the ratio of the total number of cycles
-      spent by the scheduler issuing VMEM instructions over the total CU cycles.
-    Branch Utilization: Indicates what percent of the kernel's duration the branch
-      unit was busy executing instructions. Computed as the ratio of the total number
-      of cycles spent by the scheduler issuing branch instructions over the total
-      CU cycles
-    VALU Active Threads: Indicates the average level of divergence within a wavefront
-      over the lifetime of the kernel. The number of work-items that were active in
-      a wavefront during execution of each VALU instruction, time-averaged over all
-      VALU instructions run on all wavefronts in the kernel.
-    IPC: The ratio of the total number of instructions executed on the CU over the
-      total active CU cycles. This is also presented as a percent of the peak theoretical
-      bandwidth achievable on the specific accelerator.
-    Wavefront Occupancy: 'The time-averaged number of wavefronts resident on the accelerator
-      over the lifetime of the kernel. Note: this metric may be inaccurate for short-running
-      kernels (less than 1ms). This is also presented as a percent of the peak theoretical
-      occupancy achievable on the specific accelerator.'
-    Theoretical LDS Bandwidth: Indicates the maximum amount of bytes that could have
-      been loaded from, stored to, or atomically updated in the LDS per unit time
-      (see LDS Bandwidth example for more detail). This is also presented as a percent
-      of the peak theoretical F64 MFMA operations achievable on the specific accelerator.
-    LDS Bank Conflicts/Access: The ratio of the number of cycles spent in the LDS
-      scheduler due to bank conflicts (as determined by the conflict resolution hardware)
-      to the base number of cycles that would be spent in the LDS scheduler in a completely
-      uncontended case. This is also presented in normalized form (i.e., the Bank
-      Conflict Rate).
-    vL1D Cache Hit Rate: The ratio of the number of vL1D cache line requests that
-      hit in vL1D cache over the total number of cache line requests to the vL1D cache
-      RAM.
-    vL1D Cache BW: The number of bytes looked up in the vL1D cache as a result of
-      VMEM instructions per unit time. The number of bytes is calculated as the number
-      of cache lines requested multiplied by the cache line size. This value does
-      not consider partial requests, so e.g., if only a single value is requested
-      in a cache line, the data movement will still be counted as a full cache line.
-      This is also presented as a percent of the peak theoretical bandwidth achievable
-      on the specific accelerator.
-    L2 Cache Hit Rate: The ratio of the number of L2 cache line requests that hit
-      in the L2 cache over the total number of incoming cache line requests to the
-      L2 cache.
-    L2 Cache BW: The number of bytes looked up in the L2 cache per unit time. The
-      number of bytes is calculated as the number of cache lines requested multiplied
-      by the cache line size. This value does not consider partial requests, so e.g.,
-      if only a single value is requested in a cache line, the data movement will
-      still be counted as a full cache line. This is also presented as a percent of
-      the peak theoretical bandwidth achievable on the specific accelerator.
-    L2-Fabric Read BW: "The number of bytes read by the L2 over the Infinity Fabric\u2122\
-      \ interface per unit time. This is also presented as a percent of the peak theoretical\
-      \ bandwidth achievable on the specific accelerator."
-    L2-Fabric Write BW: The number of bytes sent by the L2 over the Infinity Fabric
-      interface by write and atomic operations per unit time. This is also presented
-      as a percent of the peak theoretical bandwidth achievable on the specific accelerator.
-    L2-Fabric Read Latency: The time-averaged number of cycles read requests spent
-      in Infinity Fabric before data was returned to the L2.
-    L2-Fabric Write Latency: The time-averaged number of cycles write requests spent
-      in Infinity Fabric before a completion acknowledgement was returned to the L2.
-    sL1D Cache Hit Rate: The percent of sL1D requests that hit on a previously loaded
-      line the cache. Calculated as the ratio of the number of sL1D requests that
-      hit over the number of all sL1D requests.
-    sL1D Cache BW: The number of bytes looked up in the sL1D cache per unit time.
-      This is also presented as a percent of the peak theoretical bandwidth achievable
-      on the specific accelerator.
-    L1I Hit Rate: The number of bytes looked up in the L1I cache per unit time. This
-      is also presented as a percent of the peak theoretical bandwidth achievable
-      on the specific accelerator.
-    L1I BW: The percent of L1I requests that hit on a previously loaded line the cache.
-      Calculated as the ratio of the number of L1I requests that hit over the number
-      of all L1I requests.
-    L1I Fetch Latency: The average number of cycles spent to fetch instructions to
-      a CU.
   data source:
   - metric_table:
       id: 201
@@ -335,3 +217,125 @@ Panel Config:
           peak: None
           pop: None
           coll_level: SQ_IFETCH_LEVEL
+  metrics_description:
+    VALU FLOPs: |-
+      The total floating-point operations executed per second on the VALU.
+      This is also presented as a percent of the peak theoretical FLOPs achievable
+      on the specific accelerator. Note: this does not include any floating-point
+      operations from MFMA instructions.
+    VALU IOPs: |-
+      The total integer operations executed per second on the VALU. This is
+      also presented as a percent of the peak theoretical IOPs achievable on the
+      specific accelerator. Note: this does not include any integer operations from
+      MFMA instructions.
+    MFMA FLOPs (BF16): |-
+      The total number of 16-bit brain floating point MFMA operations executed
+      per second. Note: this does not include any 16-bit brain floating point operations
+      from VALU instructions. This is also presented as a percent of the peak theoretical
+      BF16 MFMA operations achievable on the specific accelerator.
+    MFMA FLOPs (F16): |-
+      The total number of 16-bit floating point MFMA operations executed per
+      second. Note: this does not include any 16-bit floating point operations from
+      VALU instructions. This is also presented as a percent of the peak theoretical
+      F16 MFMA operations achievable on the specific accelerator.
+    MFMA FLOPs (F32): |-
+      The total number of 32-bit floating point MFMA operations executed per
+      second. Note: this does not include any 32-bit floating point operations from
+      VALU instructions. This is also presented as a percent of the peak theoretical
+      F32 MFMA operations achievable on the specific accelerator.
+    MFMA FLOPs (F64): |-
+      The total number of 64-bit floating point MFMA operations executed per
+      second. Note: this does not include any 64-bit floating point operations from
+      VALU instructions. This is also presented as a percent of the peak theoretical
+      F64 MFMA operations achievable on the specific accelerator.
+    MFMA IOPs (Int8): |-
+      The total number of 8-bit integer MFMA operations executed per second.
+      Note: this does not include any 8-bit integer operations from VALU instructions.
+      This is also presented as a percent of the peak theoretical INT8 MFMA operations
+      achievable on the specific accelerator.
+    Active CUs: Total number of active compute units (CUs) on the accelerator during
+      the kernel execution.
+    SALU Utilization: Indicates what percent of the kernel's duration the SALU was
+      busy executing instructions. Computed as the ratio of the total number of cycles
+      spent by the scheduler issuing SALU or SMEM instructions over the total CU cycles.
+    VALU Utilization: Indicates what percent of the kernel's duration the VALU was
+      busy executing instructions. Does not include VMEM operations. Computed as the
+      ratio of the total number of cycles spent by the scheduler issuing VALU instructions
+      over the total CU cycles.
+    MFMA Utilization: Indicates what percent of the kernel's duration the MFMA unit
+      was busy executing instructions. Computed as the ratio of the total number of
+      cycles the MFMA was busy over the total CU cycles.
+    VMEM Utilization: Indicates what percent of the kernel's duration the VMEM unit
+      was busy executing instructions, including both global/generic and spill/scratch
+      operations (see the VMEM instruction count metrics) for more detail). Does not
+      include VALU operations. Computed as the ratio of the total number of cycles
+      spent by the scheduler issuing VMEM instructions over the total CU cycles.
+    Branch Utilization: Indicates what percent of the kernel's duration the branch
+      unit was busy executing instructions. Computed as the ratio of the total number
+      of cycles spent by the scheduler issuing branch instructions over the total
+      CU cycles
+    VALU Active Threads: Indicates the average level of divergence within a wavefront
+      over the lifetime of the kernel. The number of work-items that were active in
+      a wavefront during execution of each VALU instruction, time-averaged over all
+      VALU instructions run on all wavefronts in the kernel.
+    IPC: The ratio of the total number of instructions executed on the CU over the
+      total active CU cycles. This is also presented as a percent of the peak theoretical
+      bandwidth achievable on the specific accelerator.
+    Wavefront Occupancy: |-
+      The time-averaged number of wavefronts resident on the accelerator over
+      the lifetime of the kernel. Note: this metric may be inaccurate for short-running
+      kernels (less than 1ms). This is also presented as a percent of the peak theoretical
+      occupancy achievable on the specific accelerator.
+    Theoretical LDS Bandwidth: Indicates the maximum amount of bytes that could have
+      been loaded from, stored to, or atomically updated in the LDS per unit time
+      (see LDS Bandwidth example for more detail). This is also presented as a percent
+      of the peak theoretical F64 MFMA operations achievable on the specific accelerator.
+    LDS Bank Conflicts/Access: The ratio of the number of cycles spent in the LDS
+      scheduler due to bank conflicts (as determined by the conflict resolution hardware)
+      to the base number of cycles that would be spent in the LDS scheduler in a completely
+      uncontended case. This is also presented in normalized form (i.e., the Bank
+      Conflict Rate).
+    vL1D Cache Hit Rate: The ratio of the number of vL1D cache line requests that
+      hit in vL1D cache over the total number of cache line requests to the vL1D cache
+      RAM.
+    vL1D Cache BW: The number of bytes looked up in the vL1D cache as a result of
+      VMEM instructions per unit time. The number of bytes is calculated as the number
+      of cache lines requested multiplied by the cache line size. This value does
+      not consider partial requests, so e.g., if only a single value is requested
+      in a cache line, the data movement will still be counted as a full cache line.
+      This is also presented as a percent of the peak theoretical bandwidth achievable
+      on the specific accelerator.
+    L2 Cache Hit Rate: The ratio of the number of L2 cache line requests that hit
+      in the L2 cache over the total number of incoming cache line requests to the
+      L2 cache.
+    L2 Cache BW: The number of bytes looked up in the L2 cache per unit time. The
+      number of bytes is calculated as the number of cache lines requested multiplied
+      by the cache line size. This value does not consider partial requests, so e.g.,
+      if only a single value is requested in a cache line, the data movement will
+      still be counted as a full cache line. This is also presented as a percent of
+      the peak theoretical bandwidth achievable on the specific accelerator.
+    L2-Fabric Read BW: |-
+      The number of bytes read by the L2 over the Infinity Fabric\u2122 interface
+      per unit time. This is also presented as a percent of the peak theoretical
+      bandwidth achievable on the specific accelerator.
+    L2-Fabric Write BW: The number of bytes sent by the L2 over the Infinity Fabric
+      interface by write and atomic operations per unit time. This is also presented
+      as a percent of the peak theoretical bandwidth achievable on the specific accelerator.
+    L2-Fabric Read Latency: The time-averaged number of cycles read requests spent
+      in Infinity Fabric before data was returned to the L2.
+    L2-Fabric Write Latency: The time-averaged number of cycles write requests spent
+      in Infinity Fabric before a completion acknowledgement was returned to the L2.
+    sL1D Cache Hit Rate: The percent of sL1D requests that hit on a previously loaded
+      line the cache. Calculated as the ratio of the number of sL1D requests that
+      hit over the number of all sL1D requests.
+    sL1D Cache BW: The number of bytes looked up in the sL1D cache per unit time.
+      This is also presented as a percent of the peak theoretical bandwidth achievable
+      on the specific accelerator.
+    L1I Hit Rate: The number of bytes looked up in the L1I cache per unit time. This
+      is also presented as a percent of the peak theoretical bandwidth achievable
+      on the specific accelerator.
+    L1I BW: The percent of L1I requests that hit on a previously loaded line the cache.
+      Calculated as the ratio of the number of L1I requests that hit over the number
+      of all L1I requests.
+    L1I Fetch Latency: The average number of cycles spent to fetch instructions to
+      a CU.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0300_memory_chart.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0300_memory_chart.yaml
index 13d0527bca..1fd388cba1 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0300_memory_chart.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0300_memory_chart.yaml
@@ -2,122 +2,6 @@
 Panel Config:
   id: 300
   title: Memory Chart
-  metrics_description:
-    Wavefront Occupancy: Wavefronts per active CU.
-    Wave Life: Average number of cycles executing a wave.
-    SALU: Total Number of SALU (Scalar ALU) instructions issued per normalization
-      unit.
-    SMEM: Total number of SMEM (Scalar Memory Read) instructions issued normalization
-      unit.
-    VALU: The number of VALU (Vector ALU) instructions issued per normalization unit.
-    MFMA: Total number of MFMA (Matrix-Fused-Multiply-Add) instructions issued per
-      normalization unit.
-    VMEM: The number of VMEM (GPU Memory) read instructions issued (including FLAT/scratch
-      memory) per normalization unit.
-    LDS: The total number of LDS instructions (including, but not limited to, read/write/atomics
-      and HIP's __shfl instructions) executed per normalization unit.
-    GWS: Total number of GDS (global data sync) instructions issued per normalization
-      unit.
-    BR: Total number of BRANCH instructions issued per normalization unit.
-    Active CUs: Total number of active compute units (CUs) on the accelerator during
-      the kernel execution.
-    Num CUs: Total number of compute units (CUs) on the accelerator.
-    VGPR: 'The number of architected vector general-purpose registers allocated for
-      the kernel, see VALU. Note: this may not exactly match the number of VGPRs requested
-      by the compiler due to allocation granularity.'
-    SGPR: 'The number of scalar general-purpose registers allocated for the kernel,
-      see SALU. Note: this may not exactly match the number of SGPRs requested by
-      the compiler due to allocation granularity.'
-    LDS Allocation: 'The number of bytes of LDS memory (or, shared memory) allocated
-      for this kernel. Note: This may also be larger than what was requested at compile
-      time due to both allocation granularity and dynamic per-dispatch LDS allocations.'
-    Scratch Allocation: The number of bytes of scratch memory requested per work-item
-      for this kernel. Scratch memory is used for stack memory on the accelerator,
-      as well as for register spills and restores.
-    Wavefronts: The total number of wavefronts, summed over all workgroups, forming
-      this kernel launch.
-    Workgroups: The total number of workgroups forming this kernel launch.
-    LDS Req: The total number of LDS instructions (including, but not limited to,
-      read/write/atomics and HIP's __shfl instructions) executed per normalization
-      unit.
-    LDS Util: Indicates what percent of the kernel's duration the LDS was actively
-      executing instructions (including, but not limited to, load, store, atomic and
-      HIP's __shfl operations). Calculated as the ratio of the total number of cycles
-      LDS was active over the total CU cycles.
-    LDS Latency: The average number of round-trip cycles (i.e., from issue to data-return
-      / acknowledgment) required for an LDS instruction to complete.
-    VL1 Rd: The total number of incoming read requests from the address processing
-      unit after coalescing per normalization unit
-    VL1 Wr: The total number of incoming write requests from the address processing
-      unit after coalescing per normalization unit
-    VL1 Atomic: The total number of incoming atomic requests from the address processing
-      unit after coalescing per normalization unit
-    VL1 Hit: The ratio of the number of vL1D cache line requests that hit in vL1D
-      cache over the total number of cache line requests to the vL1D Cache RAM.
-    VL1 Lat: Calculated as the average number of cycles that a vL1D cache line request
-      spent in the vL1D cache pipeline.
-    VL1 Coalesce: Indicates how well memory instructions were coalesced by the address
-      processing unit, ranging from uncoalesced (25%) to fully coalesced (100%). Calculated
-      as the average number of thread-requests generated per instruction divided by
-      the ideal number of thread-requests per instruction.
-    VL1 Stall: The ratio of the number of cycles where the vL1D is stalled waiting
-      to issue a request for data to the L2 cache divided by the number of cycles
-      where the vL1D is active.
-    VL1_L2 Rd: The number of read requests for a vL1D cache line that were not satisfied
-      by the vL1D and must be retrieved from the to the L2 Cache per normalization
-      unit.
-    VL1_L2 Wr: The number of write requests to a vL1D cache line that were sent through
-      the vL1D to the L2 cache, per normalization unit.
-    VL1_L2 Atomic: The number of atomic requests that are sent through the vL1D to
-      the L2 cache, per normalization unit. This includes requests for atomics with,
-      and without return.
-    sL1D Rd: The total number of requests, of any size or type, made to the sL1D per
-      normalization unit.
-    sL1D Hit: The total number of sL1D requests that hit on a previously loaded cache
-      line, per normalization unit.
-    sL1D_L2 Rd: The total number of read requests from sL1D to the L2, per normalization
-      unit.
-    sL1D_L2 Wr: The total number of write requests from sL1D to the L2, per normalization
-      unit. Typically unused on current CDNA accelerators.
-    sL1D_L2 Atomic: The total number of atomic requests from sL1D to the L2, per normalization
-      unit. Typically unused on current CDNA accelerators.
-    IL1 Fetch: The total number of requests made to the L1I per normalization-unit.
-    IL1 Hit: The percent of L1I requests that hit on a previously loaded line the
-      cache. Calculated as the ratio of the number of L1I requests that hit over the
-      number of all L1I requests.
-    IL1 Lat: The average number of cycles spent to fetch instructions to a CU.
-    IL1_L2 Rd: The total number of requests across the L1I - L2 interface per normalization-unit.
-    L2 Rd: The total number of read requests to the L2 from all clients.
-    L2 Wr: The total number of write requests to the L2 from all clients.
-    L2 Atomic: The total number of atomic requests (with and without return) to the
-      L2 from all clients.
-    L2 Hit: The ratio of the number of L2 cache line requests that hit in the L2 cache
-      over the total number of incoming cache line requests to the L2 cache.
-    L2 Rd Lat: Calculated as the average number of cycles that the vL1D cache took
-      to issue and receive read requests from the L2 Cache. This number also includes
-      requests for atomics with return values.
-    L2 Wr Lat: Calculated as the average number of cycles that the vL1D cache took
-      to issue and receive acknowledgement of a write request to the L2 Cache. This
-      number also includes requests for atomics without return values.
-    Fabric_L2 Rd: Number of L2 cache - Infinity Fabric read requests (either 32-byte
-      or 64-byte) summed over TCC instances per normalization unit.
-    Fabric_L2 Wr: Number of L2 cache - Infinity Fabric write requests (either 32-byte
-      or 64-byte) summed over TCC instances per normalization unit.
-    Fabric_L2 Atomic: Number of L2 cache - Infinity Fabric write requests (either
-      32-byte or 64-byte) that are actually atomic requests summed over TCC instances
-      per normalization unit.
-    Fabric Rd Lat: The time-averaged number of cycles read requests spent in Infinity
-      Fabric before data was returned to the L2.
-    Fabric Wr Lat: The time-averaged number of cycles write requests spent in Infinity
-      Fabric before a completion acknowledgement was returned to the L2.
-    Fabric Atomic Lat: The time-averaged number of cycles atomic requests spent in
-      Infinity Fabric before a completion acknowledgement (atomic without return value)
-      or data (atomic with return value) was returned to the L2.
-    HBM Rd: The total number of L2 requests to Infinity Fabric to read 32B or 64B
-      of data from the accelerator's local HBM, per normalization unit.
-    HBM Wr: 'The total number of L2 requests to Infinity Fabric to write or atomically
-      update 32B or 64B of data in the accelerator''s local HBM, per normalization
-      unit. '
   data source:
   - metric_table:
       id: 301
@@ -252,13 +136,13 @@ Panel Config:
           value: ROUND(AVG((TCC_EA_ATOMIC_sum / $denom)), 0)
         Fabric Rd Lat:
           value: ROUND(AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
-            != 0) else  0)), 0)
+            != 0) else 0)), 0)
         Fabric Wr Lat:
           value: ROUND(AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
-            != 0) else  0)), 0)
+            != 0) else 0)), 0)
         Fabric Atomic Lat:
           value: ROUND(AVG(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum
-            != 0) else  0)), 0)
+            != 0) else 0)), 0)
         HBM Rd:
           value: ROUND(AVG((TCC_EA_RDREQ_DRAM_sum / $denom)), 0)
         HBM Wr:
@@ -266,3 +150,123 @@ Panel Config:
       comparable: false
       cli_style: mem_chart
       tui_style: mem_chart
+  metrics_description:
+    Wavefront Occupancy: Wavefronts per active CU.
+    Wave Life: Average number of cycles executing a wave.
+    SALU: Total Number of SALU (Scalar ALU) instructions issued per normalization
+      unit.
+    SMEM: Total number of SMEM (Scalar Memory Read) instructions issued normalization
+      unit.
+    VALU: The number of VALU (Vector ALU) instructions issued per normalization unit.
+    MFMA: Total number of MFMA (Matrix-Fused-Multiply-Add) instructions issued per
+      normalization unit.
+    VMEM: The number of VMEM (GPU Memory) read instructions issued (including FLAT/scratch
+      memory) per normalization unit.
+    LDS: The total number of LDS instructions (including, but not limited to, read/write/atomics
+      and HIP's __shfl instructions) executed per normalization unit.
+    GWS: Total number of GDS (global data sync) instructions issued per normalization
+      unit.
+    BR: Total number of BRANCH instructions issued per normalization unit.
+    Active CUs: Total number of active compute units (CUs) on the accelerator during
+      the kernel execution.
+    Num CUs: Total number of compute units (CUs) on the accelerator.
+    VGPR: |-
+      The number of architected vector general-purpose registers allocated
+      for the kernel, see VALU. Note: this may not exactly match the number of VGPRs
+      requested by the compiler due to allocation granularity.
+    SGPR: |-
+      The number of scalar general-purpose registers allocated for the kernel,
+      see SALU. Note: this may not exactly match the number of SGPRs requested by
+      the compiler due to allocation granularity.
+    LDS Allocation: |-
+      The number of bytes of LDS memory (or, shared memory) allocated for
+      this kernel. Note: This may also be larger than what was requested at compile
+      time due to both allocation granularity and dynamic per-dispatch LDS allocations.
+    Scratch Allocation: The number of bytes of scratch memory requested per work-item
+      for this kernel. Scratch memory is used for stack memory on the accelerator,
+      as well as for register spills and restores.
+    Wavefronts: The total number of wavefronts, summed over all workgroups, forming
+      this kernel launch.
+    Workgroups: The total number of workgroups forming this kernel launch.
+    LDS Req: The total number of LDS instructions (including, but not limited to,
+      read/write/atomics and HIP's __shfl instructions) executed per normalization
+      unit.
+    LDS Util: Indicates what percent of the kernel's duration the LDS was actively
+      executing instructions (including, but not limited to, load, store, atomic and
+      HIP's __shfl operations). Calculated as the ratio of the total number of cycles
+      LDS was active over the total CU cycles.
+    LDS Latency: The average number of round-trip cycles (i.e., from issue to data-return
+      / acknowledgment) required for an LDS instruction to complete.
+    VL1 Rd: The total number of incoming read requests from the address processing
+      unit after coalescing per normalization unit
+    VL1 Wr: The total number of incoming write requests from the address processing
+      unit after coalescing per normalization unit
+    VL1 Atomic: The total number of incoming atomic requests from the address processing
+      unit after coalescing per normalization unit
+    VL1 Hit: The ratio of the number of vL1D cache line requests that hit in vL1D
+      cache over the total number of cache line requests to the vL1D Cache RAM.
+    VL1 Lat: Calculated as the average number of cycles that a vL1D cache line request
+      spent in the vL1D cache pipeline.
+    VL1 Coalesce: Indicates how well memory instructions were coalesced by the address
+      processing unit, ranging from uncoalesced (25%) to fully coalesced (100%). Calculated
+      as the average number of thread-requests generated per instruction divided by
+      the ideal number of thread-requests per instruction.
+    VL1 Stall: The ratio of the number of cycles where the vL1D is stalled waiting
+      to issue a request for data to the L2 cache divided by the number of cycles
+      where the vL1D is active.
+    VL1_L2 Rd: The number of read requests for a vL1D cache line that were not satisfied
+      by the vL1D and must be retrieved from the to the L2 Cache per normalization
+      unit.
+    VL1_L2 Wr: The number of write requests to a vL1D cache line that were sent through
+      the vL1D to the L2 cache, per normalization unit.
+    VL1_L2 Atomic: The number of atomic requests that are sent through the vL1D to
+      the L2 cache, per normalization unit. This includes requests for atomics with,
+      and without return.
+    sL1D Rd: The total number of requests, of any size or type, made to the sL1D per
+      normalization unit.
+    sL1D Hit: The total number of sL1D requests that hit on a previously loaded cache
+      line, per normalization unit.
+    sL1D_L2 Rd: The total number of read requests from sL1D to the L2, per normalization
+      unit.
+    sL1D_L2 Wr: The total number of write requests from sL1D to the L2, per normalization
+      unit. Typically unused on current CDNA accelerators.
+    sL1D_L2 Atomic: The total number of atomic requests from sL1D to the L2, per normalization
+      unit. Typically unused on current CDNA accelerators.
+    IL1 Fetch: The total number of requests made to the L1I per normalization-unit.
+    IL1 Hit: The percent of L1I requests that hit on a previously loaded line the
+      cache. Calculated as the ratio of the number of L1I requests that hit over the
+      number of all L1I requests.
+    IL1 Lat: The average number of cycles spent to fetch instructions to a CU.
+    IL1_L2 Rd: The total number of requests across the L1I - L2 interface per normalization-unit.
+    L2 Rd: The total number of read requests to the L2 from all clients.
+    L2 Wr: The total number of write requests to the L2 from all clients.
+    L2 Atomic: The total number of atomic requests (with and without return) to the
+      L2 from all clients.
+    L2 Hit: The ratio of the number of L2 cache line requests that hit in the L2 cache
+      over the total number of incoming cache line requests to the L2 cache.
+    L2 Rd Lat: Calculated as the average number of cycles that the vL1D cache took
+      to issue and receive read requests from the L2 Cache. This number also includes
+      requests for atomics with return values.
+    L2 Wr Lat: Calculated as the average number of cycles that the vL1D cache took
+      to issue and receive acknowledgement of a write request to the L2 Cache. This
+      number also includes requests for atomics without return values.
+    Fabric_L2 Rd: Number of L2 cache - Infinity Fabric read requests (either 32-byte
+      or 64-byte) summed over TCC instances per normalization unit.
+    Fabric_L2 Wr: Number of L2 cache - Infinity Fabric write requests (either 32-byte
+      or 64-byte) summed over TCC instances per normalization unit.
+    Fabric_L2 Atomic: Number of L2 cache - Infinity Fabric write requests (either
+      32-byte or 64-byte) that are actually atomic requests summed over TCC instances
+      per normalization unit.
+    Fabric Rd Lat: The time-averaged number of cycles read requests spent in Infinity
+      Fabric before data was returned to the L2.
+    Fabric Wr Lat: The time-averaged number of cycles write requests spent in Infinity
+      Fabric before a completion acknowledgement was returned to the L2.
+    Fabric Atomic Lat: The time-averaged number of cycles atomic requests spent in
+      Infinity Fabric before a completion acknowledgement (atomic without return value)
+      or data (atomic with return value) was returned to the L2.
+    HBM Rd: The total number of L2 requests to Infinity Fabric to read 32B or 64B
+      of data from the accelerator's local HBM, per normalization unit.
+    HBM Wr: |-
+      The total number of L2 requests to Infinity Fabric to write or atomically
+      update 32B or 64B of data in the accelerator's local HBM, per normalization
+      unit.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0400_roofline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0400_roofline.yaml
index daf66a87e9..4a8c962f3b 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0400_roofline.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0400_roofline.yaml
@@ -2,85 +2,6 @@
 Panel Config:
   id: 400
   title: Roofline
-  metrics_description:
-    VALU FLOPs (F16): 'The total 16-bit floating-point operations executed per second
-      on the VALU. This is presented with the value of the peak empirical F16 FLOPs
-      achievable on the specific accelerator. Note: this does not include any F16
-      operations from MFMA instructions.'
-    VALU FLOPs (F32): 'The total 32-bit floating-point operations executed per second
-      on the VALU. This is presented with the value of the peak empirical F32 FLOPs
-      achievable on the specific accelerator. Note: this does not include any F32
-      operations from MFMA instructions.'
-    VALU FLOPs (F64): 'The total 64-bit floating-point operations executed per second
-      on the VALU. This is presented with the value of the peak empirical F64 FLOPs
-      achievable on the specific accelerator. Note: this does not include any F64
-      operations from MFMA instructions.'
-    MFMA FLOPs (F8): The total number of 8-bit brain floating point MFMA operations
-      executed per second. This does not include any 16-bit brain floating point operations
-      from VALU instructions. The peak empirically measured F8 MFMA operations achievable
-      on the specific accelerator is displayed alongside for comparison. It is supported
-      on AMD Instinct MI300 series and later only.
-    MFMA FLOPs (BF16): 'The total number of 16-bit brain floating point MFMA operations
-      executed per second. Note: this does not include any 16-bit brain floating point
-      operations from VALU instructions. The peak empirically measured BF16 MFMA operations
-      achievable on the specific accelerator is displayed alongside for comparison.'
-    MFMA FLOPs (F16): 'The total number of 16-bit floating point MFMA operations executed
-      per second. Note: this does not include any 16-bit floating point operations
-      from VALU instructions. The peak empirically measured F16 MFMA operations achievable
-      on the specific accelerator is displayed alongside for comparison.'
-    MFMA FLOPs (F32): 'The total number of 32-bit floating point MFMA operations executed
-      per second. Note: this does not include any 32-bit floating point operations
-      from VALU instructions. The peak empirically measured F32 MFMA operations achievable
-      on the specific accelerator is displayed alongside for comparison.'
-    MFMA FLOPs (F64): 'The total number of 64-bit floating point MFMA operations executed
-      per second. Note: this does not include any 64-bit floating point operations
-      from VALU instructions. The peak empirically measured F64 MFMA operations achievable
-      on the specific accelerator is displayed alongside for comparison.'
-    MFMA FLOPs (F6F4): 'The total number of 4-bit and 6-bit floating point MFMA operations
-      executed per second. Note: this does not include any floating point operations
-      from VALU instructions. The peak empirically measured F6F4 MFMA operations achievable
-      on the specific accelerator is displayed alongside for comparison. It is supported
-      on AMD Instinct MI350 series (gfx950) and later only.'
-    MFMA IOPs (Int8): 'The total number of 8-bit integer MFMA operations executed
-      per second. Note: this does not include any 8-bit integer operations from VALU
-      instructions. The peak empirically measured INT8 MFMA operations achievable
-      on the specific accelerator is displayed alongside for comparison.'
-    HBM Bandwidth: The total number of bytes read from and written to High-Bandwidth
-      Memory (HBM) per second. The peak empirically measured bandwidth achievable
-      on the specific accelerator is displayed alongside for comparison.
-    L2 Cache Bandwidth: The number of bytes looked up in the L2 cache per unit time.
-      The number of bytes is calculated as the number of cache lines requested multiplied
-      by the cache line size. This value does not consider partial requests, so e.g.,
-      if only a single value is requested in a cache line, the data movement will
-      still be counted as a full cache line. The peak empirically measured bandwidth
-      achievable on the specific accelerator is displayed alongside for comparison.
-    L1 Cache Bandwidth: The number of bytes looked up in the vL1D cache as a result
-      of VMEM instructions per unit time. The number of bytes is calculated as the
-      number of cache lines requested multiplied by the cache line size. This value
-      does not consider partial requests, so e.g., if only a single value is requested
-      in a cache line, the data movement will still be counted as a full cache line.
-      The peak empirically measured bandwidth achievable on the specific accelerator
-      is displayed alongside for comparison.
-    LDS Bandwidth: Indicates the maximum amount of bytes that could have been loaded
-      from, stored to, or atomically updated in the LDS per unit time (see LDS Bandwidth
-      example for more detail). The peak empirically measured LDS bandwidth achievable
-      on the specific accelerator is displayed alongside for comparison.
-    AI L1: The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio
-      of total floating-point operations (FLOPs) to total bytes transferred between
-      the L1 cache and the processing units. This value is used as the x-coordinate
-      for the L1 roofline.
-    AI L2: The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio
-      of total floating-point operations (FLOPs) to total bytes transferred between
-      the L2 cache and the L1 cache. This value is used as the x-coordinate for the
-      L2 roofline.
-    AI HBM: The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM).
-      It is the ratio of total floating-point operations (FLOPs) to total bytes transferred
-      between HBM and the L2 cache. This value is used as the x-coordinate for the
-      HBM roofline.
-    Performance (GFLOPs): The overall achieved performance, measured in GigaFLOPs
-      per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point
-      operations divided by the total execution time. This value is used as the y-coordinate
-      for the kernel's point on the Roofline plot.
   data source:
   - metric_table:
       id: 401
@@ -210,3 +131,86 @@ Panel Config:
             512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) ) / (SUM(End_Timestamp - Start_Timestamp)
             / 1e9) ) / 1e9
           unit: GFLOP/s
+  metrics_description:
+    VALU FLOPs (F16): |-
+      The total 16-bit floating-point operations executed per second on the VALU.
+      This is presented with the value of the peak empirical F16 FLOPs achievable
+      on the specific accelerator. Note: this does not include any F16 operations
+      from MFMA instructions.
+    VALU FLOPs (F32): |-
+      The total 32-bit floating-point operations executed per second on the VALU.
+      This is presented with the value of the peak empirical F32 FLOPs achievable
+      on the specific accelerator. Note: this does not include any F32 operations
+      from MFMA instructions.
+    VALU FLOPs (F64): |-
+      The total 64-bit floating-point operations executed per second on the VALU.
+      This is presented with the value of the peak empirical F64 FLOPs achievable
+      on the specific accelerator. Note: this does not include any F64 operations
+      from MFMA instructions.
+    MFMA FLOPs (BF16): |-
+      The total number of 16-bit brain floating point MFMA operations executed
+      per second. Note: this does not include any 16-bit brain floating point
+      operations from VALU instructions. The peak empirically measured BF16 MFMA
+      operations achievable on the specific accelerator is displayed alongside
+      for comparison.
+    MFMA FLOPs (F16): |-
+      The total number of 16-bit floating point MFMA operations executed per
+      second. Note: this does not include any 16-bit floating point operations from
+      VALU instructions. The peak empirically measured F16 MFMA operations
+      achievable on the specific accelerator is displayed alongside for comparison.
+    MFMA FLOPs (F32): |-
+      The total number of 32-bit floating point MFMA operations executed per
+      second. Note: this does not include any 32-bit floating point operations from
+      VALU instructions. The peak empirically measured F32 MFMA operations
+      achievable on the specific accelerator is displayed alongside for comparison.
+    MFMA FLOPs (F64): |-
+      The total number of 64-bit floating point MFMA operations executed per
+      second. Note: this does not include any 64-bit floating point operations from
+      VALU instructions. The peak empirically measured F64 MFMA operations
+      achievable on the specific accelerator is displayed alongside for comparison.
+    MFMA IOPs (Int8): |-
+      The total number of 8-bit integer MFMA operations executed per second.
+      Note: this does not include any 8-bit integer operations from VALU instructions.
+      The peak empirically measured INT8 MFMA operations achievable on the specific
+      accelerator is displayed alongside for comparison.
+    HBM Bandwidth: |-
+      The total number of bytes read from and written to High-Bandwidth
+      Memory (HBM) per second. The peak empirically measured bandwidth achievable
+      on the specific accelerator is displayed alongside for comparison.
+    L2 Cache Bandwidth: The number of bytes looked up in the L2 cache per unit time.
+      The number of bytes is calculated as the number of cache lines requested multiplied
+      by the cache line size. This value does not consider partial requests, so e.g.,
+      if only a single value is requested in a cache line, the data movement will
+      still be counted as a full cache line. The peak empirically measured bandwidth
+      achievable on the specific accelerator is displayed alongside for comparison.
+    L1 Cache Bandwidth: The number of bytes looked up in the vL1D cache as a result
+      of VMEM instructions per unit time. The number of bytes is calculated as the
+      number of cache lines requested multiplied by the cache line size. This value
+      does not consider partial requests, so e.g., if only a single value is requested
+      in a cache line, the data movement will still be counted as a full cache line.
+      The peak empirically measured bandwidth achievable on the specific accelerator
+      is displayed alongside for comparison.
+    LDS Bandwidth: Indicates the maximum amount of bytes that could have been loaded
+      from, stored to, or atomically updated in the LDS per unit time (see LDS Bandwidth
+      example for more detail). The peak empirically measured LDS bandwidth achievable
+      on the specific accelerator is displayed alongside for comparison.
+    AI L1: |-
+      The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio
+      of total floating-point operations (FLOPs) to total bytes transferred between
+      the L1 cache and the processing units. This value is used as the x-coordinate
+      for the L1 roofline.
+    AI L2: |-
+      The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio
+      of total floating-point operations (FLOPs) to total bytes transferred between
+      the L2 cache and the L1 cache. This value is used as the x-coordinate for
+      the L2 roofline.
+    AI HBM: |-
+      The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM).
+      It is the ratio of total floating-point operations (FLOPs) to total bytes
+      transferred between HBM and the L2 cache. This value is used as the x-coordinate
+      for the HBM roofline.
+    Performance (GFLOPs): |-
+      The overall achieved performance, measured in GigaFLOPs
+      per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point
+      operations divided by the total execution time. This value is used as the y-coordinate
+      for the kernel's point on the Roofline plot.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0500_command_processor_cpc_cpf.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0500_command_processor_cpc_cpf.yaml
index c4d2cabf52..118ce18331 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0500_command_processor_cpc_cpf.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0500_command_processor_cpc_cpf.yaml
@@ -2,30 +2,6 @@
 Panel Config:
   id: 500
   title: Command Processor (CPC/CPF)
-  metrics_description:
-    CPF Utilization: Percent of total cycles where the CPF was busy actively doing
-      any work. The ratio of CPF busy cycles over total cycles counted by the CPF.
-    CPF Stall: Percent of CPF busy cycles where the CPF was stalled for any reason.
-    CPF-L2 Utilization: Percent of total cycles counted by the CPF-L2 interface where
-      the CPF-L2 interface was active doing any work. The ratio of CPF-L2 busy cycles
-      over total cycles counted by the CPF-L2.
-    CPF-L2 Stall: Percent of CPF-L2 L2 busy cycles where the CPF-L2 interface was
-      stalled for any reason.
-    CPF-UTCL1 Stall: Percent of CPF busy cycles where the CPF was stalled by address
-      translation.
-    CPC Utilization: Percent of total cycles where the CPC was busy actively doing
-      any work. The ratio of CPC busy cycles over total cycles counted by the CPC.
-    CPC Stall Rate: Percent of CPC busy cycles where the CPC was stalled for any reason.
-    CPC Packet Decoding Utilization: Percent of CPC busy cycles spent decoding commands
-      for processing.
-    CPC-Workgroup Manager Utilization: Percent of CPC busy cycles spent dispatching
-      workgroups to the workgroup manager.
-    CPC-L2 Utilization: Percent of total cycles counted by the CPC-L2 interface where
-      the CPC-L2 interface was active doing any work.
-    CPC-UTCL1 Stall: Percent of CPC busy cycles where the CPC was stalled by address
-      translation
-    CPC-UTCL2 Utilization: 'Percent of total cycles counted by the CPC''s L2 address
-      translation interface where the CPC was busy doing address translation work.  '
   data source:
   - metric_table:
       id: 501
@@ -143,3 +119,28 @@ Panel Config:
           max: MAX((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
             if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
           unit: pct
+  metrics_description:
+    CPF Utilization: Percent of total cycles where the CPF was busy actively doing
+      any work. The ratio of CPF busy cycles over total cycles counted by the CPF.
+    CPF Stall: Percent of CPF busy cycles where the CPF was stalled for any reason.
+    CPF-L2 Utilization: Percent of total cycles counted by the CPF-L2 interface where
+      the CPF-L2 interface was active doing any work. The ratio of CPF-L2 busy cycles
+      over total cycles counted by the CPF-L2.
+    CPF-L2 Stall: Percent of CPF-L2 L2 busy cycles where the CPF-L2 interface was
+      stalled for any reason.
+    CPF-UTCL1 Stall: Percent of CPF busy cycles where the CPF was stalled by address
+      translation.
+    CPC Utilization: Percent of total cycles where the CPC was busy actively doing
+      any work. The ratio of CPC busy cycles over total cycles counted by the CPC.
+    CPC Stall Rate: Percent of CPC busy cycles where the CPC was stalled for any reason.
+    CPC Packet Decoding Utilization: Percent of CPC busy cycles spent decoding commands
+      for processing.
+    CPC-Workgroup Manager Utilization: Percent of CPC busy cycles spent dispatching
+      workgroups to the workgroup manager.
+    CPC-L2 Utilization: Percent of total cycles counted by the CPC-L2 interface where
+      the CPC-L2 interface was active doing any work.
+    CPC-UTCL1 Stall: Percent of CPC busy cycles where the CPC was stalled by address
+      translation
+    CPC-UTCL2 Utilization: |-
+      Percent of total cycles counted by the CPC's L2 address translation
+      interface where the CPC was busy doing address translation work.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0600_workgroup_manager_spi.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0600_workgroup_manager_spi.yaml
index f6bf13d8b8..eb9845aa82 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0600_workgroup_manager_spi.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0600_workgroup_manager_spi.yaml
@@ -2,61 +2,6 @@
 Panel Config:
   id: 600
   title: Workgroup Manager (SPI)
-  metrics_description:
-    Accelerator Utilization: The percent of cycles in the kernel where the accelerator
-      was actively doing any work.
-    Scheduler-Pipe Utilization: The percent of total scheduler-pipe cycles in the
-      kernel where the scheduler-pipes were actively doing any work.
-    Workgroup Manager Utilization: The percent of cycles in the kernel where the workgroup
-      manager was actively doing any work.
-    Shader Engine Utilization: The percent of total shader engine cycles in the kernel
-      where any CU in a shader-engine was actively doing any work, normalized over
-      all shader-engines. Low values (e.g., << 100%) indicate that the accelerator
-      was not fully saturated by the kernel, or a potential load-imbalance issue.
-    SIMD Utilization: The percent of total SIMD cycles in the kernel where any SIMD
-      on a CU was actively doing any work, summed over all CUs. Low values (less than
-      100%) indicate that the accelerator was not fully saturated by the kernel, or
-      a potential load-imbalance issue.
-    Dispatched Workgroups: The total number of workgroups forming this kernel launch.
-    Dispatched Wavefronts: The total number of wavefronts, summed over all workgroups,
-      forming this kernel launch.
-    VGPR Writes: The average number of cycles spent initializing VGPRs at wave creation.
-    SGPR Writes: The average number of cycles spent initializing SGPRs at wave creation.
-    Not-scheduled Rate (Workgroup Manager): The percent of total scheduler-pipe cycles
-      in the kernel where a workgroup could not be scheduled to a CU due to a bottleneck
-      within the workgroup manager rather than a lack of a CU or SIMD with sufficient
-      resources.
-    Not-scheduled Rate (Scheduler-Pipe): 'The percent of total scheduler-pipe cycles
-      in the kernel where a workgroup could not be scheduled to a CU due to a bottleneck
-      within the scheduler-pipes rather than a lack of a CU or SIMD with sufficient
-      resources. '
-    Scheduler-Pipe Stall Rate: The percent of total scheduler-pipe cycles in the kernel
-      where a workgroup could not be scheduled to a CU due to occupancy limitations
-      (like a lack of a CU or SIMD with sufficient resources).
-    Scratch Stall Rate: The percent of total shader-engine cycles in the kernel where
-      a workgroup could not be scheduled to a CU due to lack of private (a.k.a., scratch)
-      memory slots. While this can reach up to 100%, note that the actual occupancy
-      limitations on a kernel using private memory are typically quite small (for
-      example, less than 1% of the total number of waves that can be scheduled to
-      an accelerator).
-    Insufficient SIMD Waveslots: The percent of total SIMD cycles in the kernel where
-      a workgroup could not be scheduled to a SIMD due to lack of available waveslots.
-    Insufficient SIMD VGPRs: The percent of total SIMD cycles in the kernel where
-      a workgroup could not be scheduled to a SIMD due to lack of available VGPRs.
-    Insufficient SIMD SGPRs: The percent of total SIMD cycles in the kernel where
-      a workgroup could not be scheduled to a SIMD due to lack of available SGPRs.
-    Insufficient CU LDS: The percent of total CU cycles in the kernel where a workgroup
-      could not be scheduled to a CU due to lack of available LDS.
-    Insufficient CU Barriers: The percent of total CU cycles in the kernel where a
-      workgroup could not be scheduled to a CU due to lack of available barriers.
-    Reached CU Workgroup Limit: The percent of total CU cycles in the kernel where
-      a workgroup could not be scheduled to a CU due to limits within the workgroup
-      manager. This is expected to be always be zero on CDNA2 or newer accelerators
-      (and small for previous accelerators).
-    Reached CU Wavefront Limit: The percent of total CU cycles in the kernel where
-      a wavefront could not be scheduled to a CU due to limits within the workgroup
-      manager. This is expected to be always be zero on CDNA2 or newer accelerators
-      (and small for previous accelerators).
   data source:
   - metric_table:
       id: 601
@@ -199,3 +144,58 @@ Panel Config:
           min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
           max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
           unit: Pct
+  metrics_description:
+    Accelerator Utilization: The percent of cycles in the kernel where the accelerator
+      was actively doing any work.
+    Scheduler-Pipe Utilization: The percent of total scheduler-pipe cycles in the
+      kernel where the scheduler-pipes were actively doing any work.
+    Workgroup Manager Utilization: The percent of cycles in the kernel where the workgroup
+      manager was actively doing any work.
+    Shader Engine Utilization: The percent of total shader engine cycles in the kernel
+      where any CU in a shader-engine was actively doing any work, normalized over
+      all shader-engines. Low values (e.g., << 100%) indicate that the accelerator
+      was not fully saturated by the kernel, or a potential load-imbalance issue.
+    SIMD Utilization: The percent of total SIMD cycles in the kernel where any SIMD
+      on a CU was actively doing any work, summed over all CUs. Low values (less than
+      100%) indicate that the accelerator was not fully saturated by the kernel, or
+      a potential load-imbalance issue.
+    Dispatched Workgroups: The total number of workgroups forming this kernel launch.
+    Dispatched Wavefronts: The total number of wavefronts, summed over all workgroups,
+      forming this kernel launch.
+    VGPR Writes: The average number of cycles spent initializing VGPRs at wave creation.
+    SGPR Writes: The average number of cycles spent initializing SGPRs at wave creation.
+    Not-scheduled Rate (Workgroup Manager): The percent of total scheduler-pipe cycles
+      in the kernel where a workgroup could not be scheduled to a CU due to a bottleneck
+      within the workgroup manager rather than a lack of a CU or SIMD with sufficient
+      resources.
+    Not-scheduled Rate (Scheduler-Pipe): |-
+      The percent of total scheduler-pipe cycles in the kernel where a workgroup
+      could not be scheduled to a CU due to a bottleneck within the scheduler-pipes
+      rather than a lack of a CU or SIMD with sufficient resources.
+    Scheduler-Pipe Stall Rate: The percent of total scheduler-pipe cycles in the kernel
+      where a workgroup could not be scheduled to a CU due to occupancy limitations
+      (like a lack of a CU or SIMD with sufficient resources).
+    Scratch Stall Rate: The percent of total shader-engine cycles in the kernel where
+      a workgroup could not be scheduled to a CU due to lack of private (a.k.a., scratch)
+      memory slots. While this can reach up to 100%, note that the actual occupancy
+      limitations on a kernel using private memory are typically quite small (for
+      example, less than 1% of the total number of waves that can be scheduled to
+      an accelerator).
+    Insufficient SIMD Waveslots: The percent of total SIMD cycles in the kernel where
+      a workgroup could not be scheduled to a SIMD due to lack of available waveslots.
+    Insufficient SIMD VGPRs: The percent of total SIMD cycles in the kernel where
+      a workgroup could not be scheduled to a SIMD due to lack of available VGPRs.
+    Insufficient SIMD SGPRs: The percent of total SIMD cycles in the kernel where
+      a workgroup could not be scheduled to a SIMD due to lack of available SGPRs.
+    Insufficient CU LDS: The percent of total CU cycles in the kernel where a workgroup
+      could not be scheduled to a CU due to lack of available LDS.
+    Insufficient CU Barriers: The percent of total CU cycles in the kernel where a
+      workgroup could not be scheduled to a CU due to lack of available barriers.
+    Reached CU Workgroup Limit: The percent of total CU cycles in the kernel where
+      a workgroup could not be scheduled to a CU due to limits within the workgroup
+      manager. This is expected to be always be zero on CDNA2 or newer accelerators
+      (and small for previous accelerators).
+    Reached CU Wavefront Limit: The percent of total CU cycles in the kernel where
+      a wavefront could not be scheduled to a CU due to limits within the workgroup
+      manager. This is expected to be always be zero on CDNA2 or newer accelerators
+      (and small for previous accelerators).
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0700_wavefront.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0700_wavefront.yaml
index 5e332c0b8f..e9e9407cfc 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0700_wavefront.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0700_wavefront.yaml
@@ -2,63 +2,6 @@
 Panel Config:
   id: 700
   title: Wavefront
-  metrics_description:
-    Grid Size: The total number of work-items (or, threads) launched as a part of
-      the kernel dispatch. In HIP, this is equivalent to the total grid size multiplied
-      by the total workgroup (or, block) size.
-    Workgroup Size: The total number of work-items (or, threads) in each workgroup
-      (or, block) launched as part of the kernel dispatch. In HIP, this is equivalent
-      to the total block size.
-    Total Wavefronts: "The total number of wavefronts launched as part of the kernel\
-      \ dispatch. On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs,\
-      \ the wavefront size is always 64 work-items. Thus, the total number of wavefronts\
-      \ should be equivalent to the ceiling of grid size divided by 64."
-    Saved Wavefronts: The total number of wavefronts saved at a context-save.
-    Restored Wavefronts: The total number of wavefronts restored from a context-save.
-    VGPRs: 'The number of architected vector general-purpose registers allocated for
-      the kernel, see VALU. Note: this may not exactly match the number of VGPRs requested
-      by the compiler due to allocation granularity.'
-    AGPRs: 'The number of accumulation vector general-purpose registers allocated
-      for the kernel, see AGPRs. Note: this may not exactly match the number of AGPRs
-      requested by the compiler due to allocation granularity.'
-    SGPRs: 'The number of scalar general-purpose registers allocated for the kernel,
-      see SALU. Note: this may not exactly match the number of SGPRs requested by
-      the compiler due to allocation granularity.'
-    LDS Allocation: 'The number of bytes of LDS memory (or, shared memory) allocated
-      for this kernel. Note: This may also be larger than what was requested at compile
-      time due to both allocation granularity and dynamic per-dispatch LDS allocations.'
-    Scratch Allocation: The number of bytes of scratch memory requested per work-item
-      for this kernel. Scratch memory is used for stack memory on the accelerator,
-      as well as for register spills and restores.
-    Kernel Time: The total duration of the executed kernel.
-    Kernel Time (Cycles): The total duration of the executed kernel in cycles.
-    Instructions per wavefront: The average number of instructions (of all types)
-      executed per wavefront. This is averaged over all wavefronts in a kernel dispatch.
-    Wave Cycles: The number of cycles a wavefront in the kernel dispatch spent resident
-      on a compute unit per normalization unit. This is averaged over all wavefronts
-      in a kernel dispatch.
-    Dependency Wait Cycles: The number of cycles a wavefront in the kernel dispatch
-      spent resident on a compute unit per normalization unit. This is averaged over
-      all wavefronts in a kernel dispatch.
-    Issue Wait Cycles: The number of cycles a wavefront in the kernel dispatch was
-      unable to issue an instruction for any reason (e.g., execution pipe back-pressure,
-      arbitration loss, etc.) per normalization unit. This counter is incremented
-      at every cycle by all wavefronts on a CU unable to issue an instruction. As
-      such, it is most useful to get a sense of how waves were spending their time,
-      rather than identification of a precise limiter because another wave could be
-      actively executing while a wave is issue stalled. The sum of this metric, Dependency
-      Wait Cycles and Active Cycles should be equal to the total Wave Cycles metric.
-    Active Cycles: The average number of cycles a wavefront in the kernel dispatch
-      was actively executing instructions per normalization unit. This measurement
-      is made on a per-wavefront basis, and may include cycles that another wavefront
-      spent actively executing (on another execution unit, for example) or was stalled.
-      As such, it is most useful to get a sense of how waves were spending their time,
-      rather than identification of a precise limiter. The sum of this metric, Issue
-      Wait Cycles and Active Wait Cycles should be equal to the total Wave Cycles
-      metric.
-    Wavefront Occupancy: 'The time-averaged number of wavefronts resident on the accelerator
-      over the lifetime of the kernel. Note: this metric may be inaccurate for short-running
-      kernels (less than 1ms).'
   data source:
   - metric_table:
       id: 701
@@ -171,3 +114,66 @@ Panel Config:
           max: MAX((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
           unit: Wavefronts
           coll_level: SQ_LEVEL_WAVES
+  metrics_description:
+    Grid Size: The total number of work-items (or, threads) launched as a part of
+      the kernel dispatch. In HIP, this is equivalent to the total grid size multiplied
+      by the total workgroup (or, block) size.
+    Workgroup Size: The total number of work-items (or, threads) in each workgroup
+      (or, block) launched as part of the kernel dispatch. In HIP, this is equivalent
+      to the total block size.
+    Total Wavefronts: |-
+      The total number of wavefronts launched as part of the kernel dispatch.
+      On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront
+      size is always 64 work-items. Thus, the total number of wavefronts should
+      be equivalent to the ceiling of grid size divided by 64.
+    Saved Wavefronts: The total number of wavefronts saved at a context-save.
+    Restored Wavefronts: The total number of wavefronts restored from a context-save.
+    VGPRs: |-
+      The number of architected vector general-purpose registers allocated
+      for the kernel, see VALU. Note: this may not exactly match the number of VGPRs
+      requested by the compiler due to allocation granularity.
+    AGPRs: |-
+      The number of accumulation vector general-purpose registers allocated
+      for the kernel, see AGPRs. Note: this may not exactly match the number of
+      AGPRs requested by the compiler due to allocation granularity.
+    SGPRs: |-
+      The number of scalar general-purpose registers allocated for the kernel,
+      see SALU. Note: this may not exactly match the number of SGPRs requested by
+      the compiler due to allocation granularity.
+    LDS Allocation: |-
+      The number of bytes of LDS memory (or, shared memory) allocated for
+      this kernel. Note: This may also be larger than what was requested at compile
+      time due to both allocation granularity and dynamic per-dispatch LDS allocations.
+    Scratch Allocation: The number of bytes of scratch memory requested per work-item
+      for this kernel. Scratch memory is used for stack memory on the accelerator,
+      as well as for register spills and restores.
+    Kernel Time: The total duration of the executed kernel.
+    Kernel Time (Cycles): The total duration of the executed kernel in cycles.
+    Instructions per wavefront: The average number of instructions (of all types)
+      executed per wavefront. This is averaged over all wavefronts in a kernel dispatch.
+    Wave Cycles: The number of cycles a wavefront in the kernel dispatch spent resident
+      on a compute unit per normalization unit. This is averaged over all wavefronts
+      in a kernel dispatch.
+    Dependency Wait Cycles: The number of cycles a wavefront in the kernel dispatch
+      spent resident on a compute unit per normalization unit. This is averaged over
+      all wavefronts in a kernel dispatch.
+    Issue Wait Cycles: The number of cycles a wavefront in the kernel dispatch was
+      unable to issue an instruction for any reason (e.g., execution pipe back-pressure,
+      arbitration loss, etc.) per normalization unit. This counter is incremented
+      at every cycle by all wavefronts on a CU unable to issue an instruction. As
+      such, it is most useful to get a sense of how waves were spending their time,
+      rather than identification of a precise limiter because another wave could be
+      actively executing while a wave is issue stalled. The sum of this metric, Dependency
+      Wait Cycles and Active Cycles should be equal to the total Wave Cycles metric.
+    Active Cycles: The average number of cycles a wavefront in the kernel dispatch
+      was actively executing instructions per normalization unit. This measurement
+      is made on a per-wavefront basis, and may include cycles that another wavefront
+      spent actively executing (on another execution unit, for example) or was stalled.
+      As such, it is most useful to get a sense of how waves were spending their time,
+      rather than identification of a precise limiter. The sum of this metric, Issue
+      Wait Cycles and Active Wait Cycles should be equal to the total Wave Cycles
+      metric.
+    Wavefront Occupancy: |-
+      The time-averaged number of wavefronts resident on the accelerator over
+      the lifetime of the kernel. Note: this metric may be inaccurate for short-running
+      kernels (less than 1ms).
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1000_compute_units_instruction_mix.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1000_compute_units_instruction_mix.yaml
index 69748199b5..ecf7473688 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1000_compute_units_instruction_mix.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1000_compute_units_instruction_mix.yaml
@@ -2,90 +2,6 @@
 Panel Config:
   id: 1000
   title: Compute Units - Instruction Mix
-  metrics_description:
-    VALU: The total number of vector arithmetic logic unit (VALU) operations issued.
-      These are the workhorses of the compute unit, and are used to execute a wide
-      range of instruction types including floating point operations, non-uniform
-      address calculations, transcendental operations, integer operations, shifts,
-      conditional evaluation, etc.
-    VMEM: The total number of vector memory operations issued. These include most
-      loads, stores and atomic operations and all accesses to generic, global, private
-      and texture memory.
-    LDS: The total number of LDS (also known as shared memory) operations issued.
-      These include loads, stores, atomics, and HIP's __shfl operations.
-    MFMA: The total number of matrix fused multiply-add instructions issued.
-    SALU: The total number of scalar arithmetic logic unit (SALU) operations issued.
-      Typically these are used for address calculations, literal constants, and other
-      operations that are provably uniform across a wavefront. Although scalar memory
-      (SMEM) operations are issued by the SALU, they are counted separately in this
-      section.
-    SMEM: The total number of scalar memory (SMEM) operations issued. These are typically
-      used for loading kernel arguments, base-pointers and loads from HIP's __constant__
-      memory.
-    Branch: The total number of branch operations issued. These typically consist
-      of jump or branch operations and are used to implement control flow.
-    INT32: The total number of instructions operating on 32-bit integer operands issued
-      to the VALU per normalization unit.
-    INT64: The total number of instructions operating on 64-bit integer operands issued
-      to the VALU per normalization unit.
-    F16-ADD: The total number of addition instructions operating on 16-bit floating-point
-      operands issued to the VALU per normalization unit.
-    F16-MUL: The total number of multiplication instructions operating on 16-bit floating-point
-      operands issued to the VALU per normalization unit.
-    F16-FMA: The total number of fused multiply-add instructions operating on 16-bit
-      floating-point operands issued to the VALU per normalization unit.
-    F16-Trans: The total number of transcendental instructions (e.g., sqrt) operating
-      on 16-bit floating-point operands issued to the VALU per normalization unit.
-    F32-ADD: The total number of addition instructions operating on 32-bit floating-point
-      operands issued to the VALU per normalization unit.
-    F32-MUL: The total number of multiplication instructions operating on 32-bit floating-point
-      operands issued to the VALU per normalization unit.
-    F32-FMA: The total number of fused multiply-add instructions operating on 32-bit
-      floating-point operands issued to the VALU per normalization unit.
-    F32-Trans: The total number of transcendental instructions (such as sqrt) operating
-      on 32-bit floating-point operands issued to the VALU per normalization unit.
-    F64-ADD: The total number of addition instructions operating on 64-bit floating-point
-      operands issued to the VALU per normalization unit.
-    F64-MUL: The total number of multiplication instructions operating on 64-bit floating-point
-      operands issued to the VALU per normalization unit.
-    F64-FMA: The total number of fused multiply-add instructions operating on 64-bit
-      floating-point operands issued to the VALU per normalization unit.
-    F64-Trans: The total number of transcendental instructions (such as sqrt) operating
-      on 64-bit floating-point operands issued to the VALU per normalization unit.
-    Conversion: "The total number of type conversion instructions (such as converting\
-      \ data to or from F32\u2194F64) issued to the VALU per normalization unit."
-    Global/Generic Instr: The total number of global & generic memory instructions
-      executed on all compute units on the accelerator, per normalization unit.
-    Global/Generic Read: The total number of global & generic memory read instructions
-      executed on all compute units on the accelerator, per normalization unit.
-    Global/Generic Write: The total number of global & generic memory write instructions
-      executed on all compute units on the accelerator, per normalization unit.
-    Global/Generic Atomic: The total number of global & generic memory atomic (with
-      and without return) instructions executed on all compute units on the accelerator,
-      per normalization unit.
-    Spill/Stack Instr: The total number of spill/stack memory instructions executed
-      on all compute units on the accelerator, per normalization unit.
-    Spill/Stack Read: The total number of spill/stack memory read instructions executed
-      on all compute units on the accelerator, per normalization unit.
-    Spill/Stack Write: The total number of spill/stack memory write instructions executed
-      on all compute units on the accelerator, per normalization unit.
-    Spill/Stack Atomic: The total number of spill/stack memory atomic (with and without
-      return) instructions executed on all compute units on the accelerator, per normalization
-      unit. Typically unused as these memory operations are typically used to implement
-      thread-local storage.
-    MFMA-I8: The total number of 8-bit integer MFMA instructions issued per normalization
-      unit.
-    MFMA-F8: The total number of 8-bit floating point MFMA instructions issued per
-      normalization unit. This is supported in AMD Instinct MI300 series and later
-      only.
-    MFMA-F16: The total number of 16-bit floating point MFMA instructions issued per
-      normalization unit.
-    MFMA-BF16: The total number of 16-bit brain floating point MFMA instructions issued
-      per normalization unit.
-    MFMA-F32: The total number of 32-bit floating-point MFMA instructions issued per
-      normalization unit.
-    MFMA-F64: The total number of 64-bit floating-point MFMA instructions issued per
-      normalization unit.
   data source:
   - metric_table:
       id: 1001
@@ -302,3 +218,85 @@ Panel Config:
           min: MIN((SQ_INSTS_VALU_MFMA_F64 / $denom))
           max: MAX((SQ_INSTS_VALU_MFMA_F64 / $denom))
           unit: (instr + $normUnit)
+  metrics_description:
+    VALU: The total number of vector arithmetic logic unit (VALU) operations issued.
+      These are the workhorses of the compute unit, and are used to execute a wide
+      range of instruction types including floating point operations, non-uniform
+      address calculations, transcendental operations, integer operations, shifts,
+      conditional evaluation, etc.
+    VMEM: The total number of vector memory operations issued. These include most
+      loads, stores and atomic operations and all accesses to generic, global, private
+      and texture memory.
+    LDS: The total number of LDS (also known as shared memory) operations issued.
+      These include loads, stores, atomics, and HIP's __shfl operations.
+    MFMA: The total number of matrix fused multiply-add instructions issued.
+    SALU: The total number of scalar arithmetic logic unit (SALU) operations issued.
+      Typically these are used for address calculations, literal constants, and other
+      operations that are provably uniform across a wavefront. Although scalar memory
+      (SMEM) operations are issued by the SALU, they are counted separately in this
+      section.
+    SMEM: The total number of scalar memory (SMEM) operations issued. These are typically
+      used for loading kernel arguments, base-pointers and loads from HIP's __constant__
+      memory.
+    Branch: The total number of branch operations issued. These typically consist
+      of jump or branch operations and are used to implement control flow.
+    INT32: The total number of instructions operating on 32-bit integer operands issued
+      to the VALU per normalization unit.
+    INT64: The total number of instructions operating on 64-bit integer operands issued
+      to the VALU per normalization unit.
+    F16-ADD: The total number of addition instructions operating on 16-bit floating-point
+      operands issued to the VALU per normalization unit.
+    F16-MUL: The total number of multiplication instructions operating on 16-bit floating-point
+      operands issued to the VALU per normalization unit.
+    F16-FMA: The total number of fused multiply-add instructions operating on 16-bit
+      floating-point operands issued to the VALU per normalization unit.
+    F16-Trans: The total number of transcendental instructions (e.g., sqrt) operating
+      on 16-bit floating-point operands issued to the VALU per normalization unit.
+    F32-ADD: The total number of addition instructions operating on 32-bit floating-point
+      operands issued to the VALU per normalization unit.
+    F32-MUL: The total number of multiplication instructions operating on 32-bit floating-point
+      operands issued to the VALU per normalization unit.
+    F32-FMA: The total number of fused multiply-add instructions operating on 32-bit
+      floating-point operands issued to the VALU per normalization unit.
+    F32-Trans: The total number of transcendental instructions (such as sqrt) operating
+      on 32-bit floating-point operands issued to the VALU per normalization unit.
+    F64-ADD: The total number of addition instructions operating on 64-bit floating-point
+      operands issued to the VALU per normalization unit.
+    F64-MUL: The total number of multiplication instructions operating on 64-bit floating-point
+      operands issued to the VALU per normalization unit.
+    F64-FMA: The total number of fused multiply-add instructions operating on 64-bit
+      floating-point operands issued to the VALU per normalization unit.
+    F64-Trans: The total number of transcendental instructions (such as sqrt) operating
+      on 64-bit floating-point operands issued to the VALU per normalization unit.
+    Conversion: |-
+      The total number of type conversion instructions (such as converting
+      data to or from F32\u2194F64) issued to the VALU per normalization unit.
+    Global/Generic Instr: The total number of global & generic memory instructions
+      executed on all compute units on the accelerator, per normalization unit.
+    Global/Generic Read: The total number of global & generic memory read instructions
+      executed on all compute units on the accelerator, per normalization unit.
+    Global/Generic Write: The total number of global & generic memory write instructions
+      executed on all compute units on the accelerator, per normalization unit.
+    Global/Generic Atomic: The total number of global & generic memory atomic (with
+      and without return) instructions executed on all compute units on the accelerator,
+      per normalization unit.
+    Spill/Stack Instr: The total number of spill/stack memory instructions executed
+      on all compute units on the accelerator, per normalization unit.
+    Spill/Stack Read: The total number of spill/stack memory read instructions executed
+      on all compute units on the accelerator, per normalization unit.
+    Spill/Stack Write: The total number of spill/stack memory write instructions executed
+      on all compute units on the accelerator, per normalization unit.
+    Spill/Stack Atomic: The total number of spill/stack memory atomic (with and without
+      return) instructions executed on all compute units on the accelerator, per normalization
+      unit. Typically unused as these memory operations are typically used to implement
+      thread-local storage.
+    MFMA-I8: The total number of 8-bit integer MFMA instructions issued per normalization
+      unit.
+    MFMA-F16: The total number of 16-bit floating point MFMA instructions issued per
+      normalization unit.
+    MFMA-BF16: The total number of 16-bit brain floating point MFMA instructions issued
+      per normalization unit.
+    MFMA-F32: The total number of 32-bit floating-point MFMA instructions issued per
+      normalization unit.
+    MFMA-F64: The total number of 64-bit floating-point MFMA instructions issued per
+      normalization unit.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1100_compute_units_compute_pipeline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1100_compute_units_compute_pipeline.yaml
index 81c0197225..af4ff8ef77 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1100_compute_units_compute_pipeline.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1100_compute_units_compute_pipeline.yaml
@@ -2,84 +2,6 @@
 Panel Config:
   id: 1100
   title: Compute Units - Compute Pipeline
-  metrics_description:
-    VALU FLOPs: 'The total floating-point operations executed per second on the VALU.
-      This is also presented as a percent of the peak theoretical FLOPs achievable
-      on the specific accelerator. Note: this does not include any floating-point
-      operations from MFMA instructions.'
-    VALU IOPs: 'The total integer operations executed per second on the VALU. This
-      is also presented as a percent of the peak theoretical IOPs achievable on the
-      specific accelerator. Note: this does not include any integer operations from
-      MFMA instructions.'
-    MFMA FLOPs (BF16): 'The total number of 16-bit brain floating point MFMA operations
-      executed per second. Note: this does not include any 16-bit brain floating point
-      operations from VALU instructions. This is also presented as a percent of the
-      peak theoretical BF16 MFMA operations achievable on the specific accelerator.'
-    MFMA FLOPs (F16): 'The total number of 16-bit floating point MFMA operations executed
-      per second. Note: this does not include any 16-bit floating point operations
-      from VALU instructions. This is also presented as a percent of the peak theoretical
-      F16 MFMA operations achievable on the specific accelerator.'
-    MFMA FLOPs (F32): 'The total number of 32-bit floating point MFMA operations executed
-      per second. Note: this does not include any 32-bit floating point operations
-      from VALU instructions. This is also presented as a percent of the peak theoretical
-      F32 MFMA operations achievable on the specific accelerator.'
-    MFMA FLOPs (F64): 'The total number of 64-bit floating point MFMA operations executed
-      per second. Note: this does not include any 64-bit floating point operations
-      from VALU instructions. This is also presented as a percent of the peak theoretical
-      F64 MFMA operations achievable on the specific accelerator.'
-    MFMA IOPs (INT8): 'The total number of 8-bit integer MFMA operations executed
-      per second. Note: this does not include any 8-bit integer operations from VALU
-      instructions. This is also presented as a percent of the peak theoretical INT8
-      MFMA operations achievable on the specific accelerator.'
-    IPC: The ratio of the total number of instructions executed on the CU over the
-      total active CU cycles.
-    IPC (Issued): The ratio of the total number of (non-internal) instructions issued
-      over the number of cycles where the scheduler was actively working on issuing
-      instructions.
-    SALU Utilization: Indicates what percent of the kernel's duration the SALU was
-      busy executing instructions. Computed as the ratio of the total number of cycles
-      spent by the scheduler issuing SALU / SMEM instructions over the total CU cycles.
-    VALU Utilization: Indicates what percent of the kernel's duration the VALU was
-      busy executing instructions. Does not include VMEM operations. Computed as the
-      ratio of the total number of cycles spent by the scheduler issuing VALU instructions
-      over the total CU cycles.
-    VMEM Utilization: Indicates what percent of the kernel's duration the VMEM unit
-      was busy executing instructions, including both global/generic and spill/scratch
-      operations (see the VMEM instruction count metrics for more detail). Does not
-      include VALU operations. Computed as the ratio of the total number of cycles
-      spent by the scheduler issuing VMEM instructions over the total CU cycles.
-    Branch Utilization: Indicates what percent of the kernel's duration the branch
-      unit was busy executing instructions. Computed as the ratio of the total number
-      of cycles spent by the scheduler issuing branch instructions over the total
-      CU cycles.
-    VALU Active Threads: Indicates the average level of divergence within a wavefront
-      over the lifetime of the kernel. The number of work-items that were active in
-      a wavefront during execution of each VALU instruction, time-averaged over all
-      VALU instructions run on all wavefronts in the kernel
-    MFMA Utilization: Indicates what percent of the kernel's duration the MFMA unit
-      was busy executing instructions. Computed as the ratio of the total number of
-      cycles spent by the MFMA was busy over the total CU cycles.
-    MFMA Instruction Cycles: The average duration of MFMA instructions in this kernel
-      in cycles. Computed as the ratio of the total number of cycles the MFMA unit
-      was busy over the total number of MFMA instructions.
-    VMEM Latency: The average number of round-trip cycles (that is, from issue to
-      data return / acknowledgment) required for a VMEM instruction to complete.
-    SMEM Latency: The average number of round-trip cycles (that is, from issue to
-      data return / acknowledgment) required for a SMEM instruction to complete.
-    FLOPs (Total): The total number of floating-point operations executed on either
-      the VALU or MFMA units, per normalization unit.
-    IOPs (Total): The total number of integer operations executed on either the VALU
-      or MFMA units, per normalization unit.
-    F16 OPs: The total number of 16-bit floating-point operations executed on either
-      the VALU or MFMA units, per normalization unit.
-    BF16 OPs: The total number of 16-bit brain floating-point operations executed
-      on either the VALU or MFMA units, per normalization unit.
-    F32 OPs: The total number of 32-bit floating-point operations executed on either
-      the VALU or MFMA units, per normalization unit.
-    F64 OPs: The total number of 64-bit floating-point operations executed on either
-      the VALU or MFMA units, per normalization unit.
-    INT8 OPs: The total number of 8-bit integer operations executed on either the
-      VALU or MFMA units, per normalization unit.
   data source:
   - metric_table:
       id: 1101
@@ -159,13 +81,13 @@ Panel Config:
           unit: Instr/cycle
         IPC (Issued):
           avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
-            + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED  + SQ_INSTS_LDS)
+            + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
             / SQ_ACTIVE_INST_ANY))
           min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
             + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
             / SQ_ACTIVE_INST_ANY))
           max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
-            + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED  + SQ_INSTS_LDS)
+            + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
             / SQ_ACTIVE_INST_ANY))
           unit: Instr/cycle
         SALU Utilization:
@@ -262,7 +184,7 @@ Panel Config:
             * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64
             + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64
             * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom))
-          unit: (OPs  + $normUnit)
+          unit: (OPs + $normUnit)
         IOPs (Total):
           avg: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
             * 512)) / $denom)
@@ -270,7 +192,7 @@ Panel Config:
             * 512)) / $denom)
           max: MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
             * 512)) / $denom)
-          unit: (OPs  + $normUnit)
+          unit: (OPs + $normUnit)
         F16 OPs:
           avg: AVG(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
             + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512
@@ -281,12 +203,12 @@ Panel Config:
           max: MAX(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
             + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512
             * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
-          unit: (OPs  + $normUnit)
+          unit: (OPs + $normUnit)
         BF16 OPs:
           avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
           min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
           max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
-          unit: (OPs  + $normUnit)
+          unit: (OPs + $normUnit)
         F32 OPs:
           avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
             + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
@@ -297,7 +219,7 @@ Panel Config:
           max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
             + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
             / $denom))
-          unit: (OPs  + $normUnit)
+          unit: (OPs + $normUnit)
         F64 OPs:
           avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
             + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
@@ -308,9 +230,94 @@ Panel Config:
           max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
             + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
             / $denom))
-          unit: (OPs  + $normUnit)
+          unit: (OPs + $normUnit)
         INT8 OPs:
           avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
           min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
           max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
-          unit: (OPs  + $normUnit)
+          unit: (OPs + $normUnit)
+  metrics_description:
+    VALU FLOPs: |-
+      The total floating-point operations executed per second on the VALU.
+      This is also presented as a percent of the peak theoretical FLOPs achievable
+      on the specific accelerator. Note: this does not include any floating-point
+      operations from MFMA instructions.
+    VALU IOPs: |-
+      The total integer operations executed per second on the VALU. This is
+      also presented as a percent of the peak theoretical IOPs achievable on the
+      specific accelerator. Note: this does not include any integer operations from
+      MFMA instructions.
+    MFMA FLOPs (BF16): |-
+      The total number of 16-bit brain floating point MFMA operations executed
+      per second. Note: this does not include any 16-bit brain floating point operations
+      from VALU instructions. This is also presented as a percent of the peak theoretical
+      BF16 MFMA operations achievable on the specific accelerator.
+    MFMA FLOPs (F16): |-
+      The total number of 16-bit floating point MFMA operations executed per
+      second. Note: this does not include any 16-bit floating point operations from
+      VALU instructions. This is also presented as a percent of the peak theoretical
+      F16 MFMA operations achievable on the specific accelerator.
+    MFMA FLOPs (F32): |-
+      The total number of 32-bit floating point MFMA operations executed per
+      second. Note: this does not include any 32-bit floating point operations from
+      VALU instructions. This is also presented as a percent of the peak theoretical
+      F32 MFMA operations achievable on the specific accelerator.
+    MFMA FLOPs (F64): |-
+      The total number of 64-bit floating point MFMA operations executed per
+      second. Note: this does not include any 64-bit floating point operations from
+      VALU instructions. This is also presented as a percent of the peak theoretical
+      F64 MFMA operations achievable on the specific accelerator.
+    MFMA IOPs (INT8): |-
+      The total number of 8-bit integer MFMA operations executed per second.
+      Note: this does not include any 8-bit integer operations from VALU instructions.
+      This is also presented as a percent of the peak theoretical INT8 MFMA operations
+      achievable on the specific accelerator.
+    IPC: The ratio of the total number of instructions executed on the CU over the
+      total active CU cycles.
+    IPC (Issued): The ratio of the total number of (non-internal) instructions issued
+      over the number of cycles where the scheduler was actively working on issuing
+      instructions.
+    SALU Utilization: Indicates what percent of the kernel's duration the SALU was
+      busy executing instructions. Computed as the ratio of the total number of cycles
+      spent by the scheduler issuing SALU / SMEM instructions over the total CU cycles.
+    VALU Utilization: Indicates what percent of the kernel's duration the VALU was
+      busy executing instructions. Does not include VMEM operations. Computed as the
+      ratio of the total number of cycles spent by the scheduler issuing VALU instructions
+      over the total CU cycles.
+    VMEM Utilization: Indicates what percent of the kernel's duration the VMEM unit
+      was busy executing instructions, including both global/generic and spill/scratch
+      operations (see the VMEM instruction count metrics for more detail). Does not
+      include VALU operations. Computed as the ratio of the total number of cycles
+      spent by the scheduler issuing VMEM instructions over the total CU cycles.
+    Branch Utilization: Indicates what percent of the kernel's duration the branch
+      unit was busy executing instructions. Computed as the ratio of the total number
+      of cycles spent by the scheduler issuing branch instructions over the total
+      CU cycles.
+    VALU Active Threads: Indicates the average level of divergence within a wavefront
+      over the lifetime of the kernel. The number of work-items that were active in
+      a wavefront during execution of each VALU instruction, time-averaged over all
+      VALU instructions run on all wavefronts in the kernel
+    MFMA Utilization: Indicates what percent of the kernel's duration the MFMA unit
+      was busy executing instructions. Computed as the ratio of the total number of
+      cycles spent by the MFMA was busy over the total CU cycles.
+    MFMA Instruction Cycles: The average duration of MFMA instructions in this kernel
+      in cycles. Computed as the ratio of the total number of cycles the MFMA unit
+      was busy over the total number of MFMA instructions.
+    VMEM Latency: The average number of round-trip cycles (that is, from issue to
+      data return / acknowledgment) required for a VMEM instruction to complete.
+    SMEM Latency: The average number of round-trip cycles (that is, from issue to
+      data return / acknowledgment) required for a SMEM instruction to complete.
+    FLOPs (Total): The total number of floating-point operations executed on either
+      the VALU or MFMA units, per normalization unit.
+    IOPs (Total): The total number of integer operations executed on either the VALU
+      or MFMA units, per normalization unit.
+    F16 OPs: The total number of 16-bit floating-point operations executed on either
+      the VALU or MFMA units, per normalization unit.
+    BF16 OPs: The total number of 16-bit brain floating-point operations executed
+      on either the VALU or MFMA units, per normalization unit.
+    F32 OPs: The total number of 32-bit floating-point operations executed on either
+      the VALU or MFMA units, per normalization unit.
+    F64 OPs: The total number of 64-bit floating-point operations executed on either
+      the VALU or MFMA units, per normalization unit.
+    INT8 OPs: The total number of 8-bit integer operations executed on either the
+      VALU or MFMA units, per normalization unit.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1200_local_data_share_lds.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1200_local_data_share_lds.yaml
index 2718654ad4..b7767fea16 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1200_local_data_share_lds.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1200_local_data_share_lds.yaml
@@ -2,51 +2,6 @@
 Panel Config:
   id: 1200
   title: Local Data Share (LDS)
-  metrics_description:
-    Utilization: Indicates what percent of the kernel's duration the LDS was actively
-      executing instructions (including, but not limited to, load, store, atomic and
-      HIP's __shfl operations). Calculated as the ratio of the total number of cycles
-      LDS was active over the total CU cycles.
-    Access Rate: Indicates the percentage of SIMDs in the VALU actively issuing LDS
-      instructions, averaged over the lifetime of the kernel. Calculated as the ratio
-      of the total number of cycles spent by the scheduler issuing LDS instructions
-      over the total CU cycles.
-    Theoretical Bandwidth Utilization: Indicates the maximum amount of bytes that
-      could have been loaded from, stored to, or atomically updated in the LDS divided
-      as percentage of theoretical peak. Does not take into account the execution
-      mask of the wavefront when the instruction was executed.
-    Theoretical Bandwidth: Indicates the maximum amount of bytes that could have been
-      loaded from, stored to, or atomically updated in the LDS divided by total duration.
-      Does not take into account the execution mask of the wavefront when the instruction
-      was executed.
-    Bank Conflict Rate: Indicates the percentage of active LDS cycles that were spent
-      servicing bank conflicts. Calculated as the ratio of LDS cycles spent servicing
-      bank conflicts over the number of LDS cycles that would have been required to
-      move the same amount of data in an uncontended access.
-    LDS Instructions: The total number of LDS instructions (including, but not limited
-      to, read/write/atomics and HIP's __shfl instructions) executed per normalization
-      unit.
-    LDS Latency: The average number of round-trip cycles (i.e., from issue to data-return
-      / acknowledgment) required for an LDS instruction to complete.
-    Bank Conflicts/Access: The ratio of the number of cycles spent in the LDS scheduler
-      due to bank conflicts (as determined by the conflict resolution hardware) to
-      the base number of cycles that would be spent in the LDS scheduler in a completely
-      uncontended case. This is the unnormalized form of the Bank Conflict Rate.
-    Index Accesses: The total number of cycles spent in the LDS scheduler over all
-      operations per normalization unit.
-    Atomic Return Cycles: The total number of cycles spent on LDS atomics with return
-      per normalization unit.
-    Bank Conflict: The total number of cycles spent in the LDS scheduler due to bank
-      conflicts (as determined by the conflict resolution hardware) per normalization
-      unit.
-    Addr Conflict: The total number of cycles spent in the LDS scheduler due to address
-      conflicts (as determined by the conflict resolution hardware) per normalization
-      unit.
-    Unaligned Stall: The total number of cycles spent in the LDS scheduler due to
-      stalls from non-dword aligned addresses per normalization unit.
-    Mem Violations: "The total number of out-of-bounds accesses made to the LDS, per\
-      \ normalization unit. This is unused and expected to be zero in most configurations\
-      \ for modern CDNA\u2122 accelerators."
   data source:
   - metric_table:
       id: 1201
@@ -87,7 +42,7 @@ Panel Config:
           avg: AVG((SQ_INSTS_LDS / $denom))
           min: MIN((SQ_INSTS_LDS / $denom))
           max: MAX((SQ_INSTS_LDS / $denom))
-          unit: (Instr  + $normUnit)
+          unit: (Instr + $normUnit)
         Theoretical Bandwidth:
           avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
             / (End_Timestamp - Start_Timestamp)))
@@ -117,29 +72,75 @@ Panel Config:
           avg: AVG((SQ_LDS_IDX_ACTIVE / $denom))
           min: MIN((SQ_LDS_IDX_ACTIVE / $denom))
           max: MAX((SQ_LDS_IDX_ACTIVE / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
         Atomic Return Cycles:
           avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom))
           min: MIN((SQ_LDS_ATOMIC_RETURN / $denom))
           max: MAX((SQ_LDS_ATOMIC_RETURN / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
         Bank Conflict:
           avg: AVG((SQ_LDS_BANK_CONFLICT / $denom))
           min: MIN((SQ_LDS_BANK_CONFLICT / $denom))
           max: MAX((SQ_LDS_BANK_CONFLICT / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
         Addr Conflict:
           avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom))
           min: MIN((SQ_LDS_ADDR_CONFLICT / $denom))
           max: MAX((SQ_LDS_ADDR_CONFLICT / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
         Unaligned Stall:
           avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom))
           min: MIN((SQ_LDS_UNALIGNED_STALL / $denom))
           max: MAX((SQ_LDS_UNALIGNED_STALL / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
         Mem Violations:
           avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom))
           min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom))
           max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom))
           unit: (Accesses + $normUnit)
+  metrics_description:
+    Utilization: Indicates what percent of the kernel's duration the LDS was actively
+      executing instructions (including, but not limited to, load, store, atomic and
+      HIP's __shfl operations). Calculated as the ratio of the total number of cycles
+      LDS was active over the total CU cycles.
+    Access Rate: Indicates the percentage of SIMDs in the VALU actively issuing LDS
+      instructions, averaged over the lifetime of the kernel. Calculated as the ratio
+      of the total number of cycles spent by the scheduler issuing LDS instructions
+      over the total CU cycles.
+    Theoretical Bandwidth Utilization: Indicates the maximum amount of bytes that
+      could have been loaded from, stored to, or atomically updated in the LDS divided
+      as percentage of theoretical peak. Does not take into account the execution
+      mask of the wavefront when the instruction was executed.
+    Theoretical Bandwidth: Indicates the maximum amount of bytes that could have been
+      loaded from, stored to, or atomically updated in the LDS divided by total duration.
+      Does not take into account the execution mask of the wavefront when the instruction
+      was executed.
+    Bank Conflict Rate: Indicates the percentage of active LDS cycles that were spent
+      servicing bank conflicts. Calculated as the ratio of LDS cycles spent servicing
+      bank conflicts over the number of LDS cycles that would have been required to
+      move the same amount of data in an uncontended access.
+    LDS Instructions: The total number of LDS instructions (including, but not limited
+      to, read/write/atomics and HIP's __shfl instructions) executed per normalization
+      unit.
+    LDS Latency: The average number of round-trip cycles (i.e., from issue to data-return
+      acknowledgment) required for an LDS instruction to complete.
+    Bank Conflicts/Access: The ratio of the number of cycles spent in the LDS scheduler
+      due to bank conflicts (as determined by the conflict resolution hardware) to
+      the base number of cycles that would be spent in the LDS scheduler in a completely
+      uncontended case. This is the unnormalized form of the Bank Conflict Rate.
+    Index Accesses: The total number of cycles spent in the LDS scheduler over all
+      operations per normalization unit.
+    Atomic Return Cycles: The total number of cycles spent on LDS atomics with return
+      per normalization unit.
+    Bank Conflict: The total number of cycles spent in the LDS scheduler due to bank
+      conflicts (as determined by the conflict resolution hardware) per normalization
+      unit.
+    Addr Conflict: The total number of cycles spent in the LDS scheduler due to address
+      conflicts (as determined by the conflict resolution hardware) per normalization
+      unit.
+    Unaligned Stall: The total number of cycles spent in the LDS scheduler due to
+      stalls from non-dword aligned addresses per normalization unit.
+    Mem Violations: |-
+      The total number of out-of-bounds accesses made to the LDS, per normalization
+      unit. This is unused and expected to be zero in most configurations for
+      modern CDNA\u2122 accelerators.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1300_instruction_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1300_instruction_cache.yaml
index aeda9bc6c7..35808d9d96 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1300_instruction_cache.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1300_instruction_cache.yaml
@@ -2,28 +2,6 @@
 Panel Config:
   id: 1300
   title: Instruction Cache
-  metrics_description:
-    Bandwidth Utilization: The number of bytes looked up in the L1I cache, as a percent
-      of the peak theoretical bandwidth. Calculated as the ratio of L1I requests over
-      the total L1I cycles.
-    Cache Hit Rate: The percent of L1I requests that hit [#l1i-cache]_ on a previously
-      loaded line the cache. Calculated as the ratio of the number of L1I requests
-      that hit over the number of all L1I requests.
-    L1I-L2 Bandwidth Utilization: "The percent of the peak theoretical L1I \u2192\
-      \ L2 cache request bandwidth achieved. Calculated as the ratio of the total\
-      \ number of requests from the L1I to the L2 cache over the total L1I-L2 interface\
-      \ cycles."
-    L1I-L2 Bandwidth: Total number of bytes transferred across L1I - L2 interface
-      divided by total duration.
-    Req: The total number of requests made to the L1I per normalization-unit
-    Hits: The total number of L1I requests that hit on a previously loaded cache line,
-      per normalization-unit.
-    Misses - Non Duplicated: The total number of L1I requests that missed on a cache
-      line that were not already pending due to another request, per normalization-unit.
-    Misses - Duplicated: The total number of L1I requests that missed on a cache line
-      that were already pending due to another request, per normalization-unit.
-    Instruction Fetch Latency: The average number of cycles spent to fetch instructions
-      to a CU.
   data source:
   - metric_table:
       id: 1301
@@ -62,22 +40,22 @@ Panel Config:
           avg: AVG((SQC_ICACHE_REQ / $denom))
           min: MIN((SQC_ICACHE_REQ / $denom))
           max: MAX((SQC_ICACHE_REQ / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Hits:
           avg: AVG((SQC_ICACHE_HITS / $denom))
           min: MIN((SQC_ICACHE_HITS / $denom))
           max: MAX((SQC_ICACHE_HITS / $denom))
-          unit: (Hits  + $normUnit)
+          unit: (Hits + $normUnit)
         Misses - Non Duplicated:
           avg: AVG((SQC_ICACHE_MISSES / $denom))
           min: MIN((SQC_ICACHE_MISSES / $denom))
           max: MAX((SQC_ICACHE_MISSES / $denom))
-          unit: (Misses  + $normUnit)
+          unit: (Misses + $normUnit)
         Misses - Duplicated:
           avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom))
           min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom))
           max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom))
-          unit: (Misses  + $normUnit)
+          unit: (Misses + $normUnit)
         Cache Hit Rate:
           avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
             + SQC_ICACHE_MISSES_DUPLICATE)))
@@ -107,3 +85,25 @@ Panel Config:
           min: MIN(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
           max: MAX(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
           unit: Gbps
+  metrics_description:
+    Bandwidth Utilization: The number of bytes looked up in the L1I cache, as a percent
+      of the peak theoretical bandwidth. Calculated as the ratio of L1I requests over
+      the total L1I cycles.
+    Cache Hit Rate: The percent of L1I requests that hit [#l1i-cache]_ on a previously
+      loaded line the cache. Calculated as the ratio of the number of L1I requests
+      that hit over the number of all L1I requests.
+    L1I-L2 Bandwidth Utilization: |-
+      The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth
+      achieved. Calculated as the ratio of the total number of requests from the
+      L1I to the L2 cache over the total L1I-L2 interface cycles.
+    L1I-L2 Bandwidth: Total number of bytes transferred across L1I - L2 interface
+      divided by total duration.
+    Req: The total number of requests made to the L1I per normalization-unit
+    Hits: The total number of L1I requests that hit on a previously loaded cache line,
+      per normalization-unit.
+    Misses - Non Duplicated: The total number of L1I requests that missed on a cache
+      line that were not already pending due to another request, per normalization-unit.
+    Misses - Duplicated: The total number of L1I requests that missed on a cache line
+      that were already pending due to another request, per normalization-unit.
+    Instruction Fetch Latency: The average number of cycles spent to fetch instructions
+      to a CU.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1400_scalar_l1_data_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1400_scalar_l1_data_cache.yaml
index 282b97ad1f..6b73164848 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1400_scalar_l1_data_cache.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1400_scalar_l1_data_cache.yaml
@@ -2,49 +2,6 @@
 Panel Config:
   id: 1400
   title: Scalar L1 Data Cache
-  metrics_description:
-    Bandwidth Utilization: The number of bytes looked up in the sL1D cache, as a percent
-      of the peak theoretical bandwidth. Calculated as the ratio of sL1D requests
-      over the total sL1D cycles.
-    Cache Hit Rate: Indicates the percent of sL1D requests that hit on a previously
-      loaded line the cache. The ratio of the number of sL1D requests that hit over
-      the number of all sL1D requests.
-    sL1D-L2 BW Utilization: The percentage of the peak theoretical sL1D - L2 interface
-      bandwidth acheived.\ \ Caclulated as total number of bytes read from, written
-      to, or atomically updated\ \ across the sL1D - L2 interface.
-    sL1D-L2 BW: "The total number of bytes read from, written to, or atomically updated\
-      \ across the sL1D\u2194L2 interface, divided by total duration. Note that sL1D\
-      \ writes and atomics are typically unused on current CDNA accelerators, so in\
-      \ the majority of cases this can be interpreted as an sL1D\u2192L2 read bandwidth."
-    Req: The total number of requests, of any size or type, made to the sL1D per normalization
-      unit.
-    Hits: The total number of sL1D requests that hit on a previously loaded cache
-      line, per normalization unit.
-    Misses - Non Duplicated: 'The total number of sL1D requests that missed on a cache
-      line that was not already pending due to another request, per normalization
-      unit. '
-    Misses- Duplicated: The total number of sL1D requests that missed on a cache line
-      that was already pending due to another request, per normalization unit.
-    Read Req (Total): The total number of sL1D read requests of any size, per normalization
-      unit.
-    Atomic Req: The total number of atomic requests from sL1D to the L2, per normalization
-      unit. Typically unused on current CDNA accelerators.
-    Read Req (1 DWord): The total number of sL1D read requests made for a single dword
-      of data (4B), per normalization unit.
-    Read Req (2 DWord): The total number of sL1D read requests made for a two dwords
-      of data (8B), per normalization unit.
-    Read Req (4 DWord): The total number of sL1D read requests made for a four dwords
-      of data (16B), per normalization unit.
-    Read Req (8 DWord): The total number of sL1D read requests made for a eight dwords
-      of data (32B), per normalization unit.
-    Read Req (16 DWord): The total number of sL1D read requests made for a sixteen
-      dwords of data (64B), per normalization unit.
-    Read Req: The total number of read requests from sL1D to the L2 per normalization
-      unit.
-    Write Req: The total number of write requests from sL1D to the L2, per normalization
-      unit. Typically unused on current CDNA accelerators.
-    Stall Cycles: "The total number of cycles the sL1D\u2194L2 interface was stalled,\
-      \ per normalization unit."
   data source:
   - metric_table:
       id: 1401
@@ -84,22 +41,22 @@ Panel Config:
           avg: AVG((SQC_DCACHE_REQ / $denom))
           min: MIN((SQC_DCACHE_REQ / $denom))
           max: MAX((SQC_DCACHE_REQ / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Hits:
           avg: AVG((SQC_DCACHE_HITS / $denom))
           min: MIN((SQC_DCACHE_HITS / $denom))
           max: MAX((SQC_DCACHE_HITS / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Misses - Non Duplicated:
           avg: AVG((SQC_DCACHE_MISSES / $denom))
           min: MIN((SQC_DCACHE_MISSES / $denom))
           max: MAX((SQC_DCACHE_MISSES / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Misses- Duplicated:
           avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom))
           min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom))
           max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Cache Hit Rate:
           avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
             + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
@@ -118,37 +75,37 @@ Panel Config:
             + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
           max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
             + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Atomic Req:
           avg: AVG((SQC_DCACHE_ATOMIC / $denom))
           min: MIN((SQC_DCACHE_ATOMIC / $denom))
           max: MAX((SQC_DCACHE_ATOMIC / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Read Req (1 DWord):
           avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom))
           min: MIN((SQC_DCACHE_REQ_READ_1 / $denom))
           max: MAX((SQC_DCACHE_REQ_READ_1 / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Read Req (2 DWord):
           avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom))
           min: MIN((SQC_DCACHE_REQ_READ_2 / $denom))
           max: MAX((SQC_DCACHE_REQ_READ_2 / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Read Req (4 DWord):
           avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom))
           min: MIN((SQC_DCACHE_REQ_READ_4 / $denom))
           max: MAX((SQC_DCACHE_REQ_READ_4 / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Read Req (8 DWord):
           avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom))
           min: MIN((SQC_DCACHE_REQ_READ_8 / $denom))
           max: MAX((SQC_DCACHE_REQ_READ_8 / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Read Req (16 DWord):
           avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom))
           min: MIN((SQC_DCACHE_REQ_READ_16 / $denom))
           max: MAX((SQC_DCACHE_REQ_READ_16 / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
   - metric_table:
       id: 1403
       title: Scalar L1D Cache - L2 Interface
@@ -171,19 +128,65 @@ Panel Config:
           avg: AVG((SQC_TC_DATA_READ_REQ / $denom))
           min: MIN((SQC_TC_DATA_READ_REQ / $denom))
           max: MAX((SQC_TC_DATA_READ_REQ / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Write Req:
           avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom))
           min: MIN((SQC_TC_DATA_WRITE_REQ / $denom))
           max: MAX((SQC_TC_DATA_WRITE_REQ / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Atomic Req:
           avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom))
           min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom))
           max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Stall Cycles:
           avg: AVG((SQC_TC_STALL / $denom))
           min: MIN((SQC_TC_STALL / $denom))
           max: MAX((SQC_TC_STALL / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
+  metrics_description:
+    Bandwidth Utilization: The number of bytes looked up in the sL1D cache, as a percent
+      of the peak theoretical bandwidth. Calculated as the ratio of sL1D requests
+      over the total sL1D cycles.
+    Cache Hit Rate: Indicates the percent of sL1D requests that hit on a previously
+      loaded line the cache. The ratio of the number of sL1D requests that hit over
+      the number of all sL1D requests.
+    sL1D-L2 BW Utilization: The percentage of the peak theoretical sL1D - L2 interface
+      bandwidth acheived. Calculated as total number of bytes read from, written to,
+      or atomically updated across the sL1D - L2 interface.
+    sL1D-L2 BW: |-
+      The total number of bytes read from, written to, or atomically updated
+      across the sL1D\u2194L2 interface, divided by total duration. Note that sL1D
+      writes and atomics are typically unused on current CDNA accelerators, so
+      in the majority of cases this can be interpreted as an sL1D\u2192L2 read
+      bandwidth.
+    Req: The total number of requests, of any size or type, made to the sL1D per normalization
+      unit.
+    Hits: The total number of sL1D requests that hit on a previously loaded cache
+      line, per normalization unit.
+    Misses - Non Duplicated: |-
+      The total number of sL1D requests that missed on a cache line that was
+      not already pending due to another request, per normalization unit.
+    Misses- Duplicated: The total number of sL1D requests that missed on a cache line
+      that was already pending due to another request, per normalization unit.
+    Read Req (Total): The total number of sL1D read requests of any size, per normalization
+      unit.
+    Atomic Req: The total number of atomic requests from sL1D to the L2, per normalization
+      unit. Typically unused on current CDNA accelerators.
+    Read Req (1 DWord): The total number of sL1D read requests made for a single dword
+      of data (4B), per normalization unit.
+    Read Req (2 DWord): The total number of sL1D read requests made for a two dwords
+      of data (8B), per normalization unit.
+    Read Req (4 DWord): The total number of sL1D read requests made for a four dwords
+      of data (16B), per normalization unit.
+    Read Req (8 DWord): The total number of sL1D read requests made for a eight dwords
+      of data (32B), per normalization unit.
+    Read Req (16 DWord): The total number of sL1D read requests made for a sixteen
+      dwords of data (64B), per normalization unit.
+    Read Req: The total number of read requests from sL1D to the L2 per normalization
+      unit.
+    Write Req: The total number of write requests from sL1D to the L2, per normalization
+      unit. Typically unused on current CDNA accelerators.
+    Stall Cycles: |-
+      The total number of cycles the sL1D\u2194L2 interface was stalled, per
+      normalization unit.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1500_address_processing_unit_and_data_return_path_ta_td.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1500_address_processing_unit_and_data_return_path_ta_td.yaml
index 0d826ceb1b..e33f3fc593 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1500_address_processing_unit_and_data_return_path_ta_td.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1500_address_processing_unit_and_data_return_path_ta_td.yaml
@@ -2,70 +2,6 @@
 Panel Config:
   id: 1500
   title: Address Processing Unit and Data Return Path (TA/TD)
-  metrics_description:
-    Address Processing Unit Busy: Percent of the total CU cycles the address processor
-      was busy
-    Address Stall: Percent of the total CU cycles the address processor was stalled
-      from sending address requests further into the vL1D pipeline.
-    Data Stall: Percent of the total CU cycles the address processor was stalled from
-      sending write/atomic data further into the vL1D pipeline.
-    "Data-Processor \u2192 Address Stall": Percent of total CU cycles the address
-      processor was stalled waiting to send command data to the data processor.
-    Total Instructions: The total number of memory instructions executed by the address
-      processer over all compute units on the accelerator, per normalization unit.
-    Global/Generic Instructions: The total number of global & generic memory instructions
-      executed on all compute units on the accelerator, per normalization unit.
-    Global/Generic Read Instructions: The total number of global & generic memory
-      read instructions executed on all compute units on the accelerator, per normalization
-      unit.
-    Global/Generic Write Instructions: The total number of global & generic memory
-      write instructions executed on all compute units on the accelerator, per normalization
-      unit.
-    Global/Generic Atomic Instructions: The total number of global & generic memory
-      atomic (with and without return) instructions executed on all compute units
-      on the accelerator, per normalization unit.
-    Spill/Stack Instructions: The total number of spill/stack memory instructions
-      executed on all compute units on the accelerator, per normalization unit.
-    Spill/Stack Read Instructions: The total number of spill/stack memory read instructions
-      executed on all compute units on the accelerator, per normalization unit.
-    Spill/Stack Write Instructions: The total number of spill/stack memory write instructions
-      executed on all compute units on the accelerator, per normalization unit.
-    Spill/Stack Atomic Instructions: The total number of spill/stack memory atomic
-      (with and without return) instructions executed on all compute units on the
-      accelerator, per normalization unit. Typically unused as these memory operations
-      are typically used to implement thread-local storage.
-    Spill/Stack Total Cycles: The number of cycles the address processing unit spent
-      working on spill/stack instructions, per normalization unit.
-    Spill/Stack Coalesced Read: The number of cycles the address processing unit spent
-      working on coalesced spill/stack read instructions, per normalization unit.
-    Spill/Stack Coalesced Write: The number of cycles the address processing unit
-      spent working on coalesced spill/stack write instructions, per normalization
-      unit.
-    Data-Return Busy: Percent of the total CU cycles the data-return unit was busy
-      processing or waiting on data to return to the CU.
-    "Cache RAM \u2192 Data-Return Stall": Percent of the total CU cycles the data-return
-      unit was stalled on data to be returned from the vL1D Cache RAM.
-    "Workgroup manager \u2192 Data-Return Stall": Percent of the total CU cycles the
-      data-return unit was stalled by the workgroup manager due to initialization
-      of registers as a part of launching new workgroups.
-    Coalescable Instructions: The number of instructions submitted to the data-return
-      unit by the address processor that were found to be coalescable, per normalization
-      unit.
-    Read Instructions: The number of read instructions submitted to the data-return
-      unit by the address processor summed over all compute units on the accelerator,
-      per normalization unit. This is expected to be the sum of global/generic and
-      spill/stack reads in the address processor.
-    Write Instructions: The number of store instructions submitted to the data-return
-      unit by the address processor summed over all compute units on the accelerator,
-      per normalization unit. This is expected to be the sum of global/generic and
-      spill/stack stores in the address processor.
-    Atomic Instructions: The number of atomic instructions submitted to the data-return
-      unit by the address processor summed over all compute units on the accelerator,
-      per normalization unit. This is expected to be the sum of global/generic and
-      spill/stack atomics in the address processor.
-    Write Ack Instructions: The total number of write acknowledgements submitted by
-      data-return unit to SQ, summed over all compute units on the accelerator, per
-      normalization unit.
   data source:
   - metric_table:
       id: 1501
@@ -135,47 +71,47 @@ Panel Config:
           avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom))
           min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom))
           max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Global/Generic Instructions:
           avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
           min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
           max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Global/Generic Read Instructions:
           avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
           min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
           max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Global/Generic Write Instructions:
           avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
           min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
           max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Global/Generic Atomic Instructions:
           avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
           min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
           max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Spill/Stack Instructions:
           avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
           min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
           max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Spill/Stack Read Instructions:
           avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
           min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
           max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Spill/Stack Write Instructions:
           avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
           min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
           max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Spill/Stack Atomic Instructions:
           avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
           min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
           max: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
   - metric_table:
       id: 1503
       title: Spill and stack metrics
@@ -190,17 +126,17 @@ Panel Config:
           avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
           min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
           max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
         Spill/Stack Coalesced Read:
           avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
           min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
           max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
         Spill/Stack Coalesced Write:
           avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
           min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
           max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
   - metric_table:
       id: 1504
       title: Vector L1 data-return path or Texture Data (TD)
@@ -230,7 +166,7 @@ Panel Config:
           avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom))
           min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom))
           max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Read Instructions:
           avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
             / $denom))
@@ -238,14 +174,75 @@ Panel Config:
             / $denom))
           max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
             / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Write Instructions:
           avg: AVG((TD_STORE_WAVEFRONT_sum / $denom))
           min: MIN((TD_STORE_WAVEFRONT_sum / $denom))
           max: MAX((TD_STORE_WAVEFRONT_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Atomic Instructions:
           avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom))
           min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom))
           max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
+  metrics_description:
+    Address Processing Unit Busy: Percent of the total CU cycles the address processor
+      was busy
+    Address Stall: Percent of the total CU cycles the address processor was stalled
+      from sending address requests further into the vL1D pipeline.
+    Data Stall: Percent of the total CU cycles the address processor was stalled from
+      sending write/atomic data further into the vL1D pipeline.
+    "Data-Processor \u2192 Address Stall": Percent of total CU cycles the address
+      processor was stalled waiting to send command data to the data processor.
+    Total Instructions: The total number of memory instructions executed by the address
+      processer over all compute units on the accelerator, per normalization unit.
+    Global/Generic Instructions: The total number of global & generic memory instructions
+      executed on all compute units on the accelerator, per normalization unit.
+    Global/Generic Read Instructions: The total number of global & generic memory
+      read instructions executed on all compute units on the accelerator, per normalization
+      unit.
+    Global/Generic Write Instructions: The total number of global & generic memory
+      write instructions executed on all compute units on the accelerator, per normalization
+      unit.
+    Global/Generic Atomic Instructions: The total number of global & generic memory
+      atomic (with and without return) instructions executed on all compute units
+      on the accelerator, per normalization unit.
+    Spill/Stack Instructions: The total number of spill/stack memory instructions
+      executed on all compute units on the accelerator, per normalization unit.
+    Spill/Stack Read Instructions: The total number of spill/stack memory read instructions
+      executed on all compute units on the accelerator, per normalization unit.
+    Spill/Stack Write Instructions: The total number of spill/stack memory write instructions
+      executed on all compute units on the accelerator, per normalization unit.
+    Spill/Stack Atomic Instructions: The total number of spill/stack memory atomic
+      (with and without return) instructions executed on all compute units on the
+      accelerator, per normalization unit. Typically unused as these memory operations
+      are typically used to implement thread-local storage.
+    Spill/Stack Total Cycles: The number of cycles the address processing unit spent
+      working on spill/stack instructions, per normalization unit.
+    Spill/Stack Coalesced Read: The number of cycles the address processing unit spent
+      working on coalesced spill/stack read instructions, per normalization unit.
+    Spill/Stack Coalesced Write: The number of cycles the address processing unit
+      spent working on coalesced spill/stack write instructions, per normalization
+      unit.
+    Data-Return Busy: Percent of the total CU cycles the data-return unit was busy
+      processing or waiting on data to return to the CU.
+    "Cache RAM \u2192 Data-Return Stall": Percent of the total CU cycles the data-return
+      unit was stalled on data to be returned from the vL1D Cache RAM.
+    "Workgroup manager \u2192 Data-Return Stall": Percent of the total CU cycles the
+      data-return unit was stalled by the workgroup manager due to initialization
+      of registers as a part of launching new workgroups.
+    Coalescable Instructions: The number of instructions submitted to the data-return
+      unit by the address processor that were found to be coalescable, per normalization
+      unit.
+    Read Instructions: The number of read instructions submitted to the data-return
+      unit by the address processor summed over all compute units on the accelerator,
+      per normalization unit. This is expected to be the sum of global/generic and
+      spill/stack reads in the address processor.
+    Write Instructions: The number of store instructions submitted to the data-return
+      unit by the address processor summed over all compute units on the accelerator,
+      per normalization unit. This is expected to be the sum of global/generic and
+      spill/stack stores in the address processor.
+    Atomic Instructions: The number of atomic instructions submitted to the data-return
+      unit by the address processor summed over all compute units on the accelerator,
+      per normalization unit. This is expected to be the sum of global/generic and
+      spill/stack atomics in the address processor.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1600_vector_l1_data_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1600_vector_l1_data_cache.yaml
index b374ea9466..2be99f875f 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1600_vector_l1_data_cache.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1600_vector_l1_data_cache.yaml
@@ -2,117 +2,6 @@
 Panel Config:
   id: 1600
   title: Vector L1 Data Cache
-  metrics_description:
-    Hit rate: The ratio of the number of vL1D cache line requests that hit in vL1D
-      cache over the total number of cache line requests to the vL1D Cache RAM.
-    Bandwidth Utilization: The number of bytes looked up in the vL1D cache as a result
-      of VMEM instructions, as a percent of the peak theoretical bandwidth achievable
-      on the specific accelerator. The number of bytes is calculated as the number
-      of cache lines requested multiplied by the cache line size. This value does
-      not consider partial requests, so for instance, if only a single value is requested
-      in a cache line, the data movement will still be counted as a full cache line.
-    Utilization: Indicates how busy the vL1D Cache RAM was during the kernel execution.
-      The number of cycles where the vL1D Cache RAM is actively processing any request
-      divided by the number of cycles where the vL1D is active.
-    Coalescing: Indicates how well memory instructions were coalesced by the address
-      processing unit, ranging from uncoalesced (25%) to fully coalesced (100%). Calculated
-      as the average number of thread-requests generated per instruction divided by
-      the ideal number of thread-requests per instruction.
-    Stalled on L2 Data: The ratio of the number of cycles where the vL1D is stalled
-      waiting for requested data to return from the L2 cache divided by the number
-      of cycles where the vL1D is active.
-    Stalled on L2 Req: The ratio of the number of cycles where the vL1D is stalled
-      waiting to issue a request for data to the L2 cache divided by the number of
-      cycles where the vL1D is active.
-    Tag RAM Stall (Read): The ratio of the number of cycles where the vL1D is stalled
-      due to Read requests with conflicting tags being looked up concurrently, divided
-      by the number of cycles where the vL1D is active.
-    Tag RAM Stall (Write): The ratio of the number of cycles where the vL1D is stalled
-      due to Write requests with conflicting tags being looked up concurrently, divided
-      by the number of cycles where the vL1D is active.
-    Tag RAM Stall (Atomic): The ratio of the number of cycles where the vL1D is stalled
-      due to Atomic requests with conflicting tags being looked up concurrently, divided
-      by the number of cycles where the vL1D is active.
-    Total Req: The total number of incoming requests from the address processing unit
-      after coalescing.
-    Read Req: The total number of incoming read requests from the address processing
-      unit after coalescing per normalization unit.
-    Write Req: The total number of incoming write requests from the address processing
-      unit after coalescing per normalization unit.
-    Atomic Req: The total number of incoming atomic requests from the address processing
-      unit after coalescing per normalization unit.
-    Cache BW: The number of bytes looked up in the vL1D cache as a result of VMEM
-      instructions divided by total duration. The number of bytes is calculated as
-      the number of cache lines requested multiplied by the cache line size.  This
-      value does not consider partial requests, so for instance, if only a single
-      value is requested in a cache line, the data movement will still be counted
-      as a full cache line.
-    Cache Hit Rate: The ratio of the number of vL1D cache line requests that hit in
-      vL1D cache over the total number of cache line requests to the vL1D Cache RAM.
-    Cache Accesses: The total number of cache line lookups in the vL1D.
-    Cache Hits: The number of cache accesses minus the number of outgoing requests
-      to the L2 cache, that is, the number of cache line requests serviced by the
-      vL1D Cache RAM per normalization unit.
-    Invalidations: The number of times the vL1D was issued a write-back invalidate
-      command during the kernel's execution per normalization unit. This may be triggered
-      by, for instance, the buffer_wbinvl1 instruction.
-    L1-L2 BW: The number of bytes transferred across the vL1D-L2 interface as a result
-      of VMEM instructions, divided by total duration. The number of bytes is calculated
-      as the number of cache lines requested multiplied by the cache line size. This
-      value does not consider partial requests, so for instance, if only a single
-      value is requested in a cache line, the data movement will still be counted
-      as a full cache line.
-    L1-L2 Read: The number of read requests for a vL1D cache line that were not satisfied
-      by the vL1D and must be retrieved from the to the L2 Cache per normalization
-      unit.
-    L1-L2 Write: The number of write requests to a vL1D cache line that were sent
-      through the vL1D to the L2 cache, per normalization unit.
-    L1-L2 Atomic: The number of atomic requests that are sent through the vL1D to
-      the L2 cache, per normalization unit. This includes requests for atomics with,
-      and without return.
-    L1 Access Latency: Calculated as the average number of cycles that a vL1D cache
-      line request spent in the vL1D cache pipeline.
-    L1-L2 Read Latency: Calculated as the average number of cycles that the vL1D cache
-      took to issue and receive read requests from the L2 Cache. This number also
-      includes requests for atomics with return values.
-    L1-L2 Write Latency: Calculated as the average number of cycles that the vL1D
-      cache took to issue and receive acknowledgement of a write request to the L2
-      Cache. This number also includes requests for atomics without return values.
-    NC - Read: Total read requests with NC mtype from this TCP to all TCCs Sum over
-      TCP instances per normalization unit.
-    UC - Read: Total read requests with UC mtype from this TCP to all TCCs Sum over
-      TCP instances per normalization unit.
-    CC - Read: Total read requests with CC mtype from this TCP to all TCCs Sum over
-      TCP instances per normalization unit.
-    RW - Read: Total read requests with RW mtype from this TCP to all TCCs Sum over
-      TCP instances per normalization unit.
-    RW - Write: Total write requests with RW mtype from this TCP to all TCCs Sum over
-      TCP instances per normalization unit.
-    NC - Write: Total write requests with NC mtype from this TCP to all TCCs Sum over
-      TCP instances per normalization unit.
-    UC - Write: Total write requests with UC mtype from this TCP to all TCCs Sum over
-      TCP instances per normalization unit.
-    CC - Write: Total write requests with CC mtype from this TCP to all TCCs Sum over
-      TCP instances per normalization unit.
-    NC - Atomic: Total atomic requests with NC mtype from this TCP to all TCCs Sum
-      over TCP instances per normalization unit.
-    UC - Atomic: Total atomic requests with UC mtype from this TCP to all TCCs Sum
-      over TCP instances per normalization unit.
-    CC - Atomic: Total atomic requests with CC mtype from this TCP to all TCCs Sum
-      over TCP instances per normalization unit.
-    RW - Atomic: Total atomic requests with RW mtype from this TCP to all TCCs Sum
-      over TCP instances per normalization unit.
-    Req: The number of translation requests made to the UTCL1 per normalization unit.
-    Hit Ratio: The ratio of the number of translation requests that hit in the UTCL1
-      divided by the total number of translation requests made to the UTCL1.
-    Hits: The number of translation requests that hit in the UTCL1, and could be reused,
-      per normalization unit.
-    Translation Misses: The total number of translation requests that missed in the
-      UTCL1 due to  translation not being present in the cache, per normalization
-      unit.
-    Permission Misses: "The total number of translation requests that missed in the\
-      \ UTCL1 due to a permission error, per normalization unit. This is unused and\
-      \ expected to be zero in most configurations for modern CDNA\u2122 accelerators."
   data source:
   - metric_table:
       id: 1601
@@ -181,17 +70,17 @@ Panel Config:
           avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom))
           min: MIN((TCP_TOTAL_ACCESSES_sum / $denom))
           max: MAX((TCP_TOTAL_ACCESSES_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Read Req:
           avg: AVG((TCP_TOTAL_READ_sum / $denom))
           min: MIN((TCP_TOTAL_READ_sum / $denom))
           max: MAX((TCP_TOTAL_READ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Write Req:
           avg: AVG((TCP_TOTAL_WRITE_sum / $denom))
           min: MIN((TCP_TOTAL_WRITE_sum / $denom))
           max: MAX((TCP_TOTAL_WRITE_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Atomic Req:
           avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
             / $denom))
@@ -199,7 +88,7 @@ Panel Config:
             / $denom))
           max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
             / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Cache BW:
           avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp)))
           min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp)))
@@ -223,7 +112,7 @@ Panel Config:
           avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
           min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
           max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Cache Hits:
           avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
             + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
@@ -234,7 +123,7 @@ Panel Config:
           max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
             + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
             / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Invalidations:
           avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
           min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
@@ -252,12 +141,12 @@ Panel Config:
           avg: AVG((TCP_TCC_READ_REQ_sum / $denom))
           min: MIN((TCP_TCC_READ_REQ_sum / $denom))
           max: MAX((TCP_TCC_READ_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         L1-L2 Write:
           avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom))
           min: MIN((TCP_TCC_WRITE_REQ_sum / $denom))
           max: MAX((TCP_TCC_WRITE_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         L1-L2 Atomic:
           avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
             / $denom))
@@ -265,7 +154,7 @@ Panel Config:
             / $denom))
           max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
             / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         L1 Access Latency:
           avg: AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum
             != 0) else None))
@@ -314,84 +203,84 @@ Panel Config:
           avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom))
           min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom))
           max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         UC - Read:
           xfer: Read
           coherency: UC
           avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom))
           min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom))
           max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         CC - Read:
           xfer: Read
           coherency: CC
           avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom))
           min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom))
           max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         RW - Read:
           xfer: Read
           coherency: RW
           avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom))
           min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom))
           max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         RW - Write:
           xfer: Write
           coherency: RW
           avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom))
           min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom))
           max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         NC - Write:
           xfer: Write
           coherency: NC
           avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom))
           min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom))
           max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         UC - Write:
           xfer: Write
           coherency: UC
           avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom))
           min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom))
           max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         CC - Write:
           xfer: Write
           coherency: CC
           avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom))
           min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom))
           max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         NC - Atomic:
           xfer: Atomic
           coherency: NC
           avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
           min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
           max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         UC - Atomic:
           xfer: Atomic
           coherency: UC
           avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
           min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
           max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         CC - Atomic:
           xfer: Atomic
           coherency: CC
           avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
           min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
           max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         RW - Atomic:
           xfer: Atomic
           coherency: RW
           avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
           min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
           max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
   - metric_table:
       id: 1605
       title: L1 Unified Translation Cache (UTCL1)
@@ -440,3 +329,114 @@ Panel Config:
         max: Max
         units: Unit
       metric: {}
+  metrics_description:
+    Hit rate: The ratio of the number of vL1D cache line requests that hit in vL1D
+      cache over the total number of cache line requests to the vL1D Cache RAM.
+    Bandwidth Utilization: The number of bytes looked up in the vL1D cache as a result
+      of VMEM instructions, as a percent of the peak theoretical bandwidth achievable
+      on the specific accelerator. The number of bytes is calculated as the number
+      of cache lines requested multiplied by the cache line size. This value does
+      not consider partial requests, so for instance, if only a single value is requested
+      in a cache line, the data movement will still be counted as a full cache line.
+    Utilization: Indicates how busy the vL1D Cache RAM was during the kernel execution.
+      The number of cycles where the vL1D Cache RAM is actively processing any request
+      divided by the number of cycles where the vL1D is active.
+    Coalescing: Indicates how well memory instructions were coalesced by the address
+      processing unit, ranging from uncoalesced (25%) to fully coalesced (100%). Calculated
+      as the average number of thread-requests generated per instruction divided by
+      the ideal number of thread-requests per instruction.
+    Stalled on L2 Data: The ratio of the number of cycles where the vL1D is stalled
+      waiting for requested data to return from the L2 cache divided by the number
+      of cycles where the vL1D is active.
+    Stalled on L2 Req: The ratio of the number of cycles where the vL1D is stalled
+      waiting to issue a request for data to the L2 cache divided by the number of
+      cycles where the vL1D is active.
+    Tag RAM Stall (Read): The ratio of the number of cycles where the vL1D is stalled
+      due to Read requests with conflicting tags being looked up concurrently, divided
+      by the number of cycles where the vL1D is active.
+    Tag RAM Stall (Write): The ratio of the number of cycles where the vL1D is stalled
+      due to Write requests with conflicting tags being looked up concurrently, divided
+      by the number of cycles where the vL1D is active.
+    Tag RAM Stall (Atomic): The ratio of the number of cycles where the vL1D is stalled
+      due to Atomic requests with conflicting tags being looked up concurrently, divided
+      by the number of cycles where the vL1D is active.
+    Total Req: The total number of incoming requests from the address processing unit
+      after coalescing.
+    Read Req: The total number of incoming read requests from the address processing
+      unit after coalescing per normalization unit.
+    Write Req: The total number of incoming write requests from the address processing
+      unit after coalescing per normalization unit.
+    Atomic Req: The total number of incoming atomic requests from the address processing
+      unit after coalescing per normalization unit.
+    Cache BW: The number of bytes looked up in the vL1D cache as a result of VMEM
+      instructions divided by total duration. The number of bytes is calculated as
+      the number of cache lines requested multiplied by the cache line size. This
+      value does not consider partial requests, so for instance, if only a single
+      value is requested in a cache line, the data movement will still be counted
+      as a full cache line.
+    Cache Hit Rate: The ratio of the number of vL1D cache line requests that hit in
+      vL1D cache over the total number of cache line requests to the vL1D Cache RAM.
+    Cache Accesses: The total number of cache line lookups in the vL1D.
+    Cache Hits: The number of cache accesses minus the number of outgoing requests
+      to the L2 cache, that is, the number of cache line requests serviced by the
+      vL1D Cache RAM per normalization unit.
+    Invalidations: The number of times the vL1D was issued a write-back invalidate
+      command during the kernel's execution per normalization unit. This may be triggered
+      by, for instance, the buffer_wbinvl1 instruction.
+    L1-L2 BW: The number of bytes transferred across the vL1D-L2 interface as a result
+      of VMEM instructions, divided by total duration. The number of bytes is calculated
+      as the number of cache lines requested multiplied by the cache line size. This
+      value does not consider partial requests, so for instance, if only a single
+      value is requested in a cache line, the data movement will still be counted
+      as a full cache line.
+    L1-L2 Read: The number of read requests for a vL1D cache line that were not satisfied
+      by the vL1D and must be retrieved from the to the L2 Cache per normalization
+      unit.
+    L1-L2 Write: The number of write requests to a vL1D cache line that were sent
+      through the vL1D to the L2 cache, per normalization unit.
+    L1-L2 Atomic: The number of atomic requests that are sent through the vL1D to
+      the L2 cache, per normalization unit. This includes requests for atomics with,
+      and without return.
+    L1 Access Latency: Calculated as the average number of cycles that a vL1D cache
+      line request spent in the vL1D cache pipeline.
+    L1-L2 Read Latency: Calculated as the average number of cycles that the vL1D cache
+      took to issue and receive read requests from the L2 Cache. This number also
+      includes requests for atomics with return values.
+    L1-L2 Write Latency: Calculated as the average number of cycles that the vL1D
+      cache took to issue and receive acknowledgement of a write request to the L2
+      Cache. This number also includes requests for atomics without return values.
+    NC - Read: Total read requests with NC mtype from this TCP to all TCCs Sum over
+      TCP instances per normalization unit.
+    UC - Read: Total read requests with UC mtype from this TCP to all TCCs Sum over
+      TCP instances per normalization unit.
+    CC - Read: Total read requests with CC mtype from this TCP to all TCCs Sum over
+      TCP instances per normalization unit.
+    RW - Read: Total read requests with RW mtype from this TCP to all TCCs Sum over
+      TCP instances per normalization unit.
+    RW - Write: Total write requests with RW mtype from this TCP to all TCCs Sum over
+      TCP instances per normalization unit.
+    NC - Write: Total write requests with NC mtype from this TCP to all TCCs Sum over
+      TCP instances per normalization unit.
+    UC - Write: Total write requests with UC mtype from this TCP to all TCCs Sum over
+      TCP instances per normalization unit.
+    CC - Write: Total write requests with CC mtype from this TCP to all TCCs Sum over
+      TCP instances per normalization unit.
+    NC - Atomic: Total atomic requests with NC mtype from this TCP to all TCCs Sum
+      over TCP instances per normalization unit.
+    UC - Atomic: Total atomic requests with UC mtype from this TCP to all TCCs Sum
+      over TCP instances per normalization unit.
+    CC - Atomic: Total atomic requests with CC mtype from this TCP to all TCCs Sum
+      over TCP instances per normalization unit.
+    RW - Atomic: Total atomic requests with RW mtype from this TCP to all TCCs Sum
+      over TCP instances per normalization unit.
+    Req: The number of translation requests made to the UTCL1 per normalization unit.
+    Hit Ratio: The ratio of the number of translation requests that hit in the UTCL1
+      divided by the total number of translation requests made to the UTCL1.
+    Hits: The number of translation requests that hit in the UTCL1, and could be reused,
+      per normalization unit.
+    Translation Misses: The total number of translation requests that missed in the
+      UTCL1 due to translation not being present in the cache, per normalization unit.
+    Permission Misses: |-
+      The total number of translation requests that missed in the UTCL1 due
+      to a permission error, per normalization unit. This is unused and expected
+      to be zero in most configurations for modern CDNA\u2122 accelerators.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1700_l2_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1700_l2_cache.yaml
index b208bc32bd..2519779dff 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1700_l2_cache.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1700_l2_cache.yaml
@@ -2,6 +2,350 @@
 Panel Config:
   id: 1700
   title: L2 Cache
+  data source:
+  - metric_table:
+      id: 1701
+      title: L2 Speed-of-Light
+      header:
+        metric: Metric
+        value: Avg
+        unit: Unit
+      metric:
+        Utilization:
+          value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($total_l2_chan) * $GRBM_GUI_ACTIVE_PER_XCD)))
+          unit: pct
+        Peak Bandwidth:
+          value: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))))
+            / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)))
+          unit: pct
+        Hit Rate:
+          value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
+            + TCC_MISS_sum) != 0) else 0))
+          unit: pct
+        L2-Fabric Read BW:
+          value: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum)
+            * 64)) / (End_Timestamp - Start_Timestamp)))
+          unit: GB/s
+        L2-Fabric Write and Atomic BW:
+          value: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum)
+            * 32)) / (End_Timestamp - Start_Timestamp)))
+          unit: GB/s
+        HBM Bandwidth:
+          value: $hbmBandwidth
+          unit: GB/s
+  - metric_table:
+      id: 1702
+      title: L2-Fabric interface metrics
+      header:
+        metric: Metric
+        avg: Avg
+        min: Min
+        max: Max
+        unit: Unit
+      metric:
+        Read BW:
+          avg: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum)
+            * 64)) / (End_Timestamp - Start_Timestamp)))
+          min: MIN((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum)
+            * 64)) / (End_Timestamp - Start_Timestamp)))
+          max: MAX((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum)
+            * 64)) / (End_Timestamp - Start_Timestamp)))
+          unit: Gbps
+        HBM Read Traffic:
+          avg: AVG((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
+            != 0) else None))
+          min: MIN((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
+            != 0) else None))
+          max: MAX((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
+            != 0) else None))
+          unit: pct
+        Remote Read Traffic:
+          avg: AVG((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum)
+            if (TCC_EA_RDREQ_sum != 0) else None))
+          min: MIN((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum)
+            if (TCC_EA_RDREQ_sum != 0) else None))
+          max: MAX((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum)
+            if (TCC_EA_RDREQ_sum != 0) else None))
+          unit: pct
+        Uncached Read Traffic:
+          avg: AVG((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
+            != 0) else None))
+          min: MIN((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
+            != 0) else None))
+          max: MAX((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
+            != 0) else None))
+          unit: pct
+        Write and Atomic BW:
+          avg: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
+            * 32)) / $denom))
+          min: MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
+            * 32)) / $denom))
+          max: MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
+            * 32)) / $denom))
+          unit: (Bytes + $normUnit)
+        HBM Write and Atomic Traffic:
+          avg: AVG((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
+            != 0) else None))
+          min: MIN((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
+            != 0) else None))
+          max: MAX((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
+            != 0) else None))
+          unit: pct
+        Remote Write and Atomic Traffic:
+          avg: AVG((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum)
+            if (TCC_EA_WRREQ_sum != 0) else None))
+          min: MIN((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum)
+            if (TCC_EA_WRREQ_sum != 0) else None))
+          max: MAX((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum)
+            if (TCC_EA_WRREQ_sum != 0) else None))
+          unit: pct
+        Atomic Traffic:
+          avg: AVG((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
+            != 0) else None))
+          min: MIN((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
+            != 0) else None))
+          max: MAX((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
+            != 0) else None))
+          unit: pct
+        Uncached Write and Atomic Traffic:
+          avg: AVG((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
+            != 0) else None))
+          min: MIN((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
+            != 0) else None))
+          max: MAX((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
+            != 0) else None))
+          unit: pct
+        Read Latency:
+          avg: AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
+            != 0) else None))
+          min: MIN(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
+            != 0) else None))
+          max: MAX(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
+            != 0) else None))
+          unit: Cycles
+        Write and Atomic Latency:
+          avg: AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
+            != 0) else None))
+          min: MIN(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
+            != 0) else None))
+          max: MAX(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
+            != 0) else None))
+          unit: Cycles
+        Atomic Latency:
+          avg: AVG(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum
+            != 0) else None))
+          min: MIN(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum
+            != 0) else None))
+          max: MAX(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum
+            != 0) else None))
+          unit: Cycles
+  - metric_table:
+      id: 1703
+      title: L2 Cache Accesses
+      header:
+        metric: Metric
+        avg: Avg
+        min: Min
+        max: Max
+        unit: Unit
+      metric:
+        Bandwidth:
+          avg: AVG((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
+          min: MIN((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
+          max: MAX((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
+          unit: Gbps
+        Req:
+          avg: AVG((TCC_REQ_sum / $denom))
+          min: MIN((TCC_REQ_sum / $denom))
+          max: MAX((TCC_REQ_sum / $denom))
+          unit: (Req + $normUnit)
+        Read Req:
+          avg: AVG((TCC_READ_sum / $denom))
+          min: MIN((TCC_READ_sum / $denom))
+          max: MAX((TCC_READ_sum / $denom))
+          unit: (Req + $normUnit)
+        Write Req:
+          avg: AVG((TCC_WRITE_sum / $denom))
+          min: MIN((TCC_WRITE_sum / $denom))
+          max: MAX((TCC_WRITE_sum / $denom))
+          unit: (Req + $normUnit)
+        Atomic Req:
+          avg: AVG((TCC_ATOMIC_sum / $denom))
+          min: MIN((TCC_ATOMIC_sum / $denom))
+          max: MAX((TCC_ATOMIC_sum / $denom))
+          unit: (Req + $normUnit)
+        Streaming Req:
+          avg: AVG((TCC_STREAMING_REQ_sum / $denom))
+          min: MIN((TCC_STREAMING_REQ_sum / $denom))
+          max: MAX((TCC_STREAMING_REQ_sum / $denom))
+          unit: (Req + $normUnit)
+        Probe Req:
+          avg: AVG((TCC_PROBE_sum / $denom))
+          min: MIN((TCC_PROBE_sum / $denom))
+          max: MAX((TCC_PROBE_sum / $denom))
+          unit: (Req + $normUnit)
+        Cache Hit:
+          avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
+            + TCC_MISS_sum) != 0) else None))
+          min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
+            + TCC_MISS_sum) != 0) else None))
+          max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
+            + TCC_MISS_sum) != 0) else None))
+          unit: pct
+        Hits:
+          avg: AVG((TCC_HIT_sum / $denom))
+          min: MIN((TCC_HIT_sum / $denom))
+          max: MAX((TCC_HIT_sum / $denom))
+          unit: (Hits + $normUnit)
+        Misses:
+          avg: AVG((TCC_MISS_sum / $denom))
+          min: MIN((TCC_MISS_sum / $denom))
+          max: MAX((TCC_MISS_sum / $denom))
+          unit: (Misses + $normUnit)
+        Writeback:
+          avg: AVG((TCC_WRITEBACK_sum / $denom))
+          min: MIN((TCC_WRITEBACK_sum / $denom))
+          max: MAX((TCC_WRITEBACK_sum / $denom))
+          unit: (Cachelines + $normUnit)
+        Writeback (Internal):
+          avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom))
+          min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom))
+          max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom))
+          unit: (Cachelines + $normUnit)
+        Writeback (vL1D Req):
+          avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
+          min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
+          max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
+          unit: (Cachelines + $normUnit)
+        Evict (Internal):
+          avg: AVG((TCC_NORMAL_EVICT_sum / $denom))
+          min: MIN((TCC_NORMAL_EVICT_sum / $denom))
+          max: MAX((TCC_NORMAL_EVICT_sum / $denom))
+          unit: (Cachelines + $normUnit)
+        Evict (vL1D Req):
+          avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
+          min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
+          max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
+          unit: (Cachelines + $normUnit)
+        NC Req:
+          avg: AVG((TCC_NC_REQ_sum / $denom))
+          min: MIN((TCC_NC_REQ_sum / $denom))
+          max: MAX((TCC_NC_REQ_sum / $denom))
+          unit: (Req + $normUnit)
+        UC Req:
+          avg: AVG((TCC_UC_REQ_sum / $denom))
+          min: MIN((TCC_UC_REQ_sum / $denom))
+          max: MAX((TCC_UC_REQ_sum / $denom))
+          unit: (Req + $normUnit)
+        CC Req:
+          avg: AVG((TCC_CC_REQ_sum / $denom))
+          min: MIN((TCC_CC_REQ_sum / $denom))
+          max: MAX((TCC_CC_REQ_sum / $denom))
+          unit: (Req + $normUnit)
+        RW Req:
+          avg: AVG((TCC_RW_REQ_sum / $denom))
+          min: MIN((TCC_RW_REQ_sum / $denom))
+          max: MAX((TCC_RW_REQ_sum / $denom))
+          unit: (Req + $normUnit)
+  - metric_table:
+      id: 1704
+      title: L2 Cache Stalls
+      header:
+        metric: Metric
+        avg: Avg
+        min: Min
+        max: Max
+        unit: Unit
+      metric: {}
+  - metric_table:
+      id: 1705
+      title: L2 - Fabric Interface stalls
+      header:
+        metric: Metric
+        type: Type
+        transaction: Transaction
+        avg: Avg
+        min: Min
+        max: Max
+        unit: Unit
+      style:
+        type: simple_multi_bar
+      metric:
+        Write - Credit Starvation:
+          type: Credit Starvation
+          transaction: Write
+          avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum
+            != 0) else None))
+          min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum
+            != 0) else None))
+          max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum
+            != 0) else None))
+          unit: pct
+  - metric_table:
+      id: 1706
+      title: L2 - Fabric interface detailed metrics
+      header:
+        metric: Metric
+        avg: Avg
+        min: Min
+        max: Max
+        unit: Unit
+      metric:
+        Read (32B):
+          avg: AVG((TCC_EA_RDREQ_32B_sum / $denom))
+          min: MIN((TCC_EA_RDREQ_32B_sum / $denom))
+          max: MAX((TCC_EA_RDREQ_32B_sum / $denom))
+          unit: (Req + $normUnit)
+        Read (64B):
+          avg: AVG(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom))
+          min: MIN(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom))
+          max: MAX(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom))
+          unit: (Req + $normUnit)
+        Read (Uncached):
+          avg: AVG((TCC_EA_RD_UNCACHED_32B_sum / $denom))
+          min: MIN((TCC_EA_RD_UNCACHED_32B_sum / $denom))
+          max: MAX((TCC_EA_RD_UNCACHED_32B_sum / $denom))
+          unit: (Req + $normUnit)
+        HBM Read:
+          avg: AVG((TCC_EA_RDREQ_DRAM_sum / $denom))
+          min: MIN((TCC_EA_RDREQ_DRAM_sum / $denom))
+          max: MAX((TCC_EA_RDREQ_DRAM_sum / $denom))
+          unit: (Req + $normUnit)
+        Remote Read:
+          avg: AVG((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom))
+          min: MIN((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom))
+          max: MAX((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom))
+          unit: (Req + $normUnit)
+        Write and Atomic (32B):
+          avg: AVG(MAX(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom), 0))
+          min: MIN(MAX(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom), 0))
+          max: MAX(MAX(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom), 0))
+          unit: (Req + $normUnit)
+        Write and Atomic (Uncached):
+          avg: AVG((TCC_EA_WR_UNCACHED_32B_sum / $denom))
+          min: MIN((TCC_EA_WR_UNCACHED_32B_sum / $denom))
+          max: MAX((TCC_EA_WR_UNCACHED_32B_sum / $denom))
+          unit: (Req + $normUnit)
+        Write and Atomic (64B):
+          avg: AVG((TCC_EA_WRREQ_64B_sum / $denom))
+          min: MIN((TCC_EA_WRREQ_64B_sum / $denom))
+          max: MAX((TCC_EA_WRREQ_64B_sum / $denom))
+          unit: (Req + $normUnit)
+        HBM Write and Atomic:
+          avg: AVG((TCC_EA_WRREQ_DRAM_sum / $denom))
+          min: MIN((TCC_EA_WRREQ_DRAM_sum / $denom))
+          max: MAX((TCC_EA_WRREQ_DRAM_sum / $denom))
+          unit: (Req + $normUnit)
+        Remote Write and Atomic:
+          avg: AVG((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom))
+          min: MIN((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom))
+          max: MAX((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom))
+          unit: (Req + $normUnit)
+        Atomic:
+          avg: AVG((TCC_EA_ATOMIC_sum / $denom))
+          min: MIN((TCC_EA_ATOMIC_sum / $denom))
+          max: MAX((TCC_EA_ATOMIC_sum / $denom))
+          unit: (Req + $normUnit)
   metrics_description:
     Utilization: The ratio of the number of cycles an L2 channel was active, summed
       over all L2 channels on the accelerator over the total L2 cycles.
@@ -87,12 +431,6 @@ Panel Config:
       by the cache line size. This value does not consider partial requests, so for
       example, if only a single value is requested in a cache line, the data movement
       will still be counted as a full cache line.
-    Read Bandwidth: Total number of bytes looked up in the L2 cache for read requests,
-      divided by total duration.
-    Write Bandwidth: Total number of bytes looked up in the L2 cache for write requests,
-      divided by total duration.
-    Atomic Bandwidth: Total number of bytes looked up in the L2 cache for atomic requests,
-      divided by total duration.
     Req: The total number of incoming requests to the L2 from all clients for all
       request types, per normalization unit.
     Read Req: The total number of read requests to the L2 from all clients.
@@ -149,12 +487,6 @@ Panel Config:
     Remote Read: The total number of L2 requests to Infinity Fabric to read 32B or
       64B of data from any source other than the accelerator's local HBM, per normalization
       unit.
-    Read Bandwidth - PCIe: Total number of bytes due to L2 read requests due to PCIe
-      traffic, divided by total duration.
-    "Read Bandwidth - Infinity Fabric\u2122": Total number of bytes due to L2 read
-      requests due to Infinity Fabric traffic, divided by total duration.
-    Read Bandwidth - HBM: Total number of bytes due to L2 read requests due to HBM
-      traffic, divided by total duration.
     Write and Atomic (32B): The total number of L2 requests to Infinity Fabric to
       write or atomically update 32B of data to any memory location, per normalization
       unit.
@@ -170,391 +502,9 @@ Panel Config:
     Remote Write and Atomic: The total number of L2 requests to Infinity Fabric to
       write or atomically update 32B or 64B of data in any memory location other than
       the accelerator's local HBM, per normalization unit.
-    Write Bandwidth - PCIe: Total number of bytes due to L2 write requests due to
-      PCIe traffic, divided by total duration.
-    "Write Bandwidth - Infinity Fabric\u2122": Total number of bytes due to L2 write
-      requests due to Infinity Fabric traffic, divided by total duration.
-    Write Bandwidth - HBM: Total number of bytes due to L2 write requests due to HBM
-      traffic, divided by total duration.
-    Atomic Bandwidth - PCIe: Total number of bytes due to L2 atomic requests due to
-      PCIe traffic, divided by total duration.
-    "Atomic Bandwidth - Infinity Fabric\u2122": Total number of bytes due to L2 atomic
-      requests due to Infinity Fabric traffic, divided by total duration.
-    Atomic Bandwidth - HBM: Total number of bytes due to L2 atomic requests due to
-      HBM traffic, divided by total duration.
     Atomic: The total number of L2 requests to Infinity Fabric to atomically update
       32B or 64B of data in any memory location, per normalization unit. See Request
       flow for more detail. Note that on current CDNA accelerators, such as the MI2XX,
       requests are only considered atomic by Infinity Fabric if they are targeted
       at non-write-cacheable memory, such as fine-grained memory allocations or uncached
       memory allocations on the MI2XX.
-    Read Stall: "The ratio of the total number of cycles the L2-Fabric interface was\
-      \ stalled on a read request to any destination (local HBM, remote PCIe\xAE connected\
-      \ accelerator or CPU, or remote Infinity Fabric connected accelerator or CPU)\
-      \ over the total active L2 cycles."
-    Write Stall: The ratio of the total number of cycles the L2-Fabric interface was
-      stalled on a write or atomic request to any destination (local HBM, remote accelerator
-      or CPU, PCIe connected accelerator or CPU, or remote Infinity Fabric connected
-      accelerator or CPU) over the total active L2 cycles.
-    Read - PCIe Stall: The number of cycles the L2-Fabric interface was stalled on
-      read requests to remote PCIe connected accelerators or CPUs as a percent of
-      the total active L2 cycles.
-    Read - Infinity Fabric Stall: The number of cycles the L2-Fabric interface was
-      stalled on read requests to remote Infinity Fabric connected accelerators or
-      CPUs as a percent of the total active L2 cycles.
-    Read - HBM Stall: The number of cycles the L2-Fabric interface was stalled on
-      read requests to the accelerator's local HBM as a percent of the total active
-      L2 cycles.
-    Write - PCIe Stall: The number of cycles the L2-Fabric interface was stalled on
-      write or atomic requests to remote PCIe connected accelerators or CPUs as a
-      percent of the total active L2 cycles.
-    Write - Infinity Fabric Stall: The number of cycles the L2-Fabric interface was
-      stalled on write or atomic requests to remote Infinity Fabric connected accelerators
-      or CPUs as a percent of the total active L2 cycles.
-    Write - HBM Stall: The number of cycles the L2-Fabric interface was stalled on
-      write or atomic requests to accelerator's local HBM as a percent of the total
-      active L2 cycles.
-  data source:
-  - metric_table:
-      id: 1701
-      title: L2 Speed-of-Light
-      header:
-        metric: Metric
-        value: Avg
-        unit: Unit
-      metric:
-        Utilization:
-          value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($total_l2_chan) * $GRBM_GUI_ACTIVE_PER_XCD)))
-          unit: pct
-        Peak Bandwidth:
-          value: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))))
-            / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)))
-          unit: pct
-        Hit Rate:
-          value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-            + TCC_MISS_sum) != 0) else 0))
-          unit: pct
-        L2-Fabric Read BW:
-          value: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum)
-            * 64)) / (End_Timestamp - Start_Timestamp)))
-          unit: GB/s
-        L2-Fabric Write and Atomic BW:
-          value: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum)
-            * 32)) / (End_Timestamp - Start_Timestamp)))
-          unit: GB/s
-        HBM Bandwidth:
-          value: $hbmBandwidth
-          unit: GB/s
-  - metric_table:
-      id: 1702
-      title: L2-Fabric interface metrics
-      header:
-        metric: Metric
-        avg: Avg
-        min: Min
-        max: Max
-        unit: Unit
-      metric:
-        Read BW:
-          avg: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum)
-            * 64)) / (End_Timestamp - Start_Timestamp)))
-          min: MIN((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum)
-            * 64)) / (End_Timestamp - Start_Timestamp)))
-          max: MAX((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum)
-            * 64)) / (End_Timestamp - Start_Timestamp)))
-          unit: Gbps
-        HBM Read Traffic:
-          avg: AVG((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
-            != 0) else None))
-          min: MIN((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
-            != 0) else None))
-          max: MAX((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
-            != 0) else None))
-          unit: pct
-        Remote Read Traffic:
-          avg: AVG((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum)
-            if (TCC_EA_RDREQ_sum != 0) else None))
-          min: MIN((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum)
-            if (TCC_EA_RDREQ_sum != 0) else None))
-          max: MAX((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum)
-            if (TCC_EA_RDREQ_sum != 0) else None))
-          unit: pct
-        Uncached Read Traffic:
-          avg: AVG((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
-            != 0) else None))
-          min: MIN((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
-            != 0) else None))
-          max: MAX((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
-            != 0) else None))
-          unit: pct
-        Write and Atomic BW:
-          avg: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
-            * 32)) / $denom))
-          min: MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
-            * 32)) / $denom))
-          max: MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
-            * 32)) / $denom))
-          unit: (Bytes  + $normUnit)
-        HBM Write and Atomic Traffic:
-          avg: AVG((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
-            != 0) else None))
-          min: MIN((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
-            != 0) else None))
-          max: MAX((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
-            != 0) else None))
-          unit: pct
-        Remote Write and Atomic Traffic:
-          avg: AVG((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum)
-            if (TCC_EA_WRREQ_sum != 0) else None))
-          min: MIN((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum)
-            if (TCC_EA_WRREQ_sum != 0) else None))
-          max: MAX((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum)
-            if (TCC_EA_WRREQ_sum != 0) else None))
-          unit: pct
-        Atomic Traffic:
-          avg: AVG((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
-            != 0) else None))
-          min: MIN((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
-            != 0) else None))
-          max: MAX((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
-            != 0) else None))
-          unit: pct
-        Uncached Write and Atomic Traffic:
-          avg: AVG((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
-            != 0) else None))
-          min: MIN((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
-            != 0) else None))
-          max: MAX((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
-            != 0) else None))
-          unit: pct
-        Read Latency:
-          avg: AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
-            != 0) else None))
-          min: MIN(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
-            != 0) else None))
-          max: MAX(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
-            != 0) else None))
-          unit: Cycles
-        Write and Atomic Latency:
-          avg: AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
-            != 0) else None))
-          min: MIN(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
-            != 0) else None))
-          max: MAX(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
-            != 0) else None))
-          unit: Cycles
-        Atomic Latency:
-          avg: AVG(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum
-            != 0) else None))
-          min: MIN(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum
-            != 0) else None))
-          max: MAX(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum
-            != 0) else None))
-          unit: Cycles
-  - metric_table:
-      id: 1703
-      title: L2 Cache Accesses
-      header:
-        metric: Metric
-        avg: Avg
-        min: Min
-        max: Max
-        unit: Unit
-      metric:
-        Bandwidth:
-          avg: AVG((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
-          min: MIN((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
-          max: MAX((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
-          unit: Gbps
-        Req:
-          avg: AVG((TCC_REQ_sum / $denom))
-          min: MIN((TCC_REQ_sum / $denom))
-          max: MAX((TCC_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
-        Read Req:
-          avg: AVG((TCC_READ_sum / $denom))
-          min: MIN((TCC_READ_sum / $denom))
-          max: MAX((TCC_READ_sum / $denom))
-          unit: (Req  + $normUnit)
-        Write Req:
-          avg: AVG((TCC_WRITE_sum / $denom))
-          min: MIN((TCC_WRITE_sum / $denom))
-          max: MAX((TCC_WRITE_sum / $denom))
-          unit: (Req  + $normUnit)
-        Atomic Req:
-          avg: AVG((TCC_ATOMIC_sum / $denom))
-          min: MIN((TCC_ATOMIC_sum / $denom))
-          max: MAX((TCC_ATOMIC_sum / $denom))
-          unit: (Req  + $normUnit)
-        Streaming Req:
-          avg: AVG((TCC_STREAMING_REQ_sum / $denom))
-          min: MIN((TCC_STREAMING_REQ_sum / $denom))
-          max: MAX((TCC_STREAMING_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
-        Probe Req:
-          avg: AVG((TCC_PROBE_sum / $denom))
-          min: MIN((TCC_PROBE_sum / $denom))
-          max: MAX((TCC_PROBE_sum / $denom))
-          unit: (Req  + $normUnit)
-        Cache Hit:
-          avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-            + TCC_MISS_sum) != 0) else None))
-          min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-            + TCC_MISS_sum) != 0) else None))
-          max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-            + TCC_MISS_sum) != 0) else None))
-          unit: pct
-        Hits:
-          avg: AVG((TCC_HIT_sum / $denom))
-          min: MIN((TCC_HIT_sum / $denom))
-          max: MAX((TCC_HIT_sum / $denom))
-          unit: (Hits  + $normUnit)
-        Misses:
-          avg: AVG((TCC_MISS_sum / $denom))
-          min: MIN((TCC_MISS_sum / $denom))
-          max: MAX((TCC_MISS_sum / $denom))
-          unit: (Misses  + $normUnit)
-        Writeback:
-          avg: AVG((TCC_WRITEBACK_sum / $denom))
-          min: MIN((TCC_WRITEBACK_sum / $denom))
-          max: MAX((TCC_WRITEBACK_sum / $denom))
-          unit: (Cachelines  + $normUnit)
-        Writeback (Internal):
-          avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom))
-          min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom))
-          max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom))
-          unit: (Cachelines + $normUnit)
-        Writeback (vL1D Req):
-          avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
-          min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
-          max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
-          unit: (Cachelines + $normUnit)
-        Evict (Internal):
-          avg: AVG((TCC_NORMAL_EVICT_sum / $denom))
-          min: MIN((TCC_NORMAL_EVICT_sum / $denom))
-          max: MAX((TCC_NORMAL_EVICT_sum / $denom))
-          unit: (Cachelines + $normUnit)
-        Evict (vL1D Req):
-          avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
-          min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
-          max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
-          unit: (Cachelines + $normUnit)
-        NC Req:
-          avg: AVG((TCC_NC_REQ_sum / $denom))
-          min: MIN((TCC_NC_REQ_sum / $denom))
-          max: MAX((TCC_NC_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
-        UC Req:
-          avg: AVG((TCC_UC_REQ_sum / $denom))
-          min: MIN((TCC_UC_REQ_sum / $denom))
-          max: MAX((TCC_UC_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
-        CC Req:
-          avg: AVG((TCC_CC_REQ_sum / $denom))
-          min: MIN((TCC_CC_REQ_sum / $denom))
-          max: MAX((TCC_CC_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
-        RW Req:
-          avg: AVG((TCC_RW_REQ_sum / $denom))
-          min: MIN((TCC_RW_REQ_sum / $denom))
-          max: MAX((TCC_RW_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
-  - metric_table:
-      id: 1704
-      title: L2 Cache Stalls
-      header:
-        metric: Metric
-        avg: Avg
-        min: Min
-        max: Max
-        unit: Unit
-      metric: {}
-  - metric_table:
-      id: 1705
-      title: L2 - Fabric Interface stalls
-      header:
-        metric: Metric
-        type: Type
-        transaction: Transaction
-        avg: Avg
-        min: Min
-        max: Max
-        unit: Unit
-      style:
-        type: simple_multi_bar
-      metric:
-        Write - Credit Starvation:
-          type: Credit Starvation
-          transaction: Write
-          avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum
-            != 0) else None))
-          min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum
-            != 0) else None))
-          max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum
-            != 0) else None))
-          unit: pct
-  - metric_table:
-      id: 1706
-      title: L2 - Fabric interface detailed metrics
-      header:
-        metric: Metric
-        avg: Avg
-        min: Min
-        max: Max
-        unit: Unit
-      metric:
-        Read (32B):
-          avg: AVG((TCC_EA_RDREQ_32B_sum / $denom))
-          min: MIN((TCC_EA_RDREQ_32B_sum / $denom))
-          max: MAX((TCC_EA_RDREQ_32B_sum / $denom))
-          unit: (Req  + $normUnit)
-        Read (64B):
-          avg: AVG(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom))
-          min: MIN(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom))
-          max: MAX(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom))
-          unit: (Req  + $normUnit)
-        Read (Uncached):
-          avg: AVG((TCC_EA_RD_UNCACHED_32B_sum / $denom))
-          min: MIN((TCC_EA_RD_UNCACHED_32B_sum / $denom))
-          max: MAX((TCC_EA_RD_UNCACHED_32B_sum / $denom))
-          unit: (Req  + $normUnit)
-        HBM Read:
-          avg: AVG((TCC_EA_RDREQ_DRAM_sum / $denom))
-          min: MIN((TCC_EA_RDREQ_DRAM_sum / $denom))
-          max: MAX((TCC_EA_RDREQ_DRAM_sum / $denom))
-          unit: (Req  + $normUnit)
-        Remote Read:
-          avg: AVG((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom))
-          min: MIN((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom))
-          max: MAX((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom))
-          unit: (Req  + $normUnit)
-        Write and Atomic (32B):
-          avg: AVG(MAX(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom), 0))
-          min: MIN(MAX(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom), 0))
-          max: MAX(MAX(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom), 0))
-          unit: (Req  + $normUnit)
-        Write and Atomic (Uncached):
-          avg: AVG((TCC_EA_WR_UNCACHED_32B_sum / $denom))
-          min: MIN((TCC_EA_WR_UNCACHED_32B_sum / $denom))
-          max: MAX((TCC_EA_WR_UNCACHED_32B_sum / $denom))
-          unit: (Req  + $normUnit)
-        Write and Atomic (64B):
-          avg: AVG((TCC_EA_WRREQ_64B_sum / $denom))
-          min: MIN((TCC_EA_WRREQ_64B_sum / $denom))
-          max: MAX((TCC_EA_WRREQ_64B_sum / $denom))
-          unit: (Req  + $normUnit)
-        HBM Write and Atomic:
-          avg: AVG((TCC_EA_WRREQ_DRAM_sum / $denom))
-          min: MIN((TCC_EA_WRREQ_DRAM_sum / $denom))
-          max: MAX((TCC_EA_WRREQ_DRAM_sum / $denom))
-          unit: (Req  + $normUnit)
-        Remote Write and Atomic:
-          avg: AVG((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom))
-          min: MIN((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom))
-          max: MAX((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom))
-          unit: (Req  + $normUnit)
-        Atomic:
-          avg: AVG((TCC_EA_ATOMIC_sum / $denom))
-          min: MIN((TCC_EA_ATOMIC_sum / $denom))
-          max: MAX((TCC_EA_ATOMIC_sum / $denom))
-          unit: (Req  + $normUnit)
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1800_l2_cache_per_channel.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1800_l2_cache_per_channel.yaml
index f097a14b55..525d00ff58 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1800_l2_cache_per_channel.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1800_l2_cache_per_channel.yaml
@@ -2,10 +2,6 @@
 Panel Config:
   id: 1800
   title: L2 Cache (per Channel)
-  metrics_description:
-    L2 Cache Hit Rate: The percent of total number of requests to the L2 from all
-      clients that hit in the cache. As noted in the Speed-of-Light section, this
-      includes hit-on-miss requests.
   data source:
   - metric_table:
       id: 1801
@@ -321,3 +317,7 @@ Panel Config:
           ::_1: $total_l2_chan
       cli_style: simple_box
       tui_style: simple_box
+  metrics_description:
+    L2 Cache Hit Rate: The percent of total number of requests to the L2 from all
+      clients that hit in the cache. As noted in the Speed-of-Light section, this
+      includes hit-on-miss requests.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/2100_pc_sampling.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/2100_pc_sampling.yaml
index e94471d7dc..16e4d01e7e 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/2100_pc_sampling.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/2100_pc_sampling.yaml
@@ -2,10 +2,10 @@
 Panel Config:
   id: 2100
   title: PC Sampling
-  metrics_description: {}
   data source:
   - pc_sampling_table:
       id: 2101
       title: PC Sampling
       source: ps_file
       comparable: false
+  metrics_description: {}
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/config_delta/gfx950_diff.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/config_delta/gfx950_diff.yaml
new file mode 100644
index 0000000000..72d6adce5f
--- /dev/null
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/config_delta/gfx950_diff.yaml
@@ -0,0 +1,1022 @@
+# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated by tools/config_management/generate_config_deltas.py
+Addition:
+  - Panel Config:
+      id: 200
+      title: System Speed-of-Light
+    metric_tables:
+      - metric_table:
+          id: 201
+          title: System Speed-of-Light
+          metrics:
+            - MFMA FLOPs (F8):
+                value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))
+                unit: GFLOP/s
+                peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
+            - MFMA FLOPs (F6F4):
+                value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp - Start_Timestamp)))
+                unit: GFLOP/s
+                peak: ((($max_sclk * $cu_per_gpu) * 16834) / 1000)
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 16834) / 1000))
+  - Panel Config:
+      id: 400
+      title: Roofline
+    metric_tables:
+      - metric_table:
+          id: 401
+          title: Roofline Performance Rates
+          metrics:
+            - MFMA FLOPs (F8):
+                value: |
+                  AVG((((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
+                unit: GFLOP/s
+                peak: $MFMAF8Flops_empirical_peak
+            - MFMA FLOPs (F6F4):
+                value: |
+                  AVG((((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
+                unit: GFLOP/s
+                peak: $MFMA_FLOPs_F6F4_empirical_peak
+  - Panel Config:
+      id: 500
+      title: Command Processor (CPC/CPF)
+    metric_tables:
+      - metric_table:
+          id: 502
+          title: Command processor packet processor (CPC)
+          metrics:
+            - CPC CANE Stall Rate:
+                avg: AVG((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0) else None)
+                min: MIN((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0) else None)
+                max: MAX((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0) else None)
+                unit: pct
+            - CPC SYNC FIFO Full Rate:
+                avg: |
+                  AVG((100 * CPC_SYNC_FIFO_FULL) / CPC_SYNC_WRREQ_FIFO_BUSY if (CPC_SYNC_WRREQ_FIFO_BUSY != 0) else None)
+                min: |
+                  MIN((100 * CPC_SYNC_FIFO_FULL) / CPC_SYNC_WRREQ_FIFO_BUSY if (CPC_SYNC_WRREQ_FIFO_BUSY != 0) else None)
+                max: |
+                  MAX((100 * CPC_SYNC_FIFO_FULL) / CPC_SYNC_WRREQ_FIFO_BUSY if (CPC_SYNC_WRREQ_FIFO_BUSY != 0) else None)
+                unit: pct
+            - CPC ADC Utilization:
+                avg: AVG((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else None)
+                min: MIN((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else None)
+                max: MAX((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else None)
+                unit: pct
+  - Panel Config:
+      id: 600
+      title: Workgroup Manager (SPI)
+    metric_tables:
+      - metric_table:
+          id: 601
+          title: Workgroup manager utilizations
+          metrics:
+            - Scheduler-Pipe Wave Utilization:
+                avg: |
+                  AVG(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
+                min: |
+                  MIN(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
+                max: |
+                  MAX(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
+                unit: Pct
+            - Schedule-Pipe Wave Occupancy:
+                avg: |
+                  AVG(SPI_CSQ_P0_OCCUPANCY + SPI_CSQ_P1_OCCUPANCY + SPI_CSQ_P2_OCCUPANCY + SPI_CSQ_P3_OCCUPANCY)
+                min: |
+                  MIN(SPI_CSQ_P0_OCCUPANCY + SPI_CSQ_P1_OCCUPANCY + SPI_CSQ_P2_OCCUPANCY + SPI_CSQ_P3_OCCUPANCY)
+                max: |
+                  MAX(SPI_CSQ_P0_OCCUPANCY + SPI_CSQ_P1_OCCUPANCY + SPI_CSQ_P2_OCCUPANCY + SPI_CSQ_P3_OCCUPANCY)
+                unit: Wave
+      - metric_table:
+          id: 602
+          title: Workgroup Manager - Resource Allocation
+          metrics:
+            - Scheduler-Pipe FIFO Full Rate:
+                avg: |
+                  AVG((100 * (SPI_CS0_CRAWLER_STALL + SPI_CS1_CRAWLER_STALL + SPI_CS2_CRAWLER_STALL + SPI_CS3_CRAWLER_STALL) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
+                min: |
+                  MIN((100 * (SPI_CS0_CRAWLER_STALL + SPI_CS1_CRAWLER_STALL + SPI_CS2_CRAWLER_STALL + SPI_CS3_CRAWLER_STALL) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
+                max: |
+                  MAX((100 * (SPI_CS0_CRAWLER_STALL + SPI_CS1_CRAWLER_STALL + SPI_CS2_CRAWLER_STALL + SPI_CS3_CRAWLER_STALL) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
+                unit: Pct
+  - Panel Config:
+      id: 1000
+      title: Compute Units - Instruction Mix
+    metric_tables:
+      - metric_table:
+          id: 1003
+          title: VMEM Instruction Mix
+          metrics:
+            - Spill/Stack Coalesceable Instr:
+                avg: AVG((TA_BUFFER_COALESCEABLE_WAVEFRONTS_sum / $denom))
+                min: MIN((TA_BUFFER_COALESCEABLE_WAVEFRONTS_sum / $denom))
+                max: MAX((TA_BUFFER_COALESCEABLE_WAVEFRONTS_sum / $denom))
+                unit: (instr + $normUnit)
+      - metric_table:
+          id: 1004
+          title: MFMA Arithmetic Instruction Mix
+          metrics:
+            - MFMA-F8:
+                avg: AVG((SQ_INSTS_VALU_MFMA_F8 / $denom))
+                min: MIN((SQ_INSTS_VALU_MFMA_F8 / $denom))
+                max: MAX((SQ_INSTS_VALU_MFMA_F8 / $denom))
+                unit: (instr + $normUnit)
+            - MFMA-F6F4:
+                avg: AVG((SQ_INSTS_VALU_MFMA_F6F4 / $denom))
+                min: MIN((SQ_INSTS_VALU_MFMA_F6F4 / $denom))
+                max: MAX((SQ_INSTS_VALU_MFMA_F6F4 / $denom))
+                unit: (instr + $normUnit)
+  - Panel Config:
+      id: 1100
+      title: Compute Units - Compute Pipeline
+    metric_tables:
+      - metric_table:
+          id: 1101
+          title: Compute Speed-of-Light
+          metrics:
+            - MFMA FLOPs (F8):
+                value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))
+                unit: GFLOP
+                peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
+            - MFMA FLOPs (F6F4):
+                value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp - Start_Timestamp)))
+                unit: GFLOP
+                peak: ((($max_sclk * $cu_per_gpu) * 16834) / 1000)
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 16834) / 1000))
+      - metric_table:
+          id: 1102
+          title: Pipeline Statistics
+          metrics:
+            - VALU Co-Issue Efficiency:
+                avg: AVG((100 * SQ_ACTIVE_INST_VALU2) / (SQ_ACTIVE_INST_VALU - SQ_ACTIVE_INST_VALU2))
+                min: MIN((100 * SQ_ACTIVE_INST_VALU2) / (SQ_ACTIVE_INST_VALU - SQ_ACTIVE_INST_VALU2))
+                max: MAX((100 * SQ_ACTIVE_INST_VALU2) / (SQ_ACTIVE_INST_VALU - SQ_ACTIVE_INST_VALU2))
+                unit: pct
+      - metric_table:
+          id: 1103
+          title: Arithmetic Operations
+          metrics:
+            - F8 OPs:
+                avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
+                min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
+                max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
+                unit: (OPs + $normUnit)
+            - F6F4 OPs:
+                avg: AVG((512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4) / $denom)
+                min: MIN((512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4) / $denom)
+                max: MAX((512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4) / $denom)
+                unit: (OPs + $normUnit)
+  - Panel Config:
+      id: 1200
+      title: Local Data Share (LDS)
+    metric_tables:
+      - metric_table:
+          id: 1202
+          title: LDS Statistics
+          metrics:
+            - LDS LOAD:
+                avg: AVG((SQ_INSTS_LDS_LOAD / $denom))
+                min: MIN((SQ_INSTS_LDS_LOAD / $denom))
+                max: MAX((SQ_INSTS_LDS_LOAD / $denom))
+                unit: (instr + $normUnit)
+            - LDS LOAD Bandwidth:
+                avg: AVG(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp))
+                min: MIN(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp))
+                max: MAX(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp))
+                units: Gbps
+            - LDS Command FIFO Full Rate:
+                avg: AVG((SQ_LDS_CMD_FIFO_FULL / $denom))
+                min: MIN((SQ_LDS_CMD_FIFO_FULL / $denom))
+                max: MAX((SQ_LDS_CMD_FIFO_FULL / $denom))
+                unit: (Cycles + $normUnit)
+            - LDS ATOMIC Bandwidth:
+                avg: AVG(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp))
+                min: MIN(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp))
+                max: MAX(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp))
+                units: Gbps
+            - LDS ATOMIC:
+                avg: AVG((SQ_INSTS_LDS_ATOMIC / $denom))
+                min: MIN((SQ_INSTS_LDS_ATOMIC / $denom))
+                max: MAX((SQ_INSTS_LDS_ATOMIC / $denom))
+                unit: (instr + $normUnit)
+            - LDS STORE Bandwidth:
+                avg: AVG(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp))
+                min: MIN(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp))
+                max: MAX(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp))
+                units: Gbps
+            - LDS Data FIFO Full Rate:
+                avg: AVG((SQ_LDS_DATA_FIFO_FULL / $denom))
+                min: MIN((SQ_LDS_DATA_FIFO_FULL / $denom))
+                max: MAX((SQ_LDS_DATA_FIFO_FULL / $denom))
+                unit: (Cycles + $normUnit)
+            - LDS STORE:
+                avg: AVG((SQ_INSTS_LDS_STORE / $denom))
+                min: MIN((SQ_INSTS_LDS_STORE / $denom))
+                max: MAX((SQ_INSTS_LDS_STORE / $denom))
+                unit: (instr + $normUnit)
+  - Panel Config:
+      id: 1500
+      title: Address Processing Unit and Data Return Path (TA/TD)
+    metric_tables:
+      - metric_table:
+          id: 1504
+          title: Vector L1 data-return path or Texture Data (TD)
+          metrics:
+            - Write Ack Instructions:
+                avg: AVG((TD_WRITE_ACKT_WAVEFRONT_sum / $denom))
+                min: MIN((TD_WRITE_ACKT_WAVEFRONT_sum / $denom))
+                max: MAX((TD_WRITE_ACKT_WAVEFRONT_sum / $denom))
+                unit: (Instructions + $normUnit)
+      - metric_table:
+          id: 1502
+          title: Instruction counts
+          metrics:
+            - Spill/Stack Read Instructions for LDS:
+                avg: AVG((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom))
+                min: MIN((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom))
+                max: MAX((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom))
+                unit: (Instructions + $normUnit)
+            - Global/Generic Read Instructions for LDS:
+                avg: AVG((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom))
+                min: MIN((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom))
+                max: MAX((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom))
+                unit: (Instructions + $normUnit)
+  - Panel Config:
+      id: 1600
+      title: Vector L1 Data Cache
+    metric_tables:
+      - metric_table:
+          id: 1602
+          title: vL1D cache stall metrics
+          metrics:
+            - Stalled on Address:
+                expr: |
+                  (((100 * TCP_TCP_TA_ADDR_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None)
+            - Stalled on Data:
+                expr: |
+                  (((100 * TCP_TCP_TA_DATA_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None)
+            - Stalled on Read Return:
+                expr: |
+                  (((100 * TCP_TCR_RDRET_STALL_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None)
+            - Stalled on Latency FIFO:
+                expr: |
+                  (((100 * TCP_LFIFO_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None)
+            - Stalled on Request FIFO:
+                expr: |
+                  (((100 * TCP_RFIFO_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None)
+      - metric_table:
+          id: 1603
+          title: vL1D cache access metrics
+          metrics:
+            - Tag RAM 0 Req:
+                avg: AVG((TCP_TAGRAM0_REQ_sum / $denom))
+                min: MIN((TCP_TAGRAM0_REQ_sum / $denom))
+                max: MAX((TCP_TAGRAM0_REQ_sum / $denom))
+                unit: (Req + $normUnit)
+            - Tag RAM 3 Req:
+                avg: AVG((TCP_TAGRAM3_REQ_sum / $denom))
+                min: MIN((TCP_TAGRAM3_REQ_sum / $denom))
+                max: MAX((TCP_TAGRAM3_REQ_sum / $denom))
+                unit: (Req + $normUnit)
+            - Tag RAM 1 Req:
+                avg: AVG((TCP_TAGRAM1_REQ_sum / $denom))
+                min: MIN((TCP_TAGRAM1_REQ_sum / $denom))
+                max: MAX((TCP_TAGRAM1_REQ_sum / $denom))
+                unit: (Req + $normUnit)
+            - Tag RAM 2 Req:
+                avg: AVG((TCP_TAGRAM2_REQ_sum / $denom))
+                min: MIN((TCP_TAGRAM2_REQ_sum / $denom))
+                max: MAX((TCP_TAGRAM2_REQ_sum / $denom))
+                unit: (Req + $normUnit)
+      - metric_table:
+          id: 1605
+          title: L1 Unified Translation Cache (UTCL1)
+          metrics:
+            - Inflight Req:
+                avg: AVG((TCP_CLIENT_UTCL1_INFLIGHT_sum / $denom))
+                min: MIN((TCP_CLIENT_UTCL1_INFLIGHT_sum / $denom))
+                max: MAX((TCP_CLIENT_UTCL1_INFLIGHT_sum / $denom))
+                units: (Req + $normUnit)
+            - Misses under Translation Miss:
+                avg: AVG((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom))
+                min: MIN((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom))
+                max: MAX((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom))
+                units: (Req + $normUnit)
+      - metric_table:
+          id: 1606
+          title: L1D Addr Translation Stalls
+          metrics:
+            - Resident Page Full Stall:
+                avg: AVG((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom))
+                min: MIN((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom))
+                max: MAX((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom))
+                units: (Cycles + $normUnit)
+            - Cache Miss Stall:
+                avg: AVG((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom))
+                min: MIN((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom))
+                max: MAX((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom))
+                units: (Cycles + $normUnit)
+            - Thrashing Stall:
+                avg: AVG((TCP_UTCL1_THRASHING_STALL_sum / $denom))
+                min: MIN((TCP_UTCL1_THRASHING_STALL_sum / $denom))
+                max: MAX((TCP_UTCL1_THRASHING_STALL_sum / $denom))
+                units: (Cycles + $normUnit)
+            - Serialization Stall:
+                avg: AVG((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom))
+                min: MIN((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom))
+                max: MAX((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom))
+                units: (Cycles + $normUnit)
+            - Latency FIFO Stall:
+                avg: AVG((TCP_UTCL1_LFIFO_FULL_sum / $denom))
+                min: MIN((TCP_UTCL1_LFIFO_FULL_sum / $denom))
+                max: MAX((TCP_UTCL1_LFIFO_FULL_sum / $denom))
+                units: (Cycles + $normUnit)
+            - Cache Full Stall:
+                avg: AVG((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom))
+                min: MIN((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom))
+                max: MAX((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom))
+                units: (Cycles + $normUnit)
+            - UTCL2 Stall:
+                avg: AVG((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom))
+                min: MIN((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom))
+                max: MAX((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom))
+                units: (Cycles + $normUnit)
+  - Panel Config:
+      id: 1700
+      title: L2 Cache
+    metric_tables:
+      - metric_table:
+          id: 1702
+          title: L2-Fabric interface metrics
+          metrics:
+            - Read Stall:
+                avg: |
+                  AVG((((100 * ((TCC_EA0_RDREQ_IO_CREDIT_STALL_sum + TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum) + TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None))
+                min: |
+                  MIN((((100 * ((TCC_EA0_RDREQ_IO_CREDIT_STALL_sum + TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum) + TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None))
+                max: |
+                  MAX((((100 * ((TCC_EA0_RDREQ_IO_CREDIT_STALL_sum + TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum) + TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None))
+                unit: pct
+            - Write Stall:
+                avg: |
+                  AVG(((100 * (TCC_EA0_WRREQ_STALL_sum) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None))
+                min: |
+                  MIN(((100 * (TCC_EA0_WRREQ_STALL_sum) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None))
+                max: |
+                  MAX(((100 * (TCC_EA0_WRREQ_STALL_sum) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None))
+                unit: pct
+      - metric_table:
+          id: 1703
+          title: L2 Cache Accesses
+          metrics:
+            - Read Bandwidth:
+                avg: AVG(TCC_READ_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_READ_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_READ_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Bypasss Req:
+                avg: AVG((TCC_BYPASS_REQ_sum / $denom))
+                min: MIN((TCC_BYPASS_REQ_sum / $denom))
+                max: MAX((TCC_BYPASS_REQ_sum / $denom))
+                unit: (Req + $normUnit)
+            - Write Bandwidth:
+                avg: AVG(TCC_WRITE_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_WRITE_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_WRITE_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Atomic Bandwidth:
+                avg: AVG(TCC_ATOMIC_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_ATOMIC_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_ATOMIC_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Input Buffer Req:
+                avg: AVG((TCC_IB_REQ_sum / $denom))
+                min: MIN((TCC_IB_REQ_sum / $denom))
+                max: MAX((TCC_IB_REQ_sum / $denom))
+                unit: (Req + $normUnit)
+      - metric_table:
+          id: 1704
+          title: L2 Cache Stalls
+          metrics:
+            - Stalled on Write Data FIFO:
+                avg: AVG(TCC_SRC_FIFO_FULL_sum / $denom)
+                min: MIN(TCC_SRC_FIFO_FULL_sum / $denom)
+                max: MAX(TCC_SRC_FIFO_FULL_sum / $denom)
+                unit: (Cycles + $normUnit)
+            - Input Buffer Stalled on L2:
+                avg: AVG(TCC_IB_STALL_sum / $denom)
+                min: MIN(TCC_IB_STALL_sum / $denom)
+                max: MAX(TCC_IB_STALL_sum / $denom)
+                unit: (Cycles + $normUnit)
+            - Stalled on Latency FIFO:
+                avg: AVG(TCC_LATENCY_FIFO_FULL_sum / $denom)
+                min: MIN(TCC_LATENCY_FIFO_FULL_sum / $denom)
+                max: MAX(TCC_LATENCY_FIFO_FULL_sum / $denom)
+                unit: (Cycles + $normUnit)
+      - metric_table:
+          id: 1705
+          title: L2 - Fabric Interface stalls
+          metrics:
+            - Read - HBM Stall:
+                type: HBM Stall
+                transaction: Read
+                avg: |
+                  AVG(((100 * (TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                min: |
+                  MIN(((100 * (TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                max: |
+                  MAX(((100 * (TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                unit: pct
+            - Write - PCIe Stall:
+                type: PCIe Stall
+                transaction: Write
+                avg: |
+                  AVG(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                min: |
+                  MIN(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                max: |
+                  MAX(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                unit: pct
+            - Write - Infinity Fabric Stall:
+                type: Infinity Fabric™ Stall
+                transaction: Write
+                avg: |
+                  AVG(((100 * (TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                min: |
+                  MIN(((100 * (TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                max: |
+                  MAX(((100 * (TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                unit: pct
+            - Write - HBM Stall:
+                type: HBM Stall
+                transaction: Write
+                avg: |
+                  AVG(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                min: |
+                  MIN(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                max: |
+                  MAX(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                unit: pct
+            - Read - PCIe Stall:
+                type: PCIe Stall
+                transaction: Read
+                avg: |
+                  AVG(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                min: |
+                  MIN(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                max: |
+                  MAX(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                unit: pct
+            - Read - Infinity Fabric Stall:
+                type: Infinity Fabric™ Stall
+                transaction: Read
+                avg: |
+                  AVG(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                min: |
+                  MIN(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                max: |
+                  MAX(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                unit: pct
+      - metric_table:
+          id: 1706
+          title: L2 - Fabric interface detailed metrics
+          metrics:
+            - Read Bandwidth - HBM:
+                avg: AVG(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Atomic Bandwidth - PCIe:
+                avg: AVG(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Atomic Bandwidth - HBM:
+                avg: AVG(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Read (128B):
+                avg: AVG((TCC_EA0_RDREQ_128B_sum / $denom))
+                min: MIN((TCC_EA0_RDREQ_128B_sum / $denom))
+                max: MAX((TCC_EA0_RDREQ_128B_sum / $denom))
+                unit: (Req + $normUnit)
+            - Write Bandwidth - Infinity Fabric™:
+                avg: AVG(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Read Bandwidth - Infinity Fabric™:
+                avg: AVG(TCC_EA0_RDREQ_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_EA0_RDREQ_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_EA0_RDREQ_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Read Bandwidth - PCIe:
+                avg: AVG(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Write Bandwidth - PCIe:
+                avg: AVG(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Write Bandwidth - HBM:
+                avg: AVG(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Atomic - HBM:
+                avg: AVG((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom))
+                min: MIN((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom))
+                max: MAX((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom))
+                unit: (Req + $normUnit)
+            - Atomic Bandwidth - Infinity Fabric™:
+                avg: AVG(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+
+Deletion:
+  []
+
+Modification:
+  - Panel Config:
+      id: 200
+      title: System Speed-of-Light
+    metric_tables:
+      - metric_table:
+          id: 201
+          title: System Speed-of-Light
+          metrics:
+            - L2-Fabric Write BW:
+                pop: |
+                  ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth)
+                value: |
+                  AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp)))
+            - VALU Active Threads:
+                pop: |
+                  (100 * AVG((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU / $wave_size) if (SQ_ACTIVE_INST_VALU != 0) else None))
+                peak: $wave_size
+            - L2-Fabric Read Latency:
+                value: |
+                  AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
+            - L2 Cache BW:
+                pop: |
+                  ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)))
+                peak: ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))
+            - MFMA FLOPs (F64):
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 128) / 1000))
+                peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000)
+            - vL1D Cache BW:
+                pop: |
+                  ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu))
+                peak: ((($max_sclk / 1000) * 128) * $cu_per_gpu)
+                value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
+            - L2-Fabric Write Latency:
+                value: |
+                  AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
+            - MFMA IOPs (Int8):
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
+                peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
+            - MFMA FLOPs (F16):
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
+                peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
+            - MFMA FLOPs (BF16):
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
+                peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
+            - L2-Fabric Read BW:
+                pop: |
+                  ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)))) / $hbmBandwidth)
+                value: |
+                  AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp))
+  - Panel Config:
+      id: 300
+      title: Memory Chart
+    metric_tables:
+      - metric_table:
+          id: 301
+          title: Memory Chart
+          metrics:
+            - Fabric_L2 Wr:
+                value: ROUND(AVG((TCC_EA0_WRREQ_sum / $denom)), 0)
+            - Wavefronts:
+                value: ROUND(AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE), 0)
+            - HBM Wr:
+                value: ROUND(AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)), 0)
+            - Fabric_L2 Atomic:
+                value: ROUND(AVG((TCC_EA0_ATOMIC_sum / $denom)), 0)
+            - Fabric Atomic Lat:
+                value: |
+                  ROUND(AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum != 0) else 0)), 0)
+            - HBM Rd:
+                value: ROUND(AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)), 0)
+            - Workgroups:
+                value: |
+                  ROUND(AVG(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS), 0)
+            - Fabric_L2 Rd:
+                value: ROUND(AVG((TCC_EA0_RDREQ_sum / $denom)), 0)
+            - Fabric Rd Lat:
+                value: |
+                  ROUND(AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else 0)), 0)
+            - Fabric Wr Lat:
+                value: |
+                  ROUND(AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else 0)), 0)
+  - Panel Config:
+      id: 400
+      title: Roofline
+    metric_tables:
+      - metric_table:
+          id: 401
+          title: Roofline Performance Rates
+          metrics:
+            - HBM Bandwidth:
+                value: |
+                  AVG((( (TCC_BUBBLE_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
+      - metric_table:
+          id: 402
+          title: Roofline Plot Points
+          metrics:
+            - AI L2:
+                value: |
+                  ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) * 64 ) )
+            - Performance (GFLOPs):
+                value: |
+                  ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / (SUM(End_Timestamp - Start_Timestamp) / 1e9) ) / 1e9
+            - AI L1:
+                value: |
+                  ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM(TCP_TOTAL_CACHE_ACCESSES_sum * 64) )
+            - AI HBM:
+                value: |
+                  ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCC_BUBBLE_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64) ) )
+  - Panel Config:
+      id: 600
+      title: Workgroup Manager (SPI)
+    metric_tables:
+      - metric_table:
+          id: 601
+          title: Workgroup manager utilizations
+          metrics:
+            - VGPR Writes:
+                min: |
+                  MIN((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
+                avg: |
+                  AVG((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
+                max: |
+                  MAX((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
+            - Dispatched Workgroups:
+                min: |
+                  MIN(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS)
+                avg: |
+                  AVG(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS)
+                max: |
+                  MAX(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS)
+            - SGPR Writes:
+                min: |
+                  MIN((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
+                avg: |
+                  AVG((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
+                max: |
+                  MAX((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
+            - Dispatched Wavefronts:
+                min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
+                avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
+                max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
+            - Scheduler-Pipe Utilization:
+                min: |
+                  MIN(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
+                avg: |
+                  AVG(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
+                max: |
+                  MAX(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
+  - Panel Config:
+      id: 700
+      title: Wavefront
+    metric_tables:
+      - metric_table:
+          id: 701
+          title: Wavefront Launch Stats
+          metrics:
+            - Total Wavefronts:
+                min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
+                avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
+                max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
+  - Panel Config:
+      id: 1000
+      title: Compute Units - Instruction Mix
+    metric_tables:
+      - metric_table:
+          id: 1001
+          title: Overall Instruction Mix
+          metrics:
+            - VMEM:
+                min: MIN(((SQ_INSTS_VMEM) / $denom))
+                avg: AVG(((SQ_INSTS_VMEM) / $denom))
+                max: MAX(((SQ_INSTS_VMEM) / $denom))
+  - Panel Config:
+      id: 1100
+      title: Compute Units - Compute Pipeline
+    metric_tables:
+      - metric_table:
+          id: 1101
+          title: Compute Speed-of-Light
+          metrics:
+            - MFMA IOPs (INT8):
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
+                peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
+            - MFMA FLOPs (BF16):
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
+                peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
+            - MFMA FLOPs (F64):
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 128) / 1000))
+                peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000)
+            - MFMA FLOPs (F16):
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
+                peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
+      - metric_table:
+          id: 1103
+          title: Arithmetic Operations
+          metrics:
+            - FLOPs (Total):
+                min: |
+                  MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom))
+                avg: |
+                  AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom))
+                max: |
+                  MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom))
+  - Panel Config:
+      id: 1500
+      title: Address Processing Unit and Data Return Path (TA/TD)
+    metric_tables:
+      - metric_table:
+          id: 1502
+          title: Instruction counts
+          metrics:
+            - Spill/Stack Atomic Instructions:
+                max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
+  - Panel Config:
+      id: 1600
+      title: Vector L1 Data Cache
+    metric_tables:
+      - metric_table:
+          id: 1601
+          title: vL1D Speed-of-Light
+          metrics:
+            - Bandwidth Utilization:
+                value: |
+                  ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu))
+      - metric_table:
+          id: 1603
+          title: vL1D cache access metrics
+          metrics:
+            - L1-L2 Write Latency:
+                min: MIN((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom))
+                avg: AVG((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom))
+                unit: (Cycles + $normUnit)
+                max: MAX((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom))
+            - L1 Access Latency:
+                min: MIN((TCP_TCP_LATENCY_sum / $denom))
+                avg: AVG((TCP_TCP_LATENCY_sum / $denom))
+                unit: (Cycles + $normUnit)
+                max: MAX((TCP_TCP_LATENCY_sum / $denom))
+            - L1-L2 Read Latency:
+                min: MIN((TCP_TCC_READ_REQ_LATENCY_sum / $denom))
+                avg: AVG((TCP_TCC_READ_REQ_LATENCY_sum / $denom))
+                unit: (Cycles + $normUnit)
+                max: MAX((TCP_TCC_READ_REQ_LATENCY_sum / $denom))
+            - Cache BW:
+                min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
+                avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
+                max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
+            - L1-L2 BW:
+                min: |
+                  MIN(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp)))
+                avg: |
+                  AVG(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp)))
+                max: |
+                  MAX(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp)))
+  - Panel Config:
+      id: 1700
+      title: L2 Cache
+    metric_tables:
+      - metric_table:
+          id: 1701
+          title: L2 Speed-of-Light
+          metrics:
+            - L2-Fabric Write and Atomic BW:
+                value: |
+                  AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp)))
+            - L2-Fabric Read BW:
+                value: |
+                  AVG((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp)))
+      - metric_table:
+          id: 1702
+          title: L2-Fabric interface metrics
+          metrics:
+            - Write and Atomic Latency:
+                min: |
+                  MIN(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
+                avg: |
+                  AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
+                max: |
+                  MAX(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
+            - Read BW:
+                min: |
+                  MIN((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp)))
+                avg: |
+                  AVG((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp)))
+                max: |
+                  MAX((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp)))
+            - Remote Read Traffic:
+                min: |
+                  MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
+                avg: |
+                  AVG((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
+                max: |
+                  MAX((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
+            - HBM Read Traffic:
+                min: |
+                  MIN((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
+                avg: |
+                  AVG((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
+                max: |
+                  MAX((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
+            - Uncached Read Traffic:
+                min: |
+                  MIN((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
+                avg: |
+                  AVG((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
+                max: |
+                  MAX((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
+            - Uncached Write and Atomic Traffic:
+                min: |
+                  MIN((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
+                avg: |
+                  AVG((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
+                max: |
+                  MAX((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
+            - Atomic Traffic:
+                min: |
+                  MIN((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
+                avg: |
+                  AVG((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
+                max: |
+                  MAX((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
+            - Atomic Latency:
+                min: |
+                  MIN(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum != 0) else None))
+                avg: |
+                  AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum != 0) else None))
+                max: |
+                  MAX(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum != 0) else None))
+            - Remote Write and Atomic Traffic:
+                min: |
+                  MIN((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
+                avg: |
+                  AVG((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
+                max: |
+                  MAX((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
+            - HBM Write and Atomic Traffic:
+                min: |
+                  MIN((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
+                avg: |
+                  AVG((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
+                max: |
+                  MAX((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
+            - Read Latency:
+                min: |
+                  MIN(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
+                avg: |
+                  AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
+                max: |
+                  MAX(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
+            - Write and Atomic BW:
+                min: |
+                  MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp)))
+                avg: |
+                  AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp)))
+                unit: Gbps
+                max: |
+                  MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp)))
+      - metric_table:
+          id: 1706
+          title: L2 - Fabric interface detailed metrics
+          metrics:
+            - HBM Write and Atomic:
+                min: MIN((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom))
+                avg: AVG((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom))
+                max: MAX((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom))
+            - Atomic:
+                min: MIN((TCC_EA0_ATOMIC_sum / $denom))
+                avg: AVG((TCC_EA0_ATOMIC_sum / $denom))
+                max: MAX((TCC_EA0_ATOMIC_sum / $denom))
+            - Write and Atomic (32B):
+                min: MIN(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
+                avg: AVG(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
+                max: MAX(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
+            - Read (32B):
+                min: MIN((TCC_EA0_RDREQ_32B_sum / $denom))
+                avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom))
+                max: MAX((TCC_EA0_RDREQ_32B_sum / $denom))
+            - Read (Uncached):
+                min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
+                avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
+                max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
+            - Remote Read:
+                min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
+                avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
+                max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
+            - Remote Write and Atomic:
+                min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
+                avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
+                max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
+            - Write and Atomic (Uncached):
+                min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
+                avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
+                max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
+            - Read (64B):
+                min: MIN((TCC_EA0_RDREQ_64B_sum / $denom))
+                avg: AVG((TCC_EA0_RDREQ_64B_sum / $denom))
+                max: MAX((TCC_EA0_RDREQ_64B_sum / $denom))
+            - HBM Read:
+                min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom))
+                avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom))
+                max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom))
+            - Write and Atomic (64B):
+                min: MIN((TCC_EA0_WRREQ_64B_sum / $denom))
+                avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom))
+                max: MAX((TCC_EA0_WRREQ_64B_sum / $denom))
+  - Panel Config:
+      id: 1800
+      title: L2 Cache (per Channel)
+    metric_tables:
+      - metric_table:
+          id: 1801
+          title: Aggregate Stats (All channels)
+          metrics:
+            - L2 Cache Hit Rate:
+                min: |
+                  MIN(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
+                avg: |
+                  AVG(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
+                max: |
+                  MAX(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
+                std dev: |
+                  STD(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
+      - metric_table:
+          id: 1805
+          title: L2-Fabric Requests (per normUnit)
+          metrics:
+            - ::_1:
+                write req: AVG((TO_INT(TCC_EA0_WRREQ[::_1]) / $denom))
+                read req: AVG((TO_INT(TCC_EA0_RDREQ[::_1]) / $denom))
+                atomic req: AVG((TO_INT(TCC_EA0_ATOMIC[::_1]) / $denom))
+      - metric_table:
+          id: 1806
+          title: L2-Fabric Read Latency (Cycles)
+          metrics:
+            - ::_1:
+                expr: |
+                  ((TCC_EA0_RDREQ_LEVEL[::_1] / TCC_EA0_RDREQ[::_1]) if (TCC_EA0_RDREQ[::_1] != 0) else None)
+      - metric_table:
+          id: 1807
+          title: L2-Fabric Write and Atomic Latency (Cycles)
+          metrics:
+            - ::_1:
+                expr: |
+                  ((TCC_EA0_WRREQ_LEVEL[::_1] / TCC_EA0_WRREQ[::_1]) if (TCC_EA0_WRREQ[::_1] != 0) else None)
+      - metric_table:
+          id: 1808
+          title: L2-Fabric Atomic Latency (Cycles)
+          metrics:
+            - ::_1:
+                expr: |
+                  ((TCC_EA0_ATOMIC_LEVEL[::_1] / TCC_EA0_ATOMIC[::_1]) if (TCC_EA0_ATOMIC[::_1] != 0) else 0)
+      - metric_table:
+          id: 1809
+          title: L2-Fabric Read Stall (Cycles per normUnit)
+          metrics:
+            - ::_1:
+                ea read stall - hbm: AVG((TO_INT(TCC_EA0_RDREQ_DRAM_CREDIT_STALL[::_1]) / $denom))
+                ea read stall - if: AVG((TO_INT(TCC_EA0_RDREQ_GMI_CREDIT_STALL[::_1]) / $denom))
+                ea read stall - pcie: AVG((TO_INT(TCC_EA0_RDREQ_IO_CREDIT_STALL[::_1]) / $denom))
+      - metric_table:
+          id: 1810
+          title: L2-Fabric Write and Atomic Stall (Cycles per normUnit)
+          metrics:
+            - ::_1:
+                ea write stall - pcie: AVG((TO_INT(TCC_EA0_WRREQ_IO_CREDIT_STALL[::_1]) / $denom))
+                ea write stall - if: AVG((TO_INT(TCC_EA0_WRREQ_GMI_CREDIT_STALL[::_1]) / $denom))
+                ea write stall - hbm: AVG((TO_INT(TCC_EA0_WRREQ_DRAM_CREDIT_STALL[::_1]) / $denom))
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0000_top_stats.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0000_top_stats.yaml
index 55c6f6bb24..5ce5aeeb28 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0000_top_stats.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0000_top_stats.yaml
@@ -2,7 +2,6 @@
 Panel Config:
   id: 0
   title: Top Stats
-  metrics_description: {}
   data source:
   - raw_csv_table:
       id: 1
@@ -12,3 +11,4 @@ Panel Config:
       id: 2
       title: Dispatch List
       source: pmc_dispatch_info.csv
+  metrics_description: {}
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0100_system_info.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0100_system_info.yaml
index 23d024fde3..8b48c2253b 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0100_system_info.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0100_system_info.yaml
@@ -2,10 +2,10 @@
 Panel Config:
   id: 100
   title: System Info
-  metrics_description: {}
   data source:
   - raw_csv_table:
       id: 101
       title: System Info
       source: sysinfo.csv
       columnwise: true
+  metrics_description: {}
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0200_system_speed_of_light.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0200_system_speed_of_light.yaml
index 722866f6e0..8aa72cb25d 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0200_system_speed_of_light.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0200_system_speed_of_light.yaml
@@ -2,124 +2,6 @@
 Panel Config:
   id: 200
   title: System Speed-of-Light
-  metrics_description:
-    VALU FLOPs: 'The total floating-point operations executed per second on the VALU.
-      This is also presented as a percent of the peak theoretical FLOPs achievable
-      on the specific accelerator. Note: this does not include any floating-point
-      operations from MFMA instructions.'
-    VALU IOPs: 'The total integer operations executed per second on the VALU. This
-      is also presented as a percent of the peak theoretical IOPs achievable on the
-      specific accelerator. Note: this does not include any integer operations from
-      MFMA instructions.'
-    MFMA FLOPs (F8): The total number of 8-bit brain floating point MFMA operations
-      executed per second. This does not include any 16-bit brain floating point operations
-      from VALU instructions. This is also presented as a percent of the peak theoretical
-      F8 MFMA operations achievable on the specific accelerator. It is supported on
-      AMD Instinct MI300 series and later only.
-    MFMA FLOPs (BF16): 'The total number of 16-bit brain floating point MFMA operations
-      executed per second. Note: this does not include any 16-bit brain floating point
-      operations from VALU instructions. This is also presented as a percent of the
-      peak theoretical BF16 MFMA operations achievable on the specific accelerator.'
-    MFMA FLOPs (F16): 'The total number of 16-bit floating point MFMA operations executed
-      per second. Note: this does not include any 16-bit floating point operations
-      from VALU instructions. This is also presented as a percent of the peak theoretical
-      F16 MFMA operations achievable on the specific accelerator.'
-    MFMA FLOPs (F32): 'The total number of 32-bit floating point MFMA operations executed
-      per second. Note: this does not include any 32-bit floating point operations
-      from VALU instructions. This is also presented as a percent of the peak theoretical
-      F32 MFMA operations achievable on the specific accelerator.'
-    MFMA FLOPs (F64): 'The total number of 64-bit floating point MFMA operations executed
-      per second. Note: this does not include any 64-bit floating point operations
-      from VALU instructions. This is also presented as a percent of the peak theoretical
-      F64 MFMA operations achievable on the specific accelerator.'
-    MFMA IOPs (Int8): 'The total number of 8-bit integer MFMA operations executed
-      per second. Note: this does not include any 8-bit integer operations from VALU
-      instructions. This is also presented as a percent of the peak theoretical INT8
-      MFMA operations achievable on the specific accelerator.'
-    Active CUs: Total number of active compute units (CUs) on the accelerator during
-      the kernel execution.
-    SALU Utilization: Indicates what percent of the kernel's duration the SALU was
-      busy executing instructions. Computed as the ratio of the total number of cycles
-      spent by the scheduler issuing SALU or SMEM instructions over the total CU cycles.
-    VALU Utilization: Indicates what percent of the kernel's duration the VALU was
-      busy executing instructions. Does not include VMEM operations. Computed as the
-      ratio of the total number of cycles spent by the scheduler issuing VALU instructions
-      over the total CU cycles.
-    MFMA Utilization: Indicates what percent of the kernel's duration the MFMA unit
-      was busy executing instructions. Computed as the ratio of the total number of
-      cycles the MFMA was busy over the total CU cycles.
-    VMEM Utilization: Indicates what percent of the kernel's duration the VMEM unit
-      was busy executing instructions, including both global/generic and spill/scratch
-      operations (see the VMEM instruction count metrics) for more detail). Does not
-      include VALU operations. Computed as the ratio of the total number of cycles
-      spent by the scheduler issuing VMEM instructions over the total CU cycles.
-    Branch Utilization: Indicates what percent of the kernel's duration the branch
-      unit was busy executing instructions. Computed as the ratio of the total number
-      of cycles spent by the scheduler issuing branch instructions over the total
-      CU cycles
-    VALU Active Threads: Indicates the average level of divergence within a wavefront
-      over the lifetime of the kernel. The number of work-items that were active in
-      a wavefront during execution of each VALU instruction, time-averaged over all
-      VALU instructions run on all wavefronts in the kernel.
-    IPC: The ratio of the total number of instructions executed on the CU over the
-      total active CU cycles. This is also presented as a percent of the peak theoretical
-      bandwidth achievable on the specific accelerator.
-    Wavefront Occupancy: 'The time-averaged number of wavefronts resident on the accelerator
-      over the lifetime of the kernel. Note: this metric may be inaccurate for short-running
-      kernels (less than 1ms). This is also presented as a percent of the peak theoretical
-      occupancy achievable on the specific accelerator.'
-    Theoretical LDS Bandwidth: Indicates the maximum amount of bytes that could have
-      been loaded from, stored to, or atomically updated in the LDS per unit time
-      (see LDS Bandwidth example for more detail). This is also presented as a percent
-      of the peak theoretical F64 MFMA operations achievable on the specific accelerator.
-    LDS Bank Conflicts/Access: The ratio of the number of cycles spent in the LDS
-      scheduler due to bank conflicts (as determined by the conflict resolution hardware)
-      to the base number of cycles that would be spent in the LDS scheduler in a completely
-      uncontended case. This is also presented in normalized form (i.e., the Bank
-      Conflict Rate).
-    vL1D Cache Hit Rate: The ratio of the number of vL1D cache line requests that
-      hit in vL1D cache over the total number of cache line requests to the vL1D cache
-      RAM.
-    vL1D Cache BW: The number of bytes looked up in the vL1D cache as a result of
-      VMEM instructions per unit time. The number of bytes is calculated as the number
-      of cache lines requested multiplied by the cache line size. This value does
-      not consider partial requests, so e.g., if only a single value is requested
-      in a cache line, the data movement will still be counted as a full cache line.
-      This is also presented as a percent of the peak theoretical bandwidth achievable
-      on the specific accelerator.
-    L2 Cache Hit Rate: The ratio of the number of L2 cache line requests that hit
-      in the L2 cache over the total number of incoming cache line requests to the
-      L2 cache.
-    L2 Cache BW: The number of bytes looked up in the L2 cache per unit time. The
-      number of bytes is calculated as the number of cache lines requested multiplied
-      by the cache line size. This value does not consider partial requests, so e.g.,
-      if only a single value is requested in a cache line, the data movement will
-      still be counted as a full cache line. This is also presented as a percent of
-      the peak theoretical bandwidth achievable on the specific accelerator.
-    L2-Fabric Read BW: "The number of bytes read by the L2 over the Infinity Fabric\u2122\
-      \ interface per unit time. This is also presented as a percent of the peak theoretical\
-      \ bandwidth achievable on the specific accelerator."
-    L2-Fabric Write BW: The number of bytes sent by the L2 over the Infinity Fabric
-      interface by write and atomic operations per unit time. This is also presented
-      as a percent of the peak theoretical bandwidth achievable on the specific accelerator.
-    L2-Fabric Read Latency: The time-averaged number of cycles read requests spent
-      in Infinity Fabric before data was returned to the L2.
-    L2-Fabric Write Latency: The time-averaged number of cycles write requests spent
-      in Infinity Fabric before a completion acknowledgement was returned to the L2.
-    sL1D Cache Hit Rate: The percent of sL1D requests that hit on a previously loaded
-      line the cache. Calculated as the ratio of the number of sL1D requests that
-      hit over the number of all sL1D requests.
-    sL1D Cache BW: The number of bytes looked up in the sL1D cache per unit time.
-      This is also presented as a percent of the peak theoretical bandwidth achievable
-      on the specific accelerator.
-    L1I Hit Rate: The number of bytes looked up in the L1I cache per unit time. This
-      is also presented as a percent of the peak theoretical bandwidth achievable
-      on the specific accelerator.
-    L1I BW: The percent of L1I requests that hit on a previously loaded line the cache.
-      Calculated as the ratio of the number of L1I requests that hit over the number
-      of all L1I requests.
-    L1I Fetch Latency: The average number of cycles spent to fetch instructions to
-      a CU.
   data source:
   - metric_table:
       id: 201
@@ -344,3 +226,130 @@ Panel Config:
           peak: None
           pop: None
           coll_level: SQ_IFETCH_LEVEL
+  metrics_description:
+    VALU FLOPs: |-
+      The total floating-point operations executed per second on the VALU.
+      This is also presented as a percent of the peak theoretical FLOPs achievable
+      on the specific accelerator. Note: this does not include any floating-point
+      operations from MFMA instructions.
+    VALU IOPs: |-
+      The total integer operations executed per second on the VALU. This is
+      also presented as a percent of the peak theoretical IOPs achievable on the
+      specific accelerator. Note: this does not include any integer operations from
+      MFMA instructions.
+    MFMA FLOPs (F8): The total number of 8-bit brain floating point MFMA operations
+      executed per second. This does not include any 16-bit brain floating point operations
+      from VALU instructions. This is also presented as a percent of the peak theoretical
+      F8 MFMA operations achievable on the specific accelerator. It is supported on
+      AMD Instinct MI300 series and later only.
+    MFMA FLOPs (BF16): |-
+      The total number of 16-bit brain floating point MFMA operations executed
+      per second. Note: this does not include any 16-bit brain floating point operations
+      from VALU instructions. This is also presented as a percent of the peak theoretical
+      BF16 MFMA operations achievable on the specific accelerator.
+    MFMA FLOPs (F16): |-
+      The total number of 16-bit floating point MFMA operations executed per
+      second. Note: this does not include any 16-bit floating point operations from
+      VALU instructions. This is also presented as a percent of the peak theoretical
+      F16 MFMA operations achievable on the specific accelerator.
+    MFMA FLOPs (F32): |-
+      The total number of 32-bit floating point MFMA operations executed per
+      second. Note: this does not include any 32-bit floating point operations from
+      VALU instructions. This is also presented as a percent of the peak theoretical
+      F32 MFMA operations achievable on the specific accelerator.
+    MFMA FLOPs (F64): |-
+      The total number of 64-bit floating point MFMA operations executed per
+      second. Note: this does not include any 64-bit floating point operations from
+      VALU instructions. This is also presented as a percent of the peak theoretical
+      F64 MFMA operations achievable on the specific accelerator.
+    MFMA IOPs (Int8): |-
+      The total number of 8-bit integer MFMA operations executed per second.
+      Note: this does not include any 8-bit integer operations from VALU instructions.
+      This is also presented as a percent of the peak theoretical INT8 MFMA operations
+      achievable on the specific accelerator.
+    Active CUs: Total number of active compute units (CUs) on the accelerator during
+      the kernel execution.
+    SALU Utilization: Indicates what percent of the kernel's duration the SALU was
+      busy executing instructions. Computed as the ratio of the total number of cycles
+      spent by the scheduler issuing SALU or SMEM instructions over the total CU cycles.
+    VALU Utilization: Indicates what percent of the kernel's duration the VALU was
+      busy executing instructions. Does not include VMEM operations. Computed as the
+      ratio of the total number of cycles spent by the scheduler issuing VALU instructions
+      over the total CU cycles.
+    MFMA Utilization: Indicates what percent of the kernel's duration the MFMA unit
+      was busy executing instructions. Computed as the ratio of the total number of
+      cycles the MFMA was busy over the total CU cycles.
+    VMEM Utilization: Indicates what percent of the kernel's duration the VMEM unit
+      was busy executing instructions, including both global/generic and spill/scratch
+      operations (see the VMEM instruction count metrics) for more detail). Does not
+      include VALU operations. Computed as the ratio of the total number of cycles
+      spent by the scheduler issuing VMEM instructions over the total CU cycles.
+    Branch Utilization: Indicates what percent of the kernel's duration the branch
+      unit was busy executing instructions. Computed as the ratio of the total number
+      of cycles spent by the scheduler issuing branch instructions over the total
+      CU cycles
+    VALU Active Threads: Indicates the average level of divergence within a wavefront
+      over the lifetime of the kernel. The number of work-items that were active in
+      a wavefront during execution of each VALU instruction, time-averaged over all
+      VALU instructions run on all wavefronts in the kernel.
+    IPC: The ratio of the total number of instructions executed on the CU over the
+      total active CU cycles. This is also presented as a percent of the peak theoretical
+      bandwidth achievable on the specific accelerator.
+    Wavefront Occupancy: |-
+      The time-averaged number of wavefronts resident on the accelerator over
+      the lifetime of the kernel. Note: this metric may be inaccurate for short-running
+      kernels (less than 1ms). This is also presented as a percent of the peak theoretical
+      occupancy achievable on the specific accelerator.
+    Theoretical LDS Bandwidth: Indicates the maximum amount of bytes that could have
+      been loaded from, stored to, or atomically updated in the LDS per unit time
+      (see LDS Bandwidth example for more detail). This is also presented as a percent
+      of the peak theoretical F64 MFMA operations achievable on the specific accelerator.
+    LDS Bank Conflicts/Access: The ratio of the number of cycles spent in the LDS
+      scheduler due to bank conflicts (as determined by the conflict resolution hardware)
+      to the base number of cycles that would be spent in the LDS scheduler in a completely
+      uncontended case. This is also presented in normalized form (i.e., the Bank
+      Conflict Rate).
+    vL1D Cache Hit Rate: The ratio of the number of vL1D cache line requests that
+      hit in vL1D cache over the total number of cache line requests to the vL1D cache
+      RAM.
+    vL1D Cache BW: The number of bytes looked up in the vL1D cache as a result of
+      VMEM instructions per unit time. The number of bytes is calculated as the number
+      of cache lines requested multiplied by the cache line size. This value does
+      not consider partial requests, so e.g., if only a single value is requested
+      in a cache line, the data movement will still be counted as a full cache line.
+      This is also presented as a percent of the peak theoretical bandwidth achievable
+      on the specific accelerator.
+    L2 Cache Hit Rate: The ratio of the number of L2 cache line requests that hit
+      in the L2 cache over the total number of incoming cache line requests to the
+      L2 cache.
+    L2 Cache BW: The number of bytes looked up in the L2 cache per unit time. The
+      number of bytes is calculated as the number of cache lines requested multiplied
+      by the cache line size. This value does not consider partial requests, so e.g.,
+      if only a single value is requested in a cache line, the data movement will
+      still be counted as a full cache line. This is also presented as a percent of
+      the peak theoretical bandwidth achievable on the specific accelerator.
+    L2-Fabric Read BW: |-
+      The number of bytes read by the L2 over the Infinity Fabric\u2122 interface
+      per unit time. This is also presented as a percent of the peak theoretical
+      bandwidth achievable on the specific accelerator.
+    L2-Fabric Write BW: The number of bytes sent by the L2 over the Infinity Fabric
+      interface by write and atomic operations per unit time. This is also presented
+      as a percent of the peak theoretical bandwidth achievable on the specific accelerator.
+    L2-Fabric Read Latency: The time-averaged number of cycles read requests spent
+      in Infinity Fabric before data was returned to the L2.
+    L2-Fabric Write Latency: The time-averaged number of cycles write requests spent
+      in Infinity Fabric before a completion acknowledgement was returned to the L2.
+    sL1D Cache Hit Rate: The percent of sL1D requests that hit on a previously loaded
+      line the cache. Calculated as the ratio of the number of sL1D requests that
+      hit over the number of all sL1D requests.
+    sL1D Cache BW: The number of bytes looked up in the sL1D cache per unit time.
+      This is also presented as a percent of the peak theoretical bandwidth achievable
+      on the specific accelerator.
+    L1I Hit Rate: The number of bytes looked up in the L1I cache per unit time. This
+      is also presented as a percent of the peak theoretical bandwidth achievable
+      on the specific accelerator.
+    L1I BW: The percent of L1I requests that hit on a previously loaded line the cache.
+      Calculated as the ratio of the number of L1I requests that hit over the number
+      of all L1I requests.
+    L1I Fetch Latency: The average number of cycles spent to fetch instructions to
+      a CU.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0300_memory_chart.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0300_memory_chart.yaml
index 03b5606ad7..b13053c1f7 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0300_memory_chart.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0300_memory_chart.yaml
@@ -2,122 +2,6 @@
 Panel Config:
   id: 300
   title: Memory Chart
-  metrics_description:
-    Wavefront Occupancy: Wavefronts per active CU.
-    Wave Life: Average number of cycles executing a wave.
-    SALU: Total Number of SALU (Scalar ALU) instructions issued per normalization
-      unit.
-    SMEM: Total number of SMEM (Scalar Memory Read) instructions issued normalization
-      unit.
-    VALU: The number of VALU (Vector ALU) instructions issued per normalization unit.
-    MFMA: Total number of MFMA (Matrix-Fused-Multiply-Add) instructions issued per
-      normalization unit.
-    VMEM: The number of VMEM (GPU Memory) read instructions issued (including FLAT/scratch
-      memory) per normalization unit.
-    LDS: The total number of LDS instructions (including, but not limited to, read/write/atomics
-      and HIP's __shfl instructions) executed per normalization unit.
-    GWS: Total number of GDS (global data sync) instructions issued per normalization
-      unit.
-    BR: Total number of BRANCH instructions issued per normalization unit.
-    Active CUs: Total number of active compute units (CUs) on the accelerator during
-      the kernel execution.
-    Num CUs: Total number of compute units (CUs) on the accelerator.
-    VGPR: 'The number of architected vector general-purpose registers allocated for
-      the kernel, see VALU. Note: this may not exactly match the number of VGPRs requested
-      by the compiler due to allocation granularity.'
-    SGPR: 'The number of scalar general-purpose registers allocated for the kernel,
-      see SALU. Note: this may not exactly match the number of SGPRs requested by
-      the compiler due to allocation granularity.'
-    LDS Allocation: 'The number of bytes of LDS memory (or, shared memory) allocated
-      for this kernel. Note: This may also be larger than what was requested at compile
-      time due to both allocation granularity and dynamic per-dispatch LDS allocations.'
-    Scratch Allocation: The number of bytes of scratch memory requested per work-item
-      for this kernel. Scratch memory is used for stack memory on the accelerator,
-      as well as for register spills and restores.
-    Wavefronts: The total number of wavefronts, summed over all workgroups, forming
-      this kernel launch.
-    Workgroups: The total number of workgroups forming this kernel launch.
-    LDS Req: The total number of LDS instructions (including, but not limited to,
-      read/write/atomics and HIP's __shfl instructions) executed per normalization
-      unit.
-    LDS Util: Indicates what percent of the kernel's duration the LDS was actively
-      executing instructions (including, but not limited to, load, store, atomic and
-      HIP's __shfl operations). Calculated as the ratio of the total number of cycles
-      LDS was active over the total CU cycles.
-    LDS Latency: The average number of round-trip cycles (i.e., from issue to data-return
-      / acknowledgment) required for an LDS instruction to complete.
-    VL1 Rd: The total number of incoming read requests from the address processing
-      unit after coalescing per normalization unit
-    VL1 Wr: The total number of incoming write requests from the address processing
-      unit after coalescing per normalization unit
-    VL1 Atomic: The total number of incoming atomic requests from the address processing
-      unit after coalescing per normalization unit
-    VL1 Hit: The ratio of the number of vL1D cache line requests that hit in vL1D
-      cache over the total number of cache line requests to the vL1D Cache RAM.
-    VL1 Lat: Calculated as the average number of cycles that a vL1D cache line request
-      spent in the vL1D cache pipeline.
-    VL1 Coalesce: Indicates how well memory instructions were coalesced by the address
-      processing unit, ranging from uncoalesced (25%) to fully coalesced (100%). Calculated
-      as the average number of thread-requests generated per instruction divided by
-      the ideal number of thread-requests per instruction.
-    VL1 Stall: The ratio of the number of cycles where the vL1D is stalled waiting
-      to issue a request for data to the L2 cache divided by the number of cycles
-      where the vL1D is active.
-    VL1_L2 Rd: The number of read requests for a vL1D cache line that were not satisfied
-      by the vL1D and must be retrieved from the to the L2 Cache per normalization
-      unit.
-    VL1_L2 Wr: The number of write requests to a vL1D cache line that were sent through
-      the vL1D to the L2 cache, per normalization unit.
-    VL1_L2 Atomic: The number of atomic requests that are sent through the vL1D to
-      the L2 cache, per normalization unit. This includes requests for atomics with,
-      and without return.
-    sL1D Rd: The total number of requests, of any size or type, made to the sL1D per
-      normalization unit.
-    sL1D Hit: The total number of sL1D requests that hit on a previously loaded cache
-      line, per normalization unit.
-    sL1D_L2 Rd: The total number of read requests from sL1D to the L2, per normalization
-      unit.
-    sL1D_L2 Wr: The total number of write requests from sL1D to the L2, per normalization
-      unit. Typically unused on current CDNA accelerators.
-    sL1D_L2 Atomic: The total number of atomic requests from sL1D to the L2, per normalization
-      unit. Typically unused on current CDNA accelerators.
-    IL1 Fetch: The total number of requests made to the L1I per normalization-unit.
-    IL1 Hit: The percent of L1I requests that hit on a previously loaded line the
-      cache. Calculated as the ratio of the number of L1I requests that hit over the
-      number of all L1I requests.
-    IL1 Lat: The average number of cycles spent to fetch instructions to a CU.
-    IL1_L2 Rd: The total number of requests across the L1I - L2 interface per normalization-unit.
-    L2 Rd: The total number of read requests to the L2 from all clients.
-    L2 Wr: The total number of write requests to the L2 from all clients.
-    L2 Atomic: The total number of atomic requests (with and without return) to the
-      L2 from all clients.
-    L2 Hit: The ratio of the number of L2 cache line requests that hit in the L2 cache
-      over the total number of incoming cache line requests to the L2 cache.
-    L2 Rd Lat: Calculated as the average number of cycles that the vL1D cache took
-      to issue and receive read requests from the L2 Cache. This number also includes
-      requests for atomics with return values.
-    L2 Wr Lat: Calculated as the average number of cycles that the vL1D cache took
-      to issue and receive acknowledgement of a write request to the L2 Cache. This
-      number also includes requests for atomics without return values.
-    Fabric_L2 Rd: Number of L2 cache - Infinity Fabric read requests (either 32-byte
-      or 64-byte) summed over TCC instances per normalization unit.
-    Fabric_L2 Wr: Number of L2 cache - Infinity Fabric write requests (either 32-byte
-      or 64-byte) summed over TCC instances per normalization unit.
-    Fabric_L2 Atomic: Number of L2 cache - Infinity Fabric write requests (either
-      32-byte or 64-byte) that are actually atomic requests summed over TCC instances
-      per normalization unit.
-    Fabric Rd Lat: The time-averaged number of cycles read requests spent in Infinity
-      Fabric before data was returned to the L2.
-    Fabric Wr Lat: The time-averaged number of cycles write requests spent in Infinity
-      Fabric before a completion acknowledgement was returned to the L2.
-    Fabric Atomic Lat: The time-averaged number of cycles atomic requests spent in
-      Infinity Fabric before a completion acknowledgement (atomic without return value)
-      or data (atomic with return value) was returned to the L2.
-    HBM Rd: The total number of L2 requests to Infinity Fabric to read 32B or 64B
-      of data from the accelerator's local HBM, per normalization unit.
-    HBM Wr: 'The total number of L2 requests to Infinity Fabric to write or atomically
-      update 32B or 64B of data in the accelerator''s local HBM, per normalization
-      unit. '
   data source:
   - metric_table:
       id: 301
@@ -244,13 +128,13 @@ Panel Config:
           value: ROUND(AVG((TCC_EA0_ATOMIC_sum / $denom)), 0)
         Fabric Rd Lat:
           value: ROUND(AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-            != 0) else  0)), 0)
+            != 0) else 0)), 0)
         Fabric Wr Lat:
           value: ROUND(AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-            != 0) else  0)), 0)
+            != 0) else 0)), 0)
         Fabric Atomic Lat:
           value: ROUND(AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
-            != 0) else  0)), 0)
+            != 0) else 0)), 0)
         HBM Rd:
           value: ROUND(AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)), 0)
         HBM Wr:
@@ -258,3 +142,117 @@ Panel Config:
       comparable: false
       cli_style: mem_chart
       tui_style: mem_chart
+  metrics_description:
+    Wavefront Occupancy: Wavefronts per active CU.
+    Wave Life: Average number of cycles executing a wave.
+    SALU: Total Number of SALU (Scalar ALU) instructions issued per normalization
+      unit.
+    SMEM: Total number of SMEM (Scalar Memory Read) instructions issued normalization
+      unit.
+    VALU: The number of VALU (Vector ALU) instructions issued per normalization unit.
+    MFMA: Total number of MFMA (Matrix-Fused-Multiply-Add) instructions issued per
+      normalization unit.
+    VMEM: The number of VMEM (GPU Memory) read instructions issued (including FLAT/scratch
+      memory) per normalization unit.
+    LDS: The total number of LDS instructions (including, but not limited to, read/write/atomics
+      and HIP's __shfl instructions) executed per normalization unit.
+    GWS: Total number of GDS (global data sync) instructions issued per normalization
+      unit.
+    BR: Total number of BRANCH instructions issued per normalization unit.
+    Active CUs: Total number of active compute units (CUs) on the accelerator during
+      the kernel execution.
+    Num CUs: Total number of compute units (CUs) on the accelerator.
+    VGPR: |-
+      The number of architected vector general-purpose registers allocated
+      for the kernel, see VALU. Note: this may not exactly match the number of VGPRs
+      requested by the compiler due to allocation granularity.
+    SGPR: |-
+      The number of scalar general-purpose registers allocated for the kernel,
+      see SALU. Note: this may not exactly match the number of SGPRs requested by
+      the compiler due to allocation granularity.
+    LDS Allocation: |-
+      The number of bytes of LDS memory (or, shared memory) allocated for
+      this kernel. Note: This may also be larger than what was requested at compile
+      time due to both allocation granularity and dynamic per-dispatch LDS allocations.
+    Scratch Allocation: The number of bytes of scratch memory requested per work-item
+      for this kernel. Scratch memory is used for stack memory on the accelerator,
+      as well as for register spills and restores.
+    Wavefronts: The total number of wavefronts, summed over all workgroups, forming
+      this kernel launch.
+    Workgroups: The total number of workgroups forming this kernel launch.
+    LDS Req: The total number of LDS instructions (including, but not limited to,
+      read/write/atomics and HIP's __shfl instructions) executed per normalization
+      unit.
+    LDS Util: Indicates what percent of the kernel's duration the LDS was actively
+      executing instructions (including, but not limited to, load, store, atomic and
+      HIP's __shfl operations). Calculated as the ratio of the total number of cycles
+      LDS was active over the total CU cycles.
+    LDS Latency: The average number of round-trip cycles (i.e., from issue to data-return
+      / acknowledgment) required for an LDS instruction to complete.
+    VL1 Rd: The total number of incoming read requests from the address processing
+      unit after coalescing per normalization unit
+    VL1 Wr: The total number of incoming write requests from the address processing
+      unit after coalescing per normalization unit
+    VL1 Atomic: The total number of incoming atomic requests from the address processing
+      unit after coalescing per normalization unit
+    VL1 Hit: The ratio of the number of vL1D cache line requests that hit in vL1D
+      cache over the total number of cache line requests to the vL1D Cache RAM.
+    VL1 Lat: Calculated as the average number of cycles that a vL1D cache line request
+      spent in the vL1D cache pipeline.
+    VL1 Coalesce: Indicates how well memory instructions were coalesced by the address
+      processing unit, ranging from uncoalesced (25%) to fully coalesced (100%). Calculated
+      as the average number of thread-requests generated per instruction divided by
+      the ideal number of thread-requests per instruction.
+    VL1 Stall: The ratio of the number of cycles where the vL1D is stalled waiting
+      to issue a request for data to the L2 cache divided by the number of cycles
+      where the vL1D is active.
+    VL1_L2 Rd: The number of read requests for a vL1D cache line that were not satisfied
+      by the vL1D and must be retrieved from the to the L2 Cache per normalization
+      unit.
+    VL1_L2 Wr: The number of write requests to a vL1D cache line that were sent through
+      the vL1D to the L2 cache, per normalization unit.
+    VL1_L2 Atomic: The number of atomic requests that are sent through the vL1D to
+      the L2 cache, per normalization unit. This includes requests for atomics with,
+      and without return.
+    sL1D Rd: The total number of requests, of any size or type, made to the sL1D per
+      normalization unit.
+    sL1D Hit: The total number of sL1D requests that hit on a previously loaded cache
+      line, per normalization unit.
+    sL1D_L2 Rd: The total number of read requests from sL1D to the L2, per normalization
+      unit.
+    sL1D_L2 Wr: The total number of write requests from sL1D to the L2, per normalization
+      unit. Typically unused on current CDNA accelerators.
+    sL1D_L2 Atomic: The total number of atomic requests from sL1D to the L2, per normalization
+      unit. Typically unused on current CDNA accelerators.
+    IL1 Fetch: The total number of requests made to the L1I per normalization-unit.
+    IL1 Hit: The percent of L1I requests that hit on a previously loaded line the
+      cache. Calculated as the ratio of the number of L1I requests that hit over the
+      number of all L1I requests.
+    IL1 Lat: The average number of cycles spent to fetch instructions to a CU.
+    IL1_L2 Rd: The total number of requests across the L1I - L2 interface per normalization-unit.
+    L2 Rd: The total number of read requests to the L2 from all clients.
+    L2 Wr: The total number of write requests to the L2 from all clients.
+    L2 Atomic: The total number of atomic requests (with and without return) to the
+      L2 from all clients.
+    L2 Hit: The ratio of the number of L2 cache line requests that hit in the L2 cache
+      over the total number of incoming cache line requests to the L2 cache.
+    Fabric_L2 Rd: Number of L2 cache - Infinity Fabric read requests (either 32-byte
+      or 64-byte) summed over TCC instances per normalization unit.
+    Fabric_L2 Wr: Number of L2 cache - Infinity Fabric write requests (either 32-byte
+      or 64-byte) summed over TCC instances per normalization unit.
+    Fabric_L2 Atomic: Number of L2 cache - Infinity Fabric write requests (either
+      32-byte or 64-byte) that are actually atomic requests summed over TCC instances
+      per normalization unit.
+    Fabric Rd Lat: The time-averaged number of cycles read requests spent in Infinity
+      Fabric before data was returned to the L2.
+    Fabric Wr Lat: The time-averaged number of cycles write requests spent in Infinity
+      Fabric before a completion acknowledgement was returned to the L2.
+    Fabric Atomic Lat: The time-averaged number of cycles atomic requests spent in
+      Infinity Fabric before a completion acknowledgement (atomic without return value)
+      or data (atomic with return value) was returned to the L2.
+    HBM Rd: The total number of L2 requests to Infinity Fabric to read 32B or 64B
+      of data from the accelerator's local HBM, per normalization unit.
+    HBM Wr: |-
+      The total number of L2 requests to Infinity Fabric to write or atomically
+      update 32B or 64B of data in the accelerator's local HBM, per normalization
+      unit.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0400_roofline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0400_roofline.yaml
index 3cca25864f..6731ebfceb 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0400_roofline.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0400_roofline.yaml
@@ -2,85 +2,6 @@
 Panel Config:
   id: 400
   title: Roofline
-  metrics_description:
-    VALU FLOPs (F16): 'The total 16-bit floating-point operations executed per second
-      on the VALU. This is presented with the value of the peak empirical F16 FLOPs
-      achievable on the specific accelerator. Note: this does not include any F16
-      operations from MFMA instructions.'
-    VALU FLOPs (F32): 'The total 32-bit floating-point operations executed per second
-      on the VALU. This is presented with the value of the peak empirical F32 FLOPs
-      achievable on the specific accelerator. Note: this does not include any F32
-      operations from MFMA instructions.'
-    VALU FLOPs (F64): 'The total 64-bit floating-point operations executed per second
-      on the VALU. This is presented with the value of the peak empirical F64 FLOPs
-      achievable on the specific accelerator. Note: this does not include any F64
-      operations from MFMA instructions.'
-    MFMA FLOPs (F8): The total number of 8-bit brain floating point MFMA operations
-      executed per second. This does not include any 16-bit brain floating point operations
-      from VALU instructions. The peak empirically measured F8 MFMA operations achievable
-      on the specific accelerator is displayed alongside for comparison. It is supported
-      on AMD Instinct MI300 series and later only.
-    MFMA FLOPs (BF16): 'The total number of 16-bit brain floating point MFMA operations
-      executed per second. Note: this does not include any 16-bit brain floating point
-      operations from VALU instructions. The peak empirically measured BF16 MFMA operations
-      achievable on the specific accelerator is displayed alongside for comparison.'
-    MFMA FLOPs (F16): 'The total number of 16-bit floating point MFMA operations executed
-      per second. Note: this does not include any 16-bit floating point operations
-      from VALU instructions. The peak empirically measured F16 MFMA operations achievable
-      on the specific accelerator is displayed alongside for comparison.'
-    MFMA FLOPs (F32): 'The total number of 32-bit floating point MFMA operations executed
-      per second. Note: this does not include any 32-bit floating point operations
-      from VALU instructions. The peak empirically measured F32 MFMA operations achievable
-      on the specific accelerator is displayed alongside for comparison.'
-    MFMA FLOPs (F64): 'The total number of 64-bit floating point MFMA operations executed
-      per second. Note: this does not include any 64-bit floating point operations
-      from VALU instructions. The peak empirically measured F64 MFMA operations achievable
-      on the specific accelerator is displayed alongside for comparison.'
-    MFMA FLOPs (F6F4): 'The total number of 4-bit and 6-bit floating point MFMA operations
-      executed per second. Note: this does not include any floating point operations
-      from VALU instructions. The peak empirically measured F6F4 MFMA operations achievable
-      on the specific accelerator is displayed alongside for comparison. It is supported
-      on AMD Instinct MI350 series (gfx950) and later only.'
-    MFMA IOPs (Int8): 'The total number of 8-bit integer MFMA operations executed
-      per second. Note: this does not include any 8-bit integer operations from VALU
-      instructions. The peak empirically measured INT8 MFMA operations achievable
-      on the specific accelerator is displayed alongside for comparison.'
-    HBM Bandwidth: The total number of bytes read from and written to High-Bandwidth
-      Memory (HBM) per second. The peak empirically measured bandwidth achievable
-      on the specific accelerator is displayed alongside for comparison.
-    L2 Cache Bandwidth: The number of bytes looked up in the L2 cache per unit time.
-      The number of bytes is calculated as the number of cache lines requested multiplied
-      by the cache line size. This value does not consider partial requests, so e.g.,
-      if only a single value is requested in a cache line, the data movement will
-      still be counted as a full cache line. The peak empirically measured bandwidth
-      achievable on the specific accelerator is displayed alongside for comparison.
-    L1 Cache Bandwidth: The number of bytes looked up in the vL1D cache as a result
-      of VMEM instructions per unit time. The number of bytes is calculated as the
-      number of cache lines requested multiplied by the cache line size. This value
-      does not consider partial requests, so e.g., if only a single value is requested
-      in a cache line, the data movement will still be counted as a full cache line.
-      The peak empirically measured bandwidth achievable on the specific accelerator
-      is displayed alongside for comparison.
-    LDS Bandwidth: Indicates the maximum amount of bytes that could have been loaded
-      from, stored to, or atomically updated in the LDS per unit time (see LDS Bandwidth
-      example for more detail). The peak empirically measured LDS bandwidth achievable
-      on the specific accelerator is displayed alongside for comparison.
-    AI L1: The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio
-      of total floating-point operations (FLOPs) to total bytes transferred between
-      the L1 cache and the processing units. This value is used as the x-coordinate
-      for the L1 roofline.
-    AI L2: The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio
-      of total floating-point operations (FLOPs) to total bytes transferred between
-      the L2 cache and the L1 cache. This value is used as the x-coordinate for the
-      L2 roofline.
-    AI HBM: The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM).
-      It is the ratio of total floating-point operations (FLOPs) to total bytes transferred
-      between HBM and the L2 cache. This value is used as the x-coordinate for the
-      HBM roofline.
-    Performance (GFLOPs): The overall achieved performance, measured in GigaFLOPs
-      per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point
-      operations divided by the total execution time. This value is used as the y-coordinate
-      for the kernel's point on the Roofline plot.
   data source:
   - metric_table:
       id: 401
@@ -218,3 +139,91 @@ Panel Config:
             512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8
             * 512) ) / (SUM(End_Timestamp - Start_Timestamp) / 1e9) ) / 1e9
           unit: GFLOP/s
+  metrics_description:
+    VALU FLOPs (F16): |-
+      The total 16-bit floating-point operations executed per second on the VALU.
+      This is presented with the value of the peak empirical F16 FLOPs achievable
+      on the specific accelerator. Note: this does not include any F16 operations
+      from MFMA instructions.
+    VALU FLOPs (F32): |-
+      The total 32-bit floating-point operations executed per second on the VALU.
+      This is presented with the value of the peak empirical F32 FLOPs achievable
+      on the specific accelerator. Note: this does not include any F32 operations
+      from MFMA instructions.
+    VALU FLOPs (F64): |-
+      The total 64-bit floating-point operations executed per second on the VALU.
+      This is presented with the value of the peak empirical F64 FLOPs achievable
+      on the specific accelerator. Note: this does not include any F64 operations
+      from MFMA instructions.
+    MFMA FLOPs (F8): The total number of 8-bit brain floating point MFMA operations
+      executed per second. This does not include any 16-bit brain floating point operations
+      from VALU instructions. The peak empirically measured F8 MFMA operations achievable
+      on the specific accelerator is displayed alongside for comparison. It is supported
+      on AMD Instinct MI300 series and later only.
+    MFMA FLOPs (BF16): |-
+      The total number of 16-bit brain floating point MFMA operations executed
+      per second. Note: this does not include any 16-bit brain floating point
+      operations from VALU instructions. The peak empirically measured BF16 MFMA
+      operations achievable on the specific accelerator is displayed alongside
+      for comparison.
+    MFMA FLOPs (F16): |-
+      The total number of 16-bit floating point MFMA operations executed per
+      second. Note: this does not include any 16-bit floating point operations from
+      VALU instructions. The peak empirically measured F16 MFMA operations
+      achievable on the specific accelerator is displayed alongside for comparison.
+    MFMA FLOPs (F32): |-
+      The total number of 32-bit floating point MFMA operations executed per
+      second. Note: this does not include any 32-bit floating point operations from
+      VALU instructions. The peak empirically measured F32 MFMA operations
+      achievable on the specific accelerator is displayed alongside for comparison.
+    MFMA FLOPs (F64): |-
+      The total number of 64-bit floating point MFMA operations executed per
+      second. Note: this does not include any 64-bit floating point operations from
+      VALU instructions. The peak empirically measured F64 MFMA operations
+      achievable on the specific accelerator is displayed alongside for comparison.
+    MFMA IOPs (Int8): |-
+      The total number of 8-bit integer MFMA operations executed per second.
+      Note: this does not include any 8-bit integer operations from VALU instructions.
+      The peak empirically measured INT8 MFMA operations achievable on the specific
+      accelerator is displayed alongside for comparison.
+    HBM Bandwidth: |-
+      The total number of bytes read from and written to High-Bandwidth
+      Memory (HBM) per second. The peak empirically measured bandwidth achievable
+      on the specific accelerator is displayed alongside for comparison.
+    L2 Cache Bandwidth: The number of bytes looked up in the L2 cache per unit time.
+      The number of bytes is calculated as the number of cache lines requested multiplied
+      by the cache line size. This value does not consider partial requests, so e.g.,
+      if only a single value is requested in a cache line, the data movement will
+      still be counted as a full cache line. The peak empirically measured bandwidth
+      achievable on the specific accelerator is displayed alongside for comparison.
+    L1 Cache Bandwidth: The number of bytes looked up in the vL1D cache as a result
+      of VMEM instructions per unit time. The number of bytes is calculated as the
+      number of cache lines requested multiplied by the cache line size. This value
+      does not consider partial requests, so e.g., if only a single value is requested
+      in a cache line, the data movement will still be counted as a full cache line.
+      The peak empirically measured bandwidth achievable on the specific accelerator
+      is displayed alongside for comparison.
+    LDS Bandwidth: Indicates the maximum amount of bytes that could have been loaded
+      from, stored to, or atomically updated in the LDS per unit time (see LDS Bandwidth
+      example for more detail). The peak empirically measured LDS bandwidth achievable
+      on the specific accelerator is displayed alongside for comparison.
+    AI L1: |-
+      The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio
+      of total floating-point operations (FLOPs) to total bytes transferred between
+      the L1 cache and the processing units. This value is used as the x-coordinate
+      for the L1 roofline.
+    AI L2: |-
+      The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio
+      of total floating-point operations (FLOPs) to total bytes transferred between
+      the L2 cache and the L1 cache. This value is used as the x-coordinate for
+      the L2 roofline.
+    AI HBM: |-
+      The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM).
+      It is the ratio of total floating-point operations (FLOPs) to total bytes
+      transferred between HBM and the L2 cache. This value is used as the x-coordinate
+      for the HBM roofline.
+    Performance (GFLOPs): |-
+      The overall achieved performance, measured in GigaFLOPs
+      per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point
+      operations divided by the total execution time. This value is used as the y-coordinate
+      for the kernel's point on the Roofline plot.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0500_command_processor_cpc_cpf.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0500_command_processor_cpc_cpf.yaml
index c4d2cabf52..118ce18331 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0500_command_processor_cpc_cpf.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0500_command_processor_cpc_cpf.yaml
@@ -2,30 +2,6 @@
 Panel Config:
   id: 500
   title: Command Processor (CPC/CPF)
-  metrics_description:
-    CPF Utilization: Percent of total cycles where the CPF was busy actively doing
-      any work. The ratio of CPF busy cycles over total cycles counted by the CPF.
-    CPF Stall: Percent of CPF busy cycles where the CPF was stalled for any reason.
-    CPF-L2 Utilization: Percent of total cycles counted by the CPF-L2 interface where
-      the CPF-L2 interface was active doing any work. The ratio of CPF-L2 busy cycles
-      over total cycles counted by the CPF-L2.
-    CPF-L2 Stall: Percent of CPF-L2 L2 busy cycles where the CPF-L2 interface was
-      stalled for any reason.
-    CPF-UTCL1 Stall: Percent of CPF busy cycles where the CPF was stalled by address
-      translation.
-    CPC Utilization: Percent of total cycles where the CPC was busy actively doing
-      any work. The ratio of CPC busy cycles over total cycles counted by the CPC.
-    CPC Stall Rate: Percent of CPC busy cycles where the CPC was stalled for any reason.
-    CPC Packet Decoding Utilization: Percent of CPC busy cycles spent decoding commands
-      for processing.
-    CPC-Workgroup Manager Utilization: Percent of CPC busy cycles spent dispatching
-      workgroups to the workgroup manager.
-    CPC-L2 Utilization: Percent of total cycles counted by the CPC-L2 interface where
-      the CPC-L2 interface was active doing any work.
-    CPC-UTCL1 Stall: Percent of CPC busy cycles where the CPC was stalled by address
-      translation
-    CPC-UTCL2 Utilization: 'Percent of total cycles counted by the CPC''s L2 address
-      translation interface where the CPC was busy doing address translation work.  '
   data source:
   - metric_table:
       id: 501
@@ -143,3 +119,28 @@ Panel Config:
           max: MAX((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
             if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
           unit: pct
+  metrics_description:
+    CPF Utilization: Percent of total cycles where the CPF was busy actively doing
+      any work. The ratio of CPF busy cycles over total cycles counted by the CPF.
+    CPF Stall: Percent of CPF busy cycles where the CPF was stalled for any reason.
+    CPF-L2 Utilization: Percent of total cycles counted by the CPF-L2 interface where
+      the CPF-L2 interface was active doing any work. The ratio of CPF-L2 busy cycles
+      over total cycles counted by the CPF-L2.
+    CPF-L2 Stall: Percent of CPF-L2 L2 busy cycles where the CPF-L2 interface was
+      stalled for any reason.
+    CPF-UTCL1 Stall: Percent of CPF busy cycles where the CPF was stalled by address
+      translation.
+    CPC Utilization: Percent of total cycles where the CPC was busy actively doing
+      any work. The ratio of CPC busy cycles over total cycles counted by the CPC.
+    CPC Stall Rate: Percent of CPC busy cycles where the CPC was stalled for any reason.
+    CPC Packet Decoding Utilization: Percent of CPC busy cycles spent decoding commands
+      for processing.
+    CPC-Workgroup Manager Utilization: Percent of CPC busy cycles spent dispatching
+      workgroups to the workgroup manager.
+    CPC-L2 Utilization: Percent of total cycles counted by the CPC-L2 interface where
+      the CPC-L2 interface was active doing any work.
+    CPC-UTCL1 Stall: Percent of CPC busy cycles where the CPC was stalled by address
+      translation
+    CPC-UTCL2 Utilization: |-
+      Percent of total cycles counted by the CPC's L2 address translation
+      interface where the CPC was busy doing address translation work.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0600_workgroup_manager_spi.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0600_workgroup_manager_spi.yaml
index f6bf13d8b8..eb9845aa82 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0600_workgroup_manager_spi.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0600_workgroup_manager_spi.yaml
@@ -2,61 +2,6 @@
 Panel Config:
   id: 600
   title: Workgroup Manager (SPI)
-  metrics_description:
-    Accelerator Utilization: The percent of cycles in the kernel where the accelerator
-      was actively doing any work.
-    Scheduler-Pipe Utilization: The percent of total scheduler-pipe cycles in the
-      kernel where the scheduler-pipes were actively doing any work.
-    Workgroup Manager Utilization: The percent of cycles in the kernel where the workgroup
-      manager was actively doing any work.
-    Shader Engine Utilization: The percent of total shader engine cycles in the kernel
-      where any CU in a shader-engine was actively doing any work, normalized over
-      all shader-engines. Low values (e.g., << 100%) indicate that the accelerator
-      was not fully saturated by the kernel, or a potential load-imbalance issue.
-    SIMD Utilization: The percent of total SIMD cycles in the kernel where any SIMD
-      on a CU was actively doing any work, summed over all CUs. Low values (less than
-      100%) indicate that the accelerator was not fully saturated by the kernel, or
-      a potential load-imbalance issue.
-    Dispatched Workgroups: The total number of workgroups forming this kernel launch.
-    Dispatched Wavefronts: The total number of wavefronts, summed over all workgroups,
-      forming this kernel launch.
-    VGPR Writes: The average number of cycles spent initializing VGPRs at wave creation.
-    SGPR Writes: The average number of cycles spent initializing SGPRs at wave creation.
-    Not-scheduled Rate (Workgroup Manager): The percent of total scheduler-pipe cycles
-      in the kernel where a workgroup could not be scheduled to a CU due to a bottleneck
-      within the workgroup manager rather than a lack of a CU or SIMD with sufficient
-      resources.
-    Not-scheduled Rate (Scheduler-Pipe): 'The percent of total scheduler-pipe cycles
-      in the kernel where a workgroup could not be scheduled to a CU due to a bottleneck
-      within the scheduler-pipes rather than a lack of a CU or SIMD with sufficient
-      resources. '
-    Scheduler-Pipe Stall Rate: The percent of total scheduler-pipe cycles in the kernel
-      where a workgroup could not be scheduled to a CU due to occupancy limitations
-      (like a lack of a CU or SIMD with sufficient resources).
-    Scratch Stall Rate: The percent of total shader-engine cycles in the kernel where
-      a workgroup could not be scheduled to a CU due to lack of private (a.k.a., scratch)
-      memory slots. While this can reach up to 100%, note that the actual occupancy
-      limitations on a kernel using private memory are typically quite small (for
-      example, less than 1% of the total number of waves that can be scheduled to
-      an accelerator).
-    Insufficient SIMD Waveslots: The percent of total SIMD cycles in the kernel where
-      a workgroup could not be scheduled to a SIMD due to lack of available waveslots.
-    Insufficient SIMD VGPRs: The percent of total SIMD cycles in the kernel where
-      a workgroup could not be scheduled to a SIMD due to lack of available VGPRs.
-    Insufficient SIMD SGPRs: The percent of total SIMD cycles in the kernel where
-      a workgroup could not be scheduled to a SIMD due to lack of available SGPRs.
-    Insufficient CU LDS: The percent of total CU cycles in the kernel where a workgroup
-      could not be scheduled to a CU due to lack of available LDS.
-    Insufficient CU Barriers: The percent of total CU cycles in the kernel where a
-      workgroup could not be scheduled to a CU due to lack of available barriers.
-    Reached CU Workgroup Limit: The percent of total CU cycles in the kernel where
-      a workgroup could not be scheduled to a CU due to limits within the workgroup
-      manager. This is expected to be always be zero on CDNA2 or newer accelerators
-      (and small for previous accelerators).
-    Reached CU Wavefront Limit: The percent of total CU cycles in the kernel where
-      a wavefront could not be scheduled to a CU due to limits within the workgroup
-      manager. This is expected to be always be zero on CDNA2 or newer accelerators
-      (and small for previous accelerators).
   data source:
   - metric_table:
       id: 601
@@ -199,3 +144,58 @@ Panel Config:
           min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
           max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
           unit: Pct
+  metrics_description:
+    Accelerator Utilization: The percent of cycles in the kernel where the accelerator
+      was actively doing any work.
+    Scheduler-Pipe Utilization: The percent of total scheduler-pipe cycles in the
+      kernel where the scheduler-pipes were actively doing any work.
+    Workgroup Manager Utilization: The percent of cycles in the kernel where the workgroup
+      manager was actively doing any work.
+    Shader Engine Utilization: The percent of total shader engine cycles in the kernel
+      where any CU in a shader-engine was actively doing any work, normalized over
+      all shader-engines. Low values (e.g., << 100%) indicate that the accelerator
+      was not fully saturated by the kernel, or a potential load-imbalance issue.
+    SIMD Utilization: The percent of total SIMD cycles in the kernel where any SIMD
+      on a CU was actively doing any work, summed over all CUs. Low values (less than
+      100%) indicate that the accelerator was not fully saturated by the kernel, or
+      a potential load-imbalance issue.
+    Dispatched Workgroups: The total number of workgroups forming this kernel launch.
+    Dispatched Wavefronts: The total number of wavefronts, summed over all workgroups,
+      forming this kernel launch.
+    VGPR Writes: The average number of cycles spent initializing VGPRs at wave creation.
+    SGPR Writes: The average number of cycles spent initializing SGPRs at wave creation.
+    Not-scheduled Rate (Workgroup Manager): The percent of total scheduler-pipe cycles
+      in the kernel where a workgroup could not be scheduled to a CU due to a bottleneck
+      within the workgroup manager rather than a lack of a CU or SIMD with sufficient
+      resources.
+    Not-scheduled Rate (Scheduler-Pipe): |-
+      The percent of total scheduler-pipe cycles in the kernel where a workgroup
+      could not be scheduled to a CU due to a bottleneck within the scheduler-pipes
+      rather than a lack of a CU or SIMD with sufficient resources.
+    Scheduler-Pipe Stall Rate: The percent of total scheduler-pipe cycles in the kernel
+      where a workgroup could not be scheduled to a CU due to occupancy limitations
+      (like a lack of a CU or SIMD with sufficient resources).
+    Scratch Stall Rate: The percent of total shader-engine cycles in the kernel where
+      a workgroup could not be scheduled to a CU due to lack of private (a.k.a., scratch)
+      memory slots. While this can reach up to 100%, note that the actual occupancy
+      limitations on a kernel using private memory are typically quite small (for
+      example, less than 1% of the total number of waves that can be scheduled to
+      an accelerator).
+    Insufficient SIMD Waveslots: The percent of total SIMD cycles in the kernel where
+      a workgroup could not be scheduled to a SIMD due to lack of available waveslots.
+    Insufficient SIMD VGPRs: The percent of total SIMD cycles in the kernel where
+      a workgroup could not be scheduled to a SIMD due to lack of available VGPRs.
+    Insufficient SIMD SGPRs: The percent of total SIMD cycles in the kernel where
+      a workgroup could not be scheduled to a SIMD due to lack of available SGPRs.
+    Insufficient CU LDS: The percent of total CU cycles in the kernel where a workgroup
+      could not be scheduled to a CU due to lack of available LDS.
+    Insufficient CU Barriers: The percent of total CU cycles in the kernel where a
+      workgroup could not be scheduled to a CU due to lack of available barriers.
+    Reached CU Workgroup Limit: The percent of total CU cycles in the kernel where
+      a workgroup could not be scheduled to a CU due to limits within the workgroup
+      manager. This is expected to be always be zero on CDNA2 or newer accelerators
+      (and small for previous accelerators).
+    Reached CU Wavefront Limit: The percent of total CU cycles in the kernel where
+      a wavefront could not be scheduled to a CU due to limits within the workgroup
+      manager. This is expected to be always be zero on CDNA2 or newer accelerators
+      (and small for previous accelerators).
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0700_wavefront.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0700_wavefront.yaml
index 5e332c0b8f..e9e9407cfc 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0700_wavefront.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0700_wavefront.yaml
@@ -2,63 +2,6 @@
 Panel Config:
   id: 700
   title: Wavefront
-  metrics_description:
-    Grid Size: The total number of work-items (or, threads) launched as a part of
-      the kernel dispatch. In HIP, this is equivalent to the total grid size multiplied
-      by the total workgroup (or, block) size.
-    Workgroup Size: The total number of work-items (or, threads) in each workgroup
-      (or, block) launched as part of the kernel dispatch. In HIP, this is equivalent
-      to the total block size.
-    Total Wavefronts: "The total number of wavefronts launched as part of the kernel\
-      \ dispatch. On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs,\
-      \ the wavefront size is always 64 work-items. Thus, the total number of wavefronts\
-      \ should be equivalent to the ceiling of grid size divided by 64."
-    Saved Wavefronts: The total number of wavefronts saved at a context-save.
-    Restored Wavefronts: The total number of wavefronts restored from a context-save.
-    VGPRs: 'The number of architected vector general-purpose registers allocated for
-      the kernel, see VALU. Note: this may not exactly match the number of VGPRs requested
-      by the compiler due to allocation granularity.'
-    AGPRs: 'The number of accumulation vector general-purpose registers allocated
-      for the kernel, see AGPRs. Note: this may not exactly match the number of AGPRs
-      requested by the compiler due to allocation granularity.'
-    SGPRs: 'The number of scalar general-purpose registers allocated for the kernel,
-      see SALU. Note: this may not exactly match the number of SGPRs requested by
-      the compiler due to allocation granularity.'
-    LDS Allocation: 'The number of bytes of LDS memory (or, shared memory) allocated
-      for this kernel. Note: This may also be larger than what was requested at compile
-      time due to both allocation granularity and dynamic per-dispatch LDS allocations.'
-    Scratch Allocation: The number of bytes of scratch memory requested per work-item
-      for this kernel. Scratch memory is used for stack memory on the accelerator,
-      as well as for register spills and restores.
-    Kernel Time: The total duration of the executed kernel.
-    Kernel Time (Cycles): The total duration of the executed kernel in cycles.
-    Instructions per wavefront: The average number of instructions (of all types)
-      executed per wavefront. This is averaged over all wavefronts in a kernel dispatch.
-    Wave Cycles: The number of cycles a wavefront in the kernel dispatch spent resident
-      on a compute unit per normalization unit. This is averaged over all wavefronts
-      in a kernel dispatch.
-    Dependency Wait Cycles: The number of cycles a wavefront in the kernel dispatch
-      spent resident on a compute unit per normalization unit. This is averaged over
-      all wavefronts in a kernel dispatch.
-    Issue Wait Cycles: The number of cycles a wavefront in the kernel dispatch was
-      unable to issue an instruction for any reason (e.g., execution pipe back-pressure,
-      arbitration loss, etc.) per normalization unit. This counter is incremented
-      at every cycle by all wavefronts on a CU unable to issue an instruction. As
-      such, it is most useful to get a sense of how waves were spending their time,
-      rather than identification of a precise limiter because another wave could be
-      actively executing while a wave is issue stalled. The sum of this metric, Dependency
-      Wait Cycles and Active Cycles should be equal to the total Wave Cycles metric.
-    Active Cycles: The average number of cycles a wavefront in the kernel dispatch
-      was actively executing instructions per normalization unit. This measurement
-      is made on a per-wavefront basis, and may include cycles that another wavefront
-      spent actively executing (on another execution unit, for example) or was stalled.
-      As such, it is most useful to get a sense of how waves were spending their time,
-      rather than identification of a precise limiter. The sum of this metric, Issue
-      Wait Cycles and Active Wait Cycles should be equal to the total Wave Cycles
-      metric.
-    Wavefront Occupancy: 'The time-averaged number of wavefronts resident on the accelerator
-      over the lifetime of the kernel. Note: this metric may be inaccurate for short-running
-      kernels (less than 1ms).'
   data source:
   - metric_table:
       id: 701
@@ -171,3 +114,66 @@ Panel Config:
           max: MAX((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
           unit: Wavefronts
           coll_level: SQ_LEVEL_WAVES
+  metrics_description:
+    Grid Size: The total number of work-items (or, threads) launched as a part of
+      the kernel dispatch. In HIP, this is equivalent to the total grid size multiplied
+      by the total workgroup (or, block) size.
+    Workgroup Size: The total number of work-items (or, threads) in each workgroup
+      (or, block) launched as part of the kernel dispatch. In HIP, this is equivalent
+      to the total block size.
+    Total Wavefronts: |-
+      The total number of wavefronts launched as part of the kernel dispatch.
+      On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront
+      size is always 64 work-items. Thus, the total number of wavefronts should
+      be equivalent to the ceiling of grid size divided by 64.
+    Saved Wavefronts: The total number of wavefronts saved at a context-save.
+    Restored Wavefronts: The total number of wavefronts restored from a context-save.
+    VGPRs: |-
+      The number of architected vector general-purpose registers allocated
+      for the kernel, see VALU. Note: this may not exactly match the number of VGPRs
+      requested by the compiler due to allocation granularity.
+    AGPRs: |-
+      The number of accumulation vector general-purpose registers allocated
+      for the kernel, see AGPRs. Note: this may not exactly match the number of
+      AGPRs requested by the compiler due to allocation granularity.
+    SGPRs: |-
+      The number of scalar general-purpose registers allocated for the kernel,
+      see SALU. Note: this may not exactly match the number of SGPRs requested by
+      the compiler due to allocation granularity.
+    LDS Allocation: |-
+      The number of bytes of LDS memory (or, shared memory) allocated for
+      this kernel. Note: This may also be larger than what was requested at compile
+      time due to both allocation granularity and dynamic per-dispatch LDS allocations.
+    Scratch Allocation: The number of bytes of scratch memory requested per work-item
+      for this kernel. Scratch memory is used for stack memory on the accelerator,
+      as well as for register spills and restores.
+    Kernel Time: The total duration of the executed kernel.
+    Kernel Time (Cycles): The total duration of the executed kernel in cycles.
+    Instructions per wavefront: The average number of instructions (of all types)
+      executed per wavefront. This is averaged over all wavefronts in a kernel dispatch.
+    Wave Cycles: The number of cycles a wavefront in the kernel dispatch spent resident
+      on a compute unit per normalization unit. This is averaged over all wavefronts
+      in a kernel dispatch.
+    Dependency Wait Cycles: The number of cycles a wavefront in the kernel dispatch
+      spent resident on a compute unit per normalization unit. This is averaged over
+      all wavefronts in a kernel dispatch.
+    Issue Wait Cycles: The number of cycles a wavefront in the kernel dispatch was
+      unable to issue an instruction for any reason (e.g., execution pipe back-pressure,
+      arbitration loss, etc.) per normalization unit. This counter is incremented
+      at every cycle by all wavefronts on a CU unable to issue an instruction. As
+      such, it is most useful to get a sense of how waves were spending their time,
+      rather than identification of a precise limiter because another wave could be
+      actively executing while a wave is issue stalled. The sum of this metric, Dependency
+      Wait Cycles and Active Cycles should be equal to the total Wave Cycles metric.
+    Active Cycles: The average number of cycles a wavefront in the kernel dispatch
+      was actively executing instructions per normalization unit. This measurement
+      is made on a per-wavefront basis, and may include cycles that another wavefront
+      spent actively executing (on another execution unit, for example) or was stalled.
+      As such, it is most useful to get a sense of how waves were spending their time,
+      rather than identification of a precise limiter. The sum of this metric, Issue
+      Wait Cycles and Active Wait Cycles should be equal to the total Wave Cycles
+      metric.
+    Wavefront Occupancy: |-
+      The time-averaged number of wavefronts resident on the accelerator over
+      the lifetime of the kernel. Note: this metric may be inaccurate for short-running
+      kernels (less than 1ms).
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1000_compute_units_instruction_mix.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1000_compute_units_instruction_mix.yaml
index 9c923d7bb7..768fe6548b 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1000_compute_units_instruction_mix.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1000_compute_units_instruction_mix.yaml
@@ -2,90 +2,6 @@
 Panel Config:
   id: 1000
   title: Compute Units - Instruction Mix
-  metrics_description:
-    VALU: The total number of vector arithmetic logic unit (VALU) operations issued.
-      These are the workhorses of the compute unit, and are used to execute a wide
-      range of instruction types including floating point operations, non-uniform
-      address calculations, transcendental operations, integer operations, shifts,
-      conditional evaluation, etc.
-    VMEM: The total number of vector memory operations issued. These include most
-      loads, stores and atomic operations and all accesses to generic, global, private
-      and texture memory.
-    LDS: The total number of LDS (also known as shared memory) operations issued.
-      These include loads, stores, atomics, and HIP's __shfl operations.
-    MFMA: The total number of matrix fused multiply-add instructions issued.
-    SALU: The total number of scalar arithmetic logic unit (SALU) operations issued.
-      Typically these are used for address calculations, literal constants, and other
-      operations that are provably uniform across a wavefront. Although scalar memory
-      (SMEM) operations are issued by the SALU, they are counted separately in this
-      section.
-    SMEM: The total number of scalar memory (SMEM) operations issued. These are typically
-      used for loading kernel arguments, base-pointers and loads from HIP's __constant__
-      memory.
-    Branch: The total number of branch operations issued. These typically consist
-      of jump or branch operations and are used to implement control flow.
-    INT32: The total number of instructions operating on 32-bit integer operands issued
-      to the VALU per normalization unit.
-    INT64: The total number of instructions operating on 64-bit integer operands issued
-      to the VALU per normalization unit.
-    F16-ADD: The total number of addition instructions operating on 16-bit floating-point
-      operands issued to the VALU per normalization unit.
-    F16-MUL: The total number of multiplication instructions operating on 16-bit floating-point
-      operands issued to the VALU per normalization unit.
-    F16-FMA: The total number of fused multiply-add instructions operating on 16-bit
-      floating-point operands issued to the VALU per normalization unit.
-    F16-Trans: The total number of transcendental instructions (e.g., sqrt) operating
-      on 16-bit floating-point operands issued to the VALU per normalization unit.
-    F32-ADD: The total number of addition instructions operating on 32-bit floating-point
-      operands issued to the VALU per normalization unit.
-    F32-MUL: The total number of multiplication instructions operating on 32-bit floating-point
-      operands issued to the VALU per normalization unit.
-    F32-FMA: The total number of fused multiply-add instructions operating on 32-bit
-      floating-point operands issued to the VALU per normalization unit.
-    F32-Trans: The total number of transcendental instructions (such as sqrt) operating
-      on 32-bit floating-point operands issued to the VALU per normalization unit.
-    F64-ADD: The total number of addition instructions operating on 64-bit floating-point
-      operands issued to the VALU per normalization unit.
-    F64-MUL: The total number of multiplication instructions operating on 64-bit floating-point
-      operands issued to the VALU per normalization unit.
-    F64-FMA: The total number of fused multiply-add instructions operating on 64-bit
-      floating-point operands issued to the VALU per normalization unit.
-    F64-Trans: The total number of transcendental instructions (such as sqrt) operating
-      on 64-bit floating-point operands issued to the VALU per normalization unit.
-    Conversion: "The total number of type conversion instructions (such as converting\
-      \ data to or from F32\u2194F64) issued to the VALU per normalization unit."
-    Global/Generic Instr: The total number of global & generic memory instructions
-      executed on all compute units on the accelerator, per normalization unit.
-    Global/Generic Read: The total number of global & generic memory read instructions
-      executed on all compute units on the accelerator, per normalization unit.
-    Global/Generic Write: The total number of global & generic memory write instructions
-      executed on all compute units on the accelerator, per normalization unit.
-    Global/Generic Atomic: The total number of global & generic memory atomic (with
-      and without return) instructions executed on all compute units on the accelerator,
-      per normalization unit.
-    Spill/Stack Instr: The total number of spill/stack memory instructions executed
-      on all compute units on the accelerator, per normalization unit.
-    Spill/Stack Read: The total number of spill/stack memory read instructions executed
-      on all compute units on the accelerator, per normalization unit.
-    Spill/Stack Write: The total number of spill/stack memory write instructions executed
-      on all compute units on the accelerator, per normalization unit.
-    Spill/Stack Atomic: The total number of spill/stack memory atomic (with and without
-      return) instructions executed on all compute units on the accelerator, per normalization
-      unit. Typically unused as these memory operations are typically used to implement
-      thread-local storage.
-    MFMA-I8: The total number of 8-bit integer MFMA instructions issued per normalization
-      unit.
-    MFMA-F8: The total number of 8-bit floating point MFMA instructions issued per
-      normalization unit. This is supported in AMD Instinct MI300 series and later
-      only.
-    MFMA-F16: The total number of 16-bit floating point MFMA instructions issued per
-      normalization unit.
-    MFMA-BF16: The total number of 16-bit brain floating point MFMA instructions issued
-      per normalization unit.
-    MFMA-F32: The total number of 32-bit floating-point MFMA instructions issued per
-      normalization unit.
-    MFMA-F64: The total number of 64-bit floating-point MFMA instructions issued per
-      normalization unit.
   data source:
   - metric_table:
       id: 1001
@@ -307,3 +223,88 @@ Panel Config:
           min: MIN((SQ_INSTS_VALU_MFMA_F64 / $denom))
           max: MAX((SQ_INSTS_VALU_MFMA_F64 / $denom))
           unit: (instr + $normUnit)
+  metrics_description:
+    VALU: The total number of vector arithmetic logic unit (VALU) operations issued.
+      These are the workhorses of the compute unit, and are used to execute a wide
+      range of instruction types including floating point operations, non-uniform
+      address calculations, transcendental operations, integer operations, shifts,
+      conditional evaluation, etc.
+    VMEM: The total number of vector memory operations issued. These include most
+      loads, stores and atomic operations and all accesses to generic, global, private
+      and texture memory.
+    LDS: The total number of LDS (also known as shared memory) operations issued.
+      These include loads, stores, atomics, and HIP's __shfl operations.
+    MFMA: The total number of matrix fused multiply-add instructions issued.
+    SALU: The total number of scalar arithmetic logic unit (SALU) operations issued.
+      Typically these are used for address calculations, literal constants, and other
+      operations that are provably uniform across a wavefront. Although scalar memory
+      (SMEM) operations are issued by the SALU, they are counted separately in this
+      section.
+    SMEM: The total number of scalar memory (SMEM) operations issued. These are typically
+      used for loading kernel arguments, base-pointers and loads from HIP's __constant__
+      memory.
+    Branch: The total number of branch operations issued. These typically consist
+      of jump or branch operations and are used to implement control flow.
+    INT32: The total number of instructions operating on 32-bit integer operands issued
+      to the VALU per normalization unit.
+    INT64: The total number of instructions operating on 64-bit integer operands issued
+      to the VALU per normalization unit.
+    F16-ADD: The total number of addition instructions operating on 16-bit floating-point
+      operands issued to the VALU per normalization unit.
+    F16-MUL: The total number of multiplication instructions operating on 16-bit floating-point
+      operands issued to the VALU per normalization unit.
+    F16-FMA: The total number of fused multiply-add instructions operating on 16-bit
+      floating-point operands issued to the VALU per normalization unit.
+    F16-Trans: The total number of transcendental instructions (e.g., sqrt) operating
+      on 16-bit floating-point operands issued to the VALU per normalization unit.
+    F32-ADD: The total number of addition instructions operating on 32-bit floating-point
+      operands issued to the VALU per normalization unit.
+    F32-MUL: The total number of multiplication instructions operating on 32-bit floating-point
+      operands issued to the VALU per normalization unit.
+    F32-FMA: The total number of fused multiply-add instructions operating on 32-bit
+      floating-point operands issued to the VALU per normalization unit.
+    F32-Trans: The total number of transcendental instructions (such as sqrt) operating
+      on 32-bit floating-point operands issued to the VALU per normalization unit.
+    F64-ADD: The total number of addition instructions operating on 64-bit floating-point
+      operands issued to the VALU per normalization unit.
+    F64-MUL: The total number of multiplication instructions operating on 64-bit floating-point
+      operands issued to the VALU per normalization unit.
+    F64-FMA: The total number of fused multiply-add instructions operating on 64-bit
+      floating-point operands issued to the VALU per normalization unit.
+    F64-Trans: The total number of transcendental instructions (such as sqrt) operating
+      on 64-bit floating-point operands issued to the VALU per normalization unit.
+    Conversion: |-
+      The total number of type conversion instructions (such as converting
+      data to or from F32\u2194F64) issued to the VALU per normalization unit.
+    Global/Generic Instr: The total number of global & generic memory instructions
+      executed on all compute units on the accelerator, per normalization unit.
+    Global/Generic Read: The total number of global & generic memory read instructions
+      executed on all compute units on the accelerator, per normalization unit.
+    Global/Generic Write: The total number of global & generic memory write instructions
+      executed on all compute units on the accelerator, per normalization unit.
+    Global/Generic Atomic: The total number of global & generic memory atomic (with
+      and without return) instructions executed on all compute units on the accelerator,
+      per normalization unit.
+    Spill/Stack Instr: The total number of spill/stack memory instructions executed
+      on all compute units on the accelerator, per normalization unit.
+    Spill/Stack Read: The total number of spill/stack memory read instructions executed
+      on all compute units on the accelerator, per normalization unit.
+    Spill/Stack Write: The total number of spill/stack memory write instructions executed
+      on all compute units on the accelerator, per normalization unit.
+    Spill/Stack Atomic: The total number of spill/stack memory atomic (with and without
+      return) instructions executed on all compute units on the accelerator, per normalization
+      unit. Typically unused as these memory operations are typically used to implement
+      thread-local storage.
+    MFMA-I8: The total number of 8-bit integer MFMA instructions issued per normalization
+      unit.
+    MFMA-F8: The total number of 8-bit floating point MFMA instructions issued per
+      normalization unit. This is supported in AMD Instinct MI300 series and later
+      only.
+    MFMA-F16: The total number of 16-bit floating point MFMA instructions issued per
+      normalization unit.
+    MFMA-BF16: The total number of 16-bit brain floating point MFMA instructions issued
+      per normalization unit.
+    MFMA-F32: The total number of 32-bit floating-point MFMA instructions issued per
+      normalization unit.
+    MFMA-F64: The total number of 64-bit floating-point MFMA instructions issued per
+      normalization unit.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1100_compute_units_compute_pipeline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1100_compute_units_compute_pipeline.yaml
index 5285c6b279..5e6ceb654f 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1100_compute_units_compute_pipeline.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1100_compute_units_compute_pipeline.yaml
@@ -2,84 +2,6 @@
 Panel Config:
   id: 1100
   title: Compute Units - Compute Pipeline
-  metrics_description:
-    VALU FLOPs: 'The total floating-point operations executed per second on the VALU.
-      This is also presented as a percent of the peak theoretical FLOPs achievable
-      on the specific accelerator. Note: this does not include any floating-point
-      operations from MFMA instructions.'
-    VALU IOPs: 'The total integer operations executed per second on the VALU. This
-      is also presented as a percent of the peak theoretical IOPs achievable on the
-      specific accelerator. Note: this does not include any integer operations from
-      MFMA instructions.'
-    MFMA FLOPs (BF16): 'The total number of 16-bit brain floating point MFMA operations
-      executed per second. Note: this does not include any 16-bit brain floating point
-      operations from VALU instructions. This is also presented as a percent of the
-      peak theoretical BF16 MFMA operations achievable on the specific accelerator.'
-    MFMA FLOPs (F16): 'The total number of 16-bit floating point MFMA operations executed
-      per second. Note: this does not include any 16-bit floating point operations
-      from VALU instructions. This is also presented as a percent of the peak theoretical
-      F16 MFMA operations achievable on the specific accelerator.'
-    MFMA FLOPs (F32): 'The total number of 32-bit floating point MFMA operations executed
-      per second. Note: this does not include any 32-bit floating point operations
-      from VALU instructions. This is also presented as a percent of the peak theoretical
-      F32 MFMA operations achievable on the specific accelerator.'
-    MFMA FLOPs (F64): 'The total number of 64-bit floating point MFMA operations executed
-      per second. Note: this does not include any 64-bit floating point operations
-      from VALU instructions. This is also presented as a percent of the peak theoretical
-      F64 MFMA operations achievable on the specific accelerator.'
-    MFMA IOPs (INT8): 'The total number of 8-bit integer MFMA operations executed
-      per second. Note: this does not include any 8-bit integer operations from VALU
-      instructions. This is also presented as a percent of the peak theoretical INT8
-      MFMA operations achievable on the specific accelerator.'
-    IPC: The ratio of the total number of instructions executed on the CU over the
-      total active CU cycles.
-    IPC (Issued): The ratio of the total number of (non-internal) instructions issued
-      over the number of cycles where the scheduler was actively working on issuing
-      instructions.
-    SALU Utilization: Indicates what percent of the kernel's duration the SALU was
-      busy executing instructions. Computed as the ratio of the total number of cycles
-      spent by the scheduler issuing SALU / SMEM instructions over the total CU cycles.
-    VALU Utilization: Indicates what percent of the kernel's duration the VALU was
-      busy executing instructions. Does not include VMEM operations. Computed as the
-      ratio of the total number of cycles spent by the scheduler issuing VALU instructions
-      over the total CU cycles.
-    VMEM Utilization: Indicates what percent of the kernel's duration the VMEM unit
-      was busy executing instructions, including both global/generic and spill/scratch
-      operations (see the VMEM instruction count metrics for more detail). Does not
-      include VALU operations. Computed as the ratio of the total number of cycles
-      spent by the scheduler issuing VMEM instructions over the total CU cycles.
-    Branch Utilization: Indicates what percent of the kernel's duration the branch
-      unit was busy executing instructions. Computed as the ratio of the total number
-      of cycles spent by the scheduler issuing branch instructions over the total
-      CU cycles.
-    VALU Active Threads: Indicates the average level of divergence within a wavefront
-      over the lifetime of the kernel. The number of work-items that were active in
-      a wavefront during execution of each VALU instruction, time-averaged over all
-      VALU instructions run on all wavefronts in the kernel
-    MFMA Utilization: Indicates what percent of the kernel's duration the MFMA unit
-      was busy executing instructions. Computed as the ratio of the total number of
-      cycles spent by the MFMA was busy over the total CU cycles.
-    MFMA Instruction Cycles: The average duration of MFMA instructions in this kernel
-      in cycles. Computed as the ratio of the total number of cycles the MFMA unit
-      was busy over the total number of MFMA instructions.
-    VMEM Latency: The average number of round-trip cycles (that is, from issue to
-      data return / acknowledgment) required for a VMEM instruction to complete.
-    SMEM Latency: The average number of round-trip cycles (that is, from issue to
-      data return / acknowledgment) required for a SMEM instruction to complete.
-    FLOPs (Total): The total number of floating-point operations executed on either
-      the VALU or MFMA units, per normalization unit.
-    IOPs (Total): The total number of integer operations executed on either the VALU
-      or MFMA units, per normalization unit.
-    F16 OPs: The total number of 16-bit floating-point operations executed on either
-      the VALU or MFMA units, per normalization unit.
-    BF16 OPs: The total number of 16-bit brain floating-point operations executed
-      on either the VALU or MFMA units, per normalization unit.
-    F32 OPs: The total number of 32-bit floating-point operations executed on either
-      the VALU or MFMA units, per normalization unit.
-    F64 OPs: The total number of 64-bit floating-point operations executed on either
-      the VALU or MFMA units, per normalization unit.
-    INT8 OPs: The total number of 8-bit integer operations executed on either the
-      VALU or MFMA units, per normalization unit.
   data source:
   - metric_table:
       id: 1101
@@ -165,13 +87,13 @@ Panel Config:
           unit: Instr/cycle
         IPC (Issued):
           avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
-            + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED  + SQ_INSTS_LDS)
+            + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
             / SQ_ACTIVE_INST_ANY))
           min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
             + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
             / SQ_ACTIVE_INST_ANY))
           max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
-            + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED  + SQ_INSTS_LDS)
+            + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
             / SQ_ACTIVE_INST_ANY))
           unit: Instr/cycle
         SALU Utilization:
@@ -271,7 +193,7 @@ Panel Config:
             + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
             + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
             / $denom))
-          unit: (OPs  + $normUnit)
+          unit: (OPs + $normUnit)
         IOPs (Total):
           avg: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
             * 512)) / $denom)
@@ -279,12 +201,12 @@ Panel Config:
             * 512)) / $denom)
           max: MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
             * 512)) / $denom)
-          unit: (OPs  + $normUnit)
+          unit: (OPs + $normUnit)
         F8 OPs:
           avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
           min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
           max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
-          unit: (OPs  + $normUnit)
+          unit: (OPs + $normUnit)
         F16 OPs:
           avg: AVG(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
             + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512
@@ -295,12 +217,12 @@ Panel Config:
           max: MAX(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
             + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512
             * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
-          unit: (OPs  + $normUnit)
+          unit: (OPs + $normUnit)
         BF16 OPs:
           avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
           min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
           max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
-          unit: (OPs  + $normUnit)
+          unit: (OPs + $normUnit)
         F32 OPs:
           avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
             + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
@@ -311,7 +233,7 @@ Panel Config:
           max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
             + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
             / $denom))
-          unit: (OPs  + $normUnit)
+          unit: (OPs + $normUnit)
         F64 OPs:
           avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
             + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
@@ -322,9 +244,94 @@ Panel Config:
           max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
             + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
             / $denom))
-          unit: (OPs  + $normUnit)
+          unit: (OPs + $normUnit)
         INT8 OPs:
           avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
           min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
           max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
-          unit: (OPs  + $normUnit)
+          unit: (OPs + $normUnit)
+  metrics_description:
+    VALU FLOPs: |-
+      The total floating-point operations executed per second on the VALU.
+      This is also presented as a percent of the peak theoretical FLOPs achievable
+      on the specific accelerator. Note: this does not include any floating-point
+      operations from MFMA instructions.
+    VALU IOPs: |-
+      The total integer operations executed per second on the VALU. This is
+      also presented as a percent of the peak theoretical IOPs achievable on the
+      specific accelerator. Note: this does not include any integer operations from
+      MFMA instructions.
+    MFMA FLOPs (BF16): |-
+      The total number of 16-bit brain floating point MFMA operations executed
+      per second. Note: this does not include any 16-bit brain floating point operations
+      from VALU instructions. This is also presented as a percent of the peak theoretical
+      BF16 MFMA operations achievable on the specific accelerator.
+    MFMA FLOPs (F16): |-
+      The total number of 16-bit floating point MFMA operations executed per
+      second. Note: this does not include any 16-bit floating point operations from
+      VALU instructions. This is also presented as a percent of the peak theoretical
+      F16 MFMA operations achievable on the specific accelerator.
+    MFMA FLOPs (F32): |-
+      The total number of 32-bit floating point MFMA operations executed per
+      second. Note: this does not include any 32-bit floating point operations from
+      VALU instructions. This is also presented as a percent of the peak theoretical
+      F32 MFMA operations achievable on the specific accelerator.
+    MFMA FLOPs (F64): |-
+      The total number of 64-bit floating point MFMA operations executed per
+      second. Note: this does not include any 64-bit floating point operations from
+      VALU instructions. This is also presented as a percent of the peak theoretical
+      F64 MFMA operations achievable on the specific accelerator.
+    MFMA IOPs (INT8): |-
+      The total number of 8-bit integer MFMA operations executed per second.
+      Note: this does not include any 8-bit integer operations from VALU instructions.
+      This is also presented as a percent of the peak theoretical INT8 MFMA operations
+      achievable on the specific accelerator.
+    IPC: The ratio of the total number of instructions executed on the CU over the
+      total active CU cycles.
+    IPC (Issued): The ratio of the total number of (non-internal) instructions issued
+      over the number of cycles where the scheduler was actively working on issuing
+      instructions.
+    SALU Utilization: Indicates what percent of the kernel's duration the SALU was
+      busy executing instructions. Computed as the ratio of the total number of cycles
+      spent by the scheduler issuing SALU / SMEM instructions over the total CU cycles.
+    VALU Utilization: Indicates what percent of the kernel's duration the VALU was
+      busy executing instructions. Does not include VMEM operations. Computed as the
+      ratio of the total number of cycles spent by the scheduler issuing VALU instructions
+      over the total CU cycles.
+    VMEM Utilization: Indicates what percent of the kernel's duration the VMEM unit
+      was busy executing instructions, including both global/generic and spill/scratch
+      operations (see the VMEM instruction count metrics for more detail). Does not
+      include VALU operations. Computed as the ratio of the total number of cycles
+      spent by the scheduler issuing VMEM instructions over the total CU cycles.
+    Branch Utilization: Indicates what percent of the kernel's duration the branch
+      unit was busy executing instructions. Computed as the ratio of the total number
+      of cycles spent by the scheduler issuing branch instructions over the total
+      CU cycles.
+    VALU Active Threads: Indicates the average level of divergence within a wavefront
+      over the lifetime of the kernel. The number of work-items that were active in
+      a wavefront during execution of each VALU instruction, time-averaged over all
+      VALU instructions run on all wavefronts in the kernel
+    MFMA Utilization: Indicates what percent of the kernel's duration the MFMA unit
+      was busy executing instructions. Computed as the ratio of the total number of
+      cycles spent by the MFMA was busy over the total CU cycles.
+    MFMA Instruction Cycles: The average duration of MFMA instructions in this kernel
+      in cycles. Computed as the ratio of the total number of cycles the MFMA unit
+      was busy over the total number of MFMA instructions.
+    VMEM Latency: The average number of round-trip cycles (that is, from issue to
+      data return / acknowledgment) required for a VMEM instruction to complete.
+    SMEM Latency: The average number of round-trip cycles (that is, from issue to
+      data return / acknowledgment) required for a SMEM instruction to complete.
+    FLOPs (Total): The total number of floating-point operations executed on either
+      the VALU or MFMA units, per normalization unit.
+    IOPs (Total): The total number of integer operations executed on either the VALU
+      or MFMA units, per normalization unit.
+    F16 OPs: The total number of 16-bit floating-point operations executed on either
+      the VALU or MFMA units, per normalization unit.
+    BF16 OPs: The total number of 16-bit brain floating-point operations executed
+      on either the VALU or MFMA units, per normalization unit.
+    F32 OPs: The total number of 32-bit floating-point operations executed on either
+      the VALU or MFMA units, per normalization unit.
+    F64 OPs: The total number of 64-bit floating-point operations executed on either
+      the VALU or MFMA units, per normalization unit.
+    INT8 OPs: The total number of 8-bit integer operations executed on either the
+      VALU or MFMA units, per normalization unit.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1200_local_data_share_lds.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1200_local_data_share_lds.yaml
index 2718654ad4..b7767fea16 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1200_local_data_share_lds.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1200_local_data_share_lds.yaml
@@ -2,51 +2,6 @@
 Panel Config:
   id: 1200
   title: Local Data Share (LDS)
-  metrics_description:
-    Utilization: Indicates what percent of the kernel's duration the LDS was actively
-      executing instructions (including, but not limited to, load, store, atomic and
-      HIP's __shfl operations). Calculated as the ratio of the total number of cycles
-      LDS was active over the total CU cycles.
-    Access Rate: Indicates the percentage of SIMDs in the VALU actively issuing LDS
-      instructions, averaged over the lifetime of the kernel. Calculated as the ratio
-      of the total number of cycles spent by the scheduler issuing LDS instructions
-      over the total CU cycles.
-    Theoretical Bandwidth Utilization: Indicates the maximum amount of bytes that
-      could have been loaded from, stored to, or atomically updated in the LDS divided
-      as percentage of theoretical peak. Does not take into account the execution
-      mask of the wavefront when the instruction was executed.
-    Theoretical Bandwidth: Indicates the maximum amount of bytes that could have been
-      loaded from, stored to, or atomically updated in the LDS divided by total duration.
-      Does not take into account the execution mask of the wavefront when the instruction
-      was executed.
-    Bank Conflict Rate: Indicates the percentage of active LDS cycles that were spent
-      servicing bank conflicts. Calculated as the ratio of LDS cycles spent servicing
-      bank conflicts over the number of LDS cycles that would have been required to
-      move the same amount of data in an uncontended access.
-    LDS Instructions: The total number of LDS instructions (including, but not limited
-      to, read/write/atomics and HIP's __shfl instructions) executed per normalization
-      unit.
-    LDS Latency: The average number of round-trip cycles (i.e., from issue to data-return
-      / acknowledgment) required for an LDS instruction to complete.
-    Bank Conflicts/Access: The ratio of the number of cycles spent in the LDS scheduler
-      due to bank conflicts (as determined by the conflict resolution hardware) to
-      the base number of cycles that would be spent in the LDS scheduler in a completely
-      uncontended case. This is the unnormalized form of the Bank Conflict Rate.
-    Index Accesses: The total number of cycles spent in the LDS scheduler over all
-      operations per normalization unit.
-    Atomic Return Cycles: The total number of cycles spent on LDS atomics with return
-      per normalization unit.
-    Bank Conflict: The total number of cycles spent in the LDS scheduler due to bank
-      conflicts (as determined by the conflict resolution hardware) per normalization
-      unit.
-    Addr Conflict: The total number of cycles spent in the LDS scheduler due to address
-      conflicts (as determined by the conflict resolution hardware) per normalization
-      unit.
-    Unaligned Stall: The total number of cycles spent in the LDS scheduler due to
-      stalls from non-dword aligned addresses per normalization unit.
-    Mem Violations: "The total number of out-of-bounds accesses made to the LDS, per\
-      \ normalization unit. This is unused and expected to be zero in most configurations\
-      \ for modern CDNA\u2122 accelerators."
   data source:
   - metric_table:
       id: 1201
@@ -87,7 +42,7 @@ Panel Config:
           avg: AVG((SQ_INSTS_LDS / $denom))
           min: MIN((SQ_INSTS_LDS / $denom))
           max: MAX((SQ_INSTS_LDS / $denom))
-          unit: (Instr  + $normUnit)
+          unit: (Instr + $normUnit)
         Theoretical Bandwidth:
           avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
             / (End_Timestamp - Start_Timestamp)))
@@ -117,29 +72,75 @@ Panel Config:
           avg: AVG((SQ_LDS_IDX_ACTIVE / $denom))
           min: MIN((SQ_LDS_IDX_ACTIVE / $denom))
           max: MAX((SQ_LDS_IDX_ACTIVE / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
         Atomic Return Cycles:
           avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom))
           min: MIN((SQ_LDS_ATOMIC_RETURN / $denom))
           max: MAX((SQ_LDS_ATOMIC_RETURN / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
         Bank Conflict:
           avg: AVG((SQ_LDS_BANK_CONFLICT / $denom))
           min: MIN((SQ_LDS_BANK_CONFLICT / $denom))
           max: MAX((SQ_LDS_BANK_CONFLICT / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
         Addr Conflict:
           avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom))
           min: MIN((SQ_LDS_ADDR_CONFLICT / $denom))
           max: MAX((SQ_LDS_ADDR_CONFLICT / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
         Unaligned Stall:
           avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom))
           min: MIN((SQ_LDS_UNALIGNED_STALL / $denom))
           max: MAX((SQ_LDS_UNALIGNED_STALL / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
         Mem Violations:
           avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom))
           min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom))
           max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom))
           unit: (Accesses + $normUnit)
+  metrics_description:
+    Utilization: Indicates what percent of the kernel's duration the LDS was actively
+      executing instructions (including, but not limited to, load, store, atomic and
+      HIP's __shfl operations). Calculated as the ratio of the total number of cycles
+      LDS was active over the total CU cycles.
+    Access Rate: Indicates the percentage of SIMDs in the VALU actively issuing LDS
+      instructions, averaged over the lifetime of the kernel. Calculated as the ratio
+      of the total number of cycles spent by the scheduler issuing LDS instructions
+      over the total CU cycles.
+    Theoretical Bandwidth Utilization: Indicates the maximum amount of bytes that
+      could have been loaded from, stored to, or atomically updated in the LDS divided
+      as percentage of theoretical peak. Does not take into account the execution
+      mask of the wavefront when the instruction was executed.
+    Theoretical Bandwidth: Indicates the maximum amount of bytes that could have been
+      loaded from, stored to, or atomically updated in the LDS divided by total duration.
+      Does not take into account the execution mask of the wavefront when the instruction
+      was executed.
+    Bank Conflict Rate: Indicates the percentage of active LDS cycles that were spent
+      servicing bank conflicts. Calculated as the ratio of LDS cycles spent servicing
+      bank conflicts over the number of LDS cycles that would have been required to
+      move the same amount of data in an uncontended access.
+    LDS Instructions: The total number of LDS instructions (including, but not limited
+      to, read/write/atomics and HIP's __shfl instructions) executed per normalization
+      unit.
+    LDS Latency: The average number of round-trip cycles (i.e., from issue to data-return
+      acknowledgment) required for an LDS instruction to complete.
+    Bank Conflicts/Access: The ratio of the number of cycles spent in the LDS scheduler
+      due to bank conflicts (as determined by the conflict resolution hardware) to
+      the base number of cycles that would be spent in the LDS scheduler in a completely
+      uncontended case. This is the unnormalized form of the Bank Conflict Rate.
+    Index Accesses: The total number of cycles spent in the LDS scheduler over all
+      operations per normalization unit.
+    Atomic Return Cycles: The total number of cycles spent on LDS atomics with return
+      per normalization unit.
+    Bank Conflict: The total number of cycles spent in the LDS scheduler due to bank
+      conflicts (as determined by the conflict resolution hardware) per normalization
+      unit.
+    Addr Conflict: The total number of cycles spent in the LDS scheduler due to address
+      conflicts (as determined by the conflict resolution hardware) per normalization
+      unit.
+    Unaligned Stall: The total number of cycles spent in the LDS scheduler due to
+      stalls from non-dword aligned addresses per normalization unit.
+    Mem Violations: |-
+      The total number of out-of-bounds accesses made to the LDS, per normalization
+      unit. This is unused and expected to be zero in most configurations for
+      modern CDNA\u2122 accelerators.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1300_instruction_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1300_instruction_cache.yaml
index aeda9bc6c7..35808d9d96 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1300_instruction_cache.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1300_instruction_cache.yaml
@@ -2,28 +2,6 @@
 Panel Config:
   id: 1300
   title: Instruction Cache
-  metrics_description:
-    Bandwidth Utilization: The number of bytes looked up in the L1I cache, as a percent
-      of the peak theoretical bandwidth. Calculated as the ratio of L1I requests over
-      the total L1I cycles.
-    Cache Hit Rate: The percent of L1I requests that hit [#l1i-cache]_ on a previously
-      loaded line the cache. Calculated as the ratio of the number of L1I requests
-      that hit over the number of all L1I requests.
-    L1I-L2 Bandwidth Utilization: "The percent of the peak theoretical L1I \u2192\
-      \ L2 cache request bandwidth achieved. Calculated as the ratio of the total\
-      \ number of requests from the L1I to the L2 cache over the total L1I-L2 interface\
-      \ cycles."
-    L1I-L2 Bandwidth: Total number of bytes transferred across L1I - L2 interface
-      divided by total duration.
-    Req: The total number of requests made to the L1I per normalization-unit
-    Hits: The total number of L1I requests that hit on a previously loaded cache line,
-      per normalization-unit.
-    Misses - Non Duplicated: The total number of L1I requests that missed on a cache
-      line that were not already pending due to another request, per normalization-unit.
-    Misses - Duplicated: The total number of L1I requests that missed on a cache line
-      that were already pending due to another request, per normalization-unit.
-    Instruction Fetch Latency: The average number of cycles spent to fetch instructions
-      to a CU.
   data source:
   - metric_table:
       id: 1301
@@ -62,22 +40,22 @@ Panel Config:
           avg: AVG((SQC_ICACHE_REQ / $denom))
           min: MIN((SQC_ICACHE_REQ / $denom))
           max: MAX((SQC_ICACHE_REQ / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Hits:
           avg: AVG((SQC_ICACHE_HITS / $denom))
           min: MIN((SQC_ICACHE_HITS / $denom))
           max: MAX((SQC_ICACHE_HITS / $denom))
-          unit: (Hits  + $normUnit)
+          unit: (Hits + $normUnit)
         Misses - Non Duplicated:
           avg: AVG((SQC_ICACHE_MISSES / $denom))
           min: MIN((SQC_ICACHE_MISSES / $denom))
           max: MAX((SQC_ICACHE_MISSES / $denom))
-          unit: (Misses  + $normUnit)
+          unit: (Misses + $normUnit)
         Misses - Duplicated:
           avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom))
           min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom))
           max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom))
-          unit: (Misses  + $normUnit)
+          unit: (Misses + $normUnit)
         Cache Hit Rate:
           avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
             + SQC_ICACHE_MISSES_DUPLICATE)))
@@ -107,3 +85,25 @@ Panel Config:
           min: MIN(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
           max: MAX(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
           unit: Gbps
+  metrics_description:
+    Bandwidth Utilization: The number of bytes looked up in the L1I cache, as a percent
+      of the peak theoretical bandwidth. Calculated as the ratio of L1I requests over
+      the total L1I cycles.
+    Cache Hit Rate: The percent of L1I requests that hit [#l1i-cache]_ on a previously
+      loaded line the cache. Calculated as the ratio of the number of L1I requests
+      that hit over the number of all L1I requests.
+    L1I-L2 Bandwidth Utilization: |-
+      The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth
+      achieved. Calculated as the ratio of the total number of requests from the
+      L1I to the L2 cache over the total L1I-L2 interface cycles.
+    L1I-L2 Bandwidth: Total number of bytes transferred across L1I - L2 interface
+      divided by total duration.
+    Req: The total number of requests made to the L1I per normalization-unit
+    Hits: The total number of L1I requests that hit on a previously loaded cache line,
+      per normalization-unit.
+    Misses - Non Duplicated: The total number of L1I requests that missed on a cache
+      line that were not already pending due to another request, per normalization-unit.
+    Misses - Duplicated: The total number of L1I requests that missed on a cache line
+      that were already pending due to another request, per normalization-unit.
+    Instruction Fetch Latency: The average number of cycles spent to fetch instructions
+      to a CU.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1400_scalar_l1_data_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1400_scalar_l1_data_cache.yaml
index 282b97ad1f..6b73164848 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1400_scalar_l1_data_cache.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1400_scalar_l1_data_cache.yaml
@@ -2,49 +2,6 @@
 Panel Config:
   id: 1400
   title: Scalar L1 Data Cache
-  metrics_description:
-    Bandwidth Utilization: The number of bytes looked up in the sL1D cache, as a percent
-      of the peak theoretical bandwidth. Calculated as the ratio of sL1D requests
-      over the total sL1D cycles.
-    Cache Hit Rate: Indicates the percent of sL1D requests that hit on a previously
-      loaded line the cache. The ratio of the number of sL1D requests that hit over
-      the number of all sL1D requests.
-    sL1D-L2 BW Utilization: The percentage of the peak theoretical sL1D - L2 interface
-      bandwidth acheived.\ \ Caclulated as total number of bytes read from, written
-      to, or atomically updated\ \ across the sL1D - L2 interface.
-    sL1D-L2 BW: "The total number of bytes read from, written to, or atomically updated\
-      \ across the sL1D\u2194L2 interface, divided by total duration. Note that sL1D\
-      \ writes and atomics are typically unused on current CDNA accelerators, so in\
-      \ the majority of cases this can be interpreted as an sL1D\u2192L2 read bandwidth."
-    Req: The total number of requests, of any size or type, made to the sL1D per normalization
-      unit.
-    Hits: The total number of sL1D requests that hit on a previously loaded cache
-      line, per normalization unit.
-    Misses - Non Duplicated: 'The total number of sL1D requests that missed on a cache
-      line that was not already pending due to another request, per normalization
-      unit. '
-    Misses- Duplicated: The total number of sL1D requests that missed on a cache line
-      that was already pending due to another request, per normalization unit.
-    Read Req (Total): The total number of sL1D read requests of any size, per normalization
-      unit.
-    Atomic Req: The total number of atomic requests from sL1D to the L2, per normalization
-      unit. Typically unused on current CDNA accelerators.
-    Read Req (1 DWord): The total number of sL1D read requests made for a single dword
-      of data (4B), per normalization unit.
-    Read Req (2 DWord): The total number of sL1D read requests made for a two dwords
-      of data (8B), per normalization unit.
-    Read Req (4 DWord): The total number of sL1D read requests made for a four dwords
-      of data (16B), per normalization unit.
-    Read Req (8 DWord): The total number of sL1D read requests made for a eight dwords
-      of data (32B), per normalization unit.
-    Read Req (16 DWord): The total number of sL1D read requests made for a sixteen
-      dwords of data (64B), per normalization unit.
-    Read Req: The total number of read requests from sL1D to the L2 per normalization
-      unit.
-    Write Req: The total number of write requests from sL1D to the L2, per normalization
-      unit. Typically unused on current CDNA accelerators.
-    Stall Cycles: "The total number of cycles the sL1D\u2194L2 interface was stalled,\
-      \ per normalization unit."
   data source:
   - metric_table:
       id: 1401
@@ -84,22 +41,22 @@ Panel Config:
           avg: AVG((SQC_DCACHE_REQ / $denom))
           min: MIN((SQC_DCACHE_REQ / $denom))
           max: MAX((SQC_DCACHE_REQ / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Hits:
           avg: AVG((SQC_DCACHE_HITS / $denom))
           min: MIN((SQC_DCACHE_HITS / $denom))
           max: MAX((SQC_DCACHE_HITS / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Misses - Non Duplicated:
           avg: AVG((SQC_DCACHE_MISSES / $denom))
           min: MIN((SQC_DCACHE_MISSES / $denom))
           max: MAX((SQC_DCACHE_MISSES / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Misses- Duplicated:
           avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom))
           min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom))
           max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Cache Hit Rate:
           avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
             + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
@@ -118,37 +75,37 @@ Panel Config:
             + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
           max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
             + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Atomic Req:
           avg: AVG((SQC_DCACHE_ATOMIC / $denom))
           min: MIN((SQC_DCACHE_ATOMIC / $denom))
           max: MAX((SQC_DCACHE_ATOMIC / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Read Req (1 DWord):
           avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom))
           min: MIN((SQC_DCACHE_REQ_READ_1 / $denom))
           max: MAX((SQC_DCACHE_REQ_READ_1 / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Read Req (2 DWord):
           avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom))
           min: MIN((SQC_DCACHE_REQ_READ_2 / $denom))
           max: MAX((SQC_DCACHE_REQ_READ_2 / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Read Req (4 DWord):
           avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom))
           min: MIN((SQC_DCACHE_REQ_READ_4 / $denom))
           max: MAX((SQC_DCACHE_REQ_READ_4 / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Read Req (8 DWord):
           avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom))
           min: MIN((SQC_DCACHE_REQ_READ_8 / $denom))
           max: MAX((SQC_DCACHE_REQ_READ_8 / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Read Req (16 DWord):
           avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom))
           min: MIN((SQC_DCACHE_REQ_READ_16 / $denom))
           max: MAX((SQC_DCACHE_REQ_READ_16 / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
   - metric_table:
       id: 1403
       title: Scalar L1D Cache - L2 Interface
@@ -171,19 +128,65 @@ Panel Config:
           avg: AVG((SQC_TC_DATA_READ_REQ / $denom))
           min: MIN((SQC_TC_DATA_READ_REQ / $denom))
           max: MAX((SQC_TC_DATA_READ_REQ / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Write Req:
           avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom))
           min: MIN((SQC_TC_DATA_WRITE_REQ / $denom))
           max: MAX((SQC_TC_DATA_WRITE_REQ / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Atomic Req:
           avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom))
           min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom))
           max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Stall Cycles:
           avg: AVG((SQC_TC_STALL / $denom))
           min: MIN((SQC_TC_STALL / $denom))
           max: MAX((SQC_TC_STALL / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
+  metrics_description:
+    Bandwidth Utilization: The number of bytes looked up in the sL1D cache, as a percent
+      of the peak theoretical bandwidth. Calculated as the ratio of sL1D requests
+      over the total sL1D cycles.
+    Cache Hit Rate: Indicates the percent of sL1D requests that hit on a previously
+      loaded line the cache. The ratio of the number of sL1D requests that hit over
+      the number of all sL1D requests.
+    sL1D-L2 BW Utilization: The percentage of the peak theoretical sL1D - L2 interface
+      bandwidth acheived. Calculated as total number of bytes read from, written to,
+      or atomically updated across the sL1D - L2 interface.
+    sL1D-L2 BW: |-
+      The total number of bytes read from, written to, or atomically updated
+      across the sL1D\u2194L2 interface, divided by total duration. Note that sL1D
+      writes and atomics are typically unused on current CDNA accelerators, so
+      in the majority of cases this can be interpreted as an sL1D\u2192L2 read
+      bandwidth.
+    Req: The total number of requests, of any size or type, made to the sL1D per normalization
+      unit.
+    Hits: The total number of sL1D requests that hit on a previously loaded cache
+      line, per normalization unit.
+    Misses - Non Duplicated: |-
+      The total number of sL1D requests that missed on a cache line that was
+      not already pending due to another request, per normalization unit.
+    Misses- Duplicated: The total number of sL1D requests that missed on a cache line
+      that was already pending due to another request, per normalization unit.
+    Read Req (Total): The total number of sL1D read requests of any size, per normalization
+      unit.
+    Atomic Req: The total number of atomic requests from sL1D to the L2, per normalization
+      unit. Typically unused on current CDNA accelerators.
+    Read Req (1 DWord): The total number of sL1D read requests made for a single dword
+      of data (4B), per normalization unit.
+    Read Req (2 DWord): The total number of sL1D read requests made for a two dwords
+      of data (8B), per normalization unit.
+    Read Req (4 DWord): The total number of sL1D read requests made for a four dwords
+      of data (16B), per normalization unit.
+    Read Req (8 DWord): The total number of sL1D read requests made for a eight dwords
+      of data (32B), per normalization unit.
+    Read Req (16 DWord): The total number of sL1D read requests made for a sixteen
+      dwords of data (64B), per normalization unit.
+    Read Req: The total number of read requests from sL1D to the L2 per normalization
+      unit.
+    Write Req: The total number of write requests from sL1D to the L2, per normalization
+      unit. Typically unused on current CDNA accelerators.
+    Stall Cycles: |-
+      The total number of cycles the sL1D\u2194L2 interface was stalled, per
+      normalization unit.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1500_address_processing_unit_and_data_return_path_ta_td.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1500_address_processing_unit_and_data_return_path_ta_td.yaml
index cdbb5393aa..3fd1615719 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1500_address_processing_unit_and_data_return_path_ta_td.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1500_address_processing_unit_and_data_return_path_ta_td.yaml
@@ -2,70 +2,6 @@
 Panel Config:
   id: 1500
   title: Address Processing Unit and Data Return Path (TA/TD)
-  metrics_description:
-    Address Processing Unit Busy: Percent of the total CU cycles the address processor
-      was busy
-    Address Stall: Percent of the total CU cycles the address processor was stalled
-      from sending address requests further into the vL1D pipeline.
-    Data Stall: Percent of the total CU cycles the address processor was stalled from
-      sending write/atomic data further into the vL1D pipeline.
-    "Data-Processor \u2192 Address Stall": Percent of total CU cycles the address
-      processor was stalled waiting to send command data to the data processor.
-    Total Instructions: The total number of memory instructions executed by the address
-      processer over all compute units on the accelerator, per normalization unit.
-    Global/Generic Instructions: The total number of global & generic memory instructions
-      executed on all compute units on the accelerator, per normalization unit.
-    Global/Generic Read Instructions: The total number of global & generic memory
-      read instructions executed on all compute units on the accelerator, per normalization
-      unit.
-    Global/Generic Write Instructions: The total number of global & generic memory
-      write instructions executed on all compute units on the accelerator, per normalization
-      unit.
-    Global/Generic Atomic Instructions: The total number of global & generic memory
-      atomic (with and without return) instructions executed on all compute units
-      on the accelerator, per normalization unit.
-    Spill/Stack Instructions: The total number of spill/stack memory instructions
-      executed on all compute units on the accelerator, per normalization unit.
-    Spill/Stack Read Instructions: The total number of spill/stack memory read instructions
-      executed on all compute units on the accelerator, per normalization unit.
-    Spill/Stack Write Instructions: The total number of spill/stack memory write instructions
-      executed on all compute units on the accelerator, per normalization unit.
-    Spill/Stack Atomic Instructions: The total number of spill/stack memory atomic
-      (with and without return) instructions executed on all compute units on the
-      accelerator, per normalization unit. Typically unused as these memory operations
-      are typically used to implement thread-local storage.
-    Spill/Stack Total Cycles: The number of cycles the address processing unit spent
-      working on spill/stack instructions, per normalization unit.
-    Spill/Stack Coalesced Read: The number of cycles the address processing unit spent
-      working on coalesced spill/stack read instructions, per normalization unit.
-    Spill/Stack Coalesced Write: The number of cycles the address processing unit
-      spent working on coalesced spill/stack write instructions, per normalization
-      unit.
-    Data-Return Busy: Percent of the total CU cycles the data-return unit was busy
-      processing or waiting on data to return to the CU.
-    "Cache RAM \u2192 Data-Return Stall": Percent of the total CU cycles the data-return
-      unit was stalled on data to be returned from the vL1D Cache RAM.
-    "Workgroup manager \u2192 Data-Return Stall": Percent of the total CU cycles the
-      data-return unit was stalled by the workgroup manager due to initialization
-      of registers as a part of launching new workgroups.
-    Coalescable Instructions: The number of instructions submitted to the data-return
-      unit by the address processor that were found to be coalescable, per normalization
-      unit.
-    Read Instructions: The number of read instructions submitted to the data-return
-      unit by the address processor summed over all compute units on the accelerator,
-      per normalization unit. This is expected to be the sum of global/generic and
-      spill/stack reads in the address processor.
-    Write Instructions: The number of store instructions submitted to the data-return
-      unit by the address processor summed over all compute units on the accelerator,
-      per normalization unit. This is expected to be the sum of global/generic and
-      spill/stack stores in the address processor.
-    Atomic Instructions: The number of atomic instructions submitted to the data-return
-      unit by the address processor summed over all compute units on the accelerator,
-      per normalization unit. This is expected to be the sum of global/generic and
-      spill/stack atomics in the address processor.
-    Write Ack Instructions: The total number of write acknowledgements submitted by
-      data-return unit to SQ, summed over all compute units on the accelerator, per
-      normalization unit.
   data source:
   - metric_table:
       id: 1501
@@ -135,47 +71,47 @@ Panel Config:
           avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom))
           min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom))
           max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Global/Generic Instructions:
           avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
           min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
           max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Global/Generic Read Instructions:
           avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
           min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
           max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Global/Generic Write Instructions:
           avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
           min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
           max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Global/Generic Atomic Instructions:
           avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
           min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
           max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Spill/Stack Instructions:
           avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
           min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
           max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Spill/Stack Read Instructions:
           avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
           min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
           max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Spill/Stack Write Instructions:
           avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
           min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
           max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Spill/Stack Atomic Instructions:
           avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
           min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
           max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
   - metric_table:
       id: 1503
       title: Spill and stack metrics
@@ -190,17 +126,17 @@ Panel Config:
           avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
           min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
           max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
         Spill/Stack Coalesced Read:
           avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
           min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
           max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
         Spill/Stack Coalesced Write:
           avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
           min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
           max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
   - metric_table:
       id: 1504
       title: Vector L1 data-return path or Texture Data (TD)
@@ -230,7 +166,7 @@ Panel Config:
           avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom))
           min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom))
           max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Read Instructions:
           avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
             / $denom))
@@ -238,14 +174,75 @@ Panel Config:
             / $denom))
           max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
             / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Write Instructions:
           avg: AVG((TD_STORE_WAVEFRONT_sum / $denom))
           min: MIN((TD_STORE_WAVEFRONT_sum / $denom))
           max: MAX((TD_STORE_WAVEFRONT_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Atomic Instructions:
           avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom))
           min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom))
           max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
+  metrics_description:
+    Address Processing Unit Busy: Percent of the total CU cycles the address processor
+      was busy
+    Address Stall: Percent of the total CU cycles the address processor was stalled
+      from sending address requests further into the vL1D pipeline.
+    Data Stall: Percent of the total CU cycles the address processor was stalled from
+      sending write/atomic data further into the vL1D pipeline.
+    "Data-Processor \u2192 Address Stall": Percent of total CU cycles the address
+      processor was stalled waiting to send command data to the data processor.
+    Total Instructions: The total number of memory instructions executed by the address
+      processer over all compute units on the accelerator, per normalization unit.
+    Global/Generic Instructions: The total number of global & generic memory instructions
+      executed on all compute units on the accelerator, per normalization unit.
+    Global/Generic Read Instructions: The total number of global & generic memory
+      read instructions executed on all compute units on the accelerator, per normalization
+      unit.
+    Global/Generic Write Instructions: The total number of global & generic memory
+      write instructions executed on all compute units on the accelerator, per normalization
+      unit.
+    Global/Generic Atomic Instructions: The total number of global & generic memory
+      atomic (with and without return) instructions executed on all compute units
+      on the accelerator, per normalization unit.
+    Spill/Stack Instructions: The total number of spill/stack memory instructions
+      executed on all compute units on the accelerator, per normalization unit.
+    Spill/Stack Read Instructions: The total number of spill/stack memory read instructions
+      executed on all compute units on the accelerator, per normalization unit.
+    Spill/Stack Write Instructions: The total number of spill/stack memory write instructions
+      executed on all compute units on the accelerator, per normalization unit.
+    Spill/Stack Atomic Instructions: The total number of spill/stack memory atomic
+      (with and without return) instructions executed on all compute units on the
+      accelerator, per normalization unit. Typically unused as these memory operations
+      are typically used to implement thread-local storage.
+    Spill/Stack Total Cycles: The number of cycles the address processing unit spent
+      working on spill/stack instructions, per normalization unit.
+    Spill/Stack Coalesced Read: The number of cycles the address processing unit spent
+      working on coalesced spill/stack read instructions, per normalization unit.
+    Spill/Stack Coalesced Write: The number of cycles the address processing unit
+      spent working on coalesced spill/stack write instructions, per normalization
+      unit.
+    Data-Return Busy: Percent of the total CU cycles the data-return unit was busy
+      processing or waiting on data to return to the CU.
+    "Cache RAM \u2192 Data-Return Stall": Percent of the total CU cycles the data-return
+      unit was stalled on data to be returned from the vL1D Cache RAM.
+    "Workgroup manager \u2192 Data-Return Stall": Percent of the total CU cycles the
+      data-return unit was stalled by the workgroup manager due to initialization
+      of registers as a part of launching new workgroups.
+    Coalescable Instructions: The number of instructions submitted to the data-return
+      unit by the address processor that were found to be coalescable, per normalization
+      unit.
+    Read Instructions: The number of read instructions submitted to the data-return
+      unit by the address processor summed over all compute units on the accelerator,
+      per normalization unit. This is expected to be the sum of global/generic and
+      spill/stack reads in the address processor.
+    Write Instructions: The number of store instructions submitted to the data-return
+      unit by the address processor summed over all compute units on the accelerator,
+      per normalization unit. This is expected to be the sum of global/generic and
+      spill/stack stores in the address processor.
+    Atomic Instructions: The number of atomic instructions submitted to the data-return
+      unit by the address processor summed over all compute units on the accelerator,
+      per normalization unit. This is expected to be the sum of global/generic and
+      spill/stack atomics in the address processor.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1600_vector_l1_data_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1600_vector_l1_data_cache.yaml
index e5b5eb9e9c..3125397a30 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1600_vector_l1_data_cache.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1600_vector_l1_data_cache.yaml
@@ -2,117 +2,6 @@
 Panel Config:
   id: 1600
   title: Vector L1 Data Cache
-  metrics_description:
-    Hit rate: The ratio of the number of vL1D cache line requests that hit in vL1D
-      cache over the total number of cache line requests to the vL1D Cache RAM.
-    Bandwidth Utilization: The number of bytes looked up in the vL1D cache as a result
-      of VMEM instructions, as a percent of the peak theoretical bandwidth achievable
-      on the specific accelerator. The number of bytes is calculated as the number
-      of cache lines requested multiplied by the cache line size. This value does
-      not consider partial requests, so for instance, if only a single value is requested
-      in a cache line, the data movement will still be counted as a full cache line.
-    Utilization: Indicates how busy the vL1D Cache RAM was during the kernel execution.
-      The number of cycles where the vL1D Cache RAM is actively processing any request
-      divided by the number of cycles where the vL1D is active.
-    Coalescing: Indicates how well memory instructions were coalesced by the address
-      processing unit, ranging from uncoalesced (25%) to fully coalesced (100%). Calculated
-      as the average number of thread-requests generated per instruction divided by
-      the ideal number of thread-requests per instruction.
-    Stalled on L2 Data: The ratio of the number of cycles where the vL1D is stalled
-      waiting for requested data to return from the L2 cache divided by the number
-      of cycles where the vL1D is active.
-    Stalled on L2 Req: The ratio of the number of cycles where the vL1D is stalled
-      waiting to issue a request for data to the L2 cache divided by the number of
-      cycles where the vL1D is active.
-    Tag RAM Stall (Read): The ratio of the number of cycles where the vL1D is stalled
-      due to Read requests with conflicting tags being looked up concurrently, divided
-      by the number of cycles where the vL1D is active.
-    Tag RAM Stall (Write): The ratio of the number of cycles where the vL1D is stalled
-      due to Write requests with conflicting tags being looked up concurrently, divided
-      by the number of cycles where the vL1D is active.
-    Tag RAM Stall (Atomic): The ratio of the number of cycles where the vL1D is stalled
-      due to Atomic requests with conflicting tags being looked up concurrently, divided
-      by the number of cycles where the vL1D is active.
-    Total Req: The total number of incoming requests from the address processing unit
-      after coalescing.
-    Read Req: The total number of incoming read requests from the address processing
-      unit after coalescing per normalization unit.
-    Write Req: The total number of incoming write requests from the address processing
-      unit after coalescing per normalization unit.
-    Atomic Req: The total number of incoming atomic requests from the address processing
-      unit after coalescing per normalization unit.
-    Cache BW: The number of bytes looked up in the vL1D cache as a result of VMEM
-      instructions divided by total duration. The number of bytes is calculated as
-      the number of cache lines requested multiplied by the cache line size.  This
-      value does not consider partial requests, so for instance, if only a single
-      value is requested in a cache line, the data movement will still be counted
-      as a full cache line.
-    Cache Hit Rate: The ratio of the number of vL1D cache line requests that hit in
-      vL1D cache over the total number of cache line requests to the vL1D Cache RAM.
-    Cache Accesses: The total number of cache line lookups in the vL1D.
-    Cache Hits: The number of cache accesses minus the number of outgoing requests
-      to the L2 cache, that is, the number of cache line requests serviced by the
-      vL1D Cache RAM per normalization unit.
-    Invalidations: The number of times the vL1D was issued a write-back invalidate
-      command during the kernel's execution per normalization unit. This may be triggered
-      by, for instance, the buffer_wbinvl1 instruction.
-    L1-L2 BW: The number of bytes transferred across the vL1D-L2 interface as a result
-      of VMEM instructions, divided by total duration. The number of bytes is calculated
-      as the number of cache lines requested multiplied by the cache line size. This
-      value does not consider partial requests, so for instance, if only a single
-      value is requested in a cache line, the data movement will still be counted
-      as a full cache line.
-    L1-L2 Read: The number of read requests for a vL1D cache line that were not satisfied
-      by the vL1D and must be retrieved from the to the L2 Cache per normalization
-      unit.
-    L1-L2 Write: The number of write requests to a vL1D cache line that were sent
-      through the vL1D to the L2 cache, per normalization unit.
-    L1-L2 Atomic: The number of atomic requests that are sent through the vL1D to
-      the L2 cache, per normalization unit. This includes requests for atomics with,
-      and without return.
-    L1 Access Latency: Calculated as the average number of cycles that a vL1D cache
-      line request spent in the vL1D cache pipeline.
-    L1-L2 Read Latency: Calculated as the average number of cycles that the vL1D cache
-      took to issue and receive read requests from the L2 Cache. This number also
-      includes requests for atomics with return values.
-    L1-L2 Write Latency: Calculated as the average number of cycles that the vL1D
-      cache took to issue and receive acknowledgement of a write request to the L2
-      Cache. This number also includes requests for atomics without return values.
-    NC - Read: Total read requests with NC mtype from this TCP to all TCCs Sum over
-      TCP instances per normalization unit.
-    UC - Read: Total read requests with UC mtype from this TCP to all TCCs Sum over
-      TCP instances per normalization unit.
-    CC - Read: Total read requests with CC mtype from this TCP to all TCCs Sum over
-      TCP instances per normalization unit.
-    RW - Read: Total read requests with RW mtype from this TCP to all TCCs Sum over
-      TCP instances per normalization unit.
-    RW - Write: Total write requests with RW mtype from this TCP to all TCCs Sum over
-      TCP instances per normalization unit.
-    NC - Write: Total write requests with NC mtype from this TCP to all TCCs Sum over
-      TCP instances per normalization unit.
-    UC - Write: Total write requests with UC mtype from this TCP to all TCCs Sum over
-      TCP instances per normalization unit.
-    CC - Write: Total write requests with CC mtype from this TCP to all TCCs Sum over
-      TCP instances per normalization unit.
-    NC - Atomic: Total atomic requests with NC mtype from this TCP to all TCCs Sum
-      over TCP instances per normalization unit.
-    UC - Atomic: Total atomic requests with UC mtype from this TCP to all TCCs Sum
-      over TCP instances per normalization unit.
-    CC - Atomic: Total atomic requests with CC mtype from this TCP to all TCCs Sum
-      over TCP instances per normalization unit.
-    RW - Atomic: Total atomic requests with RW mtype from this TCP to all TCCs Sum
-      over TCP instances per normalization unit.
-    Req: The number of translation requests made to the UTCL1 per normalization unit.
-    Hit Ratio: The ratio of the number of translation requests that hit in the UTCL1
-      divided by the total number of translation requests made to the UTCL1.
-    Hits: The number of translation requests that hit in the UTCL1, and could be reused,
-      per normalization unit.
-    Translation Misses: The total number of translation requests that missed in the
-      UTCL1 due to  translation not being present in the cache, per normalization
-      unit.
-    Permission Misses: "The total number of translation requests that missed in the\
-      \ UTCL1 due to a permission error, per normalization unit. This is unused and\
-      \ expected to be zero in most configurations for modern CDNA\u2122 accelerators."
   data source:
   - metric_table:
       id: 1601
@@ -181,17 +70,17 @@ Panel Config:
           avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom))
           min: MIN((TCP_TOTAL_ACCESSES_sum / $denom))
           max: MAX((TCP_TOTAL_ACCESSES_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Read Req:
           avg: AVG((TCP_TOTAL_READ_sum / $denom))
           min: MIN((TCP_TOTAL_READ_sum / $denom))
           max: MAX((TCP_TOTAL_READ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Write Req:
           avg: AVG((TCP_TOTAL_WRITE_sum / $denom))
           min: MIN((TCP_TOTAL_WRITE_sum / $denom))
           max: MAX((TCP_TOTAL_WRITE_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Atomic Req:
           avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
             / $denom))
@@ -199,7 +88,7 @@ Panel Config:
             / $denom))
           max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
             / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Cache BW:
           avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
           min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
@@ -223,7 +112,7 @@ Panel Config:
           avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
           min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
           max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Cache Hits:
           avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
             + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
@@ -234,7 +123,7 @@ Panel Config:
           max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
             + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
             / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Invalidations:
           avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
           min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
@@ -252,12 +141,12 @@ Panel Config:
           avg: AVG((TCP_TCC_READ_REQ_sum / $denom))
           min: MIN((TCP_TCC_READ_REQ_sum / $denom))
           max: MAX((TCP_TCC_READ_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         L1-L2 Write:
           avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom))
           min: MIN((TCP_TCC_WRITE_REQ_sum / $denom))
           max: MAX((TCP_TCC_WRITE_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         L1-L2 Atomic:
           avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
             / $denom))
@@ -265,7 +154,7 @@ Panel Config:
             / $denom))
           max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
             / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
   - metric_table:
       id: 1604
       title: L1D - L2 Transactions
@@ -284,84 +173,84 @@ Panel Config:
           avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom))
           min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom))
           max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         UC - Read:
           xfer: Read
           coherency: UC
           avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom))
           min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom))
           max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         CC - Read:
           xfer: Read
           coherency: CC
           avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom))
           min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom))
           max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         RW - Read:
           xfer: Read
           coherency: RW
           avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom))
           min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom))
           max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         RW - Write:
           xfer: Write
           coherency: RW
           avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom))
           min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom))
           max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         NC - Write:
           xfer: Write
           coherency: NC
           avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom))
           min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom))
           max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         UC - Write:
           xfer: Write
           coherency: UC
           avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom))
           min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom))
           max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         CC - Write:
           xfer: Write
           coherency: CC
           avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom))
           min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom))
           max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         NC - Atomic:
           xfer: Atomic
           coherency: NC
           avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
           min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
           max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         UC - Atomic:
           xfer: Atomic
           coherency: UC
           avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
           min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
           max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         CC - Atomic:
           xfer: Atomic
           coherency: CC
           avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
           min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
           max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         RW - Atomic:
           xfer: Atomic
           coherency: RW
           avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
           min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
           max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
   - metric_table:
       id: 1605
       title: L1 Unified Translation Cache (UTCL1)
@@ -410,3 +299,106 @@ Panel Config:
         max: Max
         units: Unit
       metric: {}
+  metrics_description:
+    Hit rate: The ratio of the number of vL1D cache line requests that hit in vL1D
+      cache over the total number of cache line requests to the vL1D Cache RAM.
+    Bandwidth Utilization: The number of bytes looked up in the vL1D cache as a result
+      of VMEM instructions, as a percent of the peak theoretical bandwidth achievable
+      on the specific accelerator. The number of bytes is calculated as the number
+      of cache lines requested multiplied by the cache line size. This value does
+      not consider partial requests, so for instance, if only a single value is requested
+      in a cache line, the data movement will still be counted as a full cache line.
+    Utilization: Indicates how busy the vL1D Cache RAM was during the kernel execution.
+      The number of cycles where the vL1D Cache RAM is actively processing any request
+      divided by the number of cycles where the vL1D is active.
+    Coalescing: Indicates how well memory instructions were coalesced by the address
+      processing unit, ranging from uncoalesced (25%) to fully coalesced (100%). Calculated
+      as the average number of thread-requests generated per instruction divided by
+      the ideal number of thread-requests per instruction.
+    Stalled on L2 Data: The ratio of the number of cycles where the vL1D is stalled
+      waiting for requested data to return from the L2 cache divided by the number
+      of cycles where the vL1D is active.
+    Stalled on L2 Req: The ratio of the number of cycles where the vL1D is stalled
+      waiting to issue a request for data to the L2 cache divided by the number of
+      cycles where the vL1D is active.
+    Tag RAM Stall (Read): The ratio of the number of cycles where the vL1D is stalled
+      due to Read requests with conflicting tags being looked up concurrently, divided
+      by the number of cycles where the vL1D is active.
+    Tag RAM Stall (Write): The ratio of the number of cycles where the vL1D is stalled
+      due to Write requests with conflicting tags being looked up concurrently, divided
+      by the number of cycles where the vL1D is active.
+    Tag RAM Stall (Atomic): The ratio of the number of cycles where the vL1D is stalled
+      due to Atomic requests with conflicting tags being looked up concurrently, divided
+      by the number of cycles where the vL1D is active.
+    Total Req: The total number of incoming requests from the address processing unit
+      after coalescing.
+    Read Req: The total number of incoming read requests from the address processing
+      unit after coalescing per normalization unit.
+    Write Req: The total number of incoming write requests from the address processing
+      unit after coalescing per normalization unit.
+    Atomic Req: The total number of incoming atomic requests from the address processing
+      unit after coalescing per normalization unit.
+    Cache BW: The number of bytes looked up in the vL1D cache as a result of VMEM
+      instructions divided by total duration. The number of bytes is calculated as
+      the number of cache lines requested multiplied by the cache line size. This
+      value does not consider partial requests, so for instance, if only a single
+      value is requested in a cache line, the data movement will still be counted
+      as a full cache line.
+    Cache Hit Rate: The ratio of the number of vL1D cache line requests that hit in
+      vL1D cache over the total number of cache line requests to the vL1D Cache RAM.
+    Cache Accesses: The total number of cache line lookups in the vL1D.
+    Cache Hits: The number of cache accesses minus the number of outgoing requests
+      to the L2 cache, that is, the number of cache line requests serviced by the
+      vL1D Cache RAM per normalization unit.
+    Invalidations: The number of times the vL1D was issued a write-back invalidate
+      command during the kernel's execution per normalization unit. This may be triggered
+      by, for instance, the buffer_wbinvl1 instruction.
+    L1-L2 BW: The number of bytes transferred across the vL1D-L2 interface as a result
+      of VMEM instructions, divided by total duration. The number of bytes is calculated
+      as the number of cache lines requested multiplied by the cache line size. This
+      value does not consider partial requests, so for instance, if only a single
+      value is requested in a cache line, the data movement will still be counted
+      as a full cache line.
+    L1-L2 Read: The number of read requests for a vL1D cache line that were not satisfied
+      by the vL1D and must be retrieved from the to the L2 Cache per normalization
+      unit.
+    L1-L2 Write: The number of write requests to a vL1D cache line that were sent
+      through the vL1D to the L2 cache, per normalization unit.
+    L1-L2 Atomic: The number of atomic requests that are sent through the vL1D to
+      the L2 cache, per normalization unit. This includes requests for atomics with,
+      and without return.
+    NC - Read: Total read requests with NC mtype from this TCP to all TCCs Sum over
+      TCP instances per normalization unit.
+    UC - Read: Total read requests with UC mtype from this TCP to all TCCs Sum over
+      TCP instances per normalization unit.
+    CC - Read: Total read requests with CC mtype from this TCP to all TCCs Sum over
+      TCP instances per normalization unit.
+    RW - Read: Total read requests with RW mtype from this TCP to all TCCs Sum over
+      TCP instances per normalization unit.
+    RW - Write: Total write requests with RW mtype from this TCP to all TCCs Sum over
+      TCP instances per normalization unit.
+    NC - Write: Total write requests with NC mtype from this TCP to all TCCs Sum over
+      TCP instances per normalization unit.
+    UC - Write: Total write requests with UC mtype from this TCP to all TCCs Sum over
+      TCP instances per normalization unit.
+    CC - Write: Total write requests with CC mtype from this TCP to all TCCs Sum over
+      TCP instances per normalization unit.
+    NC - Atomic: Total atomic requests with NC mtype from this TCP to all TCCs Sum
+      over TCP instances per normalization unit.
+    UC - Atomic: Total atomic requests with UC mtype from this TCP to all TCCs Sum
+      over TCP instances per normalization unit.
+    CC - Atomic: Total atomic requests with CC mtype from this TCP to all TCCs Sum
+      over TCP instances per normalization unit.
+    RW - Atomic: Total atomic requests with RW mtype from this TCP to all TCCs Sum
+      over TCP instances per normalization unit.
+    Req: The number of translation requests made to the UTCL1 per normalization unit.
+    Hit Ratio: The ratio of the number of translation requests that hit in the UTCL1
+      divided by the total number of translation requests made to the UTCL1.
+    Hits: The number of translation requests that hit in the UTCL1, and could be reused,
+      per normalization unit.
+    Translation Misses: The total number of translation requests that missed in the
+      UTCL1 due to translation not being present in the cache, per normalization unit.
+    Permission Misses: |-
+      The total number of translation requests that missed in the UTCL1 due
+      to a permission error, per normalization unit. This is unused and expected
+      to be zero in most configurations for modern CDNA\u2122 accelerators.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1700_l2_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1700_l2_cache.yaml
index 2bbed72079..e62391c6ff 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1700_l2_cache.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1700_l2_cache.yaml
@@ -2,218 +2,6 @@
 Panel Config:
   id: 1700
   title: L2 Cache
-  metrics_description:
-    Utilization: The ratio of the number of cycles an L2 channel was active, summed
-      over all L2 channels on the accelerator over the total L2 cycles.
-    Peak Bandwidth: The number of bytes looked up in the L2 cache, as a percent of
-      the peak theoretical bandwidth achievable on the specific accelerator. The number
-      of bytes is calculated as the number of cache lines requested multiplied by
-      the cache line size. This value does not consider partial requests, so e.g.,
-      if only a single value is requested in a cache line, the data movement will
-      still be counted as a full cache line.
-    Hit Rate: The ratio of the number of L2 cache line requests that hit in the L2
-      cache over the total number of incoming cache line requests to the L2 cache.
-    L2-Fabric Read BW: The number of bytes read by the L2 over the Infinity Fabric
-      interface per unit time.
-    L2-Fabric Write and Atomic BW: The number of bytes sent by the L2 over the Infinity
-      Fabric interface by write and atomic operations per unit time.
-    HBM Bandwidth: Maximum theoretical bandwidth of the accelerator's local high-bandwidth
-      memory (HBM) per unit time. This value is calculated as the number of HBM channels
-      multiplied by the HBM channel width multiplied by the HBM clock frequency.
-    Read BW: The total number of bytes read by the L2 cache from Infinity Fabric divided
-      by total duration.
-    HBM Read Traffic: The percent of read requests generated by the L2 cache that
-      are routed to the accelerator's local high-bandwidth memory (HBM). This breakdown
-      does not consider the size of the request (meaning that 32B and 64B requests
-      are both counted as a single request), so this metric only approximates the
-      percent of the L2-Fabric Read bandwidth directed to the local HBM.
-    Remote Read Traffic: The percent of read requests generated by the L2 cache that
-      are routed to any memory location other than the accelerator's local high-bandwidth
-      memory (HBM) - for example, the CPU's DRAM or a remote accelerator's HBM. This
-      breakdown does not consider the size of the request (meaning that 32B and 64B
-      requests are both counted as a single request), so this metric only approximates
-      the percent of the L2-Fabric Read bandwidth directed to a remote location.
-    Uncached Read Traffic: The percent of read requests generated by the L2 cache
-      that are reading from an uncached memory allocation. Note, as described in the
-      request flow section, a single 64B read request is typically counted as two
-      uncached read requests. So, it is possible for the Uncached Read Traffic to
-      reach up to 200% of the total number of read requests. This breakdown does not
-      consider the size of the request (i.e., 32B and 64B requests are both counted
-      as a single request), so this metric only approximates the percent of the L2-Fabric
-      read bandwidth directed to an uncached memory location.
-    Write and Atomic BW: The total number of bytes written by the L2 over Infinity
-      Fabric by write and atomic operations divided by total duration. Note that on
-      current CDNA accelerators, such as the MI2XX, requests are only considered atomic
-      by Infinity Fabric if they are targeted at non-write-cacheable memory, for example,
-      fine-grained memory allocations or uncached memory allocations on the MI2XX.
-    HBM Write and Atomic Traffic: The percent of write and atomic requests generated
-      by the L2 cache that are routed to the accelerator's local high-bandwidth memory
-      (HBM). This breakdown does not consider the size of the request (meaning that
-      32B and 64B requests are both counted as a single request), so this metric only
-      approximates the percent of the L2-Fabric Write and Atomic bandwidth directed
-      to the local HBM. Note that on current CDNA accelerators, such as the MI2XX,
-      requests are only considered atomic by Infinity Fabric if they are targeted
-      at fine-grained memory allocations or uncached memory allocations.
-    Remote Write and Atomic Traffic: The percent of read requests generated by the
-      L2 cache that are routed to any memory location other than the accelerator's
-      local high-bandwidth memory (HBM) - for example, the CPU's DRAM or a remote
-      accelerator's HBM. This breakdown does not consider the size of the request
-      (meaning that 32B and 64B requests are both counted as a single request), so
-      this metric only approximates the percent of the L2-Fabric Read bandwidth directed
-      to a remote location. Note that on current CDNA accelerators, such as the MI2XX,
-      requests are only considered atomic by Infinity Fabric if they are targeted
-      at fine-grained memory allocations or uncached memory allocations.
-    Atomic Traffic: The percent of write requests generated by the L2 cache that are
-      atomic requests to any memory location. This breakdown does not consider the
-      size of the request (meaning that 32B and 64B requests are both counted as a
-      single request), so this metric only approximates the percent of the L2-Fabric
-      Read bandwidth directed to a remote location. Note that on current CDNA accelerators,
-      such as the MI2XX, requests are only considered atomic by Infinity Fabric if
-      they are targeted at fine-grained memory allocations or uncached memory allocations.
-    Uncached Write and Atomic Traffic: The percent of write and atomic requests generated
-      by the L2 cache that are targeting uncached memory allocations. This breakdown
-      does not consider the size of the request (meaning that 32B and 64B requests
-      are both counted as a single request), so this metric only approximates the
-      percent of the L2-Fabric read bandwidth directed to uncached memory allocations.
-    Read Latency: The time-averaged number of cycles read requests spent in Infinity
-      Fabric before data was returned to the L2.
-    Write and Atomic Latency: The time-averaged number of cycles write requests spent
-      in Infinity Fabric before a completion acknowledgement was returned to the L2.
-    Atomic Latency: The time-averaged number of cycles atomic requests spent in Infinity
-      Fabric before a completion acknowledgement (atomic without return value) or
-      data (atomic with return value) was returned to the L2.
-    Bandwidth: The number of bytes looked up in the L2 cache, divided by total duration.
-      The number of bytes is calculated as the number of cache lines requested multiplied
-      by the cache line size. This value does not consider partial requests, so for
-      example, if only a single value is requested in a cache line, the data movement
-      will still be counted as a full cache line.
-    Read Bandwidth: Total number of bytes looked up in the L2 cache for read requests,
-      divided by total duration.
-    Write Bandwidth: Total number of bytes looked up in the L2 cache for write requests,
-      divided by total duration.
-    Atomic Bandwidth: Total number of bytes looked up in the L2 cache for atomic requests,
-      divided by total duration.
-    Req: The total number of incoming requests to the L2 from all clients for all
-      request types, per normalization unit.
-    Read Req: The total number of read requests to the L2 from all clients.
-    Write Req: The total number of write requests to the L2 from all clients.
-    Atomic Req: The total number of atomic requests (with and without return) to the
-      L2 from all clients.
-    Streaming Req: The total number of incoming requests to the L2 that are marked
-      as streaming. The exact meaning of this may differ depending on the targeted
-      accelerator, however on an MI2XX this corresponds to non-temporal load or stores.
-      The L2 cache attempts to evict streaming requests before normal requests when
-      the L2 is at capacity.
-    Probe Req: The number of coherence probe requests made to the L2 cache from outside
-      the accelerator. On an MI2XX, probe requests may be generated by, for example,
-      writes to fine-grained device memory or by writes to coarse-grained device memory.
-    Cache Hit: The ratio of the number of L2 cache line requests that hit in the L2
-      cache over the total number of incoming cache line requests to the L2 cache.
-    Hits: The total number of requests to the L2 from all clients that hit in the
-      cache. As noted in the Speed-of-Light section, this includes hit-on-miss requests.
-    Misses: The total number of requests to the L2 from all clients that miss in the
-      cache. As noted in the Speed-of-Light section, these do not include hit-on-miss
-      requests.
-    Writeback: The total number of L2 cache lines written back to memory for any reason.
-      Write-backs may occur due to user code (such as HIP kernel calls to _threadfence_system
-      or atomic built-ins) by the command processor's memory acquire/release fences,
-      or for other internal hardware reasons.
-    Writeback (Internal): The total number of L2 cache lines written back to memory
-      for internal hardware reasons, per normalization unit.
-    Writeback (vL1D Req): The total number of L2 cache lines written back to memory
-      due to requests initiated by the vL1D cache, per normalization unit.
-    Evict (Internal): The total number of L2 cache lines evicted from the cache due
-      to capacity limits, per normalization unit.
-    Evict (vL1D Req): The total number of L2 cache lines evicted from the cache due
-      to invalidation requests initiated by the vL1D cache, per normalization unit.
-    NC Req: The total number of requests to the L2 to Not-hardware-Coherent (NC) memory
-      allocations, per normalization unit.
-    UC Req: The total number of requests to the L2 that go to Uncached (UC) memory
-      allocations.
-    CC Req: The total number of requests to the L2 that go to Coherently Cacheable
-      (CC) memory allocations.
-    RW Req: The total number of requests to the L2 that go to Read-Write coherent
-      memory (RW) allocations.
-    Write - Credit Starvation: The number of cycles the L2-Fabric interface was stalled
-      on write or atomic requests to any memory location because too many write/atomic
-      requests were currently in flight, as a percent of the total active L2 cycles.
-    Read (32B): The total number of L2 requests to Infinity Fabric to read 32B of
-      data from any memory location, per normalization unit.
-    Read (64B): The total number of L2 requests to Infinity Fabric to read 64B of
-      data from any memory location, per normalization unit.
-    Read (Uncached): The total number of L2 requests to Infinity Fabric to read uncached
-      data from any memory location, per normalization unit. 64B requests for uncached
-      data are counted as two 32B uncached data requests.
-    HBM Read: The total number of L2 requests to Infinity Fabric to read 32B or 64B
-      of data from the accelerator's local HBM, per normalization unit.
-    Remote Read: The total number of L2 requests to Infinity Fabric to read 32B or
-      64B of data from any source other than the accelerator's local HBM, per normalization
-      unit.
-    Read Bandwidth - PCIe: Total number of bytes due to L2 read requests due to PCIe
-      traffic, divided by total duration.
-    "Read Bandwidth - Infinity Fabric\u2122": Total number of bytes due to L2 read
-      requests due to Infinity Fabric traffic, divided by total duration.
-    Read Bandwidth - HBM: Total number of bytes due to L2 read requests due to HBM
-      traffic, divided by total duration.
-    Write and Atomic (32B): The total number of L2 requests to Infinity Fabric to
-      write or atomically update 32B of data to any memory location, per normalization
-      unit.
-    Write and Atomic (Uncached): The total number of L2 requests to Infinity Fabric
-      to write or atomically update 32B or 64B of uncached data, per normalization
-      unit.
-    Write and Atomic (64B): The total number of L2 requests to Infinity Fabric to
-      write or atomically update 64B of data in any memory location, per normalization
-      unit.
-    HBM Write and Atomic: The total number of L2 requests to Infinity Fabric to write
-      or atomically update 32B or 64B of data in the accelerator's local HBM, per
-      normalization unit.
-    Remote Write and Atomic: The total number of L2 requests to Infinity Fabric to
-      write or atomically update 32B or 64B of data in any memory location other than
-      the accelerator's local HBM, per normalization unit.
-    Write Bandwidth - PCIe: Total number of bytes due to L2 write requests due to
-      PCIe traffic, divided by total duration.
-    "Write Bandwidth - Infinity Fabric\u2122": Total number of bytes due to L2 write
-      requests due to Infinity Fabric traffic, divided by total duration.
-    Write Bandwidth - HBM: Total number of bytes due to L2 write requests due to HBM
-      traffic, divided by total duration.
-    Atomic Bandwidth - PCIe: Total number of bytes due to L2 atomic requests due to
-      PCIe traffic, divided by total duration.
-    "Atomic Bandwidth - Infinity Fabric\u2122": Total number of bytes due to L2 atomic
-      requests due to Infinity Fabric traffic, divided by total duration.
-    Atomic Bandwidth - HBM: Total number of bytes due to L2 atomic requests due to
-      HBM traffic, divided by total duration.
-    Atomic: The total number of L2 requests to Infinity Fabric to atomically update
-      32B or 64B of data in any memory location, per normalization unit. See Request
-      flow for more detail. Note that on current CDNA accelerators, such as the MI2XX,
-      requests are only considered atomic by Infinity Fabric if they are targeted
-      at non-write-cacheable memory, such as fine-grained memory allocations or uncached
-      memory allocations on the MI2XX.
-    Read Stall: "The ratio of the total number of cycles the L2-Fabric interface was\
-      \ stalled on a read request to any destination (local HBM, remote PCIe\xAE connected\
-      \ accelerator or CPU, or remote Infinity Fabric connected accelerator or CPU)\
-      \ over the total active L2 cycles."
-    Write Stall: The ratio of the total number of cycles the L2-Fabric interface was
-      stalled on a write or atomic request to any destination (local HBM, remote accelerator
-      or CPU, PCIe connected accelerator or CPU, or remote Infinity Fabric connected
-      accelerator or CPU) over the total active L2 cycles.
-    Read - PCIe Stall: The number of cycles the L2-Fabric interface was stalled on
-      read requests to remote PCIe connected accelerators or CPUs as a percent of
-      the total active L2 cycles.
-    Read - Infinity Fabric Stall: The number of cycles the L2-Fabric interface was
-      stalled on read requests to remote Infinity Fabric connected accelerators or
-      CPUs as a percent of the total active L2 cycles.
-    Read - HBM Stall: The number of cycles the L2-Fabric interface was stalled on
-      read requests to the accelerator's local HBM as a percent of the total active
-      L2 cycles.
-    Write - PCIe Stall: The number of cycles the L2-Fabric interface was stalled on
-      write or atomic requests to remote PCIe connected accelerators or CPUs as a
-      percent of the total active L2 cycles.
-    Write - Infinity Fabric Stall: The number of cycles the L2-Fabric interface was
-      stalled on write or atomic requests to remote Infinity Fabric connected accelerators
-      or CPUs as a percent of the total active L2 cycles.
-    Write - HBM Stall: The number of cycles the L2-Fabric interface was stalled on
-      write or atomic requests to accelerator's local HBM as a percent of the total
-      active L2 cycles.
   data source:
   - metric_table:
       id: 1701
@@ -370,32 +158,32 @@ Panel Config:
           avg: AVG((TCC_REQ_sum / $denom))
           min: MIN((TCC_REQ_sum / $denom))
           max: MAX((TCC_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Read Req:
           avg: AVG((TCC_READ_sum / $denom))
           min: MIN((TCC_READ_sum / $denom))
           max: MAX((TCC_READ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Write Req:
           avg: AVG((TCC_WRITE_sum / $denom))
           min: MIN((TCC_WRITE_sum / $denom))
           max: MAX((TCC_WRITE_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Atomic Req:
           avg: AVG((TCC_ATOMIC_sum / $denom))
           min: MIN((TCC_ATOMIC_sum / $denom))
           max: MAX((TCC_ATOMIC_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Streaming Req:
           avg: AVG((TCC_STREAMING_REQ_sum / $denom))
           min: MIN((TCC_STREAMING_REQ_sum / $denom))
           max: MAX((TCC_STREAMING_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Probe Req:
           avg: AVG((TCC_PROBE_sum / $denom))
           min: MIN((TCC_PROBE_sum / $denom))
           max: MAX((TCC_PROBE_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Cache Hit:
           avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
             + TCC_MISS_sum) != 0) else None))
@@ -408,17 +196,17 @@ Panel Config:
           avg: AVG((TCC_HIT_sum / $denom))
           min: MIN((TCC_HIT_sum / $denom))
           max: MAX((TCC_HIT_sum / $denom))
-          unit: (Hits  + $normUnit)
+          unit: (Hits + $normUnit)
         Misses:
           avg: AVG((TCC_MISS_sum / $denom))
           min: MIN((TCC_MISS_sum / $denom))
           max: MAX((TCC_MISS_sum / $denom))
-          unit: (Misses  + $normUnit)
+          unit: (Misses + $normUnit)
         Writeback:
           avg: AVG((TCC_WRITEBACK_sum / $denom))
           min: MIN((TCC_WRITEBACK_sum / $denom))
           max: MAX((TCC_WRITEBACK_sum / $denom))
-          unit: (Cachelines  + $normUnit)
+          unit: (Cachelines + $normUnit)
         Writeback (Internal):
           avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom))
           min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom))
@@ -443,22 +231,22 @@ Panel Config:
           avg: AVG((TCC_NC_REQ_sum / $denom))
           min: MIN((TCC_NC_REQ_sum / $denom))
           max: MAX((TCC_NC_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         UC Req:
           avg: AVG((TCC_UC_REQ_sum / $denom))
           min: MIN((TCC_UC_REQ_sum / $denom))
           max: MAX((TCC_UC_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         CC Req:
           avg: AVG((TCC_CC_REQ_sum / $denom))
           min: MIN((TCC_CC_REQ_sum / $denom))
           max: MAX((TCC_CC_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         RW Req:
           avg: AVG((TCC_RW_REQ_sum / $denom))
           min: MIN((TCC_RW_REQ_sum / $denom))
           max: MAX((TCC_RW_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
   - metric_table:
       id: 1704
       title: L2 Cache Stalls
@@ -507,54 +295,216 @@ Panel Config:
           avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom))
           min: MIN((TCC_EA0_RDREQ_32B_sum / $denom))
           max: MAX((TCC_EA0_RDREQ_32B_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Read (64B):
           avg: AVG(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
           min: MIN(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
           max: MAX(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Read (Uncached):
           avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
           min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
           max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         HBM Read:
           avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom))
           min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom))
           max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Remote Read:
           avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
           min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
           max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Write and Atomic (32B):
           avg: AVG(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
           min: MIN(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
           max: MAX(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Write and Atomic (Uncached):
           avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
           min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
           max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Write and Atomic (64B):
           avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom))
           min: MIN((TCC_EA0_WRREQ_64B_sum / $denom))
           max: MAX((TCC_EA0_WRREQ_64B_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         HBM Write and Atomic:
           avg: AVG((TCC_EA0_WRREQ_DRAM_sum / $denom))
           min: MIN((TCC_EA0_WRREQ_DRAM_sum / $denom))
           max: MAX((TCC_EA0_WRREQ_DRAM_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Remote Write and Atomic:
           avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
           min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
           max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Atomic:
           avg: AVG((TCC_EA0_ATOMIC_sum / $denom))
           min: MIN((TCC_EA0_ATOMIC_sum / $denom))
           max: MAX((TCC_EA0_ATOMIC_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
+  metrics_description:
+    Utilization: The ratio of the number of cycles an L2 channel was active, summed
+      over all L2 channels on the accelerator over the total L2 cycles.
+    Peak Bandwidth: The number of bytes looked up in the L2 cache, as a percent of
+      the peak theoretical bandwidth achievable on the specific accelerator. The number
+      of bytes is calculated as the number of cache lines requested multiplied by
+      the cache line size. This value does not consider partial requests, so e.g.,
+      if only a single value is requested in a cache line, the data movement will
+      still be counted as a full cache line.
+    Hit Rate: The ratio of the number of L2 cache line requests that hit in the L2
+      cache over the total number of incoming cache line requests to the L2 cache.
+    L2-Fabric Read BW: The number of bytes read by the L2 over the Infinity Fabric
+      interface per unit time.
+    L2-Fabric Write and Atomic BW: The number of bytes sent by the L2 over the Infinity
+      Fabric interface by write and atomic operations per unit time.
+    HBM Bandwidth: Maximum theoretical bandwidth of the accelerator's local high-bandwidth
+      memory (HBM) per unit time. This value is calculated as the number of HBM channels
+      multiplied by the HBM channel width multiplied by the HBM clock frequency.
+    Read BW: The total number of bytes read by the L2 cache from Infinity Fabric divided
+      by total duration.
+    HBM Read Traffic: The percent of read requests generated by the L2 cache that
+      are routed to the accelerator's local high-bandwidth memory (HBM). This breakdown
+      does not consider the size of the request (meaning that 32B and 64B requests
+      are both counted as a single request), so this metric only approximates the
+      percent of the L2-Fabric Read bandwidth directed to the local HBM.
+    Remote Read Traffic: The percent of read requests generated by the L2 cache that
+      are routed to any memory location other than the accelerator's local high-bandwidth
+      memory (HBM) - for example, the CPU's DRAM or a remote accelerator's HBM. This
+      breakdown does not consider the size of the request (meaning that 32B and 64B
+      requests are both counted as a single request), so this metric only approximates
+      the percent of the L2-Fabric Read bandwidth directed to a remote location.
+    Uncached Read Traffic: The percent of read requests generated by the L2 cache
+      that are reading from an uncached memory allocation. Note, as described in the
+      request flow section, a single 64B read request is typically counted as two
+      uncached read requests. So, it is possible for the Uncached Read Traffic to
+      reach up to 200% of the total number of read requests. This breakdown does not
+      consider the size of the request (i.e., 32B and 64B requests are both counted
+      as a single request), so this metric only approximates the percent of the L2-Fabric
+      read bandwidth directed to an uncached memory location.
+    Write and Atomic BW: The total number of bytes written by the L2 over Infinity
+      Fabric by write and atomic operations divided by total duration. Note that on
+      current CDNA accelerators, such as the MI2XX, requests are only considered atomic
+      by Infinity Fabric if they are targeted at non-write-cacheable memory, for example,
+      fine-grained memory allocations or uncached memory allocations on the MI2XX.
+    HBM Write and Atomic Traffic: The percent of write and atomic requests generated
+      by the L2 cache that are routed to the accelerator's local high-bandwidth memory
+      (HBM). This breakdown does not consider the size of the request (meaning that
+      32B and 64B requests are both counted as a single request), so this metric only
+      approximates the percent of the L2-Fabric Write and Atomic bandwidth directed
+      to the local HBM. Note that on current CDNA accelerators, such as the MI2XX,
+      requests are only considered atomic by Infinity Fabric if they are targeted
+      at fine-grained memory allocations or uncached memory allocations.
+    Remote Write and Atomic Traffic: The percent of read requests generated by the
+      L2 cache that are routed to any memory location other than the accelerator's
+      local high-bandwidth memory (HBM) - for example, the CPU's DRAM or a remote
+      accelerator's HBM. This breakdown does not consider the size of the request
+      (meaning that 32B and 64B requests are both counted as a single request), so
+      this metric only approximates the percent of the L2-Fabric Read bandwidth directed
+      to a remote location. Note that on current CDNA accelerators, such as the MI2XX,
+      requests are only considered atomic by Infinity Fabric if they are targeted
+      at fine-grained memory allocations or uncached memory allocations.
+    Atomic Traffic: The percent of write requests generated by the L2 cache that are
+      atomic requests to any memory location. This breakdown does not consider the
+      size of the request (meaning that 32B and 64B requests are both counted as a
+      single request), so this metric only approximates the percent of the L2-Fabric
+      Read bandwidth directed to a remote location. Note that on current CDNA accelerators,
+      such as the MI2XX, requests are only considered atomic by Infinity Fabric if
+      they are targeted at fine-grained memory allocations or uncached memory allocations.
+    Uncached Write and Atomic Traffic: The percent of write and atomic requests generated
+      by the L2 cache that are targeting uncached memory allocations. This breakdown
+      does not consider the size of the request (meaning that 32B and 64B requests
+      are both counted as a single request), so this metric only approximates the
+      percent of the L2-Fabric read bandwidth directed to uncached memory allocations.
+    Read Latency: The time-averaged number of cycles read requests spent in Infinity
+      Fabric before data was returned to the L2.
+    Write and Atomic Latency: The time-averaged number of cycles write requests spent
+      in Infinity Fabric before a completion acknowledgement was returned to the L2.
+    Atomic Latency: The time-averaged number of cycles atomic requests spent in Infinity
+      Fabric before a completion acknowledgement (atomic without return value) or
+      data (atomic with return value) was returned to the L2.
+    Bandwidth: The number of bytes looked up in the L2 cache, divided by total duration.
+      The number of bytes is calculated as the number of cache lines requested multiplied
+      by the cache line size. This value does not consider partial requests, so for
+      example, if only a single value is requested in a cache line, the data movement
+      will still be counted as a full cache line.
+    Req: The total number of incoming requests to the L2 from all clients for all
+      request types, per normalization unit.
+    Read Req: The total number of read requests to the L2 from all clients.
+    Write Req: The total number of write requests to the L2 from all clients.
+    Atomic Req: The total number of atomic requests (with and without return) to the
+      L2 from all clients.
+    Streaming Req: The total number of incoming requests to the L2 that are marked
+      as streaming. The exact meaning of this may differ depending on the targeted
+      accelerator, however on an MI2XX this corresponds to non-temporal load or stores.
+      The L2 cache attempts to evict streaming requests before normal requests when
+      the L2 is at capacity.
+    Probe Req: The number of coherence probe requests made to the L2 cache from outside
+      the accelerator. On an MI2XX, probe requests may be generated by, for example,
+      writes to fine-grained device memory or by writes to coarse-grained device memory.
+    Cache Hit: The ratio of the number of L2 cache line requests that hit in the L2
+      cache over the total number of incoming cache line requests to the L2 cache.
+    Hits: The total number of requests to the L2 from all clients that hit in the
+      cache. As noted in the Speed-of-Light section, this includes hit-on-miss requests.
+    Misses: The total number of requests to the L2 from all clients that miss in the
+      cache. As noted in the Speed-of-Light section, these do not include hit-on-miss
+      requests.
+    Writeback: The total number of L2 cache lines written back to memory for any reason.
+      Write-backs may occur due to user code (such as HIP kernel calls to _threadfence_system
+      or atomic built-ins) by the command processor's memory acquire/release fences,
+      or for other internal hardware reasons.
+    Writeback (Internal): The total number of L2 cache lines written back to memory
+      for internal hardware reasons, per normalization unit.
+    Writeback (vL1D Req): The total number of L2 cache lines written back to memory
+      due to requests initiated by the vL1D cache, per normalization unit.
+    Evict (Internal): The total number of L2 cache lines evicted from the cache due
+      to capacity limits, per normalization unit.
+    Evict (vL1D Req): The total number of L2 cache lines evicted from the cache due
+      to invalidation requests initiated by the vL1D cache, per normalization unit.
+    NC Req: The total number of requests to the L2 to Not-hardware-Coherent (NC) memory
+      allocations, per normalization unit.
+    UC Req: The total number of requests to the L2 that go to Uncached (UC) memory
+      allocations.
+    CC Req: The total number of requests to the L2 that go to Coherently Cacheable
+      (CC) memory allocations.
+    RW Req: The total number of requests to the L2 that go to Read-Write coherent
+      memory (RW) allocations.
+    Write - Credit Starvation: The number of cycles the L2-Fabric interface was stalled
+      on write or atomic requests to any memory location because too many write/atomic
+      requests were currently in flight, as a percent of the total active L2 cycles.
+    Read (32B): The total number of L2 requests to Infinity Fabric to read 32B of
+      data from any memory location, per normalization unit.
+    Read (64B): The total number of L2 requests to Infinity Fabric to read 64B of
+      data from any memory location, per normalization unit.
+    Read (Uncached): The total number of L2 requests to Infinity Fabric to read uncached
+      data from any memory location, per normalization unit. 64B requests for uncached
+      data are counted as two 32B uncached data requests.
+    HBM Read: The total number of L2 requests to Infinity Fabric to read 32B or 64B
+      of data from the accelerator's local HBM, per normalization unit.
+    Remote Read: The total number of L2 requests to Infinity Fabric to read 32B or
+      64B of data from any source other than the accelerator's local HBM, per normalization
+      unit.
+    Write and Atomic (32B): The total number of L2 requests to Infinity Fabric to
+      write or atomically update 32B of data to any memory location, per normalization
+      unit.
+    Write and Atomic (Uncached): The total number of L2 requests to Infinity Fabric
+      to write or atomically update 32B or 64B of uncached data, per normalization
+      unit.
+    Write and Atomic (64B): The total number of L2 requests to Infinity Fabric to
+      write or atomically update 64B of data in any memory location, per normalization
+      unit.
+    HBM Write and Atomic: The total number of L2 requests to Infinity Fabric to write
+      or atomically update 32B or 64B of data in the accelerator's local HBM, per
+      normalization unit.
+    Remote Write and Atomic: The total number of L2 requests to Infinity Fabric to
+      write or atomically update 32B or 64B of data in any memory location other than
+      the accelerator's local HBM, per normalization unit.
+    Atomic: The total number of L2 requests to Infinity Fabric to atomically update
+      32B or 64B of data in any memory location, per normalization unit. See Request
+      flow for more detail. Note that on current CDNA accelerators, such as the MI2XX,
+      requests are only considered atomic by Infinity Fabric if they are targeted
+      at non-write-cacheable memory, such as fine-grained memory allocations or uncached
+      memory allocations on the MI2XX.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1800_l2_cache_per_channel.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1800_l2_cache_per_channel.yaml
index 849662871e..75ce281b57 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1800_l2_cache_per_channel.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1800_l2_cache_per_channel.yaml
@@ -2,10 +2,6 @@
 Panel Config:
   id: 1800
   title: L2 Cache (per Channel)
-  metrics_description:
-    L2 Cache Hit Rate: The percent of total number of requests to the L2 from all
-      clients that hit in the cache. As noted in the Speed-of-Light section, this
-      includes hit-on-miss requests.
   data source:
   - metric_table:
       id: 1801
@@ -249,3 +245,7 @@ Panel Config:
           ::_1: $total_l2_chan
       cli_style: simple_box
       tui_style: simple_box
+  metrics_description:
+    L2 Cache Hit Rate: The percent of total number of requests to the L2 from all
+      clients that hit in the cache. As noted in the Speed-of-Light section, this
+      includes hit-on-miss requests.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/2100_pc_sampling.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/2100_pc_sampling.yaml
index e94471d7dc..16e4d01e7e 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/2100_pc_sampling.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/2100_pc_sampling.yaml
@@ -2,10 +2,10 @@
 Panel Config:
   id: 2100
   title: PC Sampling
-  metrics_description: {}
   data source:
   - pc_sampling_table:
       id: 2101
       title: PC Sampling
       source: ps_file
       comparable: false
+  metrics_description: {}
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/config_delta/gfx950_diff.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/config_delta/gfx950_diff.yaml
new file mode 100644
index 0000000000..d4c0cb307a
--- /dev/null
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/config_delta/gfx950_diff.yaml
@@ -0,0 +1,755 @@
+# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated by tools/config_management/generate_config_deltas.py
+Addition:
+  - Panel Config:
+      id: 200
+      title: System Speed-of-Light
+    metric_tables:
+      - metric_table:
+          id: 201
+          title: System Speed-of-Light
+          metrics:
+            - MFMA FLOPs (F6F4):
+                value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp - Start_Timestamp)))
+                unit: GFLOP/s
+                peak: ((($max_sclk * $cu_per_gpu) * 16834) / 1000)
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 16834) / 1000))
+  - Panel Config:
+      id: 300
+      title: Memory Chart
+    metric_tables:
+      - metric_table:
+          id: 301
+          title: Memory Chart
+          metrics:
+            - L2 Wr Lat:
+                value: |
+                  ROUND(AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != 0) else None)), 0)
+            - L2 Rd Lat:
+                value: |
+                  ROUND(AVG(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else None)), 0)
+  - Panel Config:
+      id: 400
+      title: Roofline
+    metric_tables:
+      - metric_table:
+          id: 401
+          title: Roofline Performance Rates
+          metrics:
+            - MFMA FLOPs (F6F4):
+                value: |
+                  AVG((((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
+                unit: GFLOP/s
+                peak: $MFMA_FLOPs_F6F4_empirical_peak
+  - Panel Config:
+      id: 500
+      title: Command Processor (CPC/CPF)
+    metric_tables:
+      - metric_table:
+          id: 502
+          title: Command processor packet processor (CPC)
+          metrics:
+            - CPC SYNC FIFO Full Rate:
+                avg: |
+                  AVG((100 * CPC_SYNC_FIFO_FULL) / CPC_SYNC_WRREQ_FIFO_BUSY if (CPC_SYNC_WRREQ_FIFO_BUSY != 0) else None)
+                min: |
+                  MIN((100 * CPC_SYNC_FIFO_FULL) / CPC_SYNC_WRREQ_FIFO_BUSY if (CPC_SYNC_WRREQ_FIFO_BUSY != 0) else None)
+                max: |
+                  MAX((100 * CPC_SYNC_FIFO_FULL) / CPC_SYNC_WRREQ_FIFO_BUSY if (CPC_SYNC_WRREQ_FIFO_BUSY != 0) else None)
+                unit: pct
+            - CPC CANE Stall Rate:
+                avg: AVG((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0) else None)
+                min: MIN((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0) else None)
+                max: MAX((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0) else None)
+                unit: pct
+            - CPC ADC Utilization:
+                avg: AVG((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else None)
+                min: MIN((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else None)
+                max: MAX((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else None)
+                unit: pct
+  - Panel Config:
+      id: 600
+      title: Workgroup Manager (SPI)
+    metric_tables:
+      - metric_table:
+          id: 601
+          title: Workgroup manager utilizations
+          metrics:
+            - Scheduler-Pipe Wave Utilization:
+                avg: |
+                  AVG(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
+                min: |
+                  MIN(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
+                max: |
+                  MAX(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
+                unit: Pct
+            - Schedule-Pipe Wave Occupancy:
+                avg: |
+                  AVG(SPI_CSQ_P0_OCCUPANCY + SPI_CSQ_P1_OCCUPANCY + SPI_CSQ_P2_OCCUPANCY + SPI_CSQ_P3_OCCUPANCY)
+                min: |
+                  MIN(SPI_CSQ_P0_OCCUPANCY + SPI_CSQ_P1_OCCUPANCY + SPI_CSQ_P2_OCCUPANCY + SPI_CSQ_P3_OCCUPANCY)
+                max: |
+                  MAX(SPI_CSQ_P0_OCCUPANCY + SPI_CSQ_P1_OCCUPANCY + SPI_CSQ_P2_OCCUPANCY + SPI_CSQ_P3_OCCUPANCY)
+                unit: Wave
+      - metric_table:
+          id: 602
+          title: Workgroup Manager - Resource Allocation
+          metrics:
+            - Scheduler-Pipe FIFO Full Rate:
+                avg: |
+                  AVG((100 * (SPI_CS0_CRAWLER_STALL + SPI_CS1_CRAWLER_STALL + SPI_CS2_CRAWLER_STALL + SPI_CS3_CRAWLER_STALL) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
+                min: |
+                  MIN((100 * (SPI_CS0_CRAWLER_STALL + SPI_CS1_CRAWLER_STALL + SPI_CS2_CRAWLER_STALL + SPI_CS3_CRAWLER_STALL) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
+                max: |
+                  MAX((100 * (SPI_CS0_CRAWLER_STALL + SPI_CS1_CRAWLER_STALL + SPI_CS2_CRAWLER_STALL + SPI_CS3_CRAWLER_STALL) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
+                unit: Pct
+  - Panel Config:
+      id: 1000
+      title: Compute Units - Instruction Mix
+    metric_tables:
+      - metric_table:
+          id: 1003
+          title: VMEM Instruction Mix
+          metrics:
+            - Spill/Stack Coalesceable Instr:
+                avg: AVG((TA_BUFFER_COALESCEABLE_WAVEFRONTS_sum / $denom))
+                min: MIN((TA_BUFFER_COALESCEABLE_WAVEFRONTS_sum / $denom))
+                max: MAX((TA_BUFFER_COALESCEABLE_WAVEFRONTS_sum / $denom))
+                unit: (instr + $normUnit)
+      - metric_table:
+          id: 1004
+          title: MFMA Arithmetic Instruction Mix
+          metrics:
+            - MFMA-F6F4:
+                avg: AVG((SQ_INSTS_VALU_MFMA_F6F4 / $denom))
+                min: MIN((SQ_INSTS_VALU_MFMA_F6F4 / $denom))
+                max: MAX((SQ_INSTS_VALU_MFMA_F6F4 / $denom))
+                unit: (instr + $normUnit)
+  - Panel Config:
+      id: 1100
+      title: Compute Units - Compute Pipeline
+    metric_tables:
+      - metric_table:
+          id: 1101
+          title: Compute Speed-of-Light
+          metrics:
+            - MFMA FLOPs (F6F4):
+                value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp - Start_Timestamp)))
+                unit: GFLOP
+                peak: ((($max_sclk * $cu_per_gpu) * 16834) / 1000)
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 16834) / 1000))
+      - metric_table:
+          id: 1102
+          title: Pipeline Statistics
+          metrics:
+            - VALU Co-Issue Efficiency:
+                avg: AVG((100 * SQ_ACTIVE_INST_VALU2) / (SQ_ACTIVE_INST_VALU - SQ_ACTIVE_INST_VALU2))
+                min: MIN((100 * SQ_ACTIVE_INST_VALU2) / (SQ_ACTIVE_INST_VALU - SQ_ACTIVE_INST_VALU2))
+                max: MAX((100 * SQ_ACTIVE_INST_VALU2) / (SQ_ACTIVE_INST_VALU - SQ_ACTIVE_INST_VALU2))
+                unit: pct
+      - metric_table:
+          id: 1103
+          title: Arithmetic Operations
+          metrics:
+            - F6F4 OPs:
+                avg: AVG((512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4) / $denom)
+                min: MIN((512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4) / $denom)
+                max: MAX((512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4) / $denom)
+                unit: (OPs + $normUnit)
+  - Panel Config:
+      id: 1200
+      title: Local Data Share (LDS)
+    metric_tables:
+      - metric_table:
+          id: 1202
+          title: LDS Statistics
+          metrics:
+            - LDS STORE:
+                avg: AVG((SQ_INSTS_LDS_STORE / $denom))
+                min: MIN((SQ_INSTS_LDS_STORE / $denom))
+                max: MAX((SQ_INSTS_LDS_STORE / $denom))
+                unit: (instr + $normUnit)
+            - LDS LOAD Bandwidth:
+                avg: AVG(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp))
+                min: MIN(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp))
+                max: MAX(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp))
+                units: Gbps
+            - LDS ATOMIC:
+                avg: AVG((SQ_INSTS_LDS_ATOMIC / $denom))
+                min: MIN((SQ_INSTS_LDS_ATOMIC / $denom))
+                max: MAX((SQ_INSTS_LDS_ATOMIC / $denom))
+                unit: (instr + $normUnit)
+            - LDS STORE Bandwidth:
+                avg: AVG(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp))
+                min: MIN(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp))
+                max: MAX(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp))
+                units: Gbps
+            - LDS Command FIFO Full Rate:
+                avg: AVG((SQ_LDS_CMD_FIFO_FULL / $denom))
+                min: MIN((SQ_LDS_CMD_FIFO_FULL / $denom))
+                max: MAX((SQ_LDS_CMD_FIFO_FULL / $denom))
+                unit: (Cycles + $normUnit)
+            - LDS LOAD:
+                avg: AVG((SQ_INSTS_LDS_LOAD / $denom))
+                min: MIN((SQ_INSTS_LDS_LOAD / $denom))
+                max: MAX((SQ_INSTS_LDS_LOAD / $denom))
+                unit: (instr + $normUnit)
+            - LDS ATOMIC Bandwidth:
+                avg: AVG(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp))
+                min: MIN(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp))
+                max: MAX(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp))
+                units: Gbps
+            - LDS Data FIFO Full Rate:
+                avg: AVG((SQ_LDS_DATA_FIFO_FULL / $denom))
+                min: MIN((SQ_LDS_DATA_FIFO_FULL / $denom))
+                max: MAX((SQ_LDS_DATA_FIFO_FULL / $denom))
+                unit: (Cycles + $normUnit)
+  - Panel Config:
+      id: 1500
+      title: Address Processing Unit and Data Return Path (TA/TD)
+    metric_tables:
+      - metric_table:
+          id: 1504
+          title: Vector L1 data-return path or Texture Data (TD)
+          metrics:
+            - Write Ack Instructions:
+                avg: AVG((TD_WRITE_ACKT_WAVEFRONT_sum / $denom))
+                min: MIN((TD_WRITE_ACKT_WAVEFRONT_sum / $denom))
+                max: MAX((TD_WRITE_ACKT_WAVEFRONT_sum / $denom))
+                unit: (Instructions + $normUnit)
+      - metric_table:
+          id: 1502
+          title: Instruction counts
+          metrics:
+            - Spill/Stack Read Instructions for LDS:
+                avg: AVG((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom))
+                min: MIN((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom))
+                max: MAX((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom))
+                unit: (Instructions + $normUnit)
+            - Global/Generic Read Instructions for LDS:
+                avg: AVG((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom))
+                min: MIN((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom))
+                max: MAX((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom))
+                unit: (Instructions + $normUnit)
+  - Panel Config:
+      id: 1600
+      title: Vector L1 Data Cache
+    metric_tables:
+      - metric_table:
+          id: 1602
+          title: vL1D cache stall metrics
+          metrics:
+            - Stalled on Address:
+                expr: |
+                  (((100 * TCP_TCP_TA_ADDR_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None)
+            - Stalled on Read Return:
+                expr: |
+                  (((100 * TCP_TCR_RDRET_STALL_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None)
+            - Stalled on Request FIFO:
+                expr: |
+                  (((100 * TCP_RFIFO_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None)
+            - Stalled on Data:
+                expr: |
+                  (((100 * TCP_TCP_TA_DATA_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None)
+            - Stalled on Latency FIFO:
+                expr: |
+                  (((100 * TCP_LFIFO_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None)
+      - metric_table:
+          id: 1603
+          title: vL1D cache access metrics
+          metrics:
+            - Tag RAM 3 Req:
+                avg: AVG((TCP_TAGRAM3_REQ_sum / $denom))
+                min: MIN((TCP_TAGRAM3_REQ_sum / $denom))
+                max: MAX((TCP_TAGRAM3_REQ_sum / $denom))
+                unit: (Req + $normUnit)
+            - L1-L2 Read Latency:
+                avg: AVG((TCP_TCC_READ_REQ_LATENCY_sum / $denom))
+                min: MIN((TCP_TCC_READ_REQ_LATENCY_sum / $denom))
+                max: MAX((TCP_TCC_READ_REQ_LATENCY_sum / $denom))
+                unit: (Cycles + $normUnit)
+            - Tag RAM 2 Req:
+                avg: AVG((TCP_TAGRAM2_REQ_sum / $denom))
+                min: MIN((TCP_TAGRAM2_REQ_sum / $denom))
+                max: MAX((TCP_TAGRAM2_REQ_sum / $denom))
+                unit: (Req + $normUnit)
+            - Tag RAM 0 Req:
+                avg: AVG((TCP_TAGRAM0_REQ_sum / $denom))
+                min: MIN((TCP_TAGRAM0_REQ_sum / $denom))
+                max: MAX((TCP_TAGRAM0_REQ_sum / $denom))
+                unit: (Req + $normUnit)
+            - L1-L2 Write Latency:
+                avg: AVG((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom))
+                min: MIN((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom))
+                max: MAX((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom))
+                unit: (Cycles + $normUnit)
+            - L1 Access Latency:
+                avg: AVG((TCP_TCP_LATENCY_sum / $denom))
+                min: MIN((TCP_TCP_LATENCY_sum / $denom))
+                max: MAX((TCP_TCP_LATENCY_sum / $denom))
+                unit: (Cycles + $normUnit)
+            - Tag RAM 1 Req:
+                avg: AVG((TCP_TAGRAM1_REQ_sum / $denom))
+                min: MIN((TCP_TAGRAM1_REQ_sum / $denom))
+                max: MAX((TCP_TAGRAM1_REQ_sum / $denom))
+                unit: (Req + $normUnit)
+      - metric_table:
+          id: 1605
+          title: L1 Unified Translation Cache (UTCL1)
+          metrics:
+            - Misses under Translation Miss:
+                avg: AVG((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom))
+                min: MIN((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom))
+                max: MAX((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom))
+                units: (Req + $normUnit)
+            - Inflight Req:
+                avg: AVG((TCP_CLIENT_UTCL1_INFLIGHT_sum / $denom))
+                min: MIN((TCP_CLIENT_UTCL1_INFLIGHT_sum / $denom))
+                max: MAX((TCP_CLIENT_UTCL1_INFLIGHT_sum / $denom))
+                units: (Req + $normUnit)
+      - metric_table:
+          id: 1606
+          title: L1D Addr Translation Stalls
+          metrics:
+            - Latency FIFO Stall:
+                avg: AVG((TCP_UTCL1_LFIFO_FULL_sum / $denom))
+                min: MIN((TCP_UTCL1_LFIFO_FULL_sum / $denom))
+                max: MAX((TCP_UTCL1_LFIFO_FULL_sum / $denom))
+                units: (Cycles + $normUnit)
+            - Serialization Stall:
+                avg: AVG((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom))
+                min: MIN((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom))
+                max: MAX((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom))
+                units: (Cycles + $normUnit)
+            - Cache Full Stall:
+                avg: AVG((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom))
+                min: MIN((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom))
+                max: MAX((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom))
+                units: (Cycles + $normUnit)
+            - UTCL2 Stall:
+                avg: AVG((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom))
+                min: MIN((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom))
+                max: MAX((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom))
+                units: (Cycles + $normUnit)
+            - Cache Miss Stall:
+                avg: AVG((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom))
+                min: MIN((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom))
+                max: MAX((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom))
+                units: (Cycles + $normUnit)
+            - Resident Page Full Stall:
+                avg: AVG((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom))
+                min: MIN((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom))
+                max: MAX((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom))
+                units: (Cycles + $normUnit)
+            - Thrashing Stall:
+                avg: AVG((TCP_UTCL1_THRASHING_STALL_sum / $denom))
+                min: MIN((TCP_UTCL1_THRASHING_STALL_sum / $denom))
+                max: MAX((TCP_UTCL1_THRASHING_STALL_sum / $denom))
+                units: (Cycles + $normUnit)
+  - Panel Config:
+      id: 1700
+      title: L2 Cache
+    metric_tables:
+      - metric_table:
+          id: 1702
+          title: L2-Fabric interface metrics
+          metrics:
+            - Write Stall:
+                avg: |
+                  AVG(((100 * (TCC_EA0_WRREQ_STALL_sum) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None))
+                min: |
+                  MIN(((100 * (TCC_EA0_WRREQ_STALL_sum) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None))
+                max: |
+                  MAX(((100 * (TCC_EA0_WRREQ_STALL_sum) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None))
+                unit: pct
+            - Read Stall:
+                avg: |
+                  AVG((((100 * ((TCC_EA0_RDREQ_IO_CREDIT_STALL_sum + TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum) + TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None))
+                min: |
+                  MIN((((100 * ((TCC_EA0_RDREQ_IO_CREDIT_STALL_sum + TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum) + TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None))
+                max: |
+                  MAX((((100 * ((TCC_EA0_RDREQ_IO_CREDIT_STALL_sum + TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum) + TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None))
+                unit: pct
+      - metric_table:
+          id: 1703
+          title: L2 Cache Accesses
+          metrics:
+            - Input Buffer Req:
+                avg: AVG((TCC_IB_REQ_sum / $denom))
+                min: MIN((TCC_IB_REQ_sum / $denom))
+                max: MAX((TCC_IB_REQ_sum / $denom))
+                unit: (Req + $normUnit)
+            - Bypasss Req:
+                avg: AVG((TCC_BYPASS_REQ_sum / $denom))
+                min: MIN((TCC_BYPASS_REQ_sum / $denom))
+                max: MAX((TCC_BYPASS_REQ_sum / $denom))
+                unit: (Req + $normUnit)
+            - Atomic Bandwidth:
+                avg: AVG(TCC_ATOMIC_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_ATOMIC_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_ATOMIC_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Write Bandwidth:
+                avg: AVG(TCC_WRITE_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_WRITE_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_WRITE_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Read Bandwidth:
+                avg: AVG(TCC_READ_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_READ_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_READ_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+      - metric_table:
+          id: 1704
+          title: L2 Cache Stalls
+          metrics:
+            - Input Buffer Stalled on L2:
+                avg: AVG(TCC_IB_STALL_sum / $denom)
+                min: MIN(TCC_IB_STALL_sum / $denom)
+                max: MAX(TCC_IB_STALL_sum / $denom)
+                unit: (Cycles + $normUnit)
+            - Stalled on Latency FIFO:
+                avg: AVG(TCC_LATENCY_FIFO_FULL_sum / $denom)
+                min: MIN(TCC_LATENCY_FIFO_FULL_sum / $denom)
+                max: MAX(TCC_LATENCY_FIFO_FULL_sum / $denom)
+                unit: (Cycles + $normUnit)
+            - Stalled on Write Data FIFO:
+                avg: AVG(TCC_SRC_FIFO_FULL_sum / $denom)
+                min: MIN(TCC_SRC_FIFO_FULL_sum / $denom)
+                max: MAX(TCC_SRC_FIFO_FULL_sum / $denom)
+                unit: (Cycles + $normUnit)
+      - metric_table:
+          id: 1705
+          title: L2 - Fabric Interface stalls
+          metrics:
+            - Write - HBM Stall:
+                type: HBM Stall
+                transaction: Write
+                avg: |
+                  AVG(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                min: |
+                  MIN(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                max: |
+                  MAX(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                unit: pct
+            - Read - HBM Stall:
+                type: HBM Stall
+                transaction: Read
+                avg: |
+                  AVG(((100 * (TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                min: |
+                  MIN(((100 * (TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                max: |
+                  MAX(((100 * (TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                unit: pct
+            - Write - PCIe Stall:
+                type: PCIe Stall
+                transaction: Write
+                avg: |
+                  AVG(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                min: |
+                  MIN(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                max: |
+                  MAX(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                unit: pct
+            - Write - Infinity Fabric Stall:
+                type: Infinity Fabric™ Stall
+                transaction: Write
+                avg: |
+                  AVG(((100 * (TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                min: |
+                  MIN(((100 * (TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                max: |
+                  MAX(((100 * (TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                unit: pct
+            - Read - Infinity Fabric Stall:
+                type: Infinity Fabric™ Stall
+                transaction: Read
+                avg: |
+                  AVG(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                min: |
+                  MIN(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                max: |
+                  MAX(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                unit: pct
+            - Read - PCIe Stall:
+                type: PCIe Stall
+                transaction: Read
+                avg: |
+                  AVG(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                min: |
+                  MIN(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                max: |
+                  MAX(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                unit: pct
+      - metric_table:
+          id: 1706
+          title: L2 - Fabric interface detailed metrics
+          metrics:
+            - Read Bandwidth - PCIe:
+                avg: AVG(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Write Bandwidth - Infinity Fabric™:
+                avg: AVG(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Atomic Bandwidth - HBM:
+                avg: AVG(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Atomic - HBM:
+                avg: AVG((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom))
+                min: MIN((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom))
+                max: MAX((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom))
+                unit: (Req + $normUnit)
+            - Read Bandwidth - HBM:
+                avg: AVG(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Atomic Bandwidth - Infinity Fabric™:
+                avg: AVG(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Write Bandwidth - HBM:
+                avg: AVG(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Atomic Bandwidth - PCIe:
+                avg: AVG(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Read (128B):
+                avg: AVG((TCC_EA0_RDREQ_128B_sum / $denom))
+                min: MIN((TCC_EA0_RDREQ_128B_sum / $denom))
+                max: MAX((TCC_EA0_RDREQ_128B_sum / $denom))
+                unit: (Req + $normUnit)
+            - Read Bandwidth - Infinity Fabric™:
+                avg: AVG(TCC_EA0_RDREQ_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_EA0_RDREQ_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_EA0_RDREQ_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Write Bandwidth - PCIe:
+                avg: AVG(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+
+Deletion:
+  []
+
+Modification:
+  - Panel Config:
+      id: 200
+      title: System Speed-of-Light
+    metric_tables:
+      - metric_table:
+          id: 201
+          title: System Speed-of-Light
+          metrics:
+            - MFMA FLOPs (F8):
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
+                peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
+            - MFMA FLOPs (F64):
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 128) / 1000))
+                peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000)
+            - MFMA IOPs (Int8):
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
+                peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
+            - MFMA FLOPs (F16):
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
+                peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
+            - MFMA FLOPs (BF16):
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
+                peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
+  - Panel Config:
+      id: 300
+      title: Memory Chart
+    metric_tables:
+      - metric_table:
+          id: 301
+          title: Memory Chart
+          metrics:
+            - Wavefronts:
+                value: ROUND(AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE), 0)
+            - Workgroups:
+                value: |
+                  ROUND(AVG(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS), 0)
+  - Panel Config:
+      id: 400
+      title: Roofline
+    metric_tables:
+      - metric_table:
+          id: 402
+          title: Roofline Plot Points
+          metrics:
+            - Performance (GFLOPs):
+                value: |
+                  ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / (SUM(End_Timestamp - Start_Timestamp) / 1e9) ) / 1e9
+            - AI L2:
+                value: |
+                  ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) * 64 ) )
+            - AI L1:
+                value: |
+                  ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM(TCP_TOTAL_CACHE_ACCESSES_sum * 64) )
+            - AI HBM:
+                value: |
+                  ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCC_BUBBLE_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64) ) )
+  - Panel Config:
+      id: 600
+      title: Workgroup Manager (SPI)
+    metric_tables:
+      - metric_table:
+          id: 601
+          title: Workgroup manager utilizations
+          metrics:
+            - SGPR Writes:
+                max: |
+                  MAX((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
+                min: |
+                  MIN((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
+                avg: |
+                  AVG((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
+            - Dispatched Wavefronts:
+                max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
+                min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
+                avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
+            - Dispatched Workgroups:
+                max: |
+                  MAX(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS)
+                min: |
+                  MIN(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS)
+                avg: |
+                  AVG(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS)
+            - Scheduler-Pipe Utilization:
+                max: |
+                  MAX(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
+                min: |
+                  MIN(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
+                avg: |
+                  AVG(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
+            - VGPR Writes:
+                max: |
+                  MAX((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
+                min: |
+                  MIN((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
+                avg: |
+                  AVG((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
+  - Panel Config:
+      id: 700
+      title: Wavefront
+    metric_tables:
+      - metric_table:
+          id: 701
+          title: Wavefront Launch Stats
+          metrics:
+            - Total Wavefronts:
+                max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
+                min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
+                avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
+  - Panel Config:
+      id: 1100
+      title: Compute Units - Compute Pipeline
+    metric_tables:
+      - metric_table:
+          id: 1101
+          title: Compute Speed-of-Light
+          metrics:
+            - MFMA FLOPs (F16):
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
+                peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
+            - MFMA FLOPs (F64):
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 128) / 1000))
+                peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000)
+            - MFMA IOPs (INT8):
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
+                peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
+            - MFMA FLOPs (BF16):
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
+                peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
+            - MFMA FLOPs (F8):
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
+                peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
+      - metric_table:
+          id: 1103
+          title: Arithmetic Operations
+          metrics:
+            - FLOPs (Total):
+                max: |
+                  MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom))
+                min: |
+                  MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom))
+                avg: |
+                  AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom))
+  - Panel Config:
+      id: 1700
+      title: L2 Cache
+    metric_tables:
+      - metric_table:
+          id: 1701
+          title: L2 Speed-of-Light
+          metrics:
+            - L2-Fabric Read BW:
+                value: |
+                  AVG((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp)))
+      - metric_table:
+          id: 1702
+          title: L2-Fabric interface metrics
+          metrics:
+            - Read BW:
+                max: |
+                  MAX((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp)))
+                min: |
+                  MIN((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp)))
+                avg: |
+                  AVG((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp)))
+      - metric_table:
+          id: 1706
+          title: L2 - Fabric interface detailed metrics
+          metrics:
+            - Read (64B):
+                max: MAX((TCC_EA0_RDREQ_64B_sum / $denom))
+                min: MIN((TCC_EA0_RDREQ_64B_sum / $denom))
+                avg: AVG((TCC_EA0_RDREQ_64B_sum / $denom))
+            - HBM Write and Atomic:
+                max: MAX((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom))
+                min: MIN((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom))
+                avg: AVG((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom))
+  - Panel Config:
+      id: 1800
+      title: L2 Cache (per Channel)
+    metric_tables:
+      - metric_table:
+          id: 1809
+          title: L2-Fabric Read Stall (Cycles per normUnit)
+          metrics:
+            - ::_1:
+                ea read stall - pcie: AVG((TO_INT(TCC_EA0_RDREQ_IO_CREDIT_STALL[::_1]) / $denom))
+                ea read stall - hbm: AVG((TO_INT(TCC_EA0_RDREQ_DRAM_CREDIT_STALL[::_1]) / $denom))
+                ea read stall - if: AVG((TO_INT(TCC_EA0_RDREQ_GMI_CREDIT_STALL[::_1]) / $denom))
+      - metric_table:
+          id: 1810
+          title: L2-Fabric Write and Atomic Stall (Cycles per normUnit)
+          metrics:
+            - ::_1:
+                ea write stall - hbm: AVG((TO_INT(TCC_EA0_WRREQ_DRAM_CREDIT_STALL[::_1]) / $denom))
+                ea write stall - pcie: AVG((TO_INT(TCC_EA0_WRREQ_IO_CREDIT_STALL[::_1]) / $denom))
+                ea write stall - if: AVG((TO_INT(TCC_EA0_WRREQ_GMI_CREDIT_STALL[::_1]) / $denom))
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0000_top_stats.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0000_top_stats.yaml
index 55c6f6bb24..5ce5aeeb28 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0000_top_stats.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0000_top_stats.yaml
@@ -2,7 +2,6 @@
 Panel Config:
   id: 0
   title: Top Stats
-  metrics_description: {}
   data source:
   - raw_csv_table:
       id: 1
@@ -12,3 +11,4 @@ Panel Config:
       id: 2
       title: Dispatch List
       source: pmc_dispatch_info.csv
+  metrics_description: {}
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0100_system_info.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0100_system_info.yaml
index 23d024fde3..8b48c2253b 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0100_system_info.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0100_system_info.yaml
@@ -2,10 +2,10 @@
 Panel Config:
   id: 100
   title: System Info
-  metrics_description: {}
   data source:
   - raw_csv_table:
       id: 101
       title: System Info
       source: sysinfo.csv
       columnwise: true
+  metrics_description: {}
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0200_system_speed_of_light.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0200_system_speed_of_light.yaml
index e8aa26a3e1..7943f891b1 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0200_system_speed_of_light.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0200_system_speed_of_light.yaml
@@ -2,124 +2,6 @@
 Panel Config:
   id: 200
   title: System Speed-of-Light
-  metrics_description:
-    VALU FLOPs: 'The total floating-point operations executed per second on the VALU.
-      This is also presented as a percent of the peak theoretical FLOPs achievable
-      on the specific accelerator. Note: this does not include any floating-point
-      operations from MFMA instructions.'
-    VALU IOPs: 'The total integer operations executed per second on the VALU. This
-      is also presented as a percent of the peak theoretical IOPs achievable on the
-      specific accelerator. Note: this does not include any integer operations from
-      MFMA instructions.'
-    MFMA FLOPs (F8): The total number of 8-bit brain floating point MFMA operations
-      executed per second. This does not include any 16-bit brain floating point operations
-      from VALU instructions. This is also presented as a percent of the peak theoretical
-      F8 MFMA operations achievable on the specific accelerator. It is supported on
-      AMD Instinct MI300 series and later only.
-    MFMA FLOPs (BF16): 'The total number of 16-bit brain floating point MFMA operations
-      executed per second. Note: this does not include any 16-bit brain floating point
-      operations from VALU instructions. This is also presented as a percent of the
-      peak theoretical BF16 MFMA operations achievable on the specific accelerator.'
-    MFMA FLOPs (F16): 'The total number of 16-bit floating point MFMA operations executed
-      per second. Note: this does not include any 16-bit floating point operations
-      from VALU instructions. This is also presented as a percent of the peak theoretical
-      F16 MFMA operations achievable on the specific accelerator.'
-    MFMA FLOPs (F32): 'The total number of 32-bit floating point MFMA operations executed
-      per second. Note: this does not include any 32-bit floating point operations
-      from VALU instructions. This is also presented as a percent of the peak theoretical
-      F32 MFMA operations achievable on the specific accelerator.'
-    MFMA FLOPs (F64): 'The total number of 64-bit floating point MFMA operations executed
-      per second. Note: this does not include any 64-bit floating point operations
-      from VALU instructions. This is also presented as a percent of the peak theoretical
-      F64 MFMA operations achievable on the specific accelerator.'
-    MFMA IOPs (Int8): 'The total number of 8-bit integer MFMA operations executed
-      per second. Note: this does not include any 8-bit integer operations from VALU
-      instructions. This is also presented as a percent of the peak theoretical INT8
-      MFMA operations achievable on the specific accelerator.'
-    Active CUs: Total number of active compute units (CUs) on the accelerator during
-      the kernel execution.
-    SALU Utilization: Indicates what percent of the kernel's duration the SALU was
-      busy executing instructions. Computed as the ratio of the total number of cycles
-      spent by the scheduler issuing SALU or SMEM instructions over the total CU cycles.
-    VALU Utilization: Indicates what percent of the kernel's duration the VALU was
-      busy executing instructions. Does not include VMEM operations. Computed as the
-      ratio of the total number of cycles spent by the scheduler issuing VALU instructions
-      over the total CU cycles.
-    MFMA Utilization: Indicates what percent of the kernel's duration the MFMA unit
-      was busy executing instructions. Computed as the ratio of the total number of
-      cycles the MFMA was busy over the total CU cycles.
-    VMEM Utilization: Indicates what percent of the kernel's duration the VMEM unit
-      was busy executing instructions, including both global/generic and spill/scratch
-      operations (see the VMEM instruction count metrics) for more detail). Does not
-      include VALU operations. Computed as the ratio of the total number of cycles
-      spent by the scheduler issuing VMEM instructions over the total CU cycles.
-    Branch Utilization: Indicates what percent of the kernel's duration the branch
-      unit was busy executing instructions. Computed as the ratio of the total number
-      of cycles spent by the scheduler issuing branch instructions over the total
-      CU cycles
-    VALU Active Threads: Indicates the average level of divergence within a wavefront
-      over the lifetime of the kernel. The number of work-items that were active in
-      a wavefront during execution of each VALU instruction, time-averaged over all
-      VALU instructions run on all wavefronts in the kernel.
-    IPC: The ratio of the total number of instructions executed on the CU over the
-      total active CU cycles. This is also presented as a percent of the peak theoretical
-      bandwidth achievable on the specific accelerator.
-    Wavefront Occupancy: 'The time-averaged number of wavefronts resident on the accelerator
-      over the lifetime of the kernel. Note: this metric may be inaccurate for short-running
-      kernels (less than 1ms). This is also presented as a percent of the peak theoretical
-      occupancy achievable on the specific accelerator.'
-    Theoretical LDS Bandwidth: Indicates the maximum amount of bytes that could have
-      been loaded from, stored to, or atomically updated in the LDS per unit time
-      (see LDS Bandwidth example for more detail). This is also presented as a percent
-      of the peak theoretical F64 MFMA operations achievable on the specific accelerator.
-    LDS Bank Conflicts/Access: The ratio of the number of cycles spent in the LDS
-      scheduler due to bank conflicts (as determined by the conflict resolution hardware)
-      to the base number of cycles that would be spent in the LDS scheduler in a completely
-      uncontended case. This is also presented in normalized form (i.e., the Bank
-      Conflict Rate).
-    vL1D Cache Hit Rate: The ratio of the number of vL1D cache line requests that
-      hit in vL1D cache over the total number of cache line requests to the vL1D cache
-      RAM.
-    vL1D Cache BW: The number of bytes looked up in the vL1D cache as a result of
-      VMEM instructions per unit time. The number of bytes is calculated as the number
-      of cache lines requested multiplied by the cache line size. This value does
-      not consider partial requests, so e.g., if only a single value is requested
-      in a cache line, the data movement will still be counted as a full cache line.
-      This is also presented as a percent of the peak theoretical bandwidth achievable
-      on the specific accelerator.
-    L2 Cache Hit Rate: The ratio of the number of L2 cache line requests that hit
-      in the L2 cache over the total number of incoming cache line requests to the
-      L2 cache.
-    L2 Cache BW: The number of bytes looked up in the L2 cache per unit time. The
-      number of bytes is calculated as the number of cache lines requested multiplied
-      by the cache line size. This value does not consider partial requests, so e.g.,
-      if only a single value is requested in a cache line, the data movement will
-      still be counted as a full cache line. This is also presented as a percent of
-      the peak theoretical bandwidth achievable on the specific accelerator.
-    L2-Fabric Read BW: "The number of bytes read by the L2 over the Infinity Fabric\u2122\
-      \ interface per unit time. This is also presented as a percent of the peak theoretical\
-      \ bandwidth achievable on the specific accelerator."
-    L2-Fabric Write BW: The number of bytes sent by the L2 over the Infinity Fabric
-      interface by write and atomic operations per unit time. This is also presented
-      as a percent of the peak theoretical bandwidth achievable on the specific accelerator.
-    L2-Fabric Read Latency: The time-averaged number of cycles read requests spent
-      in Infinity Fabric before data was returned to the L2.
-    L2-Fabric Write Latency: The time-averaged number of cycles write requests spent
-      in Infinity Fabric before a completion acknowledgement was returned to the L2.
-    sL1D Cache Hit Rate: The percent of sL1D requests that hit on a previously loaded
-      line the cache. Calculated as the ratio of the number of sL1D requests that
-      hit over the number of all sL1D requests.
-    sL1D Cache BW: The number of bytes looked up in the sL1D cache per unit time.
-      This is also presented as a percent of the peak theoretical bandwidth achievable
-      on the specific accelerator.
-    L1I Hit Rate: The number of bytes looked up in the L1I cache per unit time. This
-      is also presented as a percent of the peak theoretical bandwidth achievable
-      on the specific accelerator.
-    L1I BW: The percent of L1I requests that hit on a previously loaded line the cache.
-      Calculated as the ratio of the number of L1I requests that hit over the number
-      of all L1I requests.
-    L1I Fetch Latency: The average number of cycles spent to fetch instructions to
-      a CU.
   data source:
   - metric_table:
       id: 201
@@ -344,3 +226,130 @@ Panel Config:
           peak: None
           pop: None
           coll_level: SQ_IFETCH_LEVEL
+  metrics_description:
+    VALU FLOPs: |-
+      The total floating-point operations executed per second on the VALU.
+      This is also presented as a percent of the peak theoretical FLOPs achievable
+      on the specific accelerator. Note: this does not include any floating-point
+      operations from MFMA instructions.
+    VALU IOPs: |-
+      The total integer operations executed per second on the VALU. This is
+      also presented as a percent of the peak theoretical IOPs achievable on the
+      specific accelerator. Note: this does not include any integer operations from
+      MFMA instructions.
+    MFMA FLOPs (F8): The total number of 8-bit brain floating point MFMA operations
+      executed per second. This does not include any 16-bit brain floating point operations
+      from VALU instructions. This is also presented as a percent of the peak theoretical
+      F8 MFMA operations achievable on the specific accelerator. It is supported on
+      AMD Instinct MI300 series and later only.
+    MFMA FLOPs (BF16): |-
+      The total number of 16-bit brain floating point MFMA operations executed
+      per second. Note: this does not include any 16-bit brain floating point operations
+      from VALU instructions. This is also presented as a percent of the peak theoretical
+      BF16 MFMA operations achievable on the specific accelerator.
+    MFMA FLOPs (F16): |-
+      The total number of 16-bit floating point MFMA operations executed per
+      second. Note: this does not include any 16-bit floating point operations from
+      VALU instructions. This is also presented as a percent of the peak theoretical
+      F16 MFMA operations achievable on the specific accelerator.
+    MFMA FLOPs (F32): |-
+      The total number of 32-bit floating point MFMA operations executed per
+      second. Note: this does not include any 32-bit floating point operations from
+      VALU instructions. This is also presented as a percent of the peak theoretical
+      F32 MFMA operations achievable on the specific accelerator.
+    MFMA FLOPs (F64): |-
+      The total number of 64-bit floating point MFMA operations executed per
+      second. Note: this does not include any 64-bit floating point operations from
+      VALU instructions. This is also presented as a percent of the peak theoretical
+      F64 MFMA operations achievable on the specific accelerator.
+    MFMA IOPs (Int8): |-
+      The total number of 8-bit integer MFMA operations executed per second.
+      Note: this does not include any 8-bit integer operations from VALU instructions.
+      This is also presented as a percent of the peak theoretical INT8 MFMA operations
+      achievable on the specific accelerator.
+    Active CUs: Total number of active compute units (CUs) on the accelerator during
+      the kernel execution.
+    SALU Utilization: Indicates what percent of the kernel's duration the SALU was
+      busy executing instructions. Computed as the ratio of the total number of cycles
+      spent by the scheduler issuing SALU or SMEM instructions over the total CU cycles.
+    VALU Utilization: Indicates what percent of the kernel's duration the VALU was
+      busy executing instructions. Does not include VMEM operations. Computed as the
+      ratio of the total number of cycles spent by the scheduler issuing VALU instructions
+      over the total CU cycles.
+    MFMA Utilization: Indicates what percent of the kernel's duration the MFMA unit
+      was busy executing instructions. Computed as the ratio of the total number of
+      cycles the MFMA was busy over the total CU cycles.
+    VMEM Utilization: Indicates what percent of the kernel's duration the VMEM unit
+      was busy executing instructions, including both global/generic and spill/scratch
+      operations (see the VMEM instruction count metrics) for more detail). Does not
+      include VALU operations. Computed as the ratio of the total number of cycles
+      spent by the scheduler issuing VMEM instructions over the total CU cycles.
+    Branch Utilization: Indicates what percent of the kernel's duration the branch
+      unit was busy executing instructions. Computed as the ratio of the total number
+      of cycles spent by the scheduler issuing branch instructions over the total
+      CU cycles
+    VALU Active Threads: Indicates the average level of divergence within a wavefront
+      over the lifetime of the kernel. The number of work-items that were active in
+      a wavefront during execution of each VALU instruction, time-averaged over all
+      VALU instructions run on all wavefronts in the kernel.
+    IPC: The ratio of the total number of instructions executed on the CU over the
+      total active CU cycles. This is also presented as a percent of the peak theoretical
+      bandwidth achievable on the specific accelerator.
+    Wavefront Occupancy: |-
+      The time-averaged number of wavefronts resident on the accelerator over
+      the lifetime of the kernel. Note: this metric may be inaccurate for short-running
+      kernels (less than 1ms). This is also presented as a percent of the peak theoretical
+      occupancy achievable on the specific accelerator.
+    Theoretical LDS Bandwidth: Indicates the maximum amount of bytes that could have
+      been loaded from, stored to, or atomically updated in the LDS per unit time
+      (see LDS Bandwidth example for more detail). This is also presented as a percent
+      of the peak theoretical F64 MFMA operations achievable on the specific accelerator.
+    LDS Bank Conflicts/Access: The ratio of the number of cycles spent in the LDS
+      scheduler due to bank conflicts (as determined by the conflict resolution hardware)
+      to the base number of cycles that would be spent in the LDS scheduler in a completely
+      uncontended case. This is also presented in normalized form (i.e., the Bank
+      Conflict Rate).
+    vL1D Cache Hit Rate: The ratio of the number of vL1D cache line requests that
+      hit in vL1D cache over the total number of cache line requests to the vL1D cache
+      RAM.
+    vL1D Cache BW: The number of bytes looked up in the vL1D cache as a result of
+      VMEM instructions per unit time. The number of bytes is calculated as the number
+      of cache lines requested multiplied by the cache line size. This value does
+      not consider partial requests, so e.g., if only a single value is requested
+      in a cache line, the data movement will still be counted as a full cache line.
+      This is also presented as a percent of the peak theoretical bandwidth achievable
+      on the specific accelerator.
+    L2 Cache Hit Rate: The ratio of the number of L2 cache line requests that hit
+      in the L2 cache over the total number of incoming cache line requests to the
+      L2 cache.
+    L2 Cache BW: The number of bytes looked up in the L2 cache per unit time. The
+      number of bytes is calculated as the number of cache lines requested multiplied
+      by the cache line size. This value does not consider partial requests, so e.g.,
+      if only a single value is requested in a cache line, the data movement will
+      still be counted as a full cache line. This is also presented as a percent of
+      the peak theoretical bandwidth achievable on the specific accelerator.
+    L2-Fabric Read BW: |-
+      The number of bytes read by the L2 over the Infinity Fabric\u2122 interface
+      per unit time. This is also presented as a percent of the peak theoretical
+      bandwidth achievable on the specific accelerator.
+    L2-Fabric Write BW: The number of bytes sent by the L2 over the Infinity Fabric
+      interface by write and atomic operations per unit time. This is also presented
+      as a percent of the peak theoretical bandwidth achievable on the specific accelerator.
+    L2-Fabric Read Latency: The time-averaged number of cycles read requests spent
+      in Infinity Fabric before data was returned to the L2.
+    L2-Fabric Write Latency: The time-averaged number of cycles write requests spent
+      in Infinity Fabric before a completion acknowledgement was returned to the L2.
+    sL1D Cache Hit Rate: The percent of sL1D requests that hit on a previously loaded
+      line the cache. Calculated as the ratio of the number of sL1D requests that
+      hit over the number of all sL1D requests.
+    sL1D Cache BW: The number of bytes looked up in the sL1D cache per unit time.
+      This is also presented as a percent of the peak theoretical bandwidth achievable
+      on the specific accelerator.
+    L1I Hit Rate: The number of bytes looked up in the L1I cache per unit time. This
+      is also presented as a percent of the peak theoretical bandwidth achievable
+      on the specific accelerator.
+    L1I BW: The percent of L1I requests that hit on a previously loaded line the cache.
+      Calculated as the ratio of the number of L1I requests that hit over the number
+      of all L1I requests.
+    L1I Fetch Latency: The average number of cycles spent to fetch instructions to
+      a CU.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0300_memory_chart.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0300_memory_chart.yaml
index 03b5606ad7..b13053c1f7 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0300_memory_chart.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0300_memory_chart.yaml
@@ -2,122 +2,6 @@
 Panel Config:
   id: 300
   title: Memory Chart
-  metrics_description:
-    Wavefront Occupancy: Wavefronts per active CU.
-    Wave Life: Average number of cycles executing a wave.
-    SALU: Total Number of SALU (Scalar ALU) instructions issued per normalization
-      unit.
-    SMEM: Total number of SMEM (Scalar Memory Read) instructions issued normalization
-      unit.
-    VALU: The number of VALU (Vector ALU) instructions issued per normalization unit.
-    MFMA: Total number of MFMA (Matrix-Fused-Multiply-Add) instructions issued per
-      normalization unit.
-    VMEM: The number of VMEM (GPU Memory) read instructions issued (including FLAT/scratch
-      memory) per normalization unit.
-    LDS: The total number of LDS instructions (including, but not limited to, read/write/atomics
-      and HIP's __shfl instructions) executed per normalization unit.
-    GWS: Total number of GDS (global data sync) instructions issued per normalization
-      unit.
-    BR: Total number of BRANCH instructions issued per normalization unit.
-    Active CUs: Total number of active compute units (CUs) on the accelerator during
-      the kernel execution.
-    Num CUs: Total number of compute units (CUs) on the accelerator.
-    VGPR: 'The number of architected vector general-purpose registers allocated for
-      the kernel, see VALU. Note: this may not exactly match the number of VGPRs requested
-      by the compiler due to allocation granularity.'
-    SGPR: 'The number of scalar general-purpose registers allocated for the kernel,
-      see SALU. Note: this may not exactly match the number of SGPRs requested by
-      the compiler due to allocation granularity.'
-    LDS Allocation: 'The number of bytes of LDS memory (or, shared memory) allocated
-      for this kernel. Note: This may also be larger than what was requested at compile
-      time due to both allocation granularity and dynamic per-dispatch LDS allocations.'
-    Scratch Allocation: The number of bytes of scratch memory requested per work-item
-      for this kernel. Scratch memory is used for stack memory on the accelerator,
-      as well as for register spills and restores.
-    Wavefronts: The total number of wavefronts, summed over all workgroups, forming
-      this kernel launch.
-    Workgroups: The total number of workgroups forming this kernel launch.
-    LDS Req: The total number of LDS instructions (including, but not limited to,
-      read/write/atomics and HIP's __shfl instructions) executed per normalization
-      unit.
-    LDS Util: Indicates what percent of the kernel's duration the LDS was actively
-      executing instructions (including, but not limited to, load, store, atomic and
-      HIP's __shfl operations). Calculated as the ratio of the total number of cycles
-      LDS was active over the total CU cycles.
-    LDS Latency: The average number of round-trip cycles (i.e., from issue to data-return
-      / acknowledgment) required for an LDS instruction to complete.
-    VL1 Rd: The total number of incoming read requests from the address processing
-      unit after coalescing per normalization unit
-    VL1 Wr: The total number of incoming write requests from the address processing
-      unit after coalescing per normalization unit
-    VL1 Atomic: The total number of incoming atomic requests from the address processing
-      unit after coalescing per normalization unit
-    VL1 Hit: The ratio of the number of vL1D cache line requests that hit in vL1D
-      cache over the total number of cache line requests to the vL1D Cache RAM.
-    VL1 Lat: Calculated as the average number of cycles that a vL1D cache line request
-      spent in the vL1D cache pipeline.
-    VL1 Coalesce: Indicates how well memory instructions were coalesced by the address
-      processing unit, ranging from uncoalesced (25%) to fully coalesced (100%). Calculated
-      as the average number of thread-requests generated per instruction divided by
-      the ideal number of thread-requests per instruction.
-    VL1 Stall: The ratio of the number of cycles where the vL1D is stalled waiting
-      to issue a request for data to the L2 cache divided by the number of cycles
-      where the vL1D is active.
-    VL1_L2 Rd: The number of read requests for a vL1D cache line that were not satisfied
-      by the vL1D and must be retrieved from the to the L2 Cache per normalization
-      unit.
-    VL1_L2 Wr: The number of write requests to a vL1D cache line that were sent through
-      the vL1D to the L2 cache, per normalization unit.
-    VL1_L2 Atomic: The number of atomic requests that are sent through the vL1D to
-      the L2 cache, per normalization unit. This includes requests for atomics with,
-      and without return.
-    sL1D Rd: The total number of requests, of any size or type, made to the sL1D per
-      normalization unit.
-    sL1D Hit: The total number of sL1D requests that hit on a previously loaded cache
-      line, per normalization unit.
-    sL1D_L2 Rd: The total number of read requests from sL1D to the L2, per normalization
-      unit.
-    sL1D_L2 Wr: The total number of write requests from sL1D to the L2, per normalization
-      unit. Typically unused on current CDNA accelerators.
-    sL1D_L2 Atomic: The total number of atomic requests from sL1D to the L2, per normalization
-      unit. Typically unused on current CDNA accelerators.
-    IL1 Fetch: The total number of requests made to the L1I per normalization-unit.
-    IL1 Hit: The percent of L1I requests that hit on a previously loaded line the
-      cache. Calculated as the ratio of the number of L1I requests that hit over the
-      number of all L1I requests.
-    IL1 Lat: The average number of cycles spent to fetch instructions to a CU.
-    IL1_L2 Rd: The total number of requests across the L1I - L2 interface per normalization-unit.
-    L2 Rd: The total number of read requests to the L2 from all clients.
-    L2 Wr: The total number of write requests to the L2 from all clients.
-    L2 Atomic: The total number of atomic requests (with and without return) to the
-      L2 from all clients.
-    L2 Hit: The ratio of the number of L2 cache line requests that hit in the L2 cache
-      over the total number of incoming cache line requests to the L2 cache.
-    L2 Rd Lat: Calculated as the average number of cycles that the vL1D cache took
-      to issue and receive read requests from the L2 Cache. This number also includes
-      requests for atomics with return values.
-    L2 Wr Lat: Calculated as the average number of cycles that the vL1D cache took
-      to issue and receive acknowledgement of a write request to the L2 Cache. This
-      number also includes requests for atomics without return values.
-    Fabric_L2 Rd: Number of L2 cache - Infinity Fabric read requests (either 32-byte
-      or 64-byte) summed over TCC instances per normalization unit.
-    Fabric_L2 Wr: Number of L2 cache - Infinity Fabric write requests (either 32-byte
-      or 64-byte) summed over TCC instances per normalization unit.
-    Fabric_L2 Atomic: Number of L2 cache - Infinity Fabric write requests (either
-      32-byte or 64-byte) that are actually atomic requests summed over TCC instances
-      per normalization unit.
-    Fabric Rd Lat: The time-averaged number of cycles read requests spent in Infinity
-      Fabric before data was returned to the L2.
-    Fabric Wr Lat: The time-averaged number of cycles write requests spent in Infinity
-      Fabric before a completion acknowledgement was returned to the L2.
-    Fabric Atomic Lat: The time-averaged number of cycles atomic requests spent in
-      Infinity Fabric before a completion acknowledgement (atomic without return value)
-      or data (atomic with return value) was returned to the L2.
-    HBM Rd: The total number of L2 requests to Infinity Fabric to read 32B or 64B
-      of data from the accelerator's local HBM, per normalization unit.
-    HBM Wr: 'The total number of L2 requests to Infinity Fabric to write or atomically
-      update 32B or 64B of data in the accelerator''s local HBM, per normalization
-      unit. '
   data source:
   - metric_table:
       id: 301
@@ -244,13 +128,13 @@ Panel Config:
           value: ROUND(AVG((TCC_EA0_ATOMIC_sum / $denom)), 0)
         Fabric Rd Lat:
           value: ROUND(AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-            != 0) else  0)), 0)
+            != 0) else 0)), 0)
         Fabric Wr Lat:
           value: ROUND(AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-            != 0) else  0)), 0)
+            != 0) else 0)), 0)
         Fabric Atomic Lat:
           value: ROUND(AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
-            != 0) else  0)), 0)
+            != 0) else 0)), 0)
         HBM Rd:
           value: ROUND(AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)), 0)
         HBM Wr:
@@ -258,3 +142,117 @@ Panel Config:
       comparable: false
       cli_style: mem_chart
       tui_style: mem_chart
+  metrics_description:
+    Wavefront Occupancy: Wavefronts per active CU.
+    Wave Life: Average number of cycles executing a wave.
+    SALU: Total Number of SALU (Scalar ALU) instructions issued per normalization
+      unit.
+    SMEM: Total number of SMEM (Scalar Memory Read) instructions issued normalization
+      unit.
+    VALU: The number of VALU (Vector ALU) instructions issued per normalization unit.
+    MFMA: Total number of MFMA (Matrix-Fused-Multiply-Add) instructions issued per
+      normalization unit.
+    VMEM: The number of VMEM (GPU Memory) read instructions issued (including FLAT/scratch
+      memory) per normalization unit.
+    LDS: The total number of LDS instructions (including, but not limited to, read/write/atomics
+      and HIP's __shfl instructions) executed per normalization unit.
+    GWS: Total number of GDS (global data sync) instructions issued per normalization
+      unit.
+    BR: Total number of BRANCH instructions issued per normalization unit.
+    Active CUs: Total number of active compute units (CUs) on the accelerator during
+      the kernel execution.
+    Num CUs: Total number of compute units (CUs) on the accelerator.
+    VGPR: |-
+      The number of architected vector general-purpose registers allocated
+      for the kernel, see VALU. Note: this may not exactly match the number of VGPRs
+      requested by the compiler due to allocation granularity.
+    SGPR: |-
+      The number of scalar general-purpose registers allocated for the kernel,
+      see SALU. Note: this may not exactly match the number of SGPRs requested by
+      the compiler due to allocation granularity.
+    LDS Allocation: |-
+      The number of bytes of LDS memory (or, shared memory) allocated for
+      this kernel. Note: This may also be larger than what was requested at compile
+      time due to both allocation granularity and dynamic per-dispatch LDS allocations.
+    Scratch Allocation: The number of bytes of scratch memory requested per work-item
+      for this kernel. Scratch memory is used for stack memory on the accelerator,
+      as well as for register spills and restores.
+    Wavefronts: The total number of wavefronts, summed over all workgroups, forming
+      this kernel launch.
+    Workgroups: The total number of workgroups forming this kernel launch.
+    LDS Req: The total number of LDS instructions (including, but not limited to,
+      read/write/atomics and HIP's __shfl instructions) executed per normalization
+      unit.
+    LDS Util: Indicates what percent of the kernel's duration the LDS was actively
+      executing instructions (including, but not limited to, load, store, atomic and
+      HIP's __shfl operations). Calculated as the ratio of the total number of cycles
+      LDS was active over the total CU cycles.
+    LDS Latency: The average number of round-trip cycles (i.e., from issue to data-return
+      / acknowledgment) required for an LDS instruction to complete.
+    VL1 Rd: The total number of incoming read requests from the address processing
+      unit after coalescing per normalization unit
+    VL1 Wr: The total number of incoming write requests from the address processing
+      unit after coalescing per normalization unit
+    VL1 Atomic: The total number of incoming atomic requests from the address processing
+      unit after coalescing per normalization unit
+    VL1 Hit: The ratio of the number of vL1D cache line requests that hit in vL1D
+      cache over the total number of cache line requests to the vL1D Cache RAM.
+    VL1 Lat: Calculated as the average number of cycles that a vL1D cache line request
+      spent in the vL1D cache pipeline.
+    VL1 Coalesce: Indicates how well memory instructions were coalesced by the address
+      processing unit, ranging from uncoalesced (25%) to fully coalesced (100%). Calculated
+      as the average number of thread-requests generated per instruction divided by
+      the ideal number of thread-requests per instruction.
+    VL1 Stall: The ratio of the number of cycles where the vL1D is stalled waiting
+      to issue a request for data to the L2 cache divided by the number of cycles
+      where the vL1D is active.
+    VL1_L2 Rd: The number of read requests for a vL1D cache line that were not satisfied
+      by the vL1D and must be retrieved from the to the L2 Cache per normalization
+      unit.
+    VL1_L2 Wr: The number of write requests to a vL1D cache line that were sent through
+      the vL1D to the L2 cache, per normalization unit.
+    VL1_L2 Atomic: The number of atomic requests that are sent through the vL1D to
+      the L2 cache, per normalization unit. This includes requests for atomics with,
+      and without return.
+    sL1D Rd: The total number of requests, of any size or type, made to the sL1D per
+      normalization unit.
+    sL1D Hit: The total number of sL1D requests that hit on a previously loaded cache
+      line, per normalization unit.
+    sL1D_L2 Rd: The total number of read requests from sL1D to the L2, per normalization
+      unit.
+    sL1D_L2 Wr: The total number of write requests from sL1D to the L2, per normalization
+      unit. Typically unused on current CDNA accelerators.
+    sL1D_L2 Atomic: The total number of atomic requests from sL1D to the L2, per normalization
+      unit. Typically unused on current CDNA accelerators.
+    IL1 Fetch: The total number of requests made to the L1I per normalization-unit.
+    IL1 Hit: The percent of L1I requests that hit on a previously loaded line the
+      cache. Calculated as the ratio of the number of L1I requests that hit over the
+      number of all L1I requests.
+    IL1 Lat: The average number of cycles spent to fetch instructions to a CU.
+    IL1_L2 Rd: The total number of requests across the L1I - L2 interface per normalization-unit.
+    L2 Rd: The total number of read requests to the L2 from all clients.
+    L2 Wr: The total number of write requests to the L2 from all clients.
+    L2 Atomic: The total number of atomic requests (with and without return) to the
+      L2 from all clients.
+    L2 Hit: The ratio of the number of L2 cache line requests that hit in the L2 cache
+      over the total number of incoming cache line requests to the L2 cache.
+    Fabric_L2 Rd: Number of L2 cache - Infinity Fabric read requests (either 32-byte
+      or 64-byte) summed over TCC instances per normalization unit.
+    Fabric_L2 Wr: Number of L2 cache - Infinity Fabric write requests (either 32-byte
+      or 64-byte) summed over TCC instances per normalization unit.
+    Fabric_L2 Atomic: Number of L2 cache - Infinity Fabric write requests (either
+      32-byte or 64-byte) that are actually atomic requests summed over TCC instances
+      per normalization unit.
+    Fabric Rd Lat: The time-averaged number of cycles read requests spent in Infinity
+      Fabric before data was returned to the L2.
+    Fabric Wr Lat: The time-averaged number of cycles write requests spent in Infinity
+      Fabric before a completion acknowledgement was returned to the L2.
+    Fabric Atomic Lat: The time-averaged number of cycles atomic requests spent in
+      Infinity Fabric before a completion acknowledgement (atomic without return value)
+      or data (atomic with return value) was returned to the L2.
+    HBM Rd: The total number of L2 requests to Infinity Fabric to read 32B or 64B
+      of data from the accelerator's local HBM, per normalization unit.
+    HBM Wr: |-
+      The total number of L2 requests to Infinity Fabric to write or atomically
+      update 32B or 64B of data in the accelerator's local HBM, per normalization
+      unit.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0400_roofline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0400_roofline.yaml
index d95178b92b..536938f700 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0400_roofline.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0400_roofline.yaml
@@ -2,85 +2,6 @@
 Panel Config:
   id: 400
   title: Roofline
-  metrics_description:
-    VALU FLOPs (F16): 'The total 16-bit floating-point operations executed per second
-      on the VALU. This is presented with the value of the peak empirical F16 FLOPs
-      achievable on the specific accelerator. Note: this does not include any F16
-      operations from MFMA instructions.'
-    VALU FLOPs (F32): 'The total 32-bit floating-point operations executed per second
-      on the VALU. This is presented with the value of the peak empirical F32 FLOPs
-      achievable on the specific accelerator. Note: this does not include any F32
-      operations from MFMA instructions.'
-    VALU FLOPs (F64): 'The total 64-bit floating-point operations executed per second
-      on the VALU. This is presented with the value of the peak empirical F64 FLOPs
-      achievable on the specific accelerator. Note: this does not include any F64
-      operations from MFMA instructions.'
-    MFMA FLOPs (F8): The total number of 8-bit brain floating point MFMA operations
-      executed per second. This does not include any 16-bit brain floating point operations
-      from VALU instructions. The peak empirically measured F8 MFMA operations achievable
-      on the specific accelerator is displayed alongside for comparison. It is supported
-      on AMD Instinct MI300 series and later only.
-    MFMA FLOPs (BF16): 'The total number of 16-bit brain floating point MFMA operations
-      executed per second. Note: this does not include any 16-bit brain floating point
-      operations from VALU instructions. The peak empirically measured BF16 MFMA operations
-      achievable on the specific accelerator is displayed alongside for comparison.'
-    MFMA FLOPs (F16): 'The total number of 16-bit floating point MFMA operations executed
-      per second. Note: this does not include any 16-bit floating point operations
-      from VALU instructions. The peak empirically measured F16 MFMA operations achievable
-      on the specific accelerator is displayed alongside for comparison.'
-    MFMA FLOPs (F32): 'The total number of 32-bit floating point MFMA operations executed
-      per second. Note: this does not include any 32-bit floating point operations
-      from VALU instructions. The peak empirically measured F32 MFMA operations achievable
-      on the specific accelerator is displayed alongside for comparison.'
-    MFMA FLOPs (F64): 'The total number of 64-bit floating point MFMA operations executed
-      per second. Note: this does not include any 64-bit floating point operations
-      from VALU instructions. The peak empirically measured F64 MFMA operations achievable
-      on the specific accelerator is displayed alongside for comparison.'
-    MFMA FLOPs (F6F4): 'The total number of 4-bit and 6-bit floating point MFMA operations
-      executed per second. Note: this does not include any floating point operations
-      from VALU instructions. The peak empirically measured F6F4 MFMA operations achievable
-      on the specific accelerator is displayed alongside for comparison. It is supported
-      on AMD Instinct MI350 series (gfx950) and later only.'
-    MFMA IOPs (Int8): 'The total number of 8-bit integer MFMA operations executed
-      per second. Note: this does not include any 8-bit integer operations from VALU
-      instructions. The peak empirically measured INT8 MFMA operations achievable
-      on the specific accelerator is displayed alongside for comparison.'
-    HBM Bandwidth: The total number of bytes read from and written to High-Bandwidth
-      Memory (HBM) per second. The peak empirically measured bandwidth achievable
-      on the specific accelerator is displayed alongside for comparison.
-    L2 Cache Bandwidth: The number of bytes looked up in the L2 cache per unit time.
-      The number of bytes is calculated as the number of cache lines requested multiplied
-      by the cache line size. This value does not consider partial requests, so e.g.,
-      if only a single value is requested in a cache line, the data movement will
-      still be counted as a full cache line. The peak empirically measured bandwidth
-      achievable on the specific accelerator is displayed alongside for comparison.
-    L1 Cache Bandwidth: The number of bytes looked up in the vL1D cache as a result
-      of VMEM instructions per unit time. The number of bytes is calculated as the
-      number of cache lines requested multiplied by the cache line size. This value
-      does not consider partial requests, so e.g., if only a single value is requested
-      in a cache line, the data movement will still be counted as a full cache line.
-      The peak empirically measured bandwidth achievable on the specific accelerator
-      is displayed alongside for comparison.
-    LDS Bandwidth: Indicates the maximum amount of bytes that could have been loaded
-      from, stored to, or atomically updated in the LDS per unit time (see LDS Bandwidth
-      example for more detail). The peak empirically measured LDS bandwidth achievable
-      on the specific accelerator is displayed alongside for comparison.
-    AI L1: The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio
-      of total floating-point operations (FLOPs) to total bytes transferred between
-      the L1 cache and the processing units. This value is used as the x-coordinate
-      for the L1 roofline.
-    AI L2: The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio
-      of total floating-point operations (FLOPs) to total bytes transferred between
-      the L2 cache and the L1 cache. This value is used as the x-coordinate for the
-      L2 roofline.
-    AI HBM: The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM).
-      It is the ratio of total floating-point operations (FLOPs) to total bytes transferred
-      between HBM and the L2 cache. This value is used as the x-coordinate for the
-      HBM roofline.
-    Performance (GFLOPs): The overall achieved performance, measured in GigaFLOPs
-      per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point
-      operations divided by the total execution time. This value is used as the y-coordinate
-      for the kernel's point on the Roofline plot.
   data source:
   - metric_table:
       id: 401
@@ -218,3 +139,91 @@ Panel Config:
             512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8
             * 512) ) / (SUM(End_Timestamp - Start_Timestamp) / 1e9) ) / 1e9
           unit: GFLOP/s
+  metrics_description:
+    VALU FLOPs (F16): |-
+      The total 16-bit floating-point operations executed per second on the VALU.
+      This is presented with the value of the peak empirical F16 FLOPs achievable
+      on the specific accelerator. Note: this does not include any F16 operations
+      from MFMA instructions.
+    VALU FLOPs (F32): |-
+      The total 32-bit floating-point operations executed per second on the VALU.
+      This is presented with the value of the peak empirical F32 FLOPs achievable
+      on the specific accelerator. Note: this does not include any F32 operations
+      from MFMA instructions.
+    VALU FLOPs (F64): |-
+      The total 64-bit floating-point operations executed per second on the VALU.
+      This is presented with the value of the peak empirical F64 FLOPs achievable
+      on the specific accelerator. Note: this does not include any F64 operations
+      from MFMA instructions.
+    MFMA FLOPs (F8): The total number of 8-bit brain floating point MFMA operations
+      executed per second. This does not include any 16-bit brain floating point operations
+      from VALU instructions. The peak empirically measured F8 MFMA operations achievable
+      on the specific accelerator is displayed alongside for comparison. It is supported
+      on AMD Instinct MI300 series and later only.
+    MFMA FLOPs (BF16): |-
+      The total number of 16-bit brain floating point MFMA operations executed
+      per second. Note: this does not include any 16-bit brain floating point
+      operations from VALU instructions. The peak empirically measured BF16 MFMA
+      operations achievable on the specific accelerator is displayed alongside
+      for comparison.
+    MFMA FLOPs (F16): |-
+      The total number of 16-bit floating point MFMA operations executed per
+      second. Note: this does not include any 16-bit floating point operations from
+      VALU instructions. The peak empirically measured F16 MFMA operations
+      achievable on the specific accelerator is displayed alongside for comparison.
+    MFMA FLOPs (F32): |-
+      The total number of 32-bit floating point MFMA operations executed per
+      second. Note: this does not include any 32-bit floating point operations from
+      VALU instructions. The peak empirically measured F32 MFMA operations
+      achievable on the specific accelerator is displayed alongside for comparison.
+    MFMA FLOPs (F64): |-
+      The total number of 64-bit floating point MFMA operations executed per
+      second. Note: this does not include any 64-bit floating point operations from
+      VALU instructions. The peak empirically measured F64 MFMA operations
+      achievable on the specific accelerator is displayed alongside for comparison.
+    MFMA IOPs (Int8): |-
+      The total number of 8-bit integer MFMA operations executed per second.
+      Note: this does not include any 8-bit integer operations from VALU instructions.
+      The peak empirically measured INT8 MFMA operations achievable on the specific
+      accelerator is displayed alongside for comparison.
+    HBM Bandwidth: |-
+      The total number of bytes read from and written to High-Bandwidth
+      Memory (HBM) per second. The peak empirically measured bandwidth achievable
+      on the specific accelerator is displayed alongside for comparison.
+    L2 Cache Bandwidth: The number of bytes looked up in the L2 cache per unit time.
+      The number of bytes is calculated as the number of cache lines requested multiplied
+      by the cache line size. This value does not consider partial requests, so e.g.,
+      if only a single value is requested in a cache line, the data movement will
+      still be counted as a full cache line. The peak empirically measured bandwidth
+      achievable on the specific accelerator is displayed alongside for comparison.
+    L1 Cache Bandwidth: The number of bytes looked up in the vL1D cache as a result
+      of VMEM instructions per unit time. The number of bytes is calculated as the
+      number of cache lines requested multiplied by the cache line size. This value
+      does not consider partial requests, so e.g., if only a single value is requested
+      in a cache line, the data movement will still be counted as a full cache line.
+      The peak empirically measured bandwidth achievable on the specific accelerator
+      is displayed alongside for comparison.
+    LDS Bandwidth: Indicates the maximum amount of bytes that could have been loaded
+      from, stored to, or atomically updated in the LDS per unit time (see LDS Bandwidth
+      example for more detail). The peak empirically measured LDS bandwidth achievable
+      on the specific accelerator is displayed alongside for comparison.
+    AI L1: |-
+      The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio
+      of total floating-point operations (FLOPs) to total bytes transferred between
+      the L1 cache and the processing units. This value is used as the x-coordinate
+      for the L1 roofline.
+    AI L2: |-
+      The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio
+      of total floating-point operations (FLOPs) to total bytes transferred between
+      the L2 cache and the L1 cache. This value is used as the x-coordinate for
+      the L2 roofline.
+    AI HBM: |-
+      The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM).
+      It is the ratio of total floating-point operations (FLOPs) to total bytes
+      transferred between HBM and the L2 cache. This value is used as the x-coordinate
+      for the HBM roofline.
+    Performance (GFLOPs): |-
+      The overall achieved performance, measured in GigaFLOPs
+      per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point
+      operations divided by the total execution time. This value is used as the y-coordinate
+      for the kernel's point on the Roofline plot.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0500_command_processor_cpc_cpf.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0500_command_processor_cpc_cpf.yaml
index c4d2cabf52..118ce18331 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0500_command_processor_cpc_cpf.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0500_command_processor_cpc_cpf.yaml
@@ -2,30 +2,6 @@
 Panel Config:
   id: 500
   title: Command Processor (CPC/CPF)
-  metrics_description:
-    CPF Utilization: Percent of total cycles where the CPF was busy actively doing
-      any work. The ratio of CPF busy cycles over total cycles counted by the CPF.
-    CPF Stall: Percent of CPF busy cycles where the CPF was stalled for any reason.
-    CPF-L2 Utilization: Percent of total cycles counted by the CPF-L2 interface where
-      the CPF-L2 interface was active doing any work. The ratio of CPF-L2 busy cycles
-      over total cycles counted by the CPF-L2.
-    CPF-L2 Stall: Percent of CPF-L2 L2 busy cycles where the CPF-L2 interface was
-      stalled for any reason.
-    CPF-UTCL1 Stall: Percent of CPF busy cycles where the CPF was stalled by address
-      translation.
-    CPC Utilization: Percent of total cycles where the CPC was busy actively doing
-      any work. The ratio of CPC busy cycles over total cycles counted by the CPC.
-    CPC Stall Rate: Percent of CPC busy cycles where the CPC was stalled for any reason.
-    CPC Packet Decoding Utilization: Percent of CPC busy cycles spent decoding commands
-      for processing.
-    CPC-Workgroup Manager Utilization: Percent of CPC busy cycles spent dispatching
-      workgroups to the workgroup manager.
-    CPC-L2 Utilization: Percent of total cycles counted by the CPC-L2 interface where
-      the CPC-L2 interface was active doing any work.
-    CPC-UTCL1 Stall: Percent of CPC busy cycles where the CPC was stalled by address
-      translation
-    CPC-UTCL2 Utilization: 'Percent of total cycles counted by the CPC''s L2 address
-      translation interface where the CPC was busy doing address translation work.  '
   data source:
   - metric_table:
       id: 501
@@ -143,3 +119,28 @@ Panel Config:
           max: MAX((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
             if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
           unit: pct
+  metrics_description:
+    CPF Utilization: Percent of total cycles where the CPF was busy actively doing
+      any work. The ratio of CPF busy cycles over total cycles counted by the CPF.
+    CPF Stall: Percent of CPF busy cycles where the CPF was stalled for any reason.
+    CPF-L2 Utilization: Percent of total cycles counted by the CPF-L2 interface where
+      the CPF-L2 interface was active doing any work. The ratio of CPF-L2 busy cycles
+      over total cycles counted by the CPF-L2.
+    CPF-L2 Stall: Percent of CPF-L2 L2 busy cycles where the CPF-L2 interface was
+      stalled for any reason.
+    CPF-UTCL1 Stall: Percent of CPF busy cycles where the CPF was stalled by address
+      translation.
+    CPC Utilization: Percent of total cycles where the CPC was busy actively doing
+      any work. The ratio of CPC busy cycles over total cycles counted by the CPC.
+    CPC Stall Rate: Percent of CPC busy cycles where the CPC was stalled for any reason.
+    CPC Packet Decoding Utilization: Percent of CPC busy cycles spent decoding commands
+      for processing.
+    CPC-Workgroup Manager Utilization: Percent of CPC busy cycles spent dispatching
+      workgroups to the workgroup manager.
+    CPC-L2 Utilization: Percent of total cycles counted by the CPC-L2 interface where
+      the CPC-L2 interface was active doing any work.
+    CPC-UTCL1 Stall: Percent of CPC busy cycles where the CPC was stalled by address
+      translation
+    CPC-UTCL2 Utilization: |-
+      Percent of total cycles counted by the CPC's L2 address translation
+      interface where the CPC was busy doing address translation work.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0600_workgroup_manager_spi.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0600_workgroup_manager_spi.yaml
index f6bf13d8b8..eb9845aa82 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0600_workgroup_manager_spi.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0600_workgroup_manager_spi.yaml
@@ -2,61 +2,6 @@
 Panel Config:
   id: 600
   title: Workgroup Manager (SPI)
-  metrics_description:
-    Accelerator Utilization: The percent of cycles in the kernel where the accelerator
-      was actively doing any work.
-    Scheduler-Pipe Utilization: The percent of total scheduler-pipe cycles in the
-      kernel where the scheduler-pipes were actively doing any work.
-    Workgroup Manager Utilization: The percent of cycles in the kernel where the workgroup
-      manager was actively doing any work.
-    Shader Engine Utilization: The percent of total shader engine cycles in the kernel
-      where any CU in a shader-engine was actively doing any work, normalized over
-      all shader-engines. Low values (e.g., << 100%) indicate that the accelerator
-      was not fully saturated by the kernel, or a potential load-imbalance issue.
-    SIMD Utilization: The percent of total SIMD cycles in the kernel where any SIMD
-      on a CU was actively doing any work, summed over all CUs. Low values (less than
-      100%) indicate that the accelerator was not fully saturated by the kernel, or
-      a potential load-imbalance issue.
-    Dispatched Workgroups: The total number of workgroups forming this kernel launch.
-    Dispatched Wavefronts: The total number of wavefronts, summed over all workgroups,
-      forming this kernel launch.
-    VGPR Writes: The average number of cycles spent initializing VGPRs at wave creation.
-    SGPR Writes: The average number of cycles spent initializing SGPRs at wave creation.
-    Not-scheduled Rate (Workgroup Manager): The percent of total scheduler-pipe cycles
-      in the kernel where a workgroup could not be scheduled to a CU due to a bottleneck
-      within the workgroup manager rather than a lack of a CU or SIMD with sufficient
-      resources.
-    Not-scheduled Rate (Scheduler-Pipe): 'The percent of total scheduler-pipe cycles
-      in the kernel where a workgroup could not be scheduled to a CU due to a bottleneck
-      within the scheduler-pipes rather than a lack of a CU or SIMD with sufficient
-      resources. '
-    Scheduler-Pipe Stall Rate: The percent of total scheduler-pipe cycles in the kernel
-      where a workgroup could not be scheduled to a CU due to occupancy limitations
-      (like a lack of a CU or SIMD with sufficient resources).
-    Scratch Stall Rate: The percent of total shader-engine cycles in the kernel where
-      a workgroup could not be scheduled to a CU due to lack of private (a.k.a., scratch)
-      memory slots. While this can reach up to 100%, note that the actual occupancy
-      limitations on a kernel using private memory are typically quite small (for
-      example, less than 1% of the total number of waves that can be scheduled to
-      an accelerator).
-    Insufficient SIMD Waveslots: The percent of total SIMD cycles in the kernel where
-      a workgroup could not be scheduled to a SIMD due to lack of available waveslots.
-    Insufficient SIMD VGPRs: The percent of total SIMD cycles in the kernel where
-      a workgroup could not be scheduled to a SIMD due to lack of available VGPRs.
-    Insufficient SIMD SGPRs: The percent of total SIMD cycles in the kernel where
-      a workgroup could not be scheduled to a SIMD due to lack of available SGPRs.
-    Insufficient CU LDS: The percent of total CU cycles in the kernel where a workgroup
-      could not be scheduled to a CU due to lack of available LDS.
-    Insufficient CU Barriers: The percent of total CU cycles in the kernel where a
-      workgroup could not be scheduled to a CU due to lack of available barriers.
-    Reached CU Workgroup Limit: The percent of total CU cycles in the kernel where
-      a workgroup could not be scheduled to a CU due to limits within the workgroup
-      manager. This is expected to be always be zero on CDNA2 or newer accelerators
-      (and small for previous accelerators).
-    Reached CU Wavefront Limit: The percent of total CU cycles in the kernel where
-      a wavefront could not be scheduled to a CU due to limits within the workgroup
-      manager. This is expected to be always be zero on CDNA2 or newer accelerators
-      (and small for previous accelerators).
   data source:
   - metric_table:
       id: 601
@@ -199,3 +144,58 @@ Panel Config:
           min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
           max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
           unit: Pct
+  metrics_description:
+    Accelerator Utilization: The percent of cycles in the kernel where the accelerator
+      was actively doing any work.
+    Scheduler-Pipe Utilization: The percent of total scheduler-pipe cycles in the
+      kernel where the scheduler-pipes were actively doing any work.
+    Workgroup Manager Utilization: The percent of cycles in the kernel where the workgroup
+      manager was actively doing any work.
+    Shader Engine Utilization: The percent of total shader engine cycles in the kernel
+      where any CU in a shader-engine was actively doing any work, normalized over
+      all shader-engines. Low values (e.g., << 100%) indicate that the accelerator
+      was not fully saturated by the kernel, or a potential load-imbalance issue.
+    SIMD Utilization: The percent of total SIMD cycles in the kernel where any SIMD
+      on a CU was actively doing any work, summed over all CUs. Low values (less than
+      100%) indicate that the accelerator was not fully saturated by the kernel, or
+      a potential load-imbalance issue.
+    Dispatched Workgroups: The total number of workgroups forming this kernel launch.
+    Dispatched Wavefronts: The total number of wavefronts, summed over all workgroups,
+      forming this kernel launch.
+    VGPR Writes: The average number of cycles spent initializing VGPRs at wave creation.
+    SGPR Writes: The average number of cycles spent initializing SGPRs at wave creation.
+    Not-scheduled Rate (Workgroup Manager): The percent of total scheduler-pipe cycles
+      in the kernel where a workgroup could not be scheduled to a CU due to a bottleneck
+      within the workgroup manager rather than a lack of a CU or SIMD with sufficient
+      resources.
+    Not-scheduled Rate (Scheduler-Pipe): |-
+      The percent of total scheduler-pipe cycles in the kernel where a workgroup
+      could not be scheduled to a CU due to a bottleneck within the scheduler-pipes
+      rather than a lack of a CU or SIMD with sufficient resources.
+    Scheduler-Pipe Stall Rate: The percent of total scheduler-pipe cycles in the kernel
+      where a workgroup could not be scheduled to a CU due to occupancy limitations
+      (like a lack of a CU or SIMD with sufficient resources).
+    Scratch Stall Rate: The percent of total shader-engine cycles in the kernel where
+      a workgroup could not be scheduled to a CU due to lack of private (a.k.a., scratch)
+      memory slots. While this can reach up to 100%, note that the actual occupancy
+      limitations on a kernel using private memory are typically quite small (for
+      example, less than 1% of the total number of waves that can be scheduled to
+      an accelerator).
+    Insufficient SIMD Waveslots: The percent of total SIMD cycles in the kernel where
+      a workgroup could not be scheduled to a SIMD due to lack of available waveslots.
+    Insufficient SIMD VGPRs: The percent of total SIMD cycles in the kernel where
+      a workgroup could not be scheduled to a SIMD due to lack of available VGPRs.
+    Insufficient SIMD SGPRs: The percent of total SIMD cycles in the kernel where
+      a workgroup could not be scheduled to a SIMD due to lack of available SGPRs.
+    Insufficient CU LDS: The percent of total CU cycles in the kernel where a workgroup
+      could not be scheduled to a CU due to lack of available LDS.
+    Insufficient CU Barriers: The percent of total CU cycles in the kernel where a
+      workgroup could not be scheduled to a CU due to lack of available barriers.
+    Reached CU Workgroup Limit: The percent of total CU cycles in the kernel where
+      a workgroup could not be scheduled to a CU due to limits within the workgroup
+      manager. This is expected to be always be zero on CDNA2 or newer accelerators
+      (and small for previous accelerators).
+    Reached CU Wavefront Limit: The percent of total CU cycles in the kernel where
+      a wavefront could not be scheduled to a CU due to limits within the workgroup
+      manager. This is expected to be always be zero on CDNA2 or newer accelerators
+      (and small for previous accelerators).
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0700_wavefront.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0700_wavefront.yaml
index 5e332c0b8f..e9e9407cfc 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0700_wavefront.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0700_wavefront.yaml
@@ -2,63 +2,6 @@
 Panel Config:
   id: 700
   title: Wavefront
-  metrics_description:
-    Grid Size: The total number of work-items (or, threads) launched as a part of
-      the kernel dispatch. In HIP, this is equivalent to the total grid size multiplied
-      by the total workgroup (or, block) size.
-    Workgroup Size: The total number of work-items (or, threads) in each workgroup
-      (or, block) launched as part of the kernel dispatch. In HIP, this is equivalent
-      to the total block size.
-    Total Wavefronts: "The total number of wavefronts launched as part of the kernel\
-      \ dispatch. On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs,\
-      \ the wavefront size is always 64 work-items. Thus, the total number of wavefronts\
-      \ should be equivalent to the ceiling of grid size divided by 64."
-    Saved Wavefronts: The total number of wavefronts saved at a context-save.
-    Restored Wavefronts: The total number of wavefronts restored from a context-save.
-    VGPRs: 'The number of architected vector general-purpose registers allocated for
-      the kernel, see VALU. Note: this may not exactly match the number of VGPRs requested
-      by the compiler due to allocation granularity.'
-    AGPRs: 'The number of accumulation vector general-purpose registers allocated
-      for the kernel, see AGPRs. Note: this may not exactly match the number of AGPRs
-      requested by the compiler due to allocation granularity.'
-    SGPRs: 'The number of scalar general-purpose registers allocated for the kernel,
-      see SALU. Note: this may not exactly match the number of SGPRs requested by
-      the compiler due to allocation granularity.'
-    LDS Allocation: 'The number of bytes of LDS memory (or, shared memory) allocated
-      for this kernel. Note: This may also be larger than what was requested at compile
-      time due to both allocation granularity and dynamic per-dispatch LDS allocations.'
-    Scratch Allocation: The number of bytes of scratch memory requested per work-item
-      for this kernel. Scratch memory is used for stack memory on the accelerator,
-      as well as for register spills and restores.
-    Kernel Time: The total duration of the executed kernel.
-    Kernel Time (Cycles): The total duration of the executed kernel in cycles.
-    Instructions per wavefront: The average number of instructions (of all types)
-      executed per wavefront. This is averaged over all wavefronts in a kernel dispatch.
-    Wave Cycles: The number of cycles a wavefront in the kernel dispatch spent resident
-      on a compute unit per normalization unit. This is averaged over all wavefronts
-      in a kernel dispatch.
-    Dependency Wait Cycles: The number of cycles a wavefront in the kernel dispatch
-      spent resident on a compute unit per normalization unit. This is averaged over
-      all wavefronts in a kernel dispatch.
-    Issue Wait Cycles: The number of cycles a wavefront in the kernel dispatch was
-      unable to issue an instruction for any reason (e.g., execution pipe back-pressure,
-      arbitration loss, etc.) per normalization unit. This counter is incremented
-      at every cycle by all wavefronts on a CU unable to issue an instruction. As
-      such, it is most useful to get a sense of how waves were spending their time,
-      rather than identification of a precise limiter because another wave could be
-      actively executing while a wave is issue stalled. The sum of this metric, Dependency
-      Wait Cycles and Active Cycles should be equal to the total Wave Cycles metric.
-    Active Cycles: The average number of cycles a wavefront in the kernel dispatch
-      was actively executing instructions per normalization unit. This measurement
-      is made on a per-wavefront basis, and may include cycles that another wavefront
-      spent actively executing (on another execution unit, for example) or was stalled.
-      As such, it is most useful to get a sense of how waves were spending their time,
-      rather than identification of a precise limiter. The sum of this metric, Issue
-      Wait Cycles and Active Wait Cycles should be equal to the total Wave Cycles
-      metric.
-    Wavefront Occupancy: 'The time-averaged number of wavefronts resident on the accelerator
-      over the lifetime of the kernel. Note: this metric may be inaccurate for short-running
-      kernels (less than 1ms).'
   data source:
   - metric_table:
       id: 701
@@ -171,3 +114,66 @@ Panel Config:
           max: MAX((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
           unit: Wavefronts
           coll_level: SQ_LEVEL_WAVES
+  metrics_description:
+    Grid Size: The total number of work-items (or, threads) launched as a part of
+      the kernel dispatch. In HIP, this is equivalent to the total grid size multiplied
+      by the total workgroup (or, block) size.
+    Workgroup Size: The total number of work-items (or, threads) in each workgroup
+      (or, block) launched as part of the kernel dispatch. In HIP, this is equivalent
+      to the total block size.
+    Total Wavefronts: |-
+      The total number of wavefronts launched as part of the kernel dispatch.
+      On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront
+      size is always 64 work-items. Thus, the total number of wavefronts should
+      be equivalent to the ceiling of grid size divided by 64.
+    Saved Wavefronts: The total number of wavefronts saved at a context-save.
+    Restored Wavefronts: The total number of wavefronts restored from a context-save.
+    VGPRs: |-
+      The number of architected vector general-purpose registers allocated
+      for the kernel, see VALU. Note: this may not exactly match the number of VGPRs
+      requested by the compiler due to allocation granularity.
+    AGPRs: |-
+      The number of accumulation vector general-purpose registers allocated
+      for the kernel, see AGPRs. Note: this may not exactly match the number of
+      AGPRs requested by the compiler due to allocation granularity.
+    SGPRs: |-
+      The number of scalar general-purpose registers allocated for the kernel,
+      see SALU. Note: this may not exactly match the number of SGPRs requested by
+      the compiler due to allocation granularity.
+    LDS Allocation: |-
+      The number of bytes of LDS memory (or, shared memory) allocated for
+      this kernel. Note: This may also be larger than what was requested at compile
+      time due to both allocation granularity and dynamic per-dispatch LDS allocations.
+    Scratch Allocation: The number of bytes of scratch memory requested per work-item
+      for this kernel. Scratch memory is used for stack memory on the accelerator,
+      as well as for register spills and restores.
+    Kernel Time: The total duration of the executed kernel.
+    Kernel Time (Cycles): The total duration of the executed kernel in cycles.
+    Instructions per wavefront: The average number of instructions (of all types)
+      executed per wavefront. This is averaged over all wavefronts in a kernel dispatch.
+    Wave Cycles: The number of cycles a wavefront in the kernel dispatch spent resident
+      on a compute unit per normalization unit. This is averaged over all wavefronts
+      in a kernel dispatch.
+    Dependency Wait Cycles: The number of cycles a wavefront in the kernel dispatch
+      spent resident on a compute unit per normalization unit. This is averaged over
+      all wavefronts in a kernel dispatch.
+    Issue Wait Cycles: The number of cycles a wavefront in the kernel dispatch was
+      unable to issue an instruction for any reason (e.g., execution pipe back-pressure,
+      arbitration loss, etc.) per normalization unit. This counter is incremented
+      at every cycle by all wavefronts on a CU unable to issue an instruction. As
+      such, it is most useful to get a sense of how waves were spending their time,
+      rather than identification of a precise limiter because another wave could be
+      actively executing while a wave is issue stalled. The sum of this metric, Dependency
+      Wait Cycles and Active Cycles should be equal to the total Wave Cycles metric.
+    Active Cycles: The average number of cycles a wavefront in the kernel dispatch
+      was actively executing instructions per normalization unit. This measurement
+      is made on a per-wavefront basis, and may include cycles that another wavefront
+      spent actively executing (on another execution unit, for example) or was stalled.
+      As such, it is most useful to get a sense of how waves were spending their time,
+      rather than identification of a precise limiter. The sum of this metric, Issue
+      Wait Cycles and Active Wait Cycles should be equal to the total Wave Cycles
+      metric.
+    Wavefront Occupancy: |-
+      The time-averaged number of wavefronts resident on the accelerator over
+      the lifetime of the kernel. Note: this metric may be inaccurate for short-running
+      kernels (less than 1ms).
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1000_compute_units_instruction_mix.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1000_compute_units_instruction_mix.yaml
index 9c923d7bb7..768fe6548b 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1000_compute_units_instruction_mix.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1000_compute_units_instruction_mix.yaml
@@ -2,90 +2,6 @@
 Panel Config:
   id: 1000
   title: Compute Units - Instruction Mix
-  metrics_description:
-    VALU: The total number of vector arithmetic logic unit (VALU) operations issued.
-      These are the workhorses of the compute unit, and are used to execute a wide
-      range of instruction types including floating point operations, non-uniform
-      address calculations, transcendental operations, integer operations, shifts,
-      conditional evaluation, etc.
-    VMEM: The total number of vector memory operations issued. These include most
-      loads, stores and atomic operations and all accesses to generic, global, private
-      and texture memory.
-    LDS: The total number of LDS (also known as shared memory) operations issued.
-      These include loads, stores, atomics, and HIP's __shfl operations.
-    MFMA: The total number of matrix fused multiply-add instructions issued.
-    SALU: The total number of scalar arithmetic logic unit (SALU) operations issued.
-      Typically these are used for address calculations, literal constants, and other
-      operations that are provably uniform across a wavefront. Although scalar memory
-      (SMEM) operations are issued by the SALU, they are counted separately in this
-      section.
-    SMEM: The total number of scalar memory (SMEM) operations issued. These are typically
-      used for loading kernel arguments, base-pointers and loads from HIP's __constant__
-      memory.
-    Branch: The total number of branch operations issued. These typically consist
-      of jump or branch operations and are used to implement control flow.
-    INT32: The total number of instructions operating on 32-bit integer operands issued
-      to the VALU per normalization unit.
-    INT64: The total number of instructions operating on 64-bit integer operands issued
-      to the VALU per normalization unit.
-    F16-ADD: The total number of addition instructions operating on 16-bit floating-point
-      operands issued to the VALU per normalization unit.
-    F16-MUL: The total number of multiplication instructions operating on 16-bit floating-point
-      operands issued to the VALU per normalization unit.
-    F16-FMA: The total number of fused multiply-add instructions operating on 16-bit
-      floating-point operands issued to the VALU per normalization unit.
-    F16-Trans: The total number of transcendental instructions (e.g., sqrt) operating
-      on 16-bit floating-point operands issued to the VALU per normalization unit.
-    F32-ADD: The total number of addition instructions operating on 32-bit floating-point
-      operands issued to the VALU per normalization unit.
-    F32-MUL: The total number of multiplication instructions operating on 32-bit floating-point
-      operands issued to the VALU per normalization unit.
-    F32-FMA: The total number of fused multiply-add instructions operating on 32-bit
-      floating-point operands issued to the VALU per normalization unit.
-    F32-Trans: The total number of transcendental instructions (such as sqrt) operating
-      on 32-bit floating-point operands issued to the VALU per normalization unit.
-    F64-ADD: The total number of addition instructions operating on 64-bit floating-point
-      operands issued to the VALU per normalization unit.
-    F64-MUL: The total number of multiplication instructions operating on 64-bit floating-point
-      operands issued to the VALU per normalization unit.
-    F64-FMA: The total number of fused multiply-add instructions operating on 64-bit
-      floating-point operands issued to the VALU per normalization unit.
-    F64-Trans: The total number of transcendental instructions (such as sqrt) operating
-      on 64-bit floating-point operands issued to the VALU per normalization unit.
-    Conversion: "The total number of type conversion instructions (such as converting\
-      \ data to or from F32\u2194F64) issued to the VALU per normalization unit."
-    Global/Generic Instr: The total number of global & generic memory instructions
-      executed on all compute units on the accelerator, per normalization unit.
-    Global/Generic Read: The total number of global & generic memory read instructions
-      executed on all compute units on the accelerator, per normalization unit.
-    Global/Generic Write: The total number of global & generic memory write instructions
-      executed on all compute units on the accelerator, per normalization unit.
-    Global/Generic Atomic: The total number of global & generic memory atomic (with
-      and without return) instructions executed on all compute units on the accelerator,
-      per normalization unit.
-    Spill/Stack Instr: The total number of spill/stack memory instructions executed
-      on all compute units on the accelerator, per normalization unit.
-    Spill/Stack Read: The total number of spill/stack memory read instructions executed
-      on all compute units on the accelerator, per normalization unit.
-    Spill/Stack Write: The total number of spill/stack memory write instructions executed
-      on all compute units on the accelerator, per normalization unit.
-    Spill/Stack Atomic: The total number of spill/stack memory atomic (with and without
-      return) instructions executed on all compute units on the accelerator, per normalization
-      unit. Typically unused as these memory operations are typically used to implement
-      thread-local storage.
-    MFMA-I8: The total number of 8-bit integer MFMA instructions issued per normalization
-      unit.
-    MFMA-F8: The total number of 8-bit floating point MFMA instructions issued per
-      normalization unit. This is supported in AMD Instinct MI300 series and later
-      only.
-    MFMA-F16: The total number of 16-bit floating point MFMA instructions issued per
-      normalization unit.
-    MFMA-BF16: The total number of 16-bit brain floating point MFMA instructions issued
-      per normalization unit.
-    MFMA-F32: The total number of 32-bit floating-point MFMA instructions issued per
-      normalization unit.
-    MFMA-F64: The total number of 64-bit floating-point MFMA instructions issued per
-      normalization unit.
   data source:
   - metric_table:
       id: 1001
@@ -307,3 +223,88 @@ Panel Config:
           min: MIN((SQ_INSTS_VALU_MFMA_F64 / $denom))
           max: MAX((SQ_INSTS_VALU_MFMA_F64 / $denom))
           unit: (instr + $normUnit)
+  metrics_description:
+    VALU: The total number of vector arithmetic logic unit (VALU) operations issued.
+      These are the workhorses of the compute unit, and are used to execute a wide
+      range of instruction types including floating point operations, non-uniform
+      address calculations, transcendental operations, integer operations, shifts,
+      conditional evaluation, etc.
+    VMEM: The total number of vector memory operations issued. These include most
+      loads, stores and atomic operations and all accesses to generic, global, private
+      and texture memory.
+    LDS: The total number of LDS (also known as shared memory) operations issued.
+      These include loads, stores, atomics, and HIP's __shfl operations.
+    MFMA: The total number of matrix fused multiply-add instructions issued.
+    SALU: The total number of scalar arithmetic logic unit (SALU) operations issued.
+      Typically these are used for address calculations, literal constants, and other
+      operations that are provably uniform across a wavefront. Although scalar memory
+      (SMEM) operations are issued by the SALU, they are counted separately in this
+      section.
+    SMEM: The total number of scalar memory (SMEM) operations issued. These are typically
+      used for loading kernel arguments, base-pointers and loads from HIP's __constant__
+      memory.
+    Branch: The total number of branch operations issued. These typically consist
+      of jump or branch operations and are used to implement control flow.
+    INT32: The total number of instructions operating on 32-bit integer operands issued
+      to the VALU per normalization unit.
+    INT64: The total number of instructions operating on 64-bit integer operands issued
+      to the VALU per normalization unit.
+    F16-ADD: The total number of addition instructions operating on 16-bit floating-point
+      operands issued to the VALU per normalization unit.
+    F16-MUL: The total number of multiplication instructions operating on 16-bit floating-point
+      operands issued to the VALU per normalization unit.
+    F16-FMA: The total number of fused multiply-add instructions operating on 16-bit
+      floating-point operands issued to the VALU per normalization unit.
+    F16-Trans: The total number of transcendental instructions (e.g., sqrt) operating
+      on 16-bit floating-point operands issued to the VALU per normalization unit.
+    F32-ADD: The total number of addition instructions operating on 32-bit floating-point
+      operands issued to the VALU per normalization unit.
+    F32-MUL: The total number of multiplication instructions operating on 32-bit floating-point
+      operands issued to the VALU per normalization unit.
+    F32-FMA: The total number of fused multiply-add instructions operating on 32-bit
+      floating-point operands issued to the VALU per normalization unit.
+    F32-Trans: The total number of transcendental instructions (such as sqrt) operating
+      on 32-bit floating-point operands issued to the VALU per normalization unit.
+    F64-ADD: The total number of addition instructions operating on 64-bit floating-point
+      operands issued to the VALU per normalization unit.
+    F64-MUL: The total number of multiplication instructions operating on 64-bit floating-point
+      operands issued to the VALU per normalization unit.
+    F64-FMA: The total number of fused multiply-add instructions operating on 64-bit
+      floating-point operands issued to the VALU per normalization unit.
+    F64-Trans: The total number of transcendental instructions (such as sqrt) operating
+      on 64-bit floating-point operands issued to the VALU per normalization unit.
+    Conversion: |-
+      The total number of type conversion instructions (such as converting
+      data to or from F32\u2194F64) issued to the VALU per normalization unit.
+    Global/Generic Instr: The total number of global & generic memory instructions
+      executed on all compute units on the accelerator, per normalization unit.
+    Global/Generic Read: The total number of global & generic memory read instructions
+      executed on all compute units on the accelerator, per normalization unit.
+    Global/Generic Write: The total number of global & generic memory write instructions
+      executed on all compute units on the accelerator, per normalization unit.
+    Global/Generic Atomic: The total number of global & generic memory atomic (with
+      and without return) instructions executed on all compute units on the accelerator,
+      per normalization unit.
+    Spill/Stack Instr: The total number of spill/stack memory instructions executed
+      on all compute units on the accelerator, per normalization unit.
+    Spill/Stack Read: The total number of spill/stack memory read instructions executed
+      on all compute units on the accelerator, per normalization unit.
+    Spill/Stack Write: The total number of spill/stack memory write instructions executed
+      on all compute units on the accelerator, per normalization unit.
+    Spill/Stack Atomic: The total number of spill/stack memory atomic (with and without
+      return) instructions executed on all compute units on the accelerator, per normalization
+      unit. Typically unused as these memory operations are typically used to implement
+      thread-local storage.
+    MFMA-I8: The total number of 8-bit integer MFMA instructions issued per normalization
+      unit.
+    MFMA-F8: The total number of 8-bit floating point MFMA instructions issued per
+      normalization unit. This is supported in AMD Instinct MI300 series and later
+      only.
+    MFMA-F16: The total number of 16-bit floating point MFMA instructions issued per
+      normalization unit.
+    MFMA-BF16: The total number of 16-bit brain floating point MFMA instructions issued
+      per normalization unit.
+    MFMA-F32: The total number of 32-bit floating-point MFMA instructions issued per
+      normalization unit.
+    MFMA-F64: The total number of 64-bit floating-point MFMA instructions issued per
+      normalization unit.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1100_compute_units_compute_pipeline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1100_compute_units_compute_pipeline.yaml
index 5285c6b279..5e6ceb654f 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1100_compute_units_compute_pipeline.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1100_compute_units_compute_pipeline.yaml
@@ -2,84 +2,6 @@
 Panel Config:
   id: 1100
   title: Compute Units - Compute Pipeline
-  metrics_description:
-    VALU FLOPs: 'The total floating-point operations executed per second on the VALU.
-      This is also presented as a percent of the peak theoretical FLOPs achievable
-      on the specific accelerator. Note: this does not include any floating-point
-      operations from MFMA instructions.'
-    VALU IOPs: 'The total integer operations executed per second on the VALU. This
-      is also presented as a percent of the peak theoretical IOPs achievable on the
-      specific accelerator. Note: this does not include any integer operations from
-      MFMA instructions.'
-    MFMA FLOPs (BF16): 'The total number of 16-bit brain floating point MFMA operations
-      executed per second. Note: this does not include any 16-bit brain floating point
-      operations from VALU instructions. This is also presented as a percent of the
-      peak theoretical BF16 MFMA operations achievable on the specific accelerator.'
-    MFMA FLOPs (F16): 'The total number of 16-bit floating point MFMA operations executed
-      per second. Note: this does not include any 16-bit floating point operations
-      from VALU instructions. This is also presented as a percent of the peak theoretical
-      F16 MFMA operations achievable on the specific accelerator.'
-    MFMA FLOPs (F32): 'The total number of 32-bit floating point MFMA operations executed
-      per second. Note: this does not include any 32-bit floating point operations
-      from VALU instructions. This is also presented as a percent of the peak theoretical
-      F32 MFMA operations achievable on the specific accelerator.'
-    MFMA FLOPs (F64): 'The total number of 64-bit floating point MFMA operations executed
-      per second. Note: this does not include any 64-bit floating point operations
-      from VALU instructions. This is also presented as a percent of the peak theoretical
-      F64 MFMA operations achievable on the specific accelerator.'
-    MFMA IOPs (INT8): 'The total number of 8-bit integer MFMA operations executed
-      per second. Note: this does not include any 8-bit integer operations from VALU
-      instructions. This is also presented as a percent of the peak theoretical INT8
-      MFMA operations achievable on the specific accelerator.'
-    IPC: The ratio of the total number of instructions executed on the CU over the
-      total active CU cycles.
-    IPC (Issued): The ratio of the total number of (non-internal) instructions issued
-      over the number of cycles where the scheduler was actively working on issuing
-      instructions.
-    SALU Utilization: Indicates what percent of the kernel's duration the SALU was
-      busy executing instructions. Computed as the ratio of the total number of cycles
-      spent by the scheduler issuing SALU / SMEM instructions over the total CU cycles.
-    VALU Utilization: Indicates what percent of the kernel's duration the VALU was
-      busy executing instructions. Does not include VMEM operations. Computed as the
-      ratio of the total number of cycles spent by the scheduler issuing VALU instructions
-      over the total CU cycles.
-    VMEM Utilization: Indicates what percent of the kernel's duration the VMEM unit
-      was busy executing instructions, including both global/generic and spill/scratch
-      operations (see the VMEM instruction count metrics for more detail). Does not
-      include VALU operations. Computed as the ratio of the total number of cycles
-      spent by the scheduler issuing VMEM instructions over the total CU cycles.
-    Branch Utilization: Indicates what percent of the kernel's duration the branch
-      unit was busy executing instructions. Computed as the ratio of the total number
-      of cycles spent by the scheduler issuing branch instructions over the total
-      CU cycles.
-    VALU Active Threads: Indicates the average level of divergence within a wavefront
-      over the lifetime of the kernel. The number of work-items that were active in
-      a wavefront during execution of each VALU instruction, time-averaged over all
-      VALU instructions run on all wavefronts in the kernel
-    MFMA Utilization: Indicates what percent of the kernel's duration the MFMA unit
-      was busy executing instructions. Computed as the ratio of the total number of
-      cycles spent by the MFMA was busy over the total CU cycles.
-    MFMA Instruction Cycles: The average duration of MFMA instructions in this kernel
-      in cycles. Computed as the ratio of the total number of cycles the MFMA unit
-      was busy over the total number of MFMA instructions.
-    VMEM Latency: The average number of round-trip cycles (that is, from issue to
-      data return / acknowledgment) required for a VMEM instruction to complete.
-    SMEM Latency: The average number of round-trip cycles (that is, from issue to
-      data return / acknowledgment) required for a SMEM instruction to complete.
-    FLOPs (Total): The total number of floating-point operations executed on either
-      the VALU or MFMA units, per normalization unit.
-    IOPs (Total): The total number of integer operations executed on either the VALU
-      or MFMA units, per normalization unit.
-    F16 OPs: The total number of 16-bit floating-point operations executed on either
-      the VALU or MFMA units, per normalization unit.
-    BF16 OPs: The total number of 16-bit brain floating-point operations executed
-      on either the VALU or MFMA units, per normalization unit.
-    F32 OPs: The total number of 32-bit floating-point operations executed on either
-      the VALU or MFMA units, per normalization unit.
-    F64 OPs: The total number of 64-bit floating-point operations executed on either
-      the VALU or MFMA units, per normalization unit.
-    INT8 OPs: The total number of 8-bit integer operations executed on either the
-      VALU or MFMA units, per normalization unit.
   data source:
   - metric_table:
       id: 1101
@@ -165,13 +87,13 @@ Panel Config:
           unit: Instr/cycle
         IPC (Issued):
           avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
-            + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED  + SQ_INSTS_LDS)
+            + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
             / SQ_ACTIVE_INST_ANY))
           min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
             + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
             / SQ_ACTIVE_INST_ANY))
           max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
-            + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED  + SQ_INSTS_LDS)
+            + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
             / SQ_ACTIVE_INST_ANY))
           unit: Instr/cycle
         SALU Utilization:
@@ -271,7 +193,7 @@ Panel Config:
             + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
             + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
             / $denom))
-          unit: (OPs  + $normUnit)
+          unit: (OPs + $normUnit)
         IOPs (Total):
           avg: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
             * 512)) / $denom)
@@ -279,12 +201,12 @@ Panel Config:
             * 512)) / $denom)
           max: MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
             * 512)) / $denom)
-          unit: (OPs  + $normUnit)
+          unit: (OPs + $normUnit)
         F8 OPs:
           avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
           min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
           max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
-          unit: (OPs  + $normUnit)
+          unit: (OPs + $normUnit)
         F16 OPs:
           avg: AVG(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
             + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512
@@ -295,12 +217,12 @@ Panel Config:
           max: MAX(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
             + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512
             * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
-          unit: (OPs  + $normUnit)
+          unit: (OPs + $normUnit)
         BF16 OPs:
           avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
           min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
           max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
-          unit: (OPs  + $normUnit)
+          unit: (OPs + $normUnit)
         F32 OPs:
           avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
             + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
@@ -311,7 +233,7 @@ Panel Config:
           max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
             + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
             / $denom))
-          unit: (OPs  + $normUnit)
+          unit: (OPs + $normUnit)
         F64 OPs:
           avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
             + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
@@ -322,9 +244,94 @@ Panel Config:
           max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
             + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
             / $denom))
-          unit: (OPs  + $normUnit)
+          unit: (OPs + $normUnit)
         INT8 OPs:
           avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
           min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
           max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
-          unit: (OPs  + $normUnit)
+          unit: (OPs + $normUnit)
+  metrics_description:
+    VALU FLOPs: |-
+      The total floating-point operations executed per second on the VALU.
+      This is also presented as a percent of the peak theoretical FLOPs achievable
+      on the specific accelerator. Note: this does not include any floating-point
+      operations from MFMA instructions.
+    VALU IOPs: |-
+      The total integer operations executed per second on the VALU. This is
+      also presented as a percent of the peak theoretical IOPs achievable on the
+      specific accelerator. Note: this does not include any integer operations from
+      MFMA instructions.
+    MFMA FLOPs (BF16): |-
+      The total number of 16-bit brain floating point MFMA operations executed
+      per second. Note: this does not include any 16-bit brain floating point operations
+      from VALU instructions. This is also presented as a percent of the peak theoretical
+      BF16 MFMA operations achievable on the specific accelerator.
+    MFMA FLOPs (F16): |-
+      The total number of 16-bit floating point MFMA operations executed per
+      second. Note: this does not include any 16-bit floating point operations from
+      VALU instructions. This is also presented as a percent of the peak theoretical
+      F16 MFMA operations achievable on the specific accelerator.
+    MFMA FLOPs (F32): |-
+      The total number of 32-bit floating point MFMA operations executed per
+      second. Note: this does not include any 32-bit floating point operations from
+      VALU instructions. This is also presented as a percent of the peak theoretical
+      F32 MFMA operations achievable on the specific accelerator.
+    MFMA FLOPs (F64): |-
+      The total number of 64-bit floating point MFMA operations executed per
+      second. Note: this does not include any 64-bit floating point operations from
+      VALU instructions. This is also presented as a percent of the peak theoretical
+      F64 MFMA operations achievable on the specific accelerator.
+    MFMA IOPs (INT8): |-
+      The total number of 8-bit integer MFMA operations executed per second.
+      Note: this does not include any 8-bit integer operations from VALU instructions.
+      This is also presented as a percent of the peak theoretical INT8 MFMA operations
+      achievable on the specific accelerator.
+    IPC: The ratio of the total number of instructions executed on the CU over the
+      total active CU cycles.
+    IPC (Issued): The ratio of the total number of (non-internal) instructions issued
+      over the number of cycles where the scheduler was actively working on issuing
+      instructions.
+    SALU Utilization: Indicates what percent of the kernel's duration the SALU was
+      busy executing instructions. Computed as the ratio of the total number of cycles
+      spent by the scheduler issuing SALU / SMEM instructions over the total CU cycles.
+    VALU Utilization: Indicates what percent of the kernel's duration the VALU was
+      busy executing instructions. Does not include VMEM operations. Computed as the
+      ratio of the total number of cycles spent by the scheduler issuing VALU instructions
+      over the total CU cycles.
+    VMEM Utilization: Indicates what percent of the kernel's duration the VMEM unit
+      was busy executing instructions, including both global/generic and spill/scratch
+      operations (see the VMEM instruction count metrics for more detail). Does not
+      include VALU operations. Computed as the ratio of the total number of cycles
+      spent by the scheduler issuing VMEM instructions over the total CU cycles.
+    Branch Utilization: Indicates what percent of the kernel's duration the branch
+      unit was busy executing instructions. Computed as the ratio of the total number
+      of cycles spent by the scheduler issuing branch instructions over the total
+      CU cycles.
+    VALU Active Threads: Indicates the average level of divergence within a wavefront
+      over the lifetime of the kernel. The number of work-items that were active in
+      a wavefront during execution of each VALU instruction, time-averaged over all
+      VALU instructions run on all wavefronts in the kernel
+    MFMA Utilization: Indicates what percent of the kernel's duration the MFMA unit
+      was busy executing instructions. Computed as the ratio of the total number of
+      cycles spent by the MFMA was busy over the total CU cycles.
+    MFMA Instruction Cycles: The average duration of MFMA instructions in this kernel
+      in cycles. Computed as the ratio of the total number of cycles the MFMA unit
+      was busy over the total number of MFMA instructions.
+    VMEM Latency: The average number of round-trip cycles (that is, from issue to
+      data return / acknowledgment) required for a VMEM instruction to complete.
+    SMEM Latency: The average number of round-trip cycles (that is, from issue to
+      data return / acknowledgment) required for a SMEM instruction to complete.
+    FLOPs (Total): The total number of floating-point operations executed on either
+      the VALU or MFMA units, per normalization unit.
+    IOPs (Total): The total number of integer operations executed on either the VALU
+      or MFMA units, per normalization unit.
+    F16 OPs: The total number of 16-bit floating-point operations executed on either
+      the VALU or MFMA units, per normalization unit.
+    BF16 OPs: The total number of 16-bit brain floating-point operations executed
+      on either the VALU or MFMA units, per normalization unit.
+    F32 OPs: The total number of 32-bit floating-point operations executed on either
+      the VALU or MFMA units, per normalization unit.
+    F64 OPs: The total number of 64-bit floating-point operations executed on either
+      the VALU or MFMA units, per normalization unit.
+    INT8 OPs: The total number of 8-bit integer operations executed on either the
+      VALU or MFMA units, per normalization unit.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1200_local_data_share_lds.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1200_local_data_share_lds.yaml
index 2718654ad4..b7767fea16 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1200_local_data_share_lds.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1200_local_data_share_lds.yaml
@@ -2,51 +2,6 @@
 Panel Config:
   id: 1200
   title: Local Data Share (LDS)
-  metrics_description:
-    Utilization: Indicates what percent of the kernel's duration the LDS was actively
-      executing instructions (including, but not limited to, load, store, atomic and
-      HIP's __shfl operations). Calculated as the ratio of the total number of cycles
-      LDS was active over the total CU cycles.
-    Access Rate: Indicates the percentage of SIMDs in the VALU actively issuing LDS
-      instructions, averaged over the lifetime of the kernel. Calculated as the ratio
-      of the total number of cycles spent by the scheduler issuing LDS instructions
-      over the total CU cycles.
-    Theoretical Bandwidth Utilization: Indicates the maximum amount of bytes that
-      could have been loaded from, stored to, or atomically updated in the LDS divided
-      as percentage of theoretical peak. Does not take into account the execution
-      mask of the wavefront when the instruction was executed.
-    Theoretical Bandwidth: Indicates the maximum amount of bytes that could have been
-      loaded from, stored to, or atomically updated in the LDS divided by total duration.
-      Does not take into account the execution mask of the wavefront when the instruction
-      was executed.
-    Bank Conflict Rate: Indicates the percentage of active LDS cycles that were spent
-      servicing bank conflicts. Calculated as the ratio of LDS cycles spent servicing
-      bank conflicts over the number of LDS cycles that would have been required to
-      move the same amount of data in an uncontended access.
-    LDS Instructions: The total number of LDS instructions (including, but not limited
-      to, read/write/atomics and HIP's __shfl instructions) executed per normalization
-      unit.
-    LDS Latency: The average number of round-trip cycles (i.e., from issue to data-return
-      / acknowledgment) required for an LDS instruction to complete.
-    Bank Conflicts/Access: The ratio of the number of cycles spent in the LDS scheduler
-      due to bank conflicts (as determined by the conflict resolution hardware) to
-      the base number of cycles that would be spent in the LDS scheduler in a completely
-      uncontended case. This is the unnormalized form of the Bank Conflict Rate.
-    Index Accesses: The total number of cycles spent in the LDS scheduler over all
-      operations per normalization unit.
-    Atomic Return Cycles: The total number of cycles spent on LDS atomics with return
-      per normalization unit.
-    Bank Conflict: The total number of cycles spent in the LDS scheduler due to bank
-      conflicts (as determined by the conflict resolution hardware) per normalization
-      unit.
-    Addr Conflict: The total number of cycles spent in the LDS scheduler due to address
-      conflicts (as determined by the conflict resolution hardware) per normalization
-      unit.
-    Unaligned Stall: The total number of cycles spent in the LDS scheduler due to
-      stalls from non-dword aligned addresses per normalization unit.
-    Mem Violations: "The total number of out-of-bounds accesses made to the LDS, per\
-      \ normalization unit. This is unused and expected to be zero in most configurations\
-      \ for modern CDNA\u2122 accelerators."
   data source:
   - metric_table:
       id: 1201
@@ -87,7 +42,7 @@ Panel Config:
           avg: AVG((SQ_INSTS_LDS / $denom))
           min: MIN((SQ_INSTS_LDS / $denom))
           max: MAX((SQ_INSTS_LDS / $denom))
-          unit: (Instr  + $normUnit)
+          unit: (Instr + $normUnit)
         Theoretical Bandwidth:
           avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
             / (End_Timestamp - Start_Timestamp)))
@@ -117,29 +72,75 @@ Panel Config:
           avg: AVG((SQ_LDS_IDX_ACTIVE / $denom))
           min: MIN((SQ_LDS_IDX_ACTIVE / $denom))
           max: MAX((SQ_LDS_IDX_ACTIVE / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
         Atomic Return Cycles:
           avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom))
           min: MIN((SQ_LDS_ATOMIC_RETURN / $denom))
           max: MAX((SQ_LDS_ATOMIC_RETURN / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
         Bank Conflict:
           avg: AVG((SQ_LDS_BANK_CONFLICT / $denom))
           min: MIN((SQ_LDS_BANK_CONFLICT / $denom))
           max: MAX((SQ_LDS_BANK_CONFLICT / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
         Addr Conflict:
           avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom))
           min: MIN((SQ_LDS_ADDR_CONFLICT / $denom))
           max: MAX((SQ_LDS_ADDR_CONFLICT / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
         Unaligned Stall:
           avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom))
           min: MIN((SQ_LDS_UNALIGNED_STALL / $denom))
           max: MAX((SQ_LDS_UNALIGNED_STALL / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
         Mem Violations:
           avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom))
           min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom))
           max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom))
           unit: (Accesses + $normUnit)
+  metrics_description:
+    Utilization: Indicates what percent of the kernel's duration the LDS was actively
+      executing instructions (including, but not limited to, load, store, atomic and
+      HIP's __shfl operations). Calculated as the ratio of the total number of cycles
+      LDS was active over the total CU cycles.
+    Access Rate: Indicates the percentage of SIMDs in the VALU actively issuing LDS
+      instructions, averaged over the lifetime of the kernel. Calculated as the ratio
+      of the total number of cycles spent by the scheduler issuing LDS instructions
+      over the total CU cycles.
+    Theoretical Bandwidth Utilization: Indicates the maximum amount of bytes that
+      could have been loaded from, stored to, or atomically updated in the LDS divided
+      as percentage of theoretical peak. Does not take into account the execution
+      mask of the wavefront when the instruction was executed.
+    Theoretical Bandwidth: Indicates the maximum amount of bytes that could have been
+      loaded from, stored to, or atomically updated in the LDS divided by total duration.
+      Does not take into account the execution mask of the wavefront when the instruction
+      was executed.
+    Bank Conflict Rate: Indicates the percentage of active LDS cycles that were spent
+      servicing bank conflicts. Calculated as the ratio of LDS cycles spent servicing
+      bank conflicts over the number of LDS cycles that would have been required to
+      move the same amount of data in an uncontended access.
+    LDS Instructions: The total number of LDS instructions (including, but not limited
+      to, read/write/atomics and HIP's __shfl instructions) executed per normalization
+      unit.
+    LDS Latency: The average number of round-trip cycles (i.e., from issue to data-return
+      acknowledgment) required for an LDS instruction to complete.
+    Bank Conflicts/Access: The ratio of the number of cycles spent in the LDS scheduler
+      due to bank conflicts (as determined by the conflict resolution hardware) to
+      the base number of cycles that would be spent in the LDS scheduler in a completely
+      uncontended case. This is the unnormalized form of the Bank Conflict Rate.
+    Index Accesses: The total number of cycles spent in the LDS scheduler over all
+      operations per normalization unit.
+    Atomic Return Cycles: The total number of cycles spent on LDS atomics with return
+      per normalization unit.
+    Bank Conflict: The total number of cycles spent in the LDS scheduler due to bank
+      conflicts (as determined by the conflict resolution hardware) per normalization
+      unit.
+    Addr Conflict: The total number of cycles spent in the LDS scheduler due to address
+      conflicts (as determined by the conflict resolution hardware) per normalization
+      unit.
+    Unaligned Stall: The total number of cycles spent in the LDS scheduler due to
+      stalls from non-dword aligned addresses per normalization unit.
+    Mem Violations: |-
+      The total number of out-of-bounds accesses made to the LDS, per normalization
+      unit. This is unused and expected to be zero in most configurations for
+      modern CDNA\u2122 accelerators.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1300_instruction_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1300_instruction_cache.yaml
index aeda9bc6c7..35808d9d96 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1300_instruction_cache.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1300_instruction_cache.yaml
@@ -2,28 +2,6 @@
 Panel Config:
   id: 1300
   title: Instruction Cache
-  metrics_description:
-    Bandwidth Utilization: The number of bytes looked up in the L1I cache, as a percent
-      of the peak theoretical bandwidth. Calculated as the ratio of L1I requests over
-      the total L1I cycles.
-    Cache Hit Rate: The percent of L1I requests that hit [#l1i-cache]_ on a previously
-      loaded line the cache. Calculated as the ratio of the number of L1I requests
-      that hit over the number of all L1I requests.
-    L1I-L2 Bandwidth Utilization: "The percent of the peak theoretical L1I \u2192\
-      \ L2 cache request bandwidth achieved. Calculated as the ratio of the total\
-      \ number of requests from the L1I to the L2 cache over the total L1I-L2 interface\
-      \ cycles."
-    L1I-L2 Bandwidth: Total number of bytes transferred across L1I - L2 interface
-      divided by total duration.
-    Req: The total number of requests made to the L1I per normalization-unit
-    Hits: The total number of L1I requests that hit on a previously loaded cache line,
-      per normalization-unit.
-    Misses - Non Duplicated: The total number of L1I requests that missed on a cache
-      line that were not already pending due to another request, per normalization-unit.
-    Misses - Duplicated: The total number of L1I requests that missed on a cache line
-      that were already pending due to another request, per normalization-unit.
-    Instruction Fetch Latency: The average number of cycles spent to fetch instructions
-      to a CU.
   data source:
   - metric_table:
       id: 1301
@@ -62,22 +40,22 @@ Panel Config:
           avg: AVG((SQC_ICACHE_REQ / $denom))
           min: MIN((SQC_ICACHE_REQ / $denom))
           max: MAX((SQC_ICACHE_REQ / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Hits:
           avg: AVG((SQC_ICACHE_HITS / $denom))
           min: MIN((SQC_ICACHE_HITS / $denom))
           max: MAX((SQC_ICACHE_HITS / $denom))
-          unit: (Hits  + $normUnit)
+          unit: (Hits + $normUnit)
         Misses - Non Duplicated:
           avg: AVG((SQC_ICACHE_MISSES / $denom))
           min: MIN((SQC_ICACHE_MISSES / $denom))
           max: MAX((SQC_ICACHE_MISSES / $denom))
-          unit: (Misses  + $normUnit)
+          unit: (Misses + $normUnit)
         Misses - Duplicated:
           avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom))
           min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom))
           max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom))
-          unit: (Misses  + $normUnit)
+          unit: (Misses + $normUnit)
         Cache Hit Rate:
           avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
             + SQC_ICACHE_MISSES_DUPLICATE)))
@@ -107,3 +85,25 @@ Panel Config:
           min: MIN(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
           max: MAX(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
           unit: Gbps
+  metrics_description:
+    Bandwidth Utilization: The number of bytes looked up in the L1I cache, as a percent
+      of the peak theoretical bandwidth. Calculated as the ratio of L1I requests over
+      the total L1I cycles.
+    Cache Hit Rate: The percent of L1I requests that hit [#l1i-cache]_ on a previously
+      loaded line the cache. Calculated as the ratio of the number of L1I requests
+      that hit over the number of all L1I requests.
+    L1I-L2 Bandwidth Utilization: |-
+      The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth
+      achieved. Calculated as the ratio of the total number of requests from the
+      L1I to the L2 cache over the total L1I-L2 interface cycles.
+    L1I-L2 Bandwidth: Total number of bytes transferred across L1I - L2 interface
+      divided by total duration.
+    Req: The total number of requests made to the L1I per normalization-unit
+    Hits: The total number of L1I requests that hit on a previously loaded cache line,
+      per normalization-unit.
+    Misses - Non Duplicated: The total number of L1I requests that missed on a cache
+      line that were not already pending due to another request, per normalization-unit.
+    Misses - Duplicated: The total number of L1I requests that missed on a cache line
+      that were already pending due to another request, per normalization-unit.
+    Instruction Fetch Latency: The average number of cycles spent to fetch instructions
+      to a CU.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1400_scalar_l1_data_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1400_scalar_l1_data_cache.yaml
index 282b97ad1f..6b73164848 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1400_scalar_l1_data_cache.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1400_scalar_l1_data_cache.yaml
@@ -2,49 +2,6 @@
 Panel Config:
   id: 1400
   title: Scalar L1 Data Cache
-  metrics_description:
-    Bandwidth Utilization: The number of bytes looked up in the sL1D cache, as a percent
-      of the peak theoretical bandwidth. Calculated as the ratio of sL1D requests
-      over the total sL1D cycles.
-    Cache Hit Rate: Indicates the percent of sL1D requests that hit on a previously
-      loaded line the cache. The ratio of the number of sL1D requests that hit over
-      the number of all sL1D requests.
-    sL1D-L2 BW Utilization: The percentage of the peak theoretical sL1D - L2 interface
-      bandwidth acheived.\ \ Caclulated as total number of bytes read from, written
-      to, or atomically updated\ \ across the sL1D - L2 interface.
-    sL1D-L2 BW: "The total number of bytes read from, written to, or atomically updated\
-      \ across the sL1D\u2194L2 interface, divided by total duration. Note that sL1D\
-      \ writes and atomics are typically unused on current CDNA accelerators, so in\
-      \ the majority of cases this can be interpreted as an sL1D\u2192L2 read bandwidth."
-    Req: The total number of requests, of any size or type, made to the sL1D per normalization
-      unit.
-    Hits: The total number of sL1D requests that hit on a previously loaded cache
-      line, per normalization unit.
-    Misses - Non Duplicated: 'The total number of sL1D requests that missed on a cache
-      line that was not already pending due to another request, per normalization
-      unit. '
-    Misses- Duplicated: The total number of sL1D requests that missed on a cache line
-      that was already pending due to another request, per normalization unit.
-    Read Req (Total): The total number of sL1D read requests of any size, per normalization
-      unit.
-    Atomic Req: The total number of atomic requests from sL1D to the L2, per normalization
-      unit. Typically unused on current CDNA accelerators.
-    Read Req (1 DWord): The total number of sL1D read requests made for a single dword
-      of data (4B), per normalization unit.
-    Read Req (2 DWord): The total number of sL1D read requests made for a two dwords
-      of data (8B), per normalization unit.
-    Read Req (4 DWord): The total number of sL1D read requests made for a four dwords
-      of data (16B), per normalization unit.
-    Read Req (8 DWord): The total number of sL1D read requests made for a eight dwords
-      of data (32B), per normalization unit.
-    Read Req (16 DWord): The total number of sL1D read requests made for a sixteen
-      dwords of data (64B), per normalization unit.
-    Read Req: The total number of read requests from sL1D to the L2 per normalization
-      unit.
-    Write Req: The total number of write requests from sL1D to the L2, per normalization
-      unit. Typically unused on current CDNA accelerators.
-    Stall Cycles: "The total number of cycles the sL1D\u2194L2 interface was stalled,\
-      \ per normalization unit."
   data source:
   - metric_table:
       id: 1401
@@ -84,22 +41,22 @@ Panel Config:
           avg: AVG((SQC_DCACHE_REQ / $denom))
           min: MIN((SQC_DCACHE_REQ / $denom))
           max: MAX((SQC_DCACHE_REQ / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Hits:
           avg: AVG((SQC_DCACHE_HITS / $denom))
           min: MIN((SQC_DCACHE_HITS / $denom))
           max: MAX((SQC_DCACHE_HITS / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Misses - Non Duplicated:
           avg: AVG((SQC_DCACHE_MISSES / $denom))
           min: MIN((SQC_DCACHE_MISSES / $denom))
           max: MAX((SQC_DCACHE_MISSES / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Misses- Duplicated:
           avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom))
           min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom))
           max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Cache Hit Rate:
           avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
             + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
@@ -118,37 +75,37 @@ Panel Config:
             + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
           max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
             + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Atomic Req:
           avg: AVG((SQC_DCACHE_ATOMIC / $denom))
           min: MIN((SQC_DCACHE_ATOMIC / $denom))
           max: MAX((SQC_DCACHE_ATOMIC / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Read Req (1 DWord):
           avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom))
           min: MIN((SQC_DCACHE_REQ_READ_1 / $denom))
           max: MAX((SQC_DCACHE_REQ_READ_1 / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Read Req (2 DWord):
           avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom))
           min: MIN((SQC_DCACHE_REQ_READ_2 / $denom))
           max: MAX((SQC_DCACHE_REQ_READ_2 / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Read Req (4 DWord):
           avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom))
           min: MIN((SQC_DCACHE_REQ_READ_4 / $denom))
           max: MAX((SQC_DCACHE_REQ_READ_4 / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Read Req (8 DWord):
           avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom))
           min: MIN((SQC_DCACHE_REQ_READ_8 / $denom))
           max: MAX((SQC_DCACHE_REQ_READ_8 / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Read Req (16 DWord):
           avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom))
           min: MIN((SQC_DCACHE_REQ_READ_16 / $denom))
           max: MAX((SQC_DCACHE_REQ_READ_16 / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
   - metric_table:
       id: 1403
       title: Scalar L1D Cache - L2 Interface
@@ -171,19 +128,65 @@ Panel Config:
           avg: AVG((SQC_TC_DATA_READ_REQ / $denom))
           min: MIN((SQC_TC_DATA_READ_REQ / $denom))
           max: MAX((SQC_TC_DATA_READ_REQ / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Write Req:
           avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom))
           min: MIN((SQC_TC_DATA_WRITE_REQ / $denom))
           max: MAX((SQC_TC_DATA_WRITE_REQ / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Atomic Req:
           avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom))
           min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom))
           max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Stall Cycles:
           avg: AVG((SQC_TC_STALL / $denom))
           min: MIN((SQC_TC_STALL / $denom))
           max: MAX((SQC_TC_STALL / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
+  metrics_description:
+    Bandwidth Utilization: The number of bytes looked up in the sL1D cache, as a percent
+      of the peak theoretical bandwidth. Calculated as the ratio of sL1D requests
+      over the total sL1D cycles.
+    Cache Hit Rate: Indicates the percent of sL1D requests that hit on a previously
+      loaded line the cache. The ratio of the number of sL1D requests that hit over
+      the number of all sL1D requests.
+    sL1D-L2 BW Utilization: The percentage of the peak theoretical sL1D - L2 interface
+      bandwidth acheived. Calculated as total number of bytes read from, written to,
+      or atomically updated across the sL1D - L2 interface.
+    sL1D-L2 BW: |-
+      The total number of bytes read from, written to, or atomically updated
+      across the sL1D\u2194L2 interface, divided by total duration. Note that sL1D
+      writes and atomics are typically unused on current CDNA accelerators, so
+      in the majority of cases this can be interpreted as an sL1D\u2192L2 read
+      bandwidth.
+    Req: The total number of requests, of any size or type, made to the sL1D per normalization
+      unit.
+    Hits: The total number of sL1D requests that hit on a previously loaded cache
+      line, per normalization unit.
+    Misses - Non Duplicated: |-
+      The total number of sL1D requests that missed on a cache line that was
+      not already pending due to another request, per normalization unit.
+    Misses- Duplicated: The total number of sL1D requests that missed on a cache line
+      that was already pending due to another request, per normalization unit.
+    Read Req (Total): The total number of sL1D read requests of any size, per normalization
+      unit.
+    Atomic Req: The total number of atomic requests from sL1D to the L2, per normalization
+      unit. Typically unused on current CDNA accelerators.
+    Read Req (1 DWord): The total number of sL1D read requests made for a single dword
+      of data (4B), per normalization unit.
+    Read Req (2 DWord): The total number of sL1D read requests made for a two dwords
+      of data (8B), per normalization unit.
+    Read Req (4 DWord): The total number of sL1D read requests made for a four dwords
+      of data (16B), per normalization unit.
+    Read Req (8 DWord): The total number of sL1D read requests made for a eight dwords
+      of data (32B), per normalization unit.
+    Read Req (16 DWord): The total number of sL1D read requests made for a sixteen
+      dwords of data (64B), per normalization unit.
+    Read Req: The total number of read requests from sL1D to the L2 per normalization
+      unit.
+    Write Req: The total number of write requests from sL1D to the L2, per normalization
+      unit. Typically unused on current CDNA accelerators.
+    Stall Cycles: |-
+      The total number of cycles the sL1D\u2194L2 interface was stalled, per
+      normalization unit.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1500_address_processing_unit_and_data_return_path_ta_td.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1500_address_processing_unit_and_data_return_path_ta_td.yaml
index cdbb5393aa..3fd1615719 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1500_address_processing_unit_and_data_return_path_ta_td.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1500_address_processing_unit_and_data_return_path_ta_td.yaml
@@ -2,70 +2,6 @@
 Panel Config:
   id: 1500
   title: Address Processing Unit and Data Return Path (TA/TD)
-  metrics_description:
-    Address Processing Unit Busy: Percent of the total CU cycles the address processor
-      was busy
-    Address Stall: Percent of the total CU cycles the address processor was stalled
-      from sending address requests further into the vL1D pipeline.
-    Data Stall: Percent of the total CU cycles the address processor was stalled from
-      sending write/atomic data further into the vL1D pipeline.
-    "Data-Processor \u2192 Address Stall": Percent of total CU cycles the address
-      processor was stalled waiting to send command data to the data processor.
-    Total Instructions: The total number of memory instructions executed by the address
-      processer over all compute units on the accelerator, per normalization unit.
-    Global/Generic Instructions: The total number of global & generic memory instructions
-      executed on all compute units on the accelerator, per normalization unit.
-    Global/Generic Read Instructions: The total number of global & generic memory
-      read instructions executed on all compute units on the accelerator, per normalization
-      unit.
-    Global/Generic Write Instructions: The total number of global & generic memory
-      write instructions executed on all compute units on the accelerator, per normalization
-      unit.
-    Global/Generic Atomic Instructions: The total number of global & generic memory
-      atomic (with and without return) instructions executed on all compute units
-      on the accelerator, per normalization unit.
-    Spill/Stack Instructions: The total number of spill/stack memory instructions
-      executed on all compute units on the accelerator, per normalization unit.
-    Spill/Stack Read Instructions: The total number of spill/stack memory read instructions
-      executed on all compute units on the accelerator, per normalization unit.
-    Spill/Stack Write Instructions: The total number of spill/stack memory write instructions
-      executed on all compute units on the accelerator, per normalization unit.
-    Spill/Stack Atomic Instructions: The total number of spill/stack memory atomic
-      (with and without return) instructions executed on all compute units on the
-      accelerator, per normalization unit. Typically unused as these memory operations
-      are typically used to implement thread-local storage.
-    Spill/Stack Total Cycles: The number of cycles the address processing unit spent
-      working on spill/stack instructions, per normalization unit.
-    Spill/Stack Coalesced Read: The number of cycles the address processing unit spent
-      working on coalesced spill/stack read instructions, per normalization unit.
-    Spill/Stack Coalesced Write: The number of cycles the address processing unit
-      spent working on coalesced spill/stack write instructions, per normalization
-      unit.
-    Data-Return Busy: Percent of the total CU cycles the data-return unit was busy
-      processing or waiting on data to return to the CU.
-    "Cache RAM \u2192 Data-Return Stall": Percent of the total CU cycles the data-return
-      unit was stalled on data to be returned from the vL1D Cache RAM.
-    "Workgroup manager \u2192 Data-Return Stall": Percent of the total CU cycles the
-      data-return unit was stalled by the workgroup manager due to initialization
-      of registers as a part of launching new workgroups.
-    Coalescable Instructions: The number of instructions submitted to the data-return
-      unit by the address processor that were found to be coalescable, per normalization
-      unit.
-    Read Instructions: The number of read instructions submitted to the data-return
-      unit by the address processor summed over all compute units on the accelerator,
-      per normalization unit. This is expected to be the sum of global/generic and
-      spill/stack reads in the address processor.
-    Write Instructions: The number of store instructions submitted to the data-return
-      unit by the address processor summed over all compute units on the accelerator,
-      per normalization unit. This is expected to be the sum of global/generic and
-      spill/stack stores in the address processor.
-    Atomic Instructions: The number of atomic instructions submitted to the data-return
-      unit by the address processor summed over all compute units on the accelerator,
-      per normalization unit. This is expected to be the sum of global/generic and
-      spill/stack atomics in the address processor.
-    Write Ack Instructions: The total number of write acknowledgements submitted by
-      data-return unit to SQ, summed over all compute units on the accelerator, per
-      normalization unit.
   data source:
   - metric_table:
       id: 1501
@@ -135,47 +71,47 @@ Panel Config:
           avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom))
           min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom))
           max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Global/Generic Instructions:
           avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
           min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
           max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Global/Generic Read Instructions:
           avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
           min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
           max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Global/Generic Write Instructions:
           avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
           min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
           max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Global/Generic Atomic Instructions:
           avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
           min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
           max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Spill/Stack Instructions:
           avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
           min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
           max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Spill/Stack Read Instructions:
           avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
           min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
           max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Spill/Stack Write Instructions:
           avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
           min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
           max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Spill/Stack Atomic Instructions:
           avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
           min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
           max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
   - metric_table:
       id: 1503
       title: Spill and stack metrics
@@ -190,17 +126,17 @@ Panel Config:
           avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
           min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
           max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
         Spill/Stack Coalesced Read:
           avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
           min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
           max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
         Spill/Stack Coalesced Write:
           avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
           min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
           max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
   - metric_table:
       id: 1504
       title: Vector L1 data-return path or Texture Data (TD)
@@ -230,7 +166,7 @@ Panel Config:
           avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom))
           min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom))
           max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Read Instructions:
           avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
             / $denom))
@@ -238,14 +174,75 @@ Panel Config:
             / $denom))
           max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
             / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Write Instructions:
           avg: AVG((TD_STORE_WAVEFRONT_sum / $denom))
           min: MIN((TD_STORE_WAVEFRONT_sum / $denom))
           max: MAX((TD_STORE_WAVEFRONT_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Atomic Instructions:
           avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom))
           min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom))
           max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
+  metrics_description:
+    Address Processing Unit Busy: Percent of the total CU cycles the address processor
+      was busy
+    Address Stall: Percent of the total CU cycles the address processor was stalled
+      from sending address requests further into the vL1D pipeline.
+    Data Stall: Percent of the total CU cycles the address processor was stalled from
+      sending write/atomic data further into the vL1D pipeline.
+    "Data-Processor \u2192 Address Stall": Percent of total CU cycles the address
+      processor was stalled waiting to send command data to the data processor.
+    Total Instructions: The total number of memory instructions executed by the address
+      processer over all compute units on the accelerator, per normalization unit.
+    Global/Generic Instructions: The total number of global & generic memory instructions
+      executed on all compute units on the accelerator, per normalization unit.
+    Global/Generic Read Instructions: The total number of global & generic memory
+      read instructions executed on all compute units on the accelerator, per normalization
+      unit.
+    Global/Generic Write Instructions: The total number of global & generic memory
+      write instructions executed on all compute units on the accelerator, per normalization
+      unit.
+    Global/Generic Atomic Instructions: The total number of global & generic memory
+      atomic (with and without return) instructions executed on all compute units
+      on the accelerator, per normalization unit.
+    Spill/Stack Instructions: The total number of spill/stack memory instructions
+      executed on all compute units on the accelerator, per normalization unit.
+    Spill/Stack Read Instructions: The total number of spill/stack memory read instructions
+      executed on all compute units on the accelerator, per normalization unit.
+    Spill/Stack Write Instructions: The total number of spill/stack memory write instructions
+      executed on all compute units on the accelerator, per normalization unit.
+    Spill/Stack Atomic Instructions: The total number of spill/stack memory atomic
+      (with and without return) instructions executed on all compute units on the
+      accelerator, per normalization unit. Typically unused as these memory operations
+      are typically used to implement thread-local storage.
+    Spill/Stack Total Cycles: The number of cycles the address processing unit spent
+      working on spill/stack instructions, per normalization unit.
+    Spill/Stack Coalesced Read: The number of cycles the address processing unit spent
+      working on coalesced spill/stack read instructions, per normalization unit.
+    Spill/Stack Coalesced Write: The number of cycles the address processing unit
+      spent working on coalesced spill/stack write instructions, per normalization
+      unit.
+    Data-Return Busy: Percent of the total CU cycles the data-return unit was busy
+      processing or waiting on data to return to the CU.
+    "Cache RAM \u2192 Data-Return Stall": Percent of the total CU cycles the data-return
+      unit was stalled on data to be returned from the vL1D Cache RAM.
+    "Workgroup manager \u2192 Data-Return Stall": Percent of the total CU cycles the
+      data-return unit was stalled by the workgroup manager due to initialization
+      of registers as a part of launching new workgroups.
+    Coalescable Instructions: The number of instructions submitted to the data-return
+      unit by the address processor that were found to be coalescable, per normalization
+      unit.
+    Read Instructions: The number of read instructions submitted to the data-return
+      unit by the address processor summed over all compute units on the accelerator,
+      per normalization unit. This is expected to be the sum of global/generic and
+      spill/stack reads in the address processor.
+    Write Instructions: The number of store instructions submitted to the data-return
+      unit by the address processor summed over all compute units on the accelerator,
+      per normalization unit. This is expected to be the sum of global/generic and
+      spill/stack stores in the address processor.
+    Atomic Instructions: The number of atomic instructions submitted to the data-return
+      unit by the address processor summed over all compute units on the accelerator,
+      per normalization unit. This is expected to be the sum of global/generic and
+      spill/stack atomics in the address processor.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1600_vector_l1_data_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1600_vector_l1_data_cache.yaml
index e5b5eb9e9c..3125397a30 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1600_vector_l1_data_cache.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1600_vector_l1_data_cache.yaml
@@ -2,117 +2,6 @@
 Panel Config:
   id: 1600
   title: Vector L1 Data Cache
-  metrics_description:
-    Hit rate: The ratio of the number of vL1D cache line requests that hit in vL1D
-      cache over the total number of cache line requests to the vL1D Cache RAM.
-    Bandwidth Utilization: The number of bytes looked up in the vL1D cache as a result
-      of VMEM instructions, as a percent of the peak theoretical bandwidth achievable
-      on the specific accelerator. The number of bytes is calculated as the number
-      of cache lines requested multiplied by the cache line size. This value does
-      not consider partial requests, so for instance, if only a single value is requested
-      in a cache line, the data movement will still be counted as a full cache line.
-    Utilization: Indicates how busy the vL1D Cache RAM was during the kernel execution.
-      The number of cycles where the vL1D Cache RAM is actively processing any request
-      divided by the number of cycles where the vL1D is active.
-    Coalescing: Indicates how well memory instructions were coalesced by the address
-      processing unit, ranging from uncoalesced (25%) to fully coalesced (100%). Calculated
-      as the average number of thread-requests generated per instruction divided by
-      the ideal number of thread-requests per instruction.
-    Stalled on L2 Data: The ratio of the number of cycles where the vL1D is stalled
-      waiting for requested data to return from the L2 cache divided by the number
-      of cycles where the vL1D is active.
-    Stalled on L2 Req: The ratio of the number of cycles where the vL1D is stalled
-      waiting to issue a request for data to the L2 cache divided by the number of
-      cycles where the vL1D is active.
-    Tag RAM Stall (Read): The ratio of the number of cycles where the vL1D is stalled
-      due to Read requests with conflicting tags being looked up concurrently, divided
-      by the number of cycles where the vL1D is active.
-    Tag RAM Stall (Write): The ratio of the number of cycles where the vL1D is stalled
-      due to Write requests with conflicting tags being looked up concurrently, divided
-      by the number of cycles where the vL1D is active.
-    Tag RAM Stall (Atomic): The ratio of the number of cycles where the vL1D is stalled
-      due to Atomic requests with conflicting tags being looked up concurrently, divided
-      by the number of cycles where the vL1D is active.
-    Total Req: The total number of incoming requests from the address processing unit
-      after coalescing.
-    Read Req: The total number of incoming read requests from the address processing
-      unit after coalescing per normalization unit.
-    Write Req: The total number of incoming write requests from the address processing
-      unit after coalescing per normalization unit.
-    Atomic Req: The total number of incoming atomic requests from the address processing
-      unit after coalescing per normalization unit.
-    Cache BW: The number of bytes looked up in the vL1D cache as a result of VMEM
-      instructions divided by total duration. The number of bytes is calculated as
-      the number of cache lines requested multiplied by the cache line size.  This
-      value does not consider partial requests, so for instance, if only a single
-      value is requested in a cache line, the data movement will still be counted
-      as a full cache line.
-    Cache Hit Rate: The ratio of the number of vL1D cache line requests that hit in
-      vL1D cache over the total number of cache line requests to the vL1D Cache RAM.
-    Cache Accesses: The total number of cache line lookups in the vL1D.
-    Cache Hits: The number of cache accesses minus the number of outgoing requests
-      to the L2 cache, that is, the number of cache line requests serviced by the
-      vL1D Cache RAM per normalization unit.
-    Invalidations: The number of times the vL1D was issued a write-back invalidate
-      command during the kernel's execution per normalization unit. This may be triggered
-      by, for instance, the buffer_wbinvl1 instruction.
-    L1-L2 BW: The number of bytes transferred across the vL1D-L2 interface as a result
-      of VMEM instructions, divided by total duration. The number of bytes is calculated
-      as the number of cache lines requested multiplied by the cache line size. This
-      value does not consider partial requests, so for instance, if only a single
-      value is requested in a cache line, the data movement will still be counted
-      as a full cache line.
-    L1-L2 Read: The number of read requests for a vL1D cache line that were not satisfied
-      by the vL1D and must be retrieved from the to the L2 Cache per normalization
-      unit.
-    L1-L2 Write: The number of write requests to a vL1D cache line that were sent
-      through the vL1D to the L2 cache, per normalization unit.
-    L1-L2 Atomic: The number of atomic requests that are sent through the vL1D to
-      the L2 cache, per normalization unit. This includes requests for atomics with,
-      and without return.
-    L1 Access Latency: Calculated as the average number of cycles that a vL1D cache
-      line request spent in the vL1D cache pipeline.
-    L1-L2 Read Latency: Calculated as the average number of cycles that the vL1D cache
-      took to issue and receive read requests from the L2 Cache. This number also
-      includes requests for atomics with return values.
-    L1-L2 Write Latency: Calculated as the average number of cycles that the vL1D
-      cache took to issue and receive acknowledgement of a write request to the L2
-      Cache. This number also includes requests for atomics without return values.
-    NC - Read: Total read requests with NC mtype from this TCP to all TCCs Sum over
-      TCP instances per normalization unit.
-    UC - Read: Total read requests with UC mtype from this TCP to all TCCs Sum over
-      TCP instances per normalization unit.
-    CC - Read: Total read requests with CC mtype from this TCP to all TCCs Sum over
-      TCP instances per normalization unit.
-    RW - Read: Total read requests with RW mtype from this TCP to all TCCs Sum over
-      TCP instances per normalization unit.
-    RW - Write: Total write requests with RW mtype from this TCP to all TCCs Sum over
-      TCP instances per normalization unit.
-    NC - Write: Total write requests with NC mtype from this TCP to all TCCs Sum over
-      TCP instances per normalization unit.
-    UC - Write: Total write requests with UC mtype from this TCP to all TCCs Sum over
-      TCP instances per normalization unit.
-    CC - Write: Total write requests with CC mtype from this TCP to all TCCs Sum over
-      TCP instances per normalization unit.
-    NC - Atomic: Total atomic requests with NC mtype from this TCP to all TCCs Sum
-      over TCP instances per normalization unit.
-    UC - Atomic: Total atomic requests with UC mtype from this TCP to all TCCs Sum
-      over TCP instances per normalization unit.
-    CC - Atomic: Total atomic requests with CC mtype from this TCP to all TCCs Sum
-      over TCP instances per normalization unit.
-    RW - Atomic: Total atomic requests with RW mtype from this TCP to all TCCs Sum
-      over TCP instances per normalization unit.
-    Req: The number of translation requests made to the UTCL1 per normalization unit.
-    Hit Ratio: The ratio of the number of translation requests that hit in the UTCL1
-      divided by the total number of translation requests made to the UTCL1.
-    Hits: The number of translation requests that hit in the UTCL1, and could be reused,
-      per normalization unit.
-    Translation Misses: The total number of translation requests that missed in the
-      UTCL1 due to  translation not being present in the cache, per normalization
-      unit.
-    Permission Misses: "The total number of translation requests that missed in the\
-      \ UTCL1 due to a permission error, per normalization unit. This is unused and\
-      \ expected to be zero in most configurations for modern CDNA\u2122 accelerators."
   data source:
   - metric_table:
       id: 1601
@@ -181,17 +70,17 @@ Panel Config:
           avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom))
           min: MIN((TCP_TOTAL_ACCESSES_sum / $denom))
           max: MAX((TCP_TOTAL_ACCESSES_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Read Req:
           avg: AVG((TCP_TOTAL_READ_sum / $denom))
           min: MIN((TCP_TOTAL_READ_sum / $denom))
           max: MAX((TCP_TOTAL_READ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Write Req:
           avg: AVG((TCP_TOTAL_WRITE_sum / $denom))
           min: MIN((TCP_TOTAL_WRITE_sum / $denom))
           max: MAX((TCP_TOTAL_WRITE_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Atomic Req:
           avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
             / $denom))
@@ -199,7 +88,7 @@ Panel Config:
             / $denom))
           max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
             / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Cache BW:
           avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
           min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
@@ -223,7 +112,7 @@ Panel Config:
           avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
           min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
           max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Cache Hits:
           avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
             + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
@@ -234,7 +123,7 @@ Panel Config:
           max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
             + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
             / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Invalidations:
           avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
           min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
@@ -252,12 +141,12 @@ Panel Config:
           avg: AVG((TCP_TCC_READ_REQ_sum / $denom))
           min: MIN((TCP_TCC_READ_REQ_sum / $denom))
           max: MAX((TCP_TCC_READ_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         L1-L2 Write:
           avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom))
           min: MIN((TCP_TCC_WRITE_REQ_sum / $denom))
           max: MAX((TCP_TCC_WRITE_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         L1-L2 Atomic:
           avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
             / $denom))
@@ -265,7 +154,7 @@ Panel Config:
             / $denom))
           max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
             / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
   - metric_table:
       id: 1604
       title: L1D - L2 Transactions
@@ -284,84 +173,84 @@ Panel Config:
           avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom))
           min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom))
           max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         UC - Read:
           xfer: Read
           coherency: UC
           avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom))
           min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom))
           max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         CC - Read:
           xfer: Read
           coherency: CC
           avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom))
           min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom))
           max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         RW - Read:
           xfer: Read
           coherency: RW
           avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom))
           min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom))
           max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         RW - Write:
           xfer: Write
           coherency: RW
           avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom))
           min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom))
           max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         NC - Write:
           xfer: Write
           coherency: NC
           avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom))
           min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom))
           max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         UC - Write:
           xfer: Write
           coherency: UC
           avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom))
           min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom))
           max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         CC - Write:
           xfer: Write
           coherency: CC
           avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom))
           min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom))
           max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         NC - Atomic:
           xfer: Atomic
           coherency: NC
           avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
           min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
           max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         UC - Atomic:
           xfer: Atomic
           coherency: UC
           avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
           min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
           max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         CC - Atomic:
           xfer: Atomic
           coherency: CC
           avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
           min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
           max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         RW - Atomic:
           xfer: Atomic
           coherency: RW
           avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
           min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
           max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
   - metric_table:
       id: 1605
       title: L1 Unified Translation Cache (UTCL1)
@@ -410,3 +299,106 @@ Panel Config:
         max: Max
         units: Unit
       metric: {}
+  metrics_description:
+    Hit rate: The ratio of the number of vL1D cache line requests that hit in vL1D
+      cache over the total number of cache line requests to the vL1D Cache RAM.
+    Bandwidth Utilization: The number of bytes looked up in the vL1D cache as a result
+      of VMEM instructions, as a percent of the peak theoretical bandwidth achievable
+      on the specific accelerator. The number of bytes is calculated as the number
+      of cache lines requested multiplied by the cache line size. This value does
+      not consider partial requests, so for instance, if only a single value is requested
+      in a cache line, the data movement will still be counted as a full cache line.
+    Utilization: Indicates how busy the vL1D Cache RAM was during the kernel execution.
+      The number of cycles where the vL1D Cache RAM is actively processing any request
+      divided by the number of cycles where the vL1D is active.
+    Coalescing: Indicates how well memory instructions were coalesced by the address
+      processing unit, ranging from uncoalesced (25%) to fully coalesced (100%). Calculated
+      as the average number of thread-requests generated per instruction divided by
+      the ideal number of thread-requests per instruction.
+    Stalled on L2 Data: The ratio of the number of cycles where the vL1D is stalled
+      waiting for requested data to return from the L2 cache divided by the number
+      of cycles where the vL1D is active.
+    Stalled on L2 Req: The ratio of the number of cycles where the vL1D is stalled
+      waiting to issue a request for data to the L2 cache divided by the number of
+      cycles where the vL1D is active.
+    Tag RAM Stall (Read): The ratio of the number of cycles where the vL1D is stalled
+      due to Read requests with conflicting tags being looked up concurrently, divided
+      by the number of cycles where the vL1D is active.
+    Tag RAM Stall (Write): The ratio of the number of cycles where the vL1D is stalled
+      due to Write requests with conflicting tags being looked up concurrently, divided
+      by the number of cycles where the vL1D is active.
+    Tag RAM Stall (Atomic): The ratio of the number of cycles where the vL1D is stalled
+      due to Atomic requests with conflicting tags being looked up concurrently, divided
+      by the number of cycles where the vL1D is active.
+    Total Req: The total number of incoming requests from the address processing unit
+      after coalescing.
+    Read Req: The total number of incoming read requests from the address processing
+      unit after coalescing per normalization unit.
+    Write Req: The total number of incoming write requests from the address processing
+      unit after coalescing per normalization unit.
+    Atomic Req: The total number of incoming atomic requests from the address processing
+      unit after coalescing per normalization unit.
+    Cache BW: The number of bytes looked up in the vL1D cache as a result of VMEM
+      instructions divided by total duration. The number of bytes is calculated as
+      the number of cache lines requested multiplied by the cache line size. This
+      value does not consider partial requests, so for instance, if only a single
+      value is requested in a cache line, the data movement will still be counted
+      as a full cache line.
+    Cache Hit Rate: The ratio of the number of vL1D cache line requests that hit in
+      vL1D cache over the total number of cache line requests to the vL1D Cache RAM.
+    Cache Accesses: The total number of cache line lookups in the vL1D.
+    Cache Hits: The number of cache accesses minus the number of outgoing requests
+      to the L2 cache, that is, the number of cache line requests serviced by the
+      vL1D Cache RAM per normalization unit.
+    Invalidations: The number of times the vL1D was issued a write-back invalidate
+      command during the kernel's execution per normalization unit. This may be triggered
+      by, for instance, the buffer_wbinvl1 instruction.
+    L1-L2 BW: The number of bytes transferred across the vL1D-L2 interface as a result
+      of VMEM instructions, divided by total duration. The number of bytes is calculated
+      as the number of cache lines requested multiplied by the cache line size. This
+      value does not consider partial requests, so for instance, if only a single
+      value is requested in a cache line, the data movement will still be counted
+      as a full cache line.
+    L1-L2 Read: The number of read requests for a vL1D cache line that were not satisfied
+      by the vL1D and must be retrieved from the to the L2 Cache per normalization
+      unit.
+    L1-L2 Write: The number of write requests to a vL1D cache line that were sent
+      through the vL1D to the L2 cache, per normalization unit.
+    L1-L2 Atomic: The number of atomic requests that are sent through the vL1D to
+      the L2 cache, per normalization unit. This includes requests for atomics with,
+      and without return.
+    NC - Read: Total read requests with NC mtype from this TCP to all TCCs Sum over
+      TCP instances per normalization unit.
+    UC - Read: Total read requests with UC mtype from this TCP to all TCCs Sum over
+      TCP instances per normalization unit.
+    CC - Read: Total read requests with CC mtype from this TCP to all TCCs Sum over
+      TCP instances per normalization unit.
+    RW - Read: Total read requests with RW mtype from this TCP to all TCCs Sum over
+      TCP instances per normalization unit.
+    RW - Write: Total write requests with RW mtype from this TCP to all TCCs Sum over
+      TCP instances per normalization unit.
+    NC - Write: Total write requests with NC mtype from this TCP to all TCCs Sum over
+      TCP instances per normalization unit.
+    UC - Write: Total write requests with UC mtype from this TCP to all TCCs Sum over
+      TCP instances per normalization unit.
+    CC - Write: Total write requests with CC mtype from this TCP to all TCCs Sum over
+      TCP instances per normalization unit.
+    NC - Atomic: Total atomic requests with NC mtype from this TCP to all TCCs Sum
+      over TCP instances per normalization unit.
+    UC - Atomic: Total atomic requests with UC mtype from this TCP to all TCCs Sum
+      over TCP instances per normalization unit.
+    CC - Atomic: Total atomic requests with CC mtype from this TCP to all TCCs Sum
+      over TCP instances per normalization unit.
+    RW - Atomic: Total atomic requests with RW mtype from this TCP to all TCCs Sum
+      over TCP instances per normalization unit.
+    Req: The number of translation requests made to the UTCL1 per normalization unit.
+    Hit Ratio: The ratio of the number of translation requests that hit in the UTCL1
+      divided by the total number of translation requests made to the UTCL1.
+    Hits: The number of translation requests that hit in the UTCL1, and could be reused,
+      per normalization unit.
+    Translation Misses: The total number of translation requests that missed in the
+      UTCL1 due to translation not being present in the cache, per normalization unit.
+    Permission Misses: |-
+      The total number of translation requests that missed in the UTCL1 due
+      to a permission error, per normalization unit. This is unused and expected
+      to be zero in most configurations for modern CDNA\u2122 accelerators.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1700_l2_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1700_l2_cache.yaml
index 6390ed1eaf..23e277a9a2 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1700_l2_cache.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1700_l2_cache.yaml
@@ -2,218 +2,6 @@
 Panel Config:
   id: 1700
   title: L2 Cache
-  metrics_description:
-    Utilization: The ratio of the number of cycles an L2 channel was active, summed
-      over all L2 channels on the accelerator over the total L2 cycles.
-    Peak Bandwidth: The number of bytes looked up in the L2 cache, as a percent of
-      the peak theoretical bandwidth achievable on the specific accelerator. The number
-      of bytes is calculated as the number of cache lines requested multiplied by
-      the cache line size. This value does not consider partial requests, so e.g.,
-      if only a single value is requested in a cache line, the data movement will
-      still be counted as a full cache line.
-    Hit Rate: The ratio of the number of L2 cache line requests that hit in the L2
-      cache over the total number of incoming cache line requests to the L2 cache.
-    L2-Fabric Read BW: The number of bytes read by the L2 over the Infinity Fabric
-      interface per unit time.
-    L2-Fabric Write and Atomic BW: The number of bytes sent by the L2 over the Infinity
-      Fabric interface by write and atomic operations per unit time.
-    HBM Bandwidth: Maximum theoretical bandwidth of the accelerator's local high-bandwidth
-      memory (HBM) per unit time. This value is calculated as the number of HBM channels
-      multiplied by the HBM channel width multiplied by the HBM clock frequency.
-    Read BW: The total number of bytes read by the L2 cache from Infinity Fabric divided
-      by total duration.
-    HBM Read Traffic: The percent of read requests generated by the L2 cache that
-      are routed to the accelerator's local high-bandwidth memory (HBM). This breakdown
-      does not consider the size of the request (meaning that 32B and 64B requests
-      are both counted as a single request), so this metric only approximates the
-      percent of the L2-Fabric Read bandwidth directed to the local HBM.
-    Remote Read Traffic: The percent of read requests generated by the L2 cache that
-      are routed to any memory location other than the accelerator's local high-bandwidth
-      memory (HBM) - for example, the CPU's DRAM or a remote accelerator's HBM. This
-      breakdown does not consider the size of the request (meaning that 32B and 64B
-      requests are both counted as a single request), so this metric only approximates
-      the percent of the L2-Fabric Read bandwidth directed to a remote location.
-    Uncached Read Traffic: The percent of read requests generated by the L2 cache
-      that are reading from an uncached memory allocation. Note, as described in the
-      request flow section, a single 64B read request is typically counted as two
-      uncached read requests. So, it is possible for the Uncached Read Traffic to
-      reach up to 200% of the total number of read requests. This breakdown does not
-      consider the size of the request (i.e., 32B and 64B requests are both counted
-      as a single request), so this metric only approximates the percent of the L2-Fabric
-      read bandwidth directed to an uncached memory location.
-    Write and Atomic BW: The total number of bytes written by the L2 over Infinity
-      Fabric by write and atomic operations divided by total duration. Note that on
-      current CDNA accelerators, such as the MI2XX, requests are only considered atomic
-      by Infinity Fabric if they are targeted at non-write-cacheable memory, for example,
-      fine-grained memory allocations or uncached memory allocations on the MI2XX.
-    HBM Write and Atomic Traffic: The percent of write and atomic requests generated
-      by the L2 cache that are routed to the accelerator's local high-bandwidth memory
-      (HBM). This breakdown does not consider the size of the request (meaning that
-      32B and 64B requests are both counted as a single request), so this metric only
-      approximates the percent of the L2-Fabric Write and Atomic bandwidth directed
-      to the local HBM. Note that on current CDNA accelerators, such as the MI2XX,
-      requests are only considered atomic by Infinity Fabric if they are targeted
-      at fine-grained memory allocations or uncached memory allocations.
-    Remote Write and Atomic Traffic: The percent of read requests generated by the
-      L2 cache that are routed to any memory location other than the accelerator's
-      local high-bandwidth memory (HBM) - for example, the CPU's DRAM or a remote
-      accelerator's HBM. This breakdown does not consider the size of the request
-      (meaning that 32B and 64B requests are both counted as a single request), so
-      this metric only approximates the percent of the L2-Fabric Read bandwidth directed
-      to a remote location. Note that on current CDNA accelerators, such as the MI2XX,
-      requests are only considered atomic by Infinity Fabric if they are targeted
-      at fine-grained memory allocations or uncached memory allocations.
-    Atomic Traffic: The percent of write requests generated by the L2 cache that are
-      atomic requests to any memory location. This breakdown does not consider the
-      size of the request (meaning that 32B and 64B requests are both counted as a
-      single request), so this metric only approximates the percent of the L2-Fabric
-      Read bandwidth directed to a remote location. Note that on current CDNA accelerators,
-      such as the MI2XX, requests are only considered atomic by Infinity Fabric if
-      they are targeted at fine-grained memory allocations or uncached memory allocations.
-    Uncached Write and Atomic Traffic: The percent of write and atomic requests generated
-      by the L2 cache that are targeting uncached memory allocations. This breakdown
-      does not consider the size of the request (meaning that 32B and 64B requests
-      are both counted as a single request), so this metric only approximates the
-      percent of the L2-Fabric read bandwidth directed to uncached memory allocations.
-    Read Latency: The time-averaged number of cycles read requests spent in Infinity
-      Fabric before data was returned to the L2.
-    Write and Atomic Latency: The time-averaged number of cycles write requests spent
-      in Infinity Fabric before a completion acknowledgement was returned to the L2.
-    Atomic Latency: The time-averaged number of cycles atomic requests spent in Infinity
-      Fabric before a completion acknowledgement (atomic without return value) or
-      data (atomic with return value) was returned to the L2.
-    Bandwidth: The number of bytes looked up in the L2 cache, divided by total duration.
-      The number of bytes is calculated as the number of cache lines requested multiplied
-      by the cache line size. This value does not consider partial requests, so for
-      example, if only a single value is requested in a cache line, the data movement
-      will still be counted as a full cache line.
-    Read Bandwidth: Total number of bytes looked up in the L2 cache for read requests,
-      divided by total duration.
-    Write Bandwidth: Total number of bytes looked up in the L2 cache for write requests,
-      divided by total duration.
-    Atomic Bandwidth: Total number of bytes looked up in the L2 cache for atomic requests,
-      divided by total duration.
-    Req: The total number of incoming requests to the L2 from all clients for all
-      request types, per normalization unit.
-    Read Req: The total number of read requests to the L2 from all clients.
-    Write Req: The total number of write requests to the L2 from all clients.
-    Atomic Req: The total number of atomic requests (with and without return) to the
-      L2 from all clients.
-    Streaming Req: The total number of incoming requests to the L2 that are marked
-      as streaming. The exact meaning of this may differ depending on the targeted
-      accelerator, however on an MI2XX this corresponds to non-temporal load or stores.
-      The L2 cache attempts to evict streaming requests before normal requests when
-      the L2 is at capacity.
-    Probe Req: The number of coherence probe requests made to the L2 cache from outside
-      the accelerator. On an MI2XX, probe requests may be generated by, for example,
-      writes to fine-grained device memory or by writes to coarse-grained device memory.
-    Cache Hit: The ratio of the number of L2 cache line requests that hit in the L2
-      cache over the total number of incoming cache line requests to the L2 cache.
-    Hits: The total number of requests to the L2 from all clients that hit in the
-      cache. As noted in the Speed-of-Light section, this includes hit-on-miss requests.
-    Misses: The total number of requests to the L2 from all clients that miss in the
-      cache. As noted in the Speed-of-Light section, these do not include hit-on-miss
-      requests.
-    Writeback: The total number of L2 cache lines written back to memory for any reason.
-      Write-backs may occur due to user code (such as HIP kernel calls to _threadfence_system
-      or atomic built-ins) by the command processor's memory acquire/release fences,
-      or for other internal hardware reasons.
-    Writeback (Internal): The total number of L2 cache lines written back to memory
-      for internal hardware reasons, per normalization unit.
-    Writeback (vL1D Req): The total number of L2 cache lines written back to memory
-      due to requests initiated by the vL1D cache, per normalization unit.
-    Evict (Internal): The total number of L2 cache lines evicted from the cache due
-      to capacity limits, per normalization unit.
-    Evict (vL1D Req): The total number of L2 cache lines evicted from the cache due
-      to invalidation requests initiated by the vL1D cache, per normalization unit.
-    NC Req: The total number of requests to the L2 to Not-hardware-Coherent (NC) memory
-      allocations, per normalization unit.
-    UC Req: The total number of requests to the L2 that go to Uncached (UC) memory
-      allocations.
-    CC Req: The total number of requests to the L2 that go to Coherently Cacheable
-      (CC) memory allocations.
-    RW Req: The total number of requests to the L2 that go to Read-Write coherent
-      memory (RW) allocations.
-    Write - Credit Starvation: The number of cycles the L2-Fabric interface was stalled
-      on write or atomic requests to any memory location because too many write/atomic
-      requests were currently in flight, as a percent of the total active L2 cycles.
-    Read (32B): The total number of L2 requests to Infinity Fabric to read 32B of
-      data from any memory location, per normalization unit.
-    Read (64B): The total number of L2 requests to Infinity Fabric to read 64B of
-      data from any memory location, per normalization unit.
-    Read (Uncached): The total number of L2 requests to Infinity Fabric to read uncached
-      data from any memory location, per normalization unit. 64B requests for uncached
-      data are counted as two 32B uncached data requests.
-    HBM Read: The total number of L2 requests to Infinity Fabric to read 32B or 64B
-      of data from the accelerator's local HBM, per normalization unit.
-    Remote Read: The total number of L2 requests to Infinity Fabric to read 32B or
-      64B of data from any source other than the accelerator's local HBM, per normalization
-      unit.
-    Read Bandwidth - PCIe: Total number of bytes due to L2 read requests due to PCIe
-      traffic, divided by total duration.
-    "Read Bandwidth - Infinity Fabric\u2122": Total number of bytes due to L2 read
-      requests due to Infinity Fabric traffic, divided by total duration.
-    Read Bandwidth - HBM: Total number of bytes due to L2 read requests due to HBM
-      traffic, divided by total duration.
-    Write and Atomic (32B): The total number of L2 requests to Infinity Fabric to
-      write or atomically update 32B of data to any memory location, per normalization
-      unit.
-    Write and Atomic (Uncached): The total number of L2 requests to Infinity Fabric
-      to write or atomically update 32B or 64B of uncached data, per normalization
-      unit.
-    Write and Atomic (64B): The total number of L2 requests to Infinity Fabric to
-      write or atomically update 64B of data in any memory location, per normalization
-      unit.
-    HBM Write and Atomic: The total number of L2 requests to Infinity Fabric to write
-      or atomically update 32B or 64B of data in the accelerator's local HBM, per
-      normalization unit.
-    Remote Write and Atomic: The total number of L2 requests to Infinity Fabric to
-      write or atomically update 32B or 64B of data in any memory location other than
-      the accelerator's local HBM, per normalization unit.
-    Write Bandwidth - PCIe: Total number of bytes due to L2 write requests due to
-      PCIe traffic, divided by total duration.
-    "Write Bandwidth - Infinity Fabric\u2122": Total number of bytes due to L2 write
-      requests due to Infinity Fabric traffic, divided by total duration.
-    Write Bandwidth - HBM: Total number of bytes due to L2 write requests due to HBM
-      traffic, divided by total duration.
-    Atomic Bandwidth - PCIe: Total number of bytes due to L2 atomic requests due to
-      PCIe traffic, divided by total duration.
-    "Atomic Bandwidth - Infinity Fabric\u2122": Total number of bytes due to L2 atomic
-      requests due to Infinity Fabric traffic, divided by total duration.
-    Atomic Bandwidth - HBM: Total number of bytes due to L2 atomic requests due to
-      HBM traffic, divided by total duration.
-    Atomic: The total number of L2 requests to Infinity Fabric to atomically update
-      32B or 64B of data in any memory location, per normalization unit. See Request
-      flow for more detail. Note that on current CDNA accelerators, such as the MI2XX,
-      requests are only considered atomic by Infinity Fabric if they are targeted
-      at non-write-cacheable memory, such as fine-grained memory allocations or uncached
-      memory allocations on the MI2XX.
-    Read Stall: "The ratio of the total number of cycles the L2-Fabric interface was\
-      \ stalled on a read request to any destination (local HBM, remote PCIe\xAE connected\
-      \ accelerator or CPU, or remote Infinity Fabric connected accelerator or CPU)\
-      \ over the total active L2 cycles."
-    Write Stall: The ratio of the total number of cycles the L2-Fabric interface was
-      stalled on a write or atomic request to any destination (local HBM, remote accelerator
-      or CPU, PCIe connected accelerator or CPU, or remote Infinity Fabric connected
-      accelerator or CPU) over the total active L2 cycles.
-    Read - PCIe Stall: The number of cycles the L2-Fabric interface was stalled on
-      read requests to remote PCIe connected accelerators or CPUs as a percent of
-      the total active L2 cycles.
-    Read - Infinity Fabric Stall: The number of cycles the L2-Fabric interface was
-      stalled on read requests to remote Infinity Fabric connected accelerators or
-      CPUs as a percent of the total active L2 cycles.
-    Read - HBM Stall: The number of cycles the L2-Fabric interface was stalled on
-      read requests to the accelerator's local HBM as a percent of the total active
-      L2 cycles.
-    Write - PCIe Stall: The number of cycles the L2-Fabric interface was stalled on
-      write or atomic requests to remote PCIe connected accelerators or CPUs as a
-      percent of the total active L2 cycles.
-    Write - Infinity Fabric Stall: The number of cycles the L2-Fabric interface was
-      stalled on write or atomic requests to remote Infinity Fabric connected accelerators
-      or CPUs as a percent of the total active L2 cycles.
-    Write - HBM Stall: The number of cycles the L2-Fabric interface was stalled on
-      write or atomic requests to accelerator's local HBM as a percent of the total
-      active L2 cycles.
   data source:
   - metric_table:
       id: 1701
@@ -370,32 +158,32 @@ Panel Config:
           avg: AVG((TCC_REQ_sum / $denom))
           min: MIN((TCC_REQ_sum / $denom))
           max: MAX((TCC_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Read Req:
           avg: AVG((TCC_READ_sum / $denom))
           min: MIN((TCC_READ_sum / $denom))
           max: MAX((TCC_READ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Write Req:
           avg: AVG((TCC_WRITE_sum / $denom))
           min: MIN((TCC_WRITE_sum / $denom))
           max: MAX((TCC_WRITE_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Atomic Req:
           avg: AVG((TCC_ATOMIC_sum / $denom))
           min: MIN((TCC_ATOMIC_sum / $denom))
           max: MAX((TCC_ATOMIC_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Streaming Req:
           avg: AVG((TCC_STREAMING_REQ_sum / $denom))
           min: MIN((TCC_STREAMING_REQ_sum / $denom))
           max: MAX((TCC_STREAMING_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Probe Req:
           avg: AVG((TCC_PROBE_sum / $denom))
           min: MIN((TCC_PROBE_sum / $denom))
           max: MAX((TCC_PROBE_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Cache Hit:
           avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
             + TCC_MISS_sum) != 0) else None))
@@ -408,17 +196,17 @@ Panel Config:
           avg: AVG((TCC_HIT_sum / $denom))
           min: MIN((TCC_HIT_sum / $denom))
           max: MAX((TCC_HIT_sum / $denom))
-          unit: (Hits  + $normUnit)
+          unit: (Hits + $normUnit)
         Misses:
           avg: AVG((TCC_MISS_sum / $denom))
           min: MIN((TCC_MISS_sum / $denom))
           max: MAX((TCC_MISS_sum / $denom))
-          unit: (Misses  + $normUnit)
+          unit: (Misses + $normUnit)
         Writeback:
           avg: AVG((TCC_WRITEBACK_sum / $denom))
           min: MIN((TCC_WRITEBACK_sum / $denom))
           max: MAX((TCC_WRITEBACK_sum / $denom))
-          unit: (Cachelines  + $normUnit)
+          unit: (Cachelines + $normUnit)
         Writeback (Internal):
           avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom))
           min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom))
@@ -443,22 +231,22 @@ Panel Config:
           avg: AVG((TCC_NC_REQ_sum / $denom))
           min: MIN((TCC_NC_REQ_sum / $denom))
           max: MAX((TCC_NC_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         UC Req:
           avg: AVG((TCC_UC_REQ_sum / $denom))
           min: MIN((TCC_UC_REQ_sum / $denom))
           max: MAX((TCC_UC_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         CC Req:
           avg: AVG((TCC_CC_REQ_sum / $denom))
           min: MIN((TCC_CC_REQ_sum / $denom))
           max: MAX((TCC_CC_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         RW Req:
           avg: AVG((TCC_RW_REQ_sum / $denom))
           min: MIN((TCC_RW_REQ_sum / $denom))
           max: MAX((TCC_RW_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
   - metric_table:
       id: 1704
       title: L2 Cache Stalls
@@ -507,54 +295,216 @@ Panel Config:
           avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom))
           min: MIN((TCC_EA0_RDREQ_32B_sum / $denom))
           max: MAX((TCC_EA0_RDREQ_32B_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Read (64B):
           avg: AVG(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
           min: MIN(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
           max: MAX(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Read (Uncached):
           avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
           min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
           max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         HBM Read:
           avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom))
           min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom))
           max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Remote Read:
           avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
           min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
           max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Write and Atomic (32B):
           avg: AVG(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
           min: MIN(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
           max: MAX(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Write and Atomic (Uncached):
           avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
           min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
           max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Write and Atomic (64B):
           avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom))
           min: MIN((TCC_EA0_WRREQ_64B_sum / $denom))
           max: MAX((TCC_EA0_WRREQ_64B_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         HBM Write and Atomic:
           avg: AVG((TCC_EA0_WRREQ_DRAM_sum / $denom))
           min: MIN((TCC_EA0_WRREQ_DRAM_sum / $denom))
           max: MAX((TCC_EA0_WRREQ_DRAM_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Remote Write and Atomic:
           avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
           min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
           max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Atomic:
           avg: AVG((TCC_EA0_ATOMIC_sum / $denom))
           min: MIN((TCC_EA0_ATOMIC_sum / $denom))
           max: MAX((TCC_EA0_ATOMIC_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
+  metrics_description:
+    Utilization: The ratio of the number of cycles an L2 channel was active, summed
+      over all L2 channels on the accelerator over the total L2 cycles.
+    Peak Bandwidth: The number of bytes looked up in the L2 cache, as a percent of
+      the peak theoretical bandwidth achievable on the specific accelerator. The number
+      of bytes is calculated as the number of cache lines requested multiplied by
+      the cache line size. This value does not consider partial requests, so e.g.,
+      if only a single value is requested in a cache line, the data movement will
+      still be counted as a full cache line.
+    Hit Rate: The ratio of the number of L2 cache line requests that hit in the L2
+      cache over the total number of incoming cache line requests to the L2 cache.
+    L2-Fabric Read BW: The number of bytes read by the L2 over the Infinity Fabric
+      interface per unit time.
+    L2-Fabric Write and Atomic BW: The number of bytes sent by the L2 over the Infinity
+      Fabric interface by write and atomic operations per unit time.
+    HBM Bandwidth: Maximum theoretical bandwidth of the accelerator's local high-bandwidth
+      memory (HBM) per unit time. This value is calculated as the number of HBM channels
+      multiplied by the HBM channel width multiplied by the HBM clock frequency.
+    Read BW: The total number of bytes read by the L2 cache from Infinity Fabric divided
+      by total duration.
+    HBM Read Traffic: The percent of read requests generated by the L2 cache that
+      are routed to the accelerator's local high-bandwidth memory (HBM). This breakdown
+      does not consider the size of the request (meaning that 32B and 64B requests
+      are both counted as a single request), so this metric only approximates the
+      percent of the L2-Fabric Read bandwidth directed to the local HBM.
+    Remote Read Traffic: The percent of read requests generated by the L2 cache that
+      are routed to any memory location other than the accelerator's local high-bandwidth
+      memory (HBM) - for example, the CPU's DRAM or a remote accelerator's HBM. This
+      breakdown does not consider the size of the request (meaning that 32B and 64B
+      requests are both counted as a single request), so this metric only approximates
+      the percent of the L2-Fabric Read bandwidth directed to a remote location.
+    Uncached Read Traffic: The percent of read requests generated by the L2 cache
+      that are reading from an uncached memory allocation. Note, as described in the
+      request flow section, a single 64B read request is typically counted as two
+      uncached read requests. So, it is possible for the Uncached Read Traffic to
+      reach up to 200% of the total number of read requests. This breakdown does not
+      consider the size of the request (i.e., 32B and 64B requests are both counted
+      as a single request), so this metric only approximates the percent of the L2-Fabric
+      read bandwidth directed to an uncached memory location.
+    Write and Atomic BW: The total number of bytes written by the L2 over Infinity
+      Fabric by write and atomic operations divided by total duration. Note that on
+      current CDNA accelerators, such as the MI2XX, requests are only considered atomic
+      by Infinity Fabric if they are targeted at non-write-cacheable memory, for example,
+      fine-grained memory allocations or uncached memory allocations on the MI2XX.
+    HBM Write and Atomic Traffic: The percent of write and atomic requests generated
+      by the L2 cache that are routed to the accelerator's local high-bandwidth memory
+      (HBM). This breakdown does not consider the size of the request (meaning that
+      32B and 64B requests are both counted as a single request), so this metric only
+      approximates the percent of the L2-Fabric Write and Atomic bandwidth directed
+      to the local HBM. Note that on current CDNA accelerators, such as the MI2XX,
+      requests are only considered atomic by Infinity Fabric if they are targeted
+      at fine-grained memory allocations or uncached memory allocations.
+    Remote Write and Atomic Traffic: The percent of read requests generated by the
+      L2 cache that are routed to any memory location other than the accelerator's
+      local high-bandwidth memory (HBM) - for example, the CPU's DRAM or a remote
+      accelerator's HBM. This breakdown does not consider the size of the request
+      (meaning that 32B and 64B requests are both counted as a single request), so
+      this metric only approximates the percent of the L2-Fabric Read bandwidth directed
+      to a remote location. Note that on current CDNA accelerators, such as the MI2XX,
+      requests are only considered atomic by Infinity Fabric if they are targeted
+      at fine-grained memory allocations or uncached memory allocations.
+    Atomic Traffic: The percent of write requests generated by the L2 cache that are
+      atomic requests to any memory location. This breakdown does not consider the
+      size of the request (meaning that 32B and 64B requests are both counted as a
+      single request), so this metric only approximates the percent of the L2-Fabric
+      Read bandwidth directed to a remote location. Note that on current CDNA accelerators,
+      such as the MI2XX, requests are only considered atomic by Infinity Fabric if
+      they are targeted at fine-grained memory allocations or uncached memory allocations.
+    Uncached Write and Atomic Traffic: The percent of write and atomic requests generated
+      by the L2 cache that are targeting uncached memory allocations. This breakdown
+      does not consider the size of the request (meaning that 32B and 64B requests
+      are both counted as a single request), so this metric only approximates the
+      percent of the L2-Fabric read bandwidth directed to uncached memory allocations.
+    Read Latency: The time-averaged number of cycles read requests spent in Infinity
+      Fabric before data was returned to the L2.
+    Write and Atomic Latency: The time-averaged number of cycles write requests spent
+      in Infinity Fabric before a completion acknowledgement was returned to the L2.
+    Atomic Latency: The time-averaged number of cycles atomic requests spent in Infinity
+      Fabric before a completion acknowledgement (atomic without return value) or
+      data (atomic with return value) was returned to the L2.
+    Bandwidth: The number of bytes looked up in the L2 cache, divided by total duration.
+      The number of bytes is calculated as the number of cache lines requested multiplied
+      by the cache line size. This value does not consider partial requests, so for
+      example, if only a single value is requested in a cache line, the data movement
+      will still be counted as a full cache line.
+    Req: The total number of incoming requests to the L2 from all clients for all
+      request types, per normalization unit.
+    Read Req: The total number of read requests to the L2 from all clients.
+    Write Req: The total number of write requests to the L2 from all clients.
+    Atomic Req: The total number of atomic requests (with and without return) to the
+      L2 from all clients.
+    Streaming Req: The total number of incoming requests to the L2 that are marked
+      as streaming. The exact meaning of this may differ depending on the targeted
+      accelerator, however on an MI2XX this corresponds to non-temporal load or stores.
+      The L2 cache attempts to evict streaming requests before normal requests when
+      the L2 is at capacity.
+    Probe Req: The number of coherence probe requests made to the L2 cache from outside
+      the accelerator. On an MI2XX, probe requests may be generated by, for example,
+      writes to fine-grained device memory or by writes to coarse-grained device memory.
+    Cache Hit: The ratio of the number of L2 cache line requests that hit in the L2
+      cache over the total number of incoming cache line requests to the L2 cache.
+    Hits: The total number of requests to the L2 from all clients that hit in the
+      cache. As noted in the Speed-of-Light section, this includes hit-on-miss requests.
+    Misses: The total number of requests to the L2 from all clients that miss in the
+      cache. As noted in the Speed-of-Light section, these do not include hit-on-miss
+      requests.
+    Writeback: The total number of L2 cache lines written back to memory for any reason.
+      Write-backs may occur due to user code (such as HIP kernel calls to _threadfence_system
+      or atomic built-ins) by the command processor's memory acquire/release fences,
+      or for other internal hardware reasons.
+    Writeback (Internal): The total number of L2 cache lines written back to memory
+      for internal hardware reasons, per normalization unit.
+    Writeback (vL1D Req): The total number of L2 cache lines written back to memory
+      due to requests initiated by the vL1D cache, per normalization unit.
+    Evict (Internal): The total number of L2 cache lines evicted from the cache due
+      to capacity limits, per normalization unit.
+    Evict (vL1D Req): The total number of L2 cache lines evicted from the cache due
+      to invalidation requests initiated by the vL1D cache, per normalization unit.
+    NC Req: The total number of requests to the L2 to Not-hardware-Coherent (NC) memory
+      allocations, per normalization unit.
+    UC Req: The total number of requests to the L2 that go to Uncached (UC) memory
+      allocations.
+    CC Req: The total number of requests to the L2 that go to Coherently Cacheable
+      (CC) memory allocations.
+    RW Req: The total number of requests to the L2 that go to Read-Write coherent
+      memory (RW) allocations.
+    Write - Credit Starvation: The number of cycles the L2-Fabric interface was stalled
+      on write or atomic requests to any memory location because too many write/atomic
+      requests were currently in flight, as a percent of the total active L2 cycles.
+    Read (32B): The total number of L2 requests to Infinity Fabric to read 32B of
+      data from any memory location, per normalization unit.
+    Read (64B): The total number of L2 requests to Infinity Fabric to read 64B of
+      data from any memory location, per normalization unit.
+    Read (Uncached): The total number of L2 requests to Infinity Fabric to read uncached
+      data from any memory location, per normalization unit. 64B requests for uncached
+      data are counted as two 32B uncached data requests.
+    HBM Read: The total number of L2 requests to Infinity Fabric to read 32B or 64B
+      of data from the accelerator's local HBM, per normalization unit.
+    Remote Read: The total number of L2 requests to Infinity Fabric to read 32B or
+      64B of data from any source other than the accelerator's local HBM, per normalization
+      unit.
+    Write and Atomic (32B): The total number of L2 requests to Infinity Fabric to
+      write or atomically update 32B of data to any memory location, per normalization
+      unit.
+    Write and Atomic (Uncached): The total number of L2 requests to Infinity Fabric
+      to write or atomically update 32B or 64B of uncached data, per normalization
+      unit.
+    Write and Atomic (64B): The total number of L2 requests to Infinity Fabric to
+      write or atomically update 64B of data in any memory location, per normalization
+      unit.
+    HBM Write and Atomic: The total number of L2 requests to Infinity Fabric to write
+      or atomically update 32B or 64B of data in the accelerator's local HBM, per
+      normalization unit.
+    Remote Write and Atomic: The total number of L2 requests to Infinity Fabric to
+      write or atomically update 32B or 64B of data in any memory location other than
+      the accelerator's local HBM, per normalization unit.
+    Atomic: The total number of L2 requests to Infinity Fabric to atomically update
+      32B or 64B of data in any memory location, per normalization unit. See Request
+      flow for more detail. Note that on current CDNA accelerators, such as the MI2XX,
+      requests are only considered atomic by Infinity Fabric if they are targeted
+      at non-write-cacheable memory, such as fine-grained memory allocations or uncached
+      memory allocations on the MI2XX.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1800_l2_cache_per_channel.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1800_l2_cache_per_channel.yaml
index 849662871e..75ce281b57 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1800_l2_cache_per_channel.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1800_l2_cache_per_channel.yaml
@@ -2,10 +2,6 @@
 Panel Config:
   id: 1800
   title: L2 Cache (per Channel)
-  metrics_description:
-    L2 Cache Hit Rate: The percent of total number of requests to the L2 from all
-      clients that hit in the cache. As noted in the Speed-of-Light section, this
-      includes hit-on-miss requests.
   data source:
   - metric_table:
       id: 1801
@@ -249,3 +245,7 @@ Panel Config:
           ::_1: $total_l2_chan
       cli_style: simple_box
       tui_style: simple_box
+  metrics_description:
+    L2 Cache Hit Rate: The percent of total number of requests to the L2 from all
+      clients that hit in the cache. As noted in the Speed-of-Light section, this
+      includes hit-on-miss requests.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/2100_pc_sampling.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/2100_pc_sampling.yaml
index e94471d7dc..16e4d01e7e 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/2100_pc_sampling.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/2100_pc_sampling.yaml
@@ -2,10 +2,10 @@
 Panel Config:
   id: 2100
   title: PC Sampling
-  metrics_description: {}
   data source:
   - pc_sampling_table:
       id: 2101
       title: PC Sampling
       source: ps_file
       comparable: false
+  metrics_description: {}
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/config_delta/gfx950_diff.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/config_delta/gfx950_diff.yaml
new file mode 100644
index 0000000000..5d64c7a5e0
--- /dev/null
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/config_delta/gfx950_diff.yaml
@@ -0,0 +1,763 @@
+# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated by tools/config_management/generate_config_deltas.py
+Addition:
+  - Panel Config:
+      id: 200
+      title: System Speed-of-Light
+    metric_tables:
+      - metric_table:
+          id: 201
+          title: System Speed-of-Light
+          metrics:
+            - MFMA FLOPs (F6F4):
+                value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp - Start_Timestamp)))
+                unit: GFLOP/s
+                peak: ((($max_sclk * $cu_per_gpu) * 16834) / 1000)
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 16834) / 1000))
+  - Panel Config:
+      id: 300
+      title: Memory Chart
+    metric_tables:
+      - metric_table:
+          id: 301
+          title: Memory Chart
+          metrics:
+            - L2 Rd Lat:
+                value: |
+                  ROUND(AVG(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else None)), 0)
+            - L2 Wr Lat:
+                value: |
+                  ROUND(AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != 0) else None)), 0)
+  - Panel Config:
+      id: 400
+      title: Roofline
+    metric_tables:
+      - metric_table:
+          id: 401
+          title: Roofline Performance Rates
+          metrics:
+            - MFMA FLOPs (F6F4):
+                value: |
+                  AVG((((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
+                unit: GFLOP/s
+                peak: $MFMA_FLOPs_F6F4_empirical_peak
+  - Panel Config:
+      id: 500
+      title: Command Processor (CPC/CPF)
+    metric_tables:
+      - metric_table:
+          id: 502
+          title: Command processor packet processor (CPC)
+          metrics:
+            - CPC SYNC FIFO Full Rate:
+                avg: |
+                  AVG((100 * CPC_SYNC_FIFO_FULL) / CPC_SYNC_WRREQ_FIFO_BUSY if (CPC_SYNC_WRREQ_FIFO_BUSY != 0) else None)
+                min: |
+                  MIN((100 * CPC_SYNC_FIFO_FULL) / CPC_SYNC_WRREQ_FIFO_BUSY if (CPC_SYNC_WRREQ_FIFO_BUSY != 0) else None)
+                max: |
+                  MAX((100 * CPC_SYNC_FIFO_FULL) / CPC_SYNC_WRREQ_FIFO_BUSY if (CPC_SYNC_WRREQ_FIFO_BUSY != 0) else None)
+                unit: pct
+            - CPC ADC Utilization:
+                avg: AVG((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else None)
+                min: MIN((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else None)
+                max: MAX((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else None)
+                unit: pct
+            - CPC CANE Stall Rate:
+                avg: AVG((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0) else None)
+                min: MIN((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0) else None)
+                max: MAX((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0) else None)
+                unit: pct
+  - Panel Config:
+      id: 600
+      title: Workgroup Manager (SPI)
+    metric_tables:
+      - metric_table:
+          id: 601
+          title: Workgroup manager utilizations
+          metrics:
+            - Schedule-Pipe Wave Occupancy:
+                avg: |
+                  AVG(SPI_CSQ_P0_OCCUPANCY + SPI_CSQ_P1_OCCUPANCY + SPI_CSQ_P2_OCCUPANCY + SPI_CSQ_P3_OCCUPANCY)
+                min: |
+                  MIN(SPI_CSQ_P0_OCCUPANCY + SPI_CSQ_P1_OCCUPANCY + SPI_CSQ_P2_OCCUPANCY + SPI_CSQ_P3_OCCUPANCY)
+                max: |
+                  MAX(SPI_CSQ_P0_OCCUPANCY + SPI_CSQ_P1_OCCUPANCY + SPI_CSQ_P2_OCCUPANCY + SPI_CSQ_P3_OCCUPANCY)
+                unit: Wave
+            - Scheduler-Pipe Wave Utilization:
+                avg: |
+                  AVG(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
+                min: |
+                  MIN(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
+                max: |
+                  MAX(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
+                unit: Pct
+      - metric_table:
+          id: 602
+          title: Workgroup Manager - Resource Allocation
+          metrics:
+            - Scheduler-Pipe FIFO Full Rate:
+                avg: |
+                  AVG((100 * (SPI_CS0_CRAWLER_STALL + SPI_CS1_CRAWLER_STALL + SPI_CS2_CRAWLER_STALL + SPI_CS3_CRAWLER_STALL) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
+                min: |
+                  MIN((100 * (SPI_CS0_CRAWLER_STALL + SPI_CS1_CRAWLER_STALL + SPI_CS2_CRAWLER_STALL + SPI_CS3_CRAWLER_STALL) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
+                max: |
+                  MAX((100 * (SPI_CS0_CRAWLER_STALL + SPI_CS1_CRAWLER_STALL + SPI_CS2_CRAWLER_STALL + SPI_CS3_CRAWLER_STALL) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
+                unit: Pct
+  - Panel Config:
+      id: 1000
+      title: Compute Units - Instruction Mix
+    metric_tables:
+      - metric_table:
+          id: 1003
+          title: VMEM Instruction Mix
+          metrics:
+            - Spill/Stack Coalesceable Instr:
+                avg: AVG((TA_BUFFER_COALESCEABLE_WAVEFRONTS_sum / $denom))
+                min: MIN((TA_BUFFER_COALESCEABLE_WAVEFRONTS_sum / $denom))
+                max: MAX((TA_BUFFER_COALESCEABLE_WAVEFRONTS_sum / $denom))
+                unit: (instr + $normUnit)
+      - metric_table:
+          id: 1004
+          title: MFMA Arithmetic Instruction Mix
+          metrics:
+            - MFMA-F6F4:
+                avg: AVG((SQ_INSTS_VALU_MFMA_F6F4 / $denom))
+                min: MIN((SQ_INSTS_VALU_MFMA_F6F4 / $denom))
+                max: MAX((SQ_INSTS_VALU_MFMA_F6F4 / $denom))
+                unit: (instr + $normUnit)
+  - Panel Config:
+      id: 1100
+      title: Compute Units - Compute Pipeline
+    metric_tables:
+      - metric_table:
+          id: 1101
+          title: Compute Speed-of-Light
+          metrics:
+            - MFMA FLOPs (F6F4):
+                value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp - Start_Timestamp)))
+                unit: GFLOP
+                peak: ((($max_sclk * $cu_per_gpu) * 16834) / 1000)
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 16834) / 1000))
+      - metric_table:
+          id: 1102
+          title: Pipeline Statistics
+          metrics:
+            - VALU Co-Issue Efficiency:
+                avg: AVG((100 * SQ_ACTIVE_INST_VALU2) / (SQ_ACTIVE_INST_VALU - SQ_ACTIVE_INST_VALU2))
+                min: MIN((100 * SQ_ACTIVE_INST_VALU2) / (SQ_ACTIVE_INST_VALU - SQ_ACTIVE_INST_VALU2))
+                max: MAX((100 * SQ_ACTIVE_INST_VALU2) / (SQ_ACTIVE_INST_VALU - SQ_ACTIVE_INST_VALU2))
+                unit: pct
+      - metric_table:
+          id: 1103
+          title: Arithmetic Operations
+          metrics:
+            - F6F4 OPs:
+                avg: AVG((512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4) / $denom)
+                min: MIN((512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4) / $denom)
+                max: MAX((512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4) / $denom)
+                unit: (OPs + $normUnit)
+  - Panel Config:
+      id: 1200
+      title: Local Data Share (LDS)
+    metric_tables:
+      - metric_table:
+          id: 1202
+          title: LDS Statistics
+          metrics:
+            - LDS ATOMIC Bandwidth:
+                avg: AVG(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp))
+                min: MIN(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp))
+                max: MAX(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp))
+                units: Gbps
+            - LDS LOAD:
+                avg: AVG((SQ_INSTS_LDS_LOAD / $denom))
+                min: MIN((SQ_INSTS_LDS_LOAD / $denom))
+                max: MAX((SQ_INSTS_LDS_LOAD / $denom))
+                unit: (instr + $normUnit)
+            - LDS STORE:
+                avg: AVG((SQ_INSTS_LDS_STORE / $denom))
+                min: MIN((SQ_INSTS_LDS_STORE / $denom))
+                max: MAX((SQ_INSTS_LDS_STORE / $denom))
+                unit: (instr + $normUnit)
+            - LDS STORE Bandwidth:
+                avg: AVG(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp))
+                min: MIN(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp))
+                max: MAX(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp))
+                units: Gbps
+            - LDS LOAD Bandwidth:
+                avg: AVG(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp))
+                min: MIN(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp))
+                max: MAX(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp))
+                units: Gbps
+            - LDS Command FIFO Full Rate:
+                avg: AVG((SQ_LDS_CMD_FIFO_FULL / $denom))
+                min: MIN((SQ_LDS_CMD_FIFO_FULL / $denom))
+                max: MAX((SQ_LDS_CMD_FIFO_FULL / $denom))
+                unit: (Cycles + $normUnit)
+            - LDS ATOMIC:
+                avg: AVG((SQ_INSTS_LDS_ATOMIC / $denom))
+                min: MIN((SQ_INSTS_LDS_ATOMIC / $denom))
+                max: MAX((SQ_INSTS_LDS_ATOMIC / $denom))
+                unit: (instr + $normUnit)
+            - LDS Data FIFO Full Rate:
+                avg: AVG((SQ_LDS_DATA_FIFO_FULL / $denom))
+                min: MIN((SQ_LDS_DATA_FIFO_FULL / $denom))
+                max: MAX((SQ_LDS_DATA_FIFO_FULL / $denom))
+                unit: (Cycles + $normUnit)
+  - Panel Config:
+      id: 1500
+      title: Address Processing Unit and Data Return Path (TA/TD)
+    metric_tables:
+      - metric_table:
+          id: 1504
+          title: Vector L1 data-return path or Texture Data (TD)
+          metrics:
+            - Write Ack Instructions:
+                avg: AVG((TD_WRITE_ACKT_WAVEFRONT_sum / $denom))
+                min: MIN((TD_WRITE_ACKT_WAVEFRONT_sum / $denom))
+                max: MAX((TD_WRITE_ACKT_WAVEFRONT_sum / $denom))
+                unit: (Instructions + $normUnit)
+      - metric_table:
+          id: 1502
+          title: Instruction counts
+          metrics:
+            - Global/Generic Read Instructions for LDS:
+                avg: AVG((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom))
+                min: MIN((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom))
+                max: MAX((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom))
+                unit: (Instructions + $normUnit)
+            - Spill/Stack Read Instructions for LDS:
+                avg: AVG((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom))
+                min: MIN((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom))
+                max: MAX((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom))
+                unit: (Instructions + $normUnit)
+  - Panel Config:
+      id: 1600
+      title: Vector L1 Data Cache
+    metric_tables:
+      - metric_table:
+          id: 1602
+          title: vL1D cache stall metrics
+          metrics:
+            - Stalled on Request FIFO:
+                expr: |
+                  (((100 * TCP_RFIFO_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None)
+            - Stalled on Latency FIFO:
+                expr: |
+                  (((100 * TCP_LFIFO_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None)
+            - Stalled on Address:
+                expr: |
+                  (((100 * TCP_TCP_TA_ADDR_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None)
+            - Stalled on Read Return:
+                expr: |
+                  (((100 * TCP_TCR_RDRET_STALL_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None)
+            - Stalled on Data:
+                expr: |
+                  (((100 * TCP_TCP_TA_DATA_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None)
+      - metric_table:
+          id: 1603
+          title: vL1D cache access metrics
+          metrics:
+            - Tag RAM 2 Req:
+                avg: AVG((TCP_TAGRAM2_REQ_sum / $denom))
+                min: MIN((TCP_TAGRAM2_REQ_sum / $denom))
+                max: MAX((TCP_TAGRAM2_REQ_sum / $denom))
+                unit: (Req + $normUnit)
+            - Tag RAM 0 Req:
+                avg: AVG((TCP_TAGRAM0_REQ_sum / $denom))
+                min: MIN((TCP_TAGRAM0_REQ_sum / $denom))
+                max: MAX((TCP_TAGRAM0_REQ_sum / $denom))
+                unit: (Req + $normUnit)
+            - Tag RAM 3 Req:
+                avg: AVG((TCP_TAGRAM3_REQ_sum / $denom))
+                min: MIN((TCP_TAGRAM3_REQ_sum / $denom))
+                max: MAX((TCP_TAGRAM3_REQ_sum / $denom))
+                unit: (Req + $normUnit)
+            - Tag RAM 1 Req:
+                avg: AVG((TCP_TAGRAM1_REQ_sum / $denom))
+                min: MIN((TCP_TAGRAM1_REQ_sum / $denom))
+                max: MAX((TCP_TAGRAM1_REQ_sum / $denom))
+                unit: (Req + $normUnit)
+            - L1 Access Latency:
+                avg: AVG((TCP_TCP_LATENCY_sum / $denom))
+                min: MIN((TCP_TCP_LATENCY_sum / $denom))
+                max: MAX((TCP_TCP_LATENCY_sum / $denom))
+                unit: (Cycles + $normUnit)
+            - L1-L2 Read Latency:
+                avg: AVG((TCP_TCC_READ_REQ_LATENCY_sum / $denom))
+                min: MIN((TCP_TCC_READ_REQ_LATENCY_sum / $denom))
+                max: MAX((TCP_TCC_READ_REQ_LATENCY_sum / $denom))
+                unit: (Cycles + $normUnit)
+            - L1-L2 Write Latency:
+                avg: AVG((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom))
+                min: MIN((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom))
+                max: MAX((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom))
+                unit: (Cycles + $normUnit)
+      - metric_table:
+          id: 1605
+          title: L1 Unified Translation Cache (UTCL1)
+          metrics:
+            - Misses under Translation Miss:
+                avg: AVG((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom))
+                min: MIN((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom))
+                max: MAX((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom))
+                units: (Req + $normUnit)
+            - Inflight Req:
+                avg: AVG((TCP_CLIENT_UTCL1_INFLIGHT_sum / $denom))
+                min: MIN((TCP_CLIENT_UTCL1_INFLIGHT_sum / $denom))
+                max: MAX((TCP_CLIENT_UTCL1_INFLIGHT_sum / $denom))
+                units: (Req + $normUnit)
+      - metric_table:
+          id: 1606
+          title: L1D Addr Translation Stalls
+          metrics:
+            - Serialization Stall:
+                avg: AVG((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom))
+                min: MIN((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom))
+                max: MAX((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom))
+                units: (Cycles + $normUnit)
+            - Cache Full Stall:
+                avg: AVG((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom))
+                min: MIN((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom))
+                max: MAX((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom))
+                units: (Cycles + $normUnit)
+            - Resident Page Full Stall:
+                avg: AVG((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom))
+                min: MIN((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom))
+                max: MAX((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom))
+                units: (Cycles + $normUnit)
+            - UTCL2 Stall:
+                avg: AVG((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom))
+                min: MIN((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom))
+                max: MAX((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom))
+                units: (Cycles + $normUnit)
+            - Latency FIFO Stall:
+                avg: AVG((TCP_UTCL1_LFIFO_FULL_sum / $denom))
+                min: MIN((TCP_UTCL1_LFIFO_FULL_sum / $denom))
+                max: MAX((TCP_UTCL1_LFIFO_FULL_sum / $denom))
+                units: (Cycles + $normUnit)
+            - Thrashing Stall:
+                avg: AVG((TCP_UTCL1_THRASHING_STALL_sum / $denom))
+                min: MIN((TCP_UTCL1_THRASHING_STALL_sum / $denom))
+                max: MAX((TCP_UTCL1_THRASHING_STALL_sum / $denom))
+                units: (Cycles + $normUnit)
+            - Cache Miss Stall:
+                avg: AVG((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom))
+                min: MIN((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom))
+                max: MAX((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom))
+                units: (Cycles + $normUnit)
+  - Panel Config:
+      id: 1700
+      title: L2 Cache
+    metric_tables:
+      - metric_table:
+          id: 1702
+          title: L2-Fabric interface metrics
+          metrics:
+            - Read Stall:
+                avg: |
+                  AVG((((100 * ((TCC_EA0_RDREQ_IO_CREDIT_STALL_sum + TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum) + TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None))
+                min: |
+                  MIN((((100 * ((TCC_EA0_RDREQ_IO_CREDIT_STALL_sum + TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum) + TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None))
+                max: |
+                  MAX((((100 * ((TCC_EA0_RDREQ_IO_CREDIT_STALL_sum + TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum) + TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None))
+                unit: pct
+            - Write Stall:
+                avg: |
+                  AVG(((100 * (TCC_EA0_WRREQ_STALL_sum) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None))
+                min: |
+                  MIN(((100 * (TCC_EA0_WRREQ_STALL_sum) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None))
+                max: |
+                  MAX(((100 * (TCC_EA0_WRREQ_STALL_sum) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None))
+                unit: pct
+      - metric_table:
+          id: 1703
+          title: L2 Cache Accesses
+          metrics:
+            - Atomic Bandwidth:
+                avg: AVG(TCC_ATOMIC_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_ATOMIC_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_ATOMIC_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Input Buffer Req:
+                avg: AVG((TCC_IB_REQ_sum / $denom))
+                min: MIN((TCC_IB_REQ_sum / $denom))
+                max: MAX((TCC_IB_REQ_sum / $denom))
+                unit: (Req + $normUnit)
+            - Write Bandwidth:
+                avg: AVG(TCC_WRITE_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_WRITE_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_WRITE_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Read Bandwidth:
+                avg: AVG(TCC_READ_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_READ_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_READ_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Bypasss Req:
+                avg: AVG((TCC_BYPASS_REQ_sum / $denom))
+                min: MIN((TCC_BYPASS_REQ_sum / $denom))
+                max: MAX((TCC_BYPASS_REQ_sum / $denom))
+                unit: (Req + $normUnit)
+      - metric_table:
+          id: 1704
+          title: L2 Cache Stalls
+          metrics:
+            - Input Buffer Stalled on L2:
+                avg: AVG(TCC_IB_STALL_sum / $denom)
+                min: MIN(TCC_IB_STALL_sum / $denom)
+                max: MAX(TCC_IB_STALL_sum / $denom)
+                unit: (Cycles + $normUnit)
+            - Stalled on Latency FIFO:
+                avg: AVG(TCC_LATENCY_FIFO_FULL_sum / $denom)
+                min: MIN(TCC_LATENCY_FIFO_FULL_sum / $denom)
+                max: MAX(TCC_LATENCY_FIFO_FULL_sum / $denom)
+                unit: (Cycles + $normUnit)
+            - Stalled on Write Data FIFO:
+                avg: AVG(TCC_SRC_FIFO_FULL_sum / $denom)
+                min: MIN(TCC_SRC_FIFO_FULL_sum / $denom)
+                max: MAX(TCC_SRC_FIFO_FULL_sum / $denom)
+                unit: (Cycles + $normUnit)
+      - metric_table:
+          id: 1705
+          title: L2 - Fabric Interface stalls
+          metrics:
+            - Read - HBM Stall:
+                type: HBM Stall
+                transaction: Read
+                avg: |
+                  AVG(((100 * (TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                min: |
+                  MIN(((100 * (TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                max: |
+                  MAX(((100 * (TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                unit: pct
+            - Read - Infinity Fabric Stall:
+                type: Infinity Fabric™ Stall
+                transaction: Read
+                avg: |
+                  AVG(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                min: |
+                  MIN(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                max: |
+                  MAX(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                unit: pct
+            - Write - PCIe Stall:
+                type: PCIe Stall
+                transaction: Write
+                avg: |
+                  AVG(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                min: |
+                  MIN(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                max: |
+                  MAX(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                unit: pct
+            - Read - PCIe Stall:
+                type: PCIe Stall
+                transaction: Read
+                avg: |
+                  AVG(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                min: |
+                  MIN(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                max: |
+                  MAX(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                unit: pct
+            - Write - Infinity Fabric Stall:
+                type: Infinity Fabric™ Stall
+                transaction: Write
+                avg: |
+                  AVG(((100 * (TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                min: |
+                  MIN(((100 * (TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                max: |
+                  MAX(((100 * (TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                unit: pct
+            - Write - HBM Stall:
+                type: HBM Stall
+                transaction: Write
+                avg: |
+                  AVG(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                min: |
+                  MIN(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                max: |
+                  MAX(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                unit: pct
+      - metric_table:
+          id: 1706
+          title: L2 - Fabric interface detailed metrics
+          metrics:
+            - Write Bandwidth - HBM:
+                avg: AVG(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Read (128B):
+                avg: AVG((TCC_EA0_RDREQ_128B_sum / $denom))
+                min: MIN((TCC_EA0_RDREQ_128B_sum / $denom))
+                max: MAX((TCC_EA0_RDREQ_128B_sum / $denom))
+                unit: (Req + $normUnit)
+            - Atomic - HBM:
+                avg: AVG((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom))
+                min: MIN((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom))
+                max: MAX((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom))
+                unit: (Req + $normUnit)
+            - Read Bandwidth - PCIe:
+                avg: AVG(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Atomic Bandwidth - HBM:
+                avg: AVG(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Read Bandwidth - Infinity Fabric™:
+                avg: AVG(TCC_EA0_RDREQ_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_EA0_RDREQ_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_EA0_RDREQ_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Write Bandwidth - PCIe:
+                avg: AVG(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Atomic Bandwidth - PCIe:
+                avg: AVG(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Write Bandwidth - Infinity Fabric™:
+                avg: AVG(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Atomic Bandwidth - Infinity Fabric™:
+                avg: AVG(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Read Bandwidth - HBM:
+                avg: AVG(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+
+Deletion:
+  []
+
+Modification:
+  - Panel Config:
+      id: 200
+      title: System Speed-of-Light
+    metric_tables:
+      - metric_table:
+          id: 201
+          title: System Speed-of-Light
+          metrics:
+            - MFMA IOPs (Int8):
+                peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
+            - MFMA FLOPs (F16):
+                peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
+            - MFMA FLOPs (F8):
+                peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
+                unit: GFLOP/s
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
+            - MFMA FLOPs (F64):
+                peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000)
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 128) / 1000))
+            - MFMA FLOPs (BF16):
+                peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
+  - Panel Config:
+      id: 300
+      title: Memory Chart
+    metric_tables:
+      - metric_table:
+          id: 301
+          title: Memory Chart
+          metrics:
+            - Workgroups:
+                value: |
+                  ROUND(AVG(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS), 0)
+            - Wavefronts:
+                value: ROUND(AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE), 0)
+  - Panel Config:
+      id: 400
+      title: Roofline
+    metric_tables:
+      - metric_table:
+          id: 402
+          title: Roofline Plot Points
+          metrics:
+            - AI L2:
+                value: |
+                  ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) * 64 ) )
+            - AI HBM:
+                value: |
+                  ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCC_BUBBLE_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64) ) )
+            - AI L1:
+                value: |
+                  ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM(TCP_TOTAL_CACHE_ACCESSES_sum * 64) )
+            - Performance (GFLOPs):
+                value: |
+                  ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / (SUM(End_Timestamp - Start_Timestamp) / 1e9) ) / 1e9
+  - Panel Config:
+      id: 600
+      title: Workgroup Manager (SPI)
+    metric_tables:
+      - metric_table:
+          id: 601
+          title: Workgroup manager utilizations
+          metrics:
+            - Dispatched Workgroups:
+                max: |
+                  MAX(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS)
+                avg: |
+                  AVG(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS)
+                min: |
+                  MIN(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS)
+            - VGPR Writes:
+                max: |
+                  MAX((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
+                avg: |
+                  AVG((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
+                min: |
+                  MIN((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
+            - Scheduler-Pipe Utilization:
+                max: |
+                  MAX(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
+                avg: |
+                  AVG(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
+                min: |
+                  MIN(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
+            - SGPR Writes:
+                max: |
+                  MAX((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
+                avg: |
+                  AVG((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
+                min: |
+                  MIN((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
+            - Dispatched Wavefronts:
+                max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
+                avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
+                min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
+  - Panel Config:
+      id: 700
+      title: Wavefront
+    metric_tables:
+      - metric_table:
+          id: 701
+          title: Wavefront Launch Stats
+          metrics:
+            - Total Wavefronts:
+                max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
+                avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
+                min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
+  - Panel Config:
+      id: 1100
+      title: Compute Units - Compute Pipeline
+    metric_tables:
+      - metric_table:
+          id: 1101
+          title: Compute Speed-of-Light
+          metrics:
+            - MFMA FLOPs (F8):
+                peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
+            - MFMA FLOPs (F64):
+                peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000)
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 128) / 1000))
+            - MFMA FLOPs (BF16):
+                peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
+            - MFMA IOPs (INT8):
+                peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
+            - MFMA FLOPs (F16):
+                peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
+      - metric_table:
+          id: 1103
+          title: Arithmetic Operations
+          metrics:
+            - FLOPs (Total):
+                max: |
+                  MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom))
+                avg: |
+                  AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom))
+                min: |
+                  MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom))
+  - Panel Config:
+      id: 1700
+      title: L2 Cache
+    metric_tables:
+      - metric_table:
+          id: 1701
+          title: L2 Speed-of-Light
+          metrics:
+            - L2-Fabric Read BW:
+                value: |
+                  AVG((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp)))
+      - metric_table:
+          id: 1702
+          title: L2-Fabric interface metrics
+          metrics:
+            - Read BW:
+                max: |
+                  MAX((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp)))
+                avg: |
+                  AVG((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp)))
+                min: |
+                  MIN((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp)))
+            - Remote Read Traffic:
+                max: |
+                  MAX((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
+                avg: |
+                  AVG((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
+                min: |
+                  MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
+      - metric_table:
+          id: 1706
+          title: L2 - Fabric interface detailed metrics
+          metrics:
+            - HBM Write and Atomic:
+                max: MAX((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom))
+                avg: AVG((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom))
+                min: MIN((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom))
+            - Read (64B):
+                max: MAX((TCC_EA0_RDREQ_64B_sum / $denom))
+                avg: AVG((TCC_EA0_RDREQ_64B_sum / $denom))
+                min: MIN((TCC_EA0_RDREQ_64B_sum / $denom))
+  - Panel Config:
+      id: 1800
+      title: L2 Cache (per Channel)
+    metric_tables:
+      - metric_table:
+          id: 1809
+          title: L2-Fabric Read Stall (Cycles per normUnit)
+          metrics:
+            - ::_1:
+                ea read stall - pcie: AVG((TO_INT(TCC_EA0_RDREQ_IO_CREDIT_STALL[::_1]) / $denom))
+                ea read stall - hbm: AVG((TO_INT(TCC_EA0_RDREQ_DRAM_CREDIT_STALL[::_1]) / $denom))
+                ea read stall - if: AVG((TO_INT(TCC_EA0_RDREQ_GMI_CREDIT_STALL[::_1]) / $denom))
+      - metric_table:
+          id: 1810
+          title: L2-Fabric Write and Atomic Stall (Cycles per normUnit)
+          metrics:
+            - ::_1:
+                ea write stall - pcie: AVG((TO_INT(TCC_EA0_WRREQ_IO_CREDIT_STALL[::_1]) / $denom))
+                ea write stall - if: AVG((TO_INT(TCC_EA0_WRREQ_GMI_CREDIT_STALL[::_1]) / $denom))
+                ea write stall - hbm: AVG((TO_INT(TCC_EA0_WRREQ_DRAM_CREDIT_STALL[::_1]) / $denom))
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0000_top_stats.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0000_top_stats.yaml
index 55c6f6bb24..5ce5aeeb28 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0000_top_stats.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0000_top_stats.yaml
@@ -2,7 +2,6 @@
 Panel Config:
   id: 0
   title: Top Stats
-  metrics_description: {}
   data source:
   - raw_csv_table:
       id: 1
@@ -12,3 +11,4 @@ Panel Config:
       id: 2
       title: Dispatch List
       source: pmc_dispatch_info.csv
+  metrics_description: {}
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0100_system_info.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0100_system_info.yaml
index 23d024fde3..8b48c2253b 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0100_system_info.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0100_system_info.yaml
@@ -2,10 +2,10 @@
 Panel Config:
   id: 100
   title: System Info
-  metrics_description: {}
   data source:
   - raw_csv_table:
       id: 101
       title: System Info
       source: sysinfo.csv
       columnwise: true
+  metrics_description: {}
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0200_system_speed_of_light.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0200_system_speed_of_light.yaml
index 722866f6e0..8aa72cb25d 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0200_system_speed_of_light.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0200_system_speed_of_light.yaml
@@ -2,124 +2,6 @@
 Panel Config:
   id: 200
   title: System Speed-of-Light
-  metrics_description:
-    VALU FLOPs: 'The total floating-point operations executed per second on the VALU.
-      This is also presented as a percent of the peak theoretical FLOPs achievable
-      on the specific accelerator. Note: this does not include any floating-point
-      operations from MFMA instructions.'
-    VALU IOPs: 'The total integer operations executed per second on the VALU. This
-      is also presented as a percent of the peak theoretical IOPs achievable on the
-      specific accelerator. Note: this does not include any integer operations from
-      MFMA instructions.'
-    MFMA FLOPs (F8): The total number of 8-bit brain floating point MFMA operations
-      executed per second. This does not include any 16-bit brain floating point operations
-      from VALU instructions. This is also presented as a percent of the peak theoretical
-      F8 MFMA operations achievable on the specific accelerator. It is supported on
-      AMD Instinct MI300 series and later only.
-    MFMA FLOPs (BF16): 'The total number of 16-bit brain floating point MFMA operations
-      executed per second. Note: this does not include any 16-bit brain floating point
-      operations from VALU instructions. This is also presented as a percent of the
-      peak theoretical BF16 MFMA operations achievable on the specific accelerator.'
-    MFMA FLOPs (F16): 'The total number of 16-bit floating point MFMA operations executed
-      per second. Note: this does not include any 16-bit floating point operations
-      from VALU instructions. This is also presented as a percent of the peak theoretical
-      F16 MFMA operations achievable on the specific accelerator.'
-    MFMA FLOPs (F32): 'The total number of 32-bit floating point MFMA operations executed
-      per second. Note: this does not include any 32-bit floating point operations
-      from VALU instructions. This is also presented as a percent of the peak theoretical
-      F32 MFMA operations achievable on the specific accelerator.'
-    MFMA FLOPs (F64): 'The total number of 64-bit floating point MFMA operations executed
-      per second. Note: this does not include any 64-bit floating point operations
-      from VALU instructions. This is also presented as a percent of the peak theoretical
-      F64 MFMA operations achievable on the specific accelerator.'
-    MFMA IOPs (Int8): 'The total number of 8-bit integer MFMA operations executed
-      per second. Note: this does not include any 8-bit integer operations from VALU
-      instructions. This is also presented as a percent of the peak theoretical INT8
-      MFMA operations achievable on the specific accelerator.'
-    Active CUs: Total number of active compute units (CUs) on the accelerator during
-      the kernel execution.
-    SALU Utilization: Indicates what percent of the kernel's duration the SALU was
-      busy executing instructions. Computed as the ratio of the total number of cycles
-      spent by the scheduler issuing SALU or SMEM instructions over the total CU cycles.
-    VALU Utilization: Indicates what percent of the kernel's duration the VALU was
-      busy executing instructions. Does not include VMEM operations. Computed as the
-      ratio of the total number of cycles spent by the scheduler issuing VALU instructions
-      over the total CU cycles.
-    MFMA Utilization: Indicates what percent of the kernel's duration the MFMA unit
-      was busy executing instructions. Computed as the ratio of the total number of
-      cycles the MFMA was busy over the total CU cycles.
-    VMEM Utilization: Indicates what percent of the kernel's duration the VMEM unit
-      was busy executing instructions, including both global/generic and spill/scratch
-      operations (see the VMEM instruction count metrics) for more detail). Does not
-      include VALU operations. Computed as the ratio of the total number of cycles
-      spent by the scheduler issuing VMEM instructions over the total CU cycles.
-    Branch Utilization: Indicates what percent of the kernel's duration the branch
-      unit was busy executing instructions. Computed as the ratio of the total number
-      of cycles spent by the scheduler issuing branch instructions over the total
-      CU cycles
-    VALU Active Threads: Indicates the average level of divergence within a wavefront
-      over the lifetime of the kernel. The number of work-items that were active in
-      a wavefront during execution of each VALU instruction, time-averaged over all
-      VALU instructions run on all wavefronts in the kernel.
-    IPC: The ratio of the total number of instructions executed on the CU over the
-      total active CU cycles. This is also presented as a percent of the peak theoretical
-      bandwidth achievable on the specific accelerator.
-    Wavefront Occupancy: 'The time-averaged number of wavefronts resident on the accelerator
-      over the lifetime of the kernel. Note: this metric may be inaccurate for short-running
-      kernels (less than 1ms). This is also presented as a percent of the peak theoretical
-      occupancy achievable on the specific accelerator.'
-    Theoretical LDS Bandwidth: Indicates the maximum amount of bytes that could have
-      been loaded from, stored to, or atomically updated in the LDS per unit time
-      (see LDS Bandwidth example for more detail). This is also presented as a percent
-      of the peak theoretical F64 MFMA operations achievable on the specific accelerator.
-    LDS Bank Conflicts/Access: The ratio of the number of cycles spent in the LDS
-      scheduler due to bank conflicts (as determined by the conflict resolution hardware)
-      to the base number of cycles that would be spent in the LDS scheduler in a completely
-      uncontended case. This is also presented in normalized form (i.e., the Bank
-      Conflict Rate).
-    vL1D Cache Hit Rate: The ratio of the number of vL1D cache line requests that
-      hit in vL1D cache over the total number of cache line requests to the vL1D cache
-      RAM.
-    vL1D Cache BW: The number of bytes looked up in the vL1D cache as a result of
-      VMEM instructions per unit time. The number of bytes is calculated as the number
-      of cache lines requested multiplied by the cache line size. This value does
-      not consider partial requests, so e.g., if only a single value is requested
-      in a cache line, the data movement will still be counted as a full cache line.
-      This is also presented as a percent of the peak theoretical bandwidth achievable
-      on the specific accelerator.
-    L2 Cache Hit Rate: The ratio of the number of L2 cache line requests that hit
-      in the L2 cache over the total number of incoming cache line requests to the
-      L2 cache.
-    L2 Cache BW: The number of bytes looked up in the L2 cache per unit time. The
-      number of bytes is calculated as the number of cache lines requested multiplied
-      by the cache line size. This value does not consider partial requests, so e.g.,
-      if only a single value is requested in a cache line, the data movement will
-      still be counted as a full cache line. This is also presented as a percent of
-      the peak theoretical bandwidth achievable on the specific accelerator.
-    L2-Fabric Read BW: "The number of bytes read by the L2 over the Infinity Fabric\u2122\
-      \ interface per unit time. This is also presented as a percent of the peak theoretical\
-      \ bandwidth achievable on the specific accelerator."
-    L2-Fabric Write BW: The number of bytes sent by the L2 over the Infinity Fabric
-      interface by write and atomic operations per unit time. This is also presented
-      as a percent of the peak theoretical bandwidth achievable on the specific accelerator.
-    L2-Fabric Read Latency: The time-averaged number of cycles read requests spent
-      in Infinity Fabric before data was returned to the L2.
-    L2-Fabric Write Latency: The time-averaged number of cycles write requests spent
-      in Infinity Fabric before a completion acknowledgement was returned to the L2.
-    sL1D Cache Hit Rate: The percent of sL1D requests that hit on a previously loaded
-      line the cache. Calculated as the ratio of the number of sL1D requests that
-      hit over the number of all sL1D requests.
-    sL1D Cache BW: The number of bytes looked up in the sL1D cache per unit time.
-      This is also presented as a percent of the peak theoretical bandwidth achievable
-      on the specific accelerator.
-    L1I Hit Rate: The number of bytes looked up in the L1I cache per unit time. This
-      is also presented as a percent of the peak theoretical bandwidth achievable
-      on the specific accelerator.
-    L1I BW: The percent of L1I requests that hit on a previously loaded line the cache.
-      Calculated as the ratio of the number of L1I requests that hit over the number
-      of all L1I requests.
-    L1I Fetch Latency: The average number of cycles spent to fetch instructions to
-      a CU.
   data source:
   - metric_table:
       id: 201
@@ -344,3 +226,130 @@ Panel Config:
           peak: None
           pop: None
           coll_level: SQ_IFETCH_LEVEL
+  metrics_description:
+    VALU FLOPs: |-
+      The total floating-point operations executed per second on the VALU.
+      This is also presented as a percent of the peak theoretical FLOPs achievable
+      on the specific accelerator. Note: this does not include any floating-point
+      operations from MFMA instructions.
+    VALU IOPs: |-
+      The total integer operations executed per second on the VALU. This is
+      also presented as a percent of the peak theoretical IOPs achievable on the
+      specific accelerator. Note: this does not include any integer operations from
+      MFMA instructions.
+    MFMA FLOPs (F8): The total number of 8-bit brain floating point MFMA operations
+      executed per second. This does not include any 16-bit brain floating point operations
+      from VALU instructions. This is also presented as a percent of the peak theoretical
+      F8 MFMA operations achievable on the specific accelerator. It is supported on
+      AMD Instinct MI300 series and later only.
+    MFMA FLOPs (BF16): |-
+      The total number of 16-bit brain floating point MFMA operations executed
+      per second. Note: this does not include any 16-bit brain floating point operations
+      from VALU instructions. This is also presented as a percent of the peak theoretical
+      BF16 MFMA operations achievable on the specific accelerator.
+    MFMA FLOPs (F16): |-
+      The total number of 16-bit floating point MFMA operations executed per
+      second. Note: this does not include any 16-bit floating point operations from
+      VALU instructions. This is also presented as a percent of the peak theoretical
+      F16 MFMA operations achievable on the specific accelerator.
+    MFMA FLOPs (F32): |-
+      The total number of 32-bit floating point MFMA operations executed per
+      second. Note: this does not include any 32-bit floating point operations from
+      VALU instructions. This is also presented as a percent of the peak theoretical
+      F32 MFMA operations achievable on the specific accelerator.
+    MFMA FLOPs (F64): |-
+      The total number of 64-bit floating point MFMA operations executed per
+      second. Note: this does not include any 64-bit floating point operations from
+      VALU instructions. This is also presented as a percent of the peak theoretical
+      F64 MFMA operations achievable on the specific accelerator.
+    MFMA IOPs (Int8): |-
+      The total number of 8-bit integer MFMA operations executed per second.
+      Note: this does not include any 8-bit integer operations from VALU instructions.
+      This is also presented as a percent of the peak theoretical INT8 MFMA operations
+      achievable on the specific accelerator.
+    Active CUs: Total number of active compute units (CUs) on the accelerator during
+      the kernel execution.
+    SALU Utilization: Indicates what percent of the kernel's duration the SALU was
+      busy executing instructions. Computed as the ratio of the total number of cycles
+      spent by the scheduler issuing SALU or SMEM instructions over the total CU cycles.
+    VALU Utilization: Indicates what percent of the kernel's duration the VALU was
+      busy executing instructions. Does not include VMEM operations. Computed as the
+      ratio of the total number of cycles spent by the scheduler issuing VALU instructions
+      over the total CU cycles.
+    MFMA Utilization: Indicates what percent of the kernel's duration the MFMA unit
+      was busy executing instructions. Computed as the ratio of the total number of
+      cycles the MFMA was busy over the total CU cycles.
+    VMEM Utilization: Indicates what percent of the kernel's duration the VMEM unit
+      was busy executing instructions, including both global/generic and spill/scratch
+      operations (see the VMEM instruction count metrics) for more detail). Does not
+      include VALU operations. Computed as the ratio of the total number of cycles
+      spent by the scheduler issuing VMEM instructions over the total CU cycles.
+    Branch Utilization: Indicates what percent of the kernel's duration the branch
+      unit was busy executing instructions. Computed as the ratio of the total number
+      of cycles spent by the scheduler issuing branch instructions over the total
+      CU cycles
+    VALU Active Threads: Indicates the average level of divergence within a wavefront
+      over the lifetime of the kernel. The number of work-items that were active in
+      a wavefront during execution of each VALU instruction, time-averaged over all
+      VALU instructions run on all wavefronts in the kernel.
+    IPC: The ratio of the total number of instructions executed on the CU over the
+      total active CU cycles. This is also presented as a percent of the peak theoretical
+      bandwidth achievable on the specific accelerator.
+    Wavefront Occupancy: |-
+      The time-averaged number of wavefronts resident on the accelerator over
+      the lifetime of the kernel. Note: this metric may be inaccurate for short-running
+      kernels (less than 1ms). This is also presented as a percent of the peak theoretical
+      occupancy achievable on the specific accelerator.
+    Theoretical LDS Bandwidth: Indicates the maximum amount of bytes that could have
+      been loaded from, stored to, or atomically updated in the LDS per unit time
+      (see LDS Bandwidth example for more detail). This is also presented as a percent
+      of the peak theoretical F64 MFMA operations achievable on the specific accelerator.
+    LDS Bank Conflicts/Access: The ratio of the number of cycles spent in the LDS
+      scheduler due to bank conflicts (as determined by the conflict resolution hardware)
+      to the base number of cycles that would be spent in the LDS scheduler in a completely
+      uncontended case. This is also presented in normalized form (i.e., the Bank
+      Conflict Rate).
+    vL1D Cache Hit Rate: The ratio of the number of vL1D cache line requests that
+      hit in vL1D cache over the total number of cache line requests to the vL1D cache
+      RAM.
+    vL1D Cache BW: The number of bytes looked up in the vL1D cache as a result of
+      VMEM instructions per unit time. The number of bytes is calculated as the number
+      of cache lines requested multiplied by the cache line size. This value does
+      not consider partial requests, so e.g., if only a single value is requested
+      in a cache line, the data movement will still be counted as a full cache line.
+      This is also presented as a percent of the peak theoretical bandwidth achievable
+      on the specific accelerator.
+    L2 Cache Hit Rate: The ratio of the number of L2 cache line requests that hit
+      in the L2 cache over the total number of incoming cache line requests to the
+      L2 cache.
+    L2 Cache BW: The number of bytes looked up in the L2 cache per unit time. The
+      number of bytes is calculated as the number of cache lines requested multiplied
+      by the cache line size. This value does not consider partial requests, so e.g.,
+      if only a single value is requested in a cache line, the data movement will
+      still be counted as a full cache line. This is also presented as a percent of
+      the peak theoretical bandwidth achievable on the specific accelerator.
+    L2-Fabric Read BW: |-
+      The number of bytes read by the L2 over the Infinity Fabric\u2122 interface
+      per unit time. This is also presented as a percent of the peak theoretical
+      bandwidth achievable on the specific accelerator.
+    L2-Fabric Write BW: The number of bytes sent by the L2 over the Infinity Fabric
+      interface by write and atomic operations per unit time. This is also presented
+      as a percent of the peak theoretical bandwidth achievable on the specific accelerator.
+    L2-Fabric Read Latency: The time-averaged number of cycles read requests spent
+      in Infinity Fabric before data was returned to the L2.
+    L2-Fabric Write Latency: The time-averaged number of cycles write requests spent
+      in Infinity Fabric before a completion acknowledgement was returned to the L2.
+    sL1D Cache Hit Rate: The percent of sL1D requests that hit on a previously loaded
+      line the cache. Calculated as the ratio of the number of sL1D requests that
+      hit over the number of all sL1D requests.
+    sL1D Cache BW: The number of bytes looked up in the sL1D cache per unit time.
+      This is also presented as a percent of the peak theoretical bandwidth achievable
+      on the specific accelerator.
+    L1I Hit Rate: The number of bytes looked up in the L1I cache per unit time. This
+      is also presented as a percent of the peak theoretical bandwidth achievable
+      on the specific accelerator.
+    L1I BW: The percent of L1I requests that hit on a previously loaded line the cache.
+      Calculated as the ratio of the number of L1I requests that hit over the number
+      of all L1I requests.
+    L1I Fetch Latency: The average number of cycles spent to fetch instructions to
+      a CU.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0300_memory_chart.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0300_memory_chart.yaml
index 03b5606ad7..b13053c1f7 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0300_memory_chart.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0300_memory_chart.yaml
@@ -2,122 +2,6 @@
 Panel Config:
   id: 300
   title: Memory Chart
-  metrics_description:
-    Wavefront Occupancy: Wavefronts per active CU.
-    Wave Life: Average number of cycles executing a wave.
-    SALU: Total Number of SALU (Scalar ALU) instructions issued per normalization
-      unit.
-    SMEM: Total number of SMEM (Scalar Memory Read) instructions issued normalization
-      unit.
-    VALU: The number of VALU (Vector ALU) instructions issued per normalization unit.
-    MFMA: Total number of MFMA (Matrix-Fused-Multiply-Add) instructions issued per
-      normalization unit.
-    VMEM: The number of VMEM (GPU Memory) read instructions issued (including FLAT/scratch
-      memory) per normalization unit.
-    LDS: The total number of LDS instructions (including, but not limited to, read/write/atomics
-      and HIP's __shfl instructions) executed per normalization unit.
-    GWS: Total number of GDS (global data sync) instructions issued per normalization
-      unit.
-    BR: Total number of BRANCH instructions issued per normalization unit.
-    Active CUs: Total number of active compute units (CUs) on the accelerator during
-      the kernel execution.
-    Num CUs: Total number of compute units (CUs) on the accelerator.
-    VGPR: 'The number of architected vector general-purpose registers allocated for
-      the kernel, see VALU. Note: this may not exactly match the number of VGPRs requested
-      by the compiler due to allocation granularity.'
-    SGPR: 'The number of scalar general-purpose registers allocated for the kernel,
-      see SALU. Note: this may not exactly match the number of SGPRs requested by
-      the compiler due to allocation granularity.'
-    LDS Allocation: 'The number of bytes of LDS memory (or, shared memory) allocated
-      for this kernel. Note: This may also be larger than what was requested at compile
-      time due to both allocation granularity and dynamic per-dispatch LDS allocations.'
-    Scratch Allocation: The number of bytes of scratch memory requested per work-item
-      for this kernel. Scratch memory is used for stack memory on the accelerator,
-      as well as for register spills and restores.
-    Wavefronts: The total number of wavefronts, summed over all workgroups, forming
-      this kernel launch.
-    Workgroups: The total number of workgroups forming this kernel launch.
-    LDS Req: The total number of LDS instructions (including, but not limited to,
-      read/write/atomics and HIP's __shfl instructions) executed per normalization
-      unit.
-    LDS Util: Indicates what percent of the kernel's duration the LDS was actively
-      executing instructions (including, but not limited to, load, store, atomic and
-      HIP's __shfl operations). Calculated as the ratio of the total number of cycles
-      LDS was active over the total CU cycles.
-    LDS Latency: The average number of round-trip cycles (i.e., from issue to data-return
-      / acknowledgment) required for an LDS instruction to complete.
-    VL1 Rd: The total number of incoming read requests from the address processing
-      unit after coalescing per normalization unit
-    VL1 Wr: The total number of incoming write requests from the address processing
-      unit after coalescing per normalization unit
-    VL1 Atomic: The total number of incoming atomic requests from the address processing
-      unit after coalescing per normalization unit
-    VL1 Hit: The ratio of the number of vL1D cache line requests that hit in vL1D
-      cache over the total number of cache line requests to the vL1D Cache RAM.
-    VL1 Lat: Calculated as the average number of cycles that a vL1D cache line request
-      spent in the vL1D cache pipeline.
-    VL1 Coalesce: Indicates how well memory instructions were coalesced by the address
-      processing unit, ranging from uncoalesced (25%) to fully coalesced (100%). Calculated
-      as the average number of thread-requests generated per instruction divided by
-      the ideal number of thread-requests per instruction.
-    VL1 Stall: The ratio of the number of cycles where the vL1D is stalled waiting
-      to issue a request for data to the L2 cache divided by the number of cycles
-      where the vL1D is active.
-    VL1_L2 Rd: The number of read requests for a vL1D cache line that were not satisfied
-      by the vL1D and must be retrieved from the to the L2 Cache per normalization
-      unit.
-    VL1_L2 Wr: The number of write requests to a vL1D cache line that were sent through
-      the vL1D to the L2 cache, per normalization unit.
-    VL1_L2 Atomic: The number of atomic requests that are sent through the vL1D to
-      the L2 cache, per normalization unit. This includes requests for atomics with,
-      and without return.
-    sL1D Rd: The total number of requests, of any size or type, made to the sL1D per
-      normalization unit.
-    sL1D Hit: The total number of sL1D requests that hit on a previously loaded cache
-      line, per normalization unit.
-    sL1D_L2 Rd: The total number of read requests from sL1D to the L2, per normalization
-      unit.
-    sL1D_L2 Wr: The total number of write requests from sL1D to the L2, per normalization
-      unit. Typically unused on current CDNA accelerators.
-    sL1D_L2 Atomic: The total number of atomic requests from sL1D to the L2, per normalization
-      unit. Typically unused on current CDNA accelerators.
-    IL1 Fetch: The total number of requests made to the L1I per normalization-unit.
-    IL1 Hit: The percent of L1I requests that hit on a previously loaded line the
-      cache. Calculated as the ratio of the number of L1I requests that hit over the
-      number of all L1I requests.
-    IL1 Lat: The average number of cycles spent to fetch instructions to a CU.
-    IL1_L2 Rd: The total number of requests across the L1I - L2 interface per normalization-unit.
-    L2 Rd: The total number of read requests to the L2 from all clients.
-    L2 Wr: The total number of write requests to the L2 from all clients.
-    L2 Atomic: The total number of atomic requests (with and without return) to the
-      L2 from all clients.
-    L2 Hit: The ratio of the number of L2 cache line requests that hit in the L2 cache
-      over the total number of incoming cache line requests to the L2 cache.
-    L2 Rd Lat: Calculated as the average number of cycles that the vL1D cache took
-      to issue and receive read requests from the L2 Cache. This number also includes
-      requests for atomics with return values.
-    L2 Wr Lat: Calculated as the average number of cycles that the vL1D cache took
-      to issue and receive acknowledgement of a write request to the L2 Cache. This
-      number also includes requests for atomics without return values.
-    Fabric_L2 Rd: Number of L2 cache - Infinity Fabric read requests (either 32-byte
-      or 64-byte) summed over TCC instances per normalization unit.
-    Fabric_L2 Wr: Number of L2 cache - Infinity Fabric write requests (either 32-byte
-      or 64-byte) summed over TCC instances per normalization unit.
-    Fabric_L2 Atomic: Number of L2 cache - Infinity Fabric write requests (either
-      32-byte or 64-byte) that are actually atomic requests summed over TCC instances
-      per normalization unit.
-    Fabric Rd Lat: The time-averaged number of cycles read requests spent in Infinity
-      Fabric before data was returned to the L2.
-    Fabric Wr Lat: The time-averaged number of cycles write requests spent in Infinity
-      Fabric before a completion acknowledgement was returned to the L2.
-    Fabric Atomic Lat: The time-averaged number of cycles atomic requests spent in
-      Infinity Fabric before a completion acknowledgement (atomic without return value)
-      or data (atomic with return value) was returned to the L2.
-    HBM Rd: The total number of L2 requests to Infinity Fabric to read 32B or 64B
-      of data from the accelerator's local HBM, per normalization unit.
-    HBM Wr: 'The total number of L2 requests to Infinity Fabric to write or atomically
-      update 32B or 64B of data in the accelerator''s local HBM, per normalization
-      unit. '
   data source:
   - metric_table:
       id: 301
@@ -244,13 +128,13 @@ Panel Config:
           value: ROUND(AVG((TCC_EA0_ATOMIC_sum / $denom)), 0)
         Fabric Rd Lat:
           value: ROUND(AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-            != 0) else  0)), 0)
+            != 0) else 0)), 0)
         Fabric Wr Lat:
           value: ROUND(AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-            != 0) else  0)), 0)
+            != 0) else 0)), 0)
         Fabric Atomic Lat:
           value: ROUND(AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
-            != 0) else  0)), 0)
+            != 0) else 0)), 0)
         HBM Rd:
           value: ROUND(AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)), 0)
         HBM Wr:
@@ -258,3 +142,117 @@ Panel Config:
       comparable: false
       cli_style: mem_chart
       tui_style: mem_chart
+  metrics_description:
+    Wavefront Occupancy: Wavefronts per active CU.
+    Wave Life: Average number of cycles executing a wave.
+    SALU: Total Number of SALU (Scalar ALU) instructions issued per normalization
+      unit.
+    SMEM: Total number of SMEM (Scalar Memory Read) instructions issued normalization
+      unit.
+    VALU: The number of VALU (Vector ALU) instructions issued per normalization unit.
+    MFMA: Total number of MFMA (Matrix-Fused-Multiply-Add) instructions issued per
+      normalization unit.
+    VMEM: The number of VMEM (GPU Memory) read instructions issued (including FLAT/scratch
+      memory) per normalization unit.
+    LDS: The total number of LDS instructions (including, but not limited to, read/write/atomics
+      and HIP's __shfl instructions) executed per normalization unit.
+    GWS: Total number of GDS (global data sync) instructions issued per normalization
+      unit.
+    BR: Total number of BRANCH instructions issued per normalization unit.
+    Active CUs: Total number of active compute units (CUs) on the accelerator during
+      the kernel execution.
+    Num CUs: Total number of compute units (CUs) on the accelerator.
+    VGPR: |-
+      The number of architected vector general-purpose registers allocated
+      for the kernel, see VALU. Note: this may not exactly match the number of VGPRs
+      requested by the compiler due to allocation granularity.
+    SGPR: |-
+      The number of scalar general-purpose registers allocated for the kernel,
+      see SALU. Note: this may not exactly match the number of SGPRs requested by
+      the compiler due to allocation granularity.
+    LDS Allocation: |-
+      The number of bytes of LDS memory (or, shared memory) allocated for
+      this kernel. Note: This may also be larger than what was requested at compile
+      time due to both allocation granularity and dynamic per-dispatch LDS allocations.
+    Scratch Allocation: The number of bytes of scratch memory requested per work-item
+      for this kernel. Scratch memory is used for stack memory on the accelerator,
+      as well as for register spills and restores.
+    Wavefronts: The total number of wavefronts, summed over all workgroups, forming
+      this kernel launch.
+    Workgroups: The total number of workgroups forming this kernel launch.
+    LDS Req: The total number of LDS instructions (including, but not limited to,
+      read/write/atomics and HIP's __shfl instructions) executed per normalization
+      unit.
+    LDS Util: Indicates what percent of the kernel's duration the LDS was actively
+      executing instructions (including, but not limited to, load, store, atomic and
+      HIP's __shfl operations). Calculated as the ratio of the total number of cycles
+      LDS was active over the total CU cycles.
+    LDS Latency: The average number of round-trip cycles (i.e., from issue to data-return
+      / acknowledgment) required for an LDS instruction to complete.
+    VL1 Rd: The total number of incoming read requests from the address processing
+      unit after coalescing per normalization unit
+    VL1 Wr: The total number of incoming write requests from the address processing
+      unit after coalescing per normalization unit
+    VL1 Atomic: The total number of incoming atomic requests from the address processing
+      unit after coalescing per normalization unit
+    VL1 Hit: The ratio of the number of vL1D cache line requests that hit in vL1D
+      cache over the total number of cache line requests to the vL1D Cache RAM.
+    VL1 Lat: Calculated as the average number of cycles that a vL1D cache line request
+      spent in the vL1D cache pipeline.
+    VL1 Coalesce: Indicates how well memory instructions were coalesced by the address
+      processing unit, ranging from uncoalesced (25%) to fully coalesced (100%). Calculated
+      as the average number of thread-requests generated per instruction divided by
+      the ideal number of thread-requests per instruction.
+    VL1 Stall: The ratio of the number of cycles where the vL1D is stalled waiting
+      to issue a request for data to the L2 cache divided by the number of cycles
+      where the vL1D is active.
+    VL1_L2 Rd: The number of read requests for a vL1D cache line that were not satisfied
+      by the vL1D and must be retrieved from the to the L2 Cache per normalization
+      unit.
+    VL1_L2 Wr: The number of write requests to a vL1D cache line that were sent through
+      the vL1D to the L2 cache, per normalization unit.
+    VL1_L2 Atomic: The number of atomic requests that are sent through the vL1D to
+      the L2 cache, per normalization unit. This includes requests for atomics with,
+      and without return.
+    sL1D Rd: The total number of requests, of any size or type, made to the sL1D per
+      normalization unit.
+    sL1D Hit: The total number of sL1D requests that hit on a previously loaded cache
+      line, per normalization unit.
+    sL1D_L2 Rd: The total number of read requests from sL1D to the L2, per normalization
+      unit.
+    sL1D_L2 Wr: The total number of write requests from sL1D to the L2, per normalization
+      unit. Typically unused on current CDNA accelerators.
+    sL1D_L2 Atomic: The total number of atomic requests from sL1D to the L2, per normalization
+      unit. Typically unused on current CDNA accelerators.
+    IL1 Fetch: The total number of requests made to the L1I per normalization-unit.
+    IL1 Hit: The percent of L1I requests that hit on a previously loaded line the
+      cache. Calculated as the ratio of the number of L1I requests that hit over the
+      number of all L1I requests.
+    IL1 Lat: The average number of cycles spent to fetch instructions to a CU.
+    IL1_L2 Rd: The total number of requests across the L1I - L2 interface per normalization-unit.
+    L2 Rd: The total number of read requests to the L2 from all clients.
+    L2 Wr: The total number of write requests to the L2 from all clients.
+    L2 Atomic: The total number of atomic requests (with and without return) to the
+      L2 from all clients.
+    L2 Hit: The ratio of the number of L2 cache line requests that hit in the L2 cache
+      over the total number of incoming cache line requests to the L2 cache.
+    Fabric_L2 Rd: Number of L2 cache - Infinity Fabric read requests (either 32-byte
+      or 64-byte) summed over TCC instances per normalization unit.
+    Fabric_L2 Wr: Number of L2 cache - Infinity Fabric write requests (either 32-byte
+      or 64-byte) summed over TCC instances per normalization unit.
+    Fabric_L2 Atomic: Number of L2 cache - Infinity Fabric write requests (either
+      32-byte or 64-byte) that are actually atomic requests summed over TCC instances
+      per normalization unit.
+    Fabric Rd Lat: The time-averaged number of cycles read requests spent in Infinity
+      Fabric before data was returned to the L2.
+    Fabric Wr Lat: The time-averaged number of cycles write requests spent in Infinity
+      Fabric before a completion acknowledgement was returned to the L2.
+    Fabric Atomic Lat: The time-averaged number of cycles atomic requests spent in
+      Infinity Fabric before a completion acknowledgement (atomic without return value)
+      or data (atomic with return value) was returned to the L2.
+    HBM Rd: The total number of L2 requests to Infinity Fabric to read 32B or 64B
+      of data from the accelerator's local HBM, per normalization unit.
+    HBM Wr: |-
+      The total number of L2 requests to Infinity Fabric to write or atomically
+      update 32B or 64B of data in the accelerator's local HBM, per normalization
+      unit.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0400_roofline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0400_roofline.yaml
index c951110895..fe6389ef3b 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0400_roofline.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0400_roofline.yaml
@@ -2,85 +2,6 @@
 Panel Config:
   id: 400
   title: Roofline
-  metrics_description:
-    VALU FLOPs (F16): 'The total 16-bit floating-point operations executed per second
-      on the VALU. This is presented with the value of the peak empirical F16 FLOPs
-      achievable on the specific accelerator. Note: this does not include any F16
-      operations from MFMA instructions.'
-    VALU FLOPs (F32): 'The total 32-bit floating-point operations executed per second
-      on the VALU. This is presented with the value of the peak empirical F32 FLOPs
-      achievable on the specific accelerator. Note: this does not include any F32
-      operations from MFMA instructions.'
-    VALU FLOPs (F64): 'The total 64-bit floating-point operations executed per second
-      on the VALU. This is presented with the value of the peak empirical F64 FLOPs
-      achievable on the specific accelerator. Note: this does not include any F64
-      operations from MFMA instructions.'
-    MFMA FLOPs (F8): The total number of 8-bit brain floating point MFMA operations
-      executed per second. This does not include any 16-bit brain floating point operations
-      from VALU instructions. The peak empirically measured F8 MFMA operations achievable
-      on the specific accelerator is displayed alongside for comparison. It is supported
-      on AMD Instinct MI300 series and later only.
-    MFMA FLOPs (BF16): 'The total number of 16-bit brain floating point MFMA operations
-      executed per second. Note: this does not include any 16-bit brain floating point
-      operations from VALU instructions. The peak empirically measured BF16 MFMA operations
-      achievable on the specific accelerator is displayed alongside for comparison.'
-    MFMA FLOPs (F16): 'The total number of 16-bit floating point MFMA operations executed
-      per second. Note: this does not include any 16-bit floating point operations
-      from VALU instructions. The peak empirically measured F16 MFMA operations achievable
-      on the specific accelerator is displayed alongside for comparison.'
-    MFMA FLOPs (F32): 'The total number of 32-bit floating point MFMA operations executed
-      per second. Note: this does not include any 32-bit floating point operations
-      from VALU instructions. The peak empirically measured F32 MFMA operations achievable
-      on the specific accelerator is displayed alongside for comparison.'
-    MFMA FLOPs (F64): 'The total number of 64-bit floating point MFMA operations executed
-      per second. Note: this does not include any 64-bit floating point operations
-      from VALU instructions. The peak empirically measured F64 MFMA operations achievable
-      on the specific accelerator is displayed alongside for comparison.'
-    MFMA FLOPs (F6F4): 'The total number of 4-bit and 6-bit floating point MFMA operations
-      executed per second. Note: this does not include any floating point operations
-      from VALU instructions. The peak empirically measured F6F4 MFMA operations achievable
-      on the specific accelerator is displayed alongside for comparison. It is supported
-      on AMD Instinct MI350 series (gfx950) and later only.'
-    MFMA IOPs (Int8): 'The total number of 8-bit integer MFMA operations executed
-      per second. Note: this does not include any 8-bit integer operations from VALU
-      instructions. The peak empirically measured INT8 MFMA operations achievable
-      on the specific accelerator is displayed alongside for comparison.'
-    HBM Bandwidth: The total number of bytes read from and written to High-Bandwidth
-      Memory (HBM) per second. The peak empirically measured bandwidth achievable
-      on the specific accelerator is displayed alongside for comparison.
-    L2 Cache Bandwidth: The number of bytes looked up in the L2 cache per unit time.
-      The number of bytes is calculated as the number of cache lines requested multiplied
-      by the cache line size. This value does not consider partial requests, so e.g.,
-      if only a single value is requested in a cache line, the data movement will
-      still be counted as a full cache line. The peak empirically measured bandwidth
-      achievable on the specific accelerator is displayed alongside for comparison.
-    L1 Cache Bandwidth: The number of bytes looked up in the vL1D cache as a result
-      of VMEM instructions per unit time. The number of bytes is calculated as the
-      number of cache lines requested multiplied by the cache line size. This value
-      does not consider partial requests, so e.g., if only a single value is requested
-      in a cache line, the data movement will still be counted as a full cache line.
-      The peak empirically measured bandwidth achievable on the specific accelerator
-      is displayed alongside for comparison.
-    LDS Bandwidth: Indicates the maximum amount of bytes that could have been loaded
-      from, stored to, or atomically updated in the LDS per unit time (see LDS Bandwidth
-      example for more detail). The peak empirically measured LDS bandwidth achievable
-      on the specific accelerator is displayed alongside for comparison.
-    AI L1: The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio
-      of total floating-point operations (FLOPs) to total bytes transferred between
-      the L1 cache and the processing units. This value is used as the x-coordinate
-      for the L1 roofline.
-    AI L2: The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio
-      of total floating-point operations (FLOPs) to total bytes transferred between
-      the L2 cache and the L1 cache. This value is used as the x-coordinate for the
-      L2 roofline.
-    AI HBM: The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM).
-      It is the ratio of total floating-point operations (FLOPs) to total bytes transferred
-      between HBM and the L2 cache. This value is used as the x-coordinate for the
-      HBM roofline.
-    Performance (GFLOPs): The overall achieved performance, measured in GigaFLOPs
-      per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point
-      operations divided by the total execution time. This value is used as the y-coordinate
-      for the kernel's point on the Roofline plot.
   data source:
   - metric_table:
       id: 401
@@ -218,3 +139,91 @@ Panel Config:
             512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8
             * 512) ) / (SUM(End_Timestamp - Start_Timestamp) / 1e9) ) / 1e9
           unit: GFLOP/s
+  metrics_description:
+    VALU FLOPs (F16): |-
+      The total 16-bit floating-point operations executed per second on the VALU.
+      This is presented with the value of the peak empirical F16 FLOPs achievable
+      on the specific accelerator. Note: this does not include any F16 operations
+      from MFMA instructions.
+    VALU FLOPs (F32): |-
+      The total 32-bit floating-point operations executed per second on the VALU.
+      This is presented with the value of the peak empirical F32 FLOPs achievable
+      on the specific accelerator. Note: this does not include any F32 operations
+      from MFMA instructions.
+    VALU FLOPs (F64): |-
+      The total 64-bit floating-point operations executed per second on the VALU.
+      This is presented with the value of the peak empirical F64 FLOPs achievable
+      on the specific accelerator. Note: this does not include any F64 operations
+      from MFMA instructions.
+    MFMA FLOPs (F8): The total number of 8-bit brain floating point MFMA operations
+      executed per second. This does not include any 16-bit brain floating point operations
+      from VALU instructions. The peak empirically measured F8 MFMA operations achievable
+      on the specific accelerator is displayed alongside for comparison. It is supported
+      on AMD Instinct MI300 series and later only.
+    MFMA FLOPs (BF16): |-
+      The total number of 16-bit brain floating point MFMA operations executed
+      per second. Note: this does not include any 16-bit brain floating point
+      operations from VALU instructions. The peak empirically measured BF16 MFMA
+      operations achievable on the specific accelerator is displayed alongside
+      for comparison.
+    MFMA FLOPs (F16): |-
+      The total number of 16-bit floating point MFMA operations executed per
+      second. Note: this does not include any 16-bit floating point operations from
+      VALU instructions. The peak empirically measured F16 MFMA operations
+      achievable on the specific accelerator is displayed alongside for comparison.
+    MFMA FLOPs (F32): |-
+      The total number of 32-bit floating point MFMA operations executed per
+      second. Note: this does not include any 32-bit floating point operations from
+      VALU instructions. The peak empirically measured F32 MFMA operations
+      achievable on the specific accelerator is displayed alongside for comparison.
+    MFMA FLOPs (F64): |-
+      The total number of 64-bit floating point MFMA operations executed per
+      second. Note: this does not include any 64-bit floating point operations from
+      VALU instructions. The peak empirically measured F64 MFMA operations
+      achievable on the specific accelerator is displayed alongside for comparison.
+    MFMA IOPs (Int8): |-
+      The total number of 8-bit integer MFMA operations executed per second.
+      Note: this does not include any 8-bit integer operations from VALU instructions.
+      The peak empirically measured INT8 MFMA operations achievable on the specific
+      accelerator is displayed alongside for comparison.
+    HBM Bandwidth: |-
+      The total number of bytes read from and written to High-Bandwidth
+      Memory (HBM) per second. The peak empirically measured bandwidth achievable
+      on the specific accelerator is displayed alongside for comparison.
+    L2 Cache Bandwidth: The number of bytes looked up in the L2 cache per unit time.
+      The number of bytes is calculated as the number of cache lines requested multiplied
+      by the cache line size. This value does not consider partial requests, so e.g.,
+      if only a single value is requested in a cache line, the data movement will
+      still be counted as a full cache line. The peak empirically measured bandwidth
+      achievable on the specific accelerator is displayed alongside for comparison.
+    L1 Cache Bandwidth: The number of bytes looked up in the vL1D cache as a result
+      of VMEM instructions per unit time. The number of bytes is calculated as the
+      number of cache lines requested multiplied by the cache line size. This value
+      does not consider partial requests, so e.g., if only a single value is requested
+      in a cache line, the data movement will still be counted as a full cache line.
+      The peak empirically measured bandwidth achievable on the specific accelerator
+      is displayed alongside for comparison.
+    LDS Bandwidth: Indicates the maximum amount of bytes that could have been loaded
+      from, stored to, or atomically updated in the LDS per unit time (see LDS Bandwidth
+      example for more detail). The peak empirically measured LDS bandwidth achievable
+      on the specific accelerator is displayed alongside for comparison.
+    AI L1: |-
+      The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio
+      of total floating-point operations (FLOPs) to total bytes transferred between
+      the L1 cache and the processing units. This value is used as the x-coordinate
+      for the L1 roofline.
+    AI L2: |-
+      The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio
+      of total floating-point operations (FLOPs) to total bytes transferred between
+      the L2 cache and the L1 cache. This value is used as the x-coordinate for
+      the L2 roofline.
+    AI HBM: |-
+      The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM).
+      It is the ratio of total floating-point operations (FLOPs) to total bytes
+      transferred between HBM and the L2 cache. This value is used as the x-coordinate
+      for the HBM roofline.
+    Performance (GFLOPs): |-
+      The overall achieved performance, measured in GigaFLOPs
+      per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point
+      operations divided by the total execution time. This value is used as the y-coordinate
+      for the kernel's point on the Roofline plot.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0500_command_processor_cpc_cpf.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0500_command_processor_cpc_cpf.yaml
index c4d2cabf52..118ce18331 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0500_command_processor_cpc_cpf.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0500_command_processor_cpc_cpf.yaml
@@ -2,30 +2,6 @@
 Panel Config:
   id: 500
   title: Command Processor (CPC/CPF)
-  metrics_description:
-    CPF Utilization: Percent of total cycles where the CPF was busy actively doing
-      any work. The ratio of CPF busy cycles over total cycles counted by the CPF.
-    CPF Stall: Percent of CPF busy cycles where the CPF was stalled for any reason.
-    CPF-L2 Utilization: Percent of total cycles counted by the CPF-L2 interface where
-      the CPF-L2 interface was active doing any work. The ratio of CPF-L2 busy cycles
-      over total cycles counted by the CPF-L2.
-    CPF-L2 Stall: Percent of CPF-L2 L2 busy cycles where the CPF-L2 interface was
-      stalled for any reason.
-    CPF-UTCL1 Stall: Percent of CPF busy cycles where the CPF was stalled by address
-      translation.
-    CPC Utilization: Percent of total cycles where the CPC was busy actively doing
-      any work. The ratio of CPC busy cycles over total cycles counted by the CPC.
-    CPC Stall Rate: Percent of CPC busy cycles where the CPC was stalled for any reason.
-    CPC Packet Decoding Utilization: Percent of CPC busy cycles spent decoding commands
-      for processing.
-    CPC-Workgroup Manager Utilization: Percent of CPC busy cycles spent dispatching
-      workgroups to the workgroup manager.
-    CPC-L2 Utilization: Percent of total cycles counted by the CPC-L2 interface where
-      the CPC-L2 interface was active doing any work.
-    CPC-UTCL1 Stall: Percent of CPC busy cycles where the CPC was stalled by address
-      translation
-    CPC-UTCL2 Utilization: 'Percent of total cycles counted by the CPC''s L2 address
-      translation interface where the CPC was busy doing address translation work.  '
   data source:
   - metric_table:
       id: 501
@@ -143,3 +119,28 @@ Panel Config:
           max: MAX((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
             if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
           unit: pct
+  metrics_description:
+    CPF Utilization: Percent of total cycles where the CPF was busy actively doing
+      any work. The ratio of CPF busy cycles over total cycles counted by the CPF.
+    CPF Stall: Percent of CPF busy cycles where the CPF was stalled for any reason.
+    CPF-L2 Utilization: Percent of total cycles counted by the CPF-L2 interface where
+      the CPF-L2 interface was active doing any work. The ratio of CPF-L2 busy cycles
+      over total cycles counted by the CPF-L2.
+    CPF-L2 Stall: Percent of CPF-L2 L2 busy cycles where the CPF-L2 interface was
+      stalled for any reason.
+    CPF-UTCL1 Stall: Percent of CPF busy cycles where the CPF was stalled by address
+      translation.
+    CPC Utilization: Percent of total cycles where the CPC was busy actively doing
+      any work. The ratio of CPC busy cycles over total cycles counted by the CPC.
+    CPC Stall Rate: Percent of CPC busy cycles where the CPC was stalled for any reason.
+    CPC Packet Decoding Utilization: Percent of CPC busy cycles spent decoding commands
+      for processing.
+    CPC-Workgroup Manager Utilization: Percent of CPC busy cycles spent dispatching
+      workgroups to the workgroup manager.
+    CPC-L2 Utilization: Percent of total cycles counted by the CPC-L2 interface where
+      the CPC-L2 interface was active doing any work.
+    CPC-UTCL1 Stall: Percent of CPC busy cycles where the CPC was stalled by address
+      translation
+    CPC-UTCL2 Utilization: |-
+      Percent of total cycles counted by the CPC's L2 address translation
+      interface where the CPC was busy doing address translation work.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0600_workgroup_manager_spi.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0600_workgroup_manager_spi.yaml
index f6bf13d8b8..eb9845aa82 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0600_workgroup_manager_spi.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0600_workgroup_manager_spi.yaml
@@ -2,61 +2,6 @@
 Panel Config:
   id: 600
   title: Workgroup Manager (SPI)
-  metrics_description:
-    Accelerator Utilization: The percent of cycles in the kernel where the accelerator
-      was actively doing any work.
-    Scheduler-Pipe Utilization: The percent of total scheduler-pipe cycles in the
-      kernel where the scheduler-pipes were actively doing any work.
-    Workgroup Manager Utilization: The percent of cycles in the kernel where the workgroup
-      manager was actively doing any work.
-    Shader Engine Utilization: The percent of total shader engine cycles in the kernel
-      where any CU in a shader-engine was actively doing any work, normalized over
-      all shader-engines. Low values (e.g., << 100%) indicate that the accelerator
-      was not fully saturated by the kernel, or a potential load-imbalance issue.
-    SIMD Utilization: The percent of total SIMD cycles in the kernel where any SIMD
-      on a CU was actively doing any work, summed over all CUs. Low values (less than
-      100%) indicate that the accelerator was not fully saturated by the kernel, or
-      a potential load-imbalance issue.
-    Dispatched Workgroups: The total number of workgroups forming this kernel launch.
-    Dispatched Wavefronts: The total number of wavefronts, summed over all workgroups,
-      forming this kernel launch.
-    VGPR Writes: The average number of cycles spent initializing VGPRs at wave creation.
-    SGPR Writes: The average number of cycles spent initializing SGPRs at wave creation.
-    Not-scheduled Rate (Workgroup Manager): The percent of total scheduler-pipe cycles
-      in the kernel where a workgroup could not be scheduled to a CU due to a bottleneck
-      within the workgroup manager rather than a lack of a CU or SIMD with sufficient
-      resources.
-    Not-scheduled Rate (Scheduler-Pipe): 'The percent of total scheduler-pipe cycles
-      in the kernel where a workgroup could not be scheduled to a CU due to a bottleneck
-      within the scheduler-pipes rather than a lack of a CU or SIMD with sufficient
-      resources. '
-    Scheduler-Pipe Stall Rate: The percent of total scheduler-pipe cycles in the kernel
-      where a workgroup could not be scheduled to a CU due to occupancy limitations
-      (like a lack of a CU or SIMD with sufficient resources).
-    Scratch Stall Rate: The percent of total shader-engine cycles in the kernel where
-      a workgroup could not be scheduled to a CU due to lack of private (a.k.a., scratch)
-      memory slots. While this can reach up to 100%, note that the actual occupancy
-      limitations on a kernel using private memory are typically quite small (for
-      example, less than 1% of the total number of waves that can be scheduled to
-      an accelerator).
-    Insufficient SIMD Waveslots: The percent of total SIMD cycles in the kernel where
-      a workgroup could not be scheduled to a SIMD due to lack of available waveslots.
-    Insufficient SIMD VGPRs: The percent of total SIMD cycles in the kernel where
-      a workgroup could not be scheduled to a SIMD due to lack of available VGPRs.
-    Insufficient SIMD SGPRs: The percent of total SIMD cycles in the kernel where
-      a workgroup could not be scheduled to a SIMD due to lack of available SGPRs.
-    Insufficient CU LDS: The percent of total CU cycles in the kernel where a workgroup
-      could not be scheduled to a CU due to lack of available LDS.
-    Insufficient CU Barriers: The percent of total CU cycles in the kernel where a
-      workgroup could not be scheduled to a CU due to lack of available barriers.
-    Reached CU Workgroup Limit: The percent of total CU cycles in the kernel where
-      a workgroup could not be scheduled to a CU due to limits within the workgroup
-      manager. This is expected to be always be zero on CDNA2 or newer accelerators
-      (and small for previous accelerators).
-    Reached CU Wavefront Limit: The percent of total CU cycles in the kernel where
-      a wavefront could not be scheduled to a CU due to limits within the workgroup
-      manager. This is expected to be always be zero on CDNA2 or newer accelerators
-      (and small for previous accelerators).
   data source:
   - metric_table:
       id: 601
@@ -199,3 +144,58 @@ Panel Config:
           min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
           max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
           unit: Pct
+  metrics_description:
+    Accelerator Utilization: The percent of cycles in the kernel where the accelerator
+      was actively doing any work.
+    Scheduler-Pipe Utilization: The percent of total scheduler-pipe cycles in the
+      kernel where the scheduler-pipes were actively doing any work.
+    Workgroup Manager Utilization: The percent of cycles in the kernel where the workgroup
+      manager was actively doing any work.
+    Shader Engine Utilization: The percent of total shader engine cycles in the kernel
+      where any CU in a shader-engine was actively doing any work, normalized over
+      all shader-engines. Low values (e.g., << 100%) indicate that the accelerator
+      was not fully saturated by the kernel, or a potential load-imbalance issue.
+    SIMD Utilization: The percent of total SIMD cycles in the kernel where any SIMD
+      on a CU was actively doing any work, summed over all CUs. Low values (less than
+      100%) indicate that the accelerator was not fully saturated by the kernel, or
+      a potential load-imbalance issue.
+    Dispatched Workgroups: The total number of workgroups forming this kernel launch.
+    Dispatched Wavefronts: The total number of wavefronts, summed over all workgroups,
+      forming this kernel launch.
+    VGPR Writes: The average number of cycles spent initializing VGPRs at wave creation.
+    SGPR Writes: The average number of cycles spent initializing SGPRs at wave creation.
+    Not-scheduled Rate (Workgroup Manager): The percent of total scheduler-pipe cycles
+      in the kernel where a workgroup could not be scheduled to a CU due to a bottleneck
+      within the workgroup manager rather than a lack of a CU or SIMD with sufficient
+      resources.
+    Not-scheduled Rate (Scheduler-Pipe): |-
+      The percent of total scheduler-pipe cycles in the kernel where a workgroup
+      could not be scheduled to a CU due to a bottleneck within the scheduler-pipes
+      rather than a lack of a CU or SIMD with sufficient resources.
+    Scheduler-Pipe Stall Rate: The percent of total scheduler-pipe cycles in the kernel
+      where a workgroup could not be scheduled to a CU due to occupancy limitations
+      (like a lack of a CU or SIMD with sufficient resources).
+    Scratch Stall Rate: The percent of total shader-engine cycles in the kernel where
+      a workgroup could not be scheduled to a CU due to lack of private (a.k.a., scratch)
+      memory slots. While this can reach up to 100%, note that the actual occupancy
+      limitations on a kernel using private memory are typically quite small (for
+      example, less than 1% of the total number of waves that can be scheduled to
+      an accelerator).
+    Insufficient SIMD Waveslots: The percent of total SIMD cycles in the kernel where
+      a workgroup could not be scheduled to a SIMD due to lack of available waveslots.
+    Insufficient SIMD VGPRs: The percent of total SIMD cycles in the kernel where
+      a workgroup could not be scheduled to a SIMD due to lack of available VGPRs.
+    Insufficient SIMD SGPRs: The percent of total SIMD cycles in the kernel where
+      a workgroup could not be scheduled to a SIMD due to lack of available SGPRs.
+    Insufficient CU LDS: The percent of total CU cycles in the kernel where a workgroup
+      could not be scheduled to a CU due to lack of available LDS.
+    Insufficient CU Barriers: The percent of total CU cycles in the kernel where a
+      workgroup could not be scheduled to a CU due to lack of available barriers.
+    Reached CU Workgroup Limit: The percent of total CU cycles in the kernel where
+      a workgroup could not be scheduled to a CU due to limits within the workgroup
+      manager. This is expected to be always be zero on CDNA2 or newer accelerators
+      (and small for previous accelerators).
+    Reached CU Wavefront Limit: The percent of total CU cycles in the kernel where
+      a wavefront could not be scheduled to a CU due to limits within the workgroup
+      manager. This is expected to be always be zero on CDNA2 or newer accelerators
+      (and small for previous accelerators).
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0700_wavefront.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0700_wavefront.yaml
index 5e332c0b8f..e9e9407cfc 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0700_wavefront.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0700_wavefront.yaml
@@ -2,63 +2,6 @@
 Panel Config:
   id: 700
   title: Wavefront
-  metrics_description:
-    Grid Size: The total number of work-items (or, threads) launched as a part of
-      the kernel dispatch. In HIP, this is equivalent to the total grid size multiplied
-      by the total workgroup (or, block) size.
-    Workgroup Size: The total number of work-items (or, threads) in each workgroup
-      (or, block) launched as part of the kernel dispatch. In HIP, this is equivalent
-      to the total block size.
-    Total Wavefronts: "The total number of wavefronts launched as part of the kernel\
-      \ dispatch. On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs,\
-      \ the wavefront size is always 64 work-items. Thus, the total number of wavefronts\
-      \ should be equivalent to the ceiling of grid size divided by 64."
-    Saved Wavefronts: The total number of wavefronts saved at a context-save.
-    Restored Wavefronts: The total number of wavefronts restored from a context-save.
-    VGPRs: 'The number of architected vector general-purpose registers allocated for
-      the kernel, see VALU. Note: this may not exactly match the number of VGPRs requested
-      by the compiler due to allocation granularity.'
-    AGPRs: 'The number of accumulation vector general-purpose registers allocated
-      for the kernel, see AGPRs. Note: this may not exactly match the number of AGPRs
-      requested by the compiler due to allocation granularity.'
-    SGPRs: 'The number of scalar general-purpose registers allocated for the kernel,
-      see SALU. Note: this may not exactly match the number of SGPRs requested by
-      the compiler due to allocation granularity.'
-    LDS Allocation: 'The number of bytes of LDS memory (or, shared memory) allocated
-      for this kernel. Note: This may also be larger than what was requested at compile
-      time due to both allocation granularity and dynamic per-dispatch LDS allocations.'
-    Scratch Allocation: The number of bytes of scratch memory requested per work-item
-      for this kernel. Scratch memory is used for stack memory on the accelerator,
-      as well as for register spills and restores.
-    Kernel Time: The total duration of the executed kernel.
-    Kernel Time (Cycles): The total duration of the executed kernel in cycles.
-    Instructions per wavefront: The average number of instructions (of all types)
-      executed per wavefront. This is averaged over all wavefronts in a kernel dispatch.
-    Wave Cycles: The number of cycles a wavefront in the kernel dispatch spent resident
-      on a compute unit per normalization unit. This is averaged over all wavefronts
-      in a kernel dispatch.
-    Dependency Wait Cycles: The number of cycles a wavefront in the kernel dispatch
-      spent resident on a compute unit per normalization unit. This is averaged over
-      all wavefronts in a kernel dispatch.
-    Issue Wait Cycles: The number of cycles a wavefront in the kernel dispatch was
-      unable to issue an instruction for any reason (e.g., execution pipe back-pressure,
-      arbitration loss, etc.) per normalization unit. This counter is incremented
-      at every cycle by all wavefronts on a CU unable to issue an instruction. As
-      such, it is most useful to get a sense of how waves were spending their time,
-      rather than identification of a precise limiter because another wave could be
-      actively executing while a wave is issue stalled. The sum of this metric, Dependency
-      Wait Cycles and Active Cycles should be equal to the total Wave Cycles metric.
-    Active Cycles: The average number of cycles a wavefront in the kernel dispatch
-      was actively executing instructions per normalization unit. This measurement
-      is made on a per-wavefront basis, and may include cycles that another wavefront
-      spent actively executing (on another execution unit, for example) or was stalled.
-      As such, it is most useful to get a sense of how waves were spending their time,
-      rather than identification of a precise limiter. The sum of this metric, Issue
-      Wait Cycles and Active Wait Cycles should be equal to the total Wave Cycles
-      metric.
-    Wavefront Occupancy: 'The time-averaged number of wavefronts resident on the accelerator
-      over the lifetime of the kernel. Note: this metric may be inaccurate for short-running
-      kernels (less than 1ms).'
   data source:
   - metric_table:
       id: 701
@@ -171,3 +114,66 @@ Panel Config:
           max: MAX((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
           unit: Wavefronts
           coll_level: SQ_LEVEL_WAVES
+  metrics_description:
+    Grid Size: The total number of work-items (or, threads) launched as a part of
+      the kernel dispatch. In HIP, this is equivalent to the total grid size multiplied
+      by the total workgroup (or, block) size.
+    Workgroup Size: The total number of work-items (or, threads) in each workgroup
+      (or, block) launched as part of the kernel dispatch. In HIP, this is equivalent
+      to the total block size.
+    Total Wavefronts: |-
+      The total number of wavefronts launched as part of the kernel dispatch.
+      On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront
+      size is always 64 work-items. Thus, the total number of wavefronts should
+      be equivalent to the ceiling of grid size divided by 64.
+    Saved Wavefronts: The total number of wavefronts saved at a context-save.
+    Restored Wavefronts: The total number of wavefronts restored from a context-save.
+    VGPRs: |-
+      The number of architected vector general-purpose registers allocated
+      for the kernel, see VALU. Note: this may not exactly match the number of VGPRs
+      requested by the compiler due to allocation granularity.
+    AGPRs: |-
+      The number of accumulation vector general-purpose registers allocated
+      for the kernel, see AGPRs. Note: this may not exactly match the number of
+      AGPRs requested by the compiler due to allocation granularity.
+    SGPRs: |-
+      The number of scalar general-purpose registers allocated for the kernel,
+      see SALU. Note: this may not exactly match the number of SGPRs requested by
+      the compiler due to allocation granularity.
+    LDS Allocation: |-
+      The number of bytes of LDS memory (or, shared memory) allocated for
+      this kernel. Note: This may also be larger than what was requested at compile
+      time due to both allocation granularity and dynamic per-dispatch LDS allocations.
+    Scratch Allocation: The number of bytes of scratch memory requested per work-item
+      for this kernel. Scratch memory is used for stack memory on the accelerator,
+      as well as for register spills and restores.
+    Kernel Time: The total duration of the executed kernel.
+    Kernel Time (Cycles): The total duration of the executed kernel in cycles.
+    Instructions per wavefront: The average number of instructions (of all types)
+      executed per wavefront. This is averaged over all wavefronts in a kernel dispatch.
+    Wave Cycles: The number of cycles a wavefront in the kernel dispatch spent resident
+      on a compute unit per normalization unit. This is averaged over all wavefronts
+      in a kernel dispatch.
+    Dependency Wait Cycles: The number of cycles a wavefront in the kernel dispatch
+      spent resident on a compute unit per normalization unit. This is averaged over
+      all wavefronts in a kernel dispatch.
+    Issue Wait Cycles: The number of cycles a wavefront in the kernel dispatch was
+      unable to issue an instruction for any reason (e.g., execution pipe back-pressure,
+      arbitration loss, etc.) per normalization unit. This counter is incremented
+      at every cycle by all wavefronts on a CU unable to issue an instruction. As
+      such, it is most useful to get a sense of how waves were spending their time,
+      rather than identification of a precise limiter because another wave could be
+      actively executing while a wave is issue stalled. The sum of this metric, Dependency
+      Wait Cycles and Active Cycles should be equal to the total Wave Cycles metric.
+    Active Cycles: The average number of cycles a wavefront in the kernel dispatch
+      was actively executing instructions per normalization unit. This measurement
+      is made on a per-wavefront basis, and may include cycles that another wavefront
+      spent actively executing (on another execution unit, for example) or was stalled.
+      As such, it is most useful to get a sense of how waves were spending their time,
+      rather than identification of a precise limiter. The sum of this metric, Issue
+      Wait Cycles and Active Wait Cycles should be equal to the total Wave Cycles
+      metric.
+    Wavefront Occupancy: |-
+      The time-averaged number of wavefronts resident on the accelerator over
+      the lifetime of the kernel. Note: this metric may be inaccurate for short-running
+      kernels (less than 1ms).
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1000_compute_units_instruction_mix.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1000_compute_units_instruction_mix.yaml
index 9c923d7bb7..768fe6548b 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1000_compute_units_instruction_mix.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1000_compute_units_instruction_mix.yaml
@@ -2,90 +2,6 @@
 Panel Config:
   id: 1000
   title: Compute Units - Instruction Mix
-  metrics_description:
-    VALU: The total number of vector arithmetic logic unit (VALU) operations issued.
-      These are the workhorses of the compute unit, and are used to execute a wide
-      range of instruction types including floating point operations, non-uniform
-      address calculations, transcendental operations, integer operations, shifts,
-      conditional evaluation, etc.
-    VMEM: The total number of vector memory operations issued. These include most
-      loads, stores and atomic operations and all accesses to generic, global, private
-      and texture memory.
-    LDS: The total number of LDS (also known as shared memory) operations issued.
-      These include loads, stores, atomics, and HIP's __shfl operations.
-    MFMA: The total number of matrix fused multiply-add instructions issued.
-    SALU: The total number of scalar arithmetic logic unit (SALU) operations issued.
-      Typically these are used for address calculations, literal constants, and other
-      operations that are provably uniform across a wavefront. Although scalar memory
-      (SMEM) operations are issued by the SALU, they are counted separately in this
-      section.
-    SMEM: The total number of scalar memory (SMEM) operations issued. These are typically
-      used for loading kernel arguments, base-pointers and loads from HIP's __constant__
-      memory.
-    Branch: The total number of branch operations issued. These typically consist
-      of jump or branch operations and are used to implement control flow.
-    INT32: The total number of instructions operating on 32-bit integer operands issued
-      to the VALU per normalization unit.
-    INT64: The total number of instructions operating on 64-bit integer operands issued
-      to the VALU per normalization unit.
-    F16-ADD: The total number of addition instructions operating on 16-bit floating-point
-      operands issued to the VALU per normalization unit.
-    F16-MUL: The total number of multiplication instructions operating on 16-bit floating-point
-      operands issued to the VALU per normalization unit.
-    F16-FMA: The total number of fused multiply-add instructions operating on 16-bit
-      floating-point operands issued to the VALU per normalization unit.
-    F16-Trans: The total number of transcendental instructions (e.g., sqrt) operating
-      on 16-bit floating-point operands issued to the VALU per normalization unit.
-    F32-ADD: The total number of addition instructions operating on 32-bit floating-point
-      operands issued to the VALU per normalization unit.
-    F32-MUL: The total number of multiplication instructions operating on 32-bit floating-point
-      operands issued to the VALU per normalization unit.
-    F32-FMA: The total number of fused multiply-add instructions operating on 32-bit
-      floating-point operands issued to the VALU per normalization unit.
-    F32-Trans: The total number of transcendental instructions (such as sqrt) operating
-      on 32-bit floating-point operands issued to the VALU per normalization unit.
-    F64-ADD: The total number of addition instructions operating on 64-bit floating-point
-      operands issued to the VALU per normalization unit.
-    F64-MUL: The total number of multiplication instructions operating on 64-bit floating-point
-      operands issued to the VALU per normalization unit.
-    F64-FMA: The total number of fused multiply-add instructions operating on 64-bit
-      floating-point operands issued to the VALU per normalization unit.
-    F64-Trans: The total number of transcendental instructions (such as sqrt) operating
-      on 64-bit floating-point operands issued to the VALU per normalization unit.
-    Conversion: "The total number of type conversion instructions (such as converting\
-      \ data to or from F32\u2194F64) issued to the VALU per normalization unit."
-    Global/Generic Instr: The total number of global & generic memory instructions
-      executed on all compute units on the accelerator, per normalization unit.
-    Global/Generic Read: The total number of global & generic memory read instructions
-      executed on all compute units on the accelerator, per normalization unit.
-    Global/Generic Write: The total number of global & generic memory write instructions
-      executed on all compute units on the accelerator, per normalization unit.
-    Global/Generic Atomic: The total number of global & generic memory atomic (with
-      and without return) instructions executed on all compute units on the accelerator,
-      per normalization unit.
-    Spill/Stack Instr: The total number of spill/stack memory instructions executed
-      on all compute units on the accelerator, per normalization unit.
-    Spill/Stack Read: The total number of spill/stack memory read instructions executed
-      on all compute units on the accelerator, per normalization unit.
-    Spill/Stack Write: The total number of spill/stack memory write instructions executed
-      on all compute units on the accelerator, per normalization unit.
-    Spill/Stack Atomic: The total number of spill/stack memory atomic (with and without
-      return) instructions executed on all compute units on the accelerator, per normalization
-      unit. Typically unused as these memory operations are typically used to implement
-      thread-local storage.
-    MFMA-I8: The total number of 8-bit integer MFMA instructions issued per normalization
-      unit.
-    MFMA-F8: The total number of 8-bit floating point MFMA instructions issued per
-      normalization unit. This is supported in AMD Instinct MI300 series and later
-      only.
-    MFMA-F16: The total number of 16-bit floating point MFMA instructions issued per
-      normalization unit.
-    MFMA-BF16: The total number of 16-bit brain floating point MFMA instructions issued
-      per normalization unit.
-    MFMA-F32: The total number of 32-bit floating-point MFMA instructions issued per
-      normalization unit.
-    MFMA-F64: The total number of 64-bit floating-point MFMA instructions issued per
-      normalization unit.
   data source:
   - metric_table:
       id: 1001
@@ -307,3 +223,88 @@ Panel Config:
           min: MIN((SQ_INSTS_VALU_MFMA_F64 / $denom))
           max: MAX((SQ_INSTS_VALU_MFMA_F64 / $denom))
           unit: (instr + $normUnit)
+  metrics_description:
+    VALU: The total number of vector arithmetic logic unit (VALU) operations issued.
+      These are the workhorses of the compute unit, and are used to execute a wide
+      range of instruction types including floating point operations, non-uniform
+      address calculations, transcendental operations, integer operations, shifts,
+      conditional evaluation, etc.
+    VMEM: The total number of vector memory operations issued. These include most
+      loads, stores and atomic operations and all accesses to generic, global, private
+      and texture memory.
+    LDS: The total number of LDS (also known as shared memory) operations issued.
+      These include loads, stores, atomics, and HIP's __shfl operations.
+    MFMA: The total number of matrix fused multiply-add instructions issued.
+    SALU: The total number of scalar arithmetic logic unit (SALU) operations issued.
+      Typically these are used for address calculations, literal constants, and other
+      operations that are provably uniform across a wavefront. Although scalar memory
+      (SMEM) operations are issued by the SALU, they are counted separately in this
+      section.
+    SMEM: The total number of scalar memory (SMEM) operations issued. These are typically
+      used for loading kernel arguments, base-pointers and loads from HIP's __constant__
+      memory.
+    Branch: The total number of branch operations issued. These typically consist
+      of jump or branch operations and are used to implement control flow.
+    INT32: The total number of instructions operating on 32-bit integer operands issued
+      to the VALU per normalization unit.
+    INT64: The total number of instructions operating on 64-bit integer operands issued
+      to the VALU per normalization unit.
+    F16-ADD: The total number of addition instructions operating on 16-bit floating-point
+      operands issued to the VALU per normalization unit.
+    F16-MUL: The total number of multiplication instructions operating on 16-bit floating-point
+      operands issued to the VALU per normalization unit.
+    F16-FMA: The total number of fused multiply-add instructions operating on 16-bit
+      floating-point operands issued to the VALU per normalization unit.
+    F16-Trans: The total number of transcendental instructions (e.g., sqrt) operating
+      on 16-bit floating-point operands issued to the VALU per normalization unit.
+    F32-ADD: The total number of addition instructions operating on 32-bit floating-point
+      operands issued to the VALU per normalization unit.
+    F32-MUL: The total number of multiplication instructions operating on 32-bit floating-point
+      operands issued to the VALU per normalization unit.
+    F32-FMA: The total number of fused multiply-add instructions operating on 32-bit
+      floating-point operands issued to the VALU per normalization unit.
+    F32-Trans: The total number of transcendental instructions (such as sqrt) operating
+      on 32-bit floating-point operands issued to the VALU per normalization unit.
+    F64-ADD: The total number of addition instructions operating on 64-bit floating-point
+      operands issued to the VALU per normalization unit.
+    F64-MUL: The total number of multiplication instructions operating on 64-bit floating-point
+      operands issued to the VALU per normalization unit.
+    F64-FMA: The total number of fused multiply-add instructions operating on 64-bit
+      floating-point operands issued to the VALU per normalization unit.
+    F64-Trans: The total number of transcendental instructions (such as sqrt) operating
+      on 64-bit floating-point operands issued to the VALU per normalization unit.
+    Conversion: |-
+      The total number of type conversion instructions (such as converting
+      data to or from F32\u2194F64) issued to the VALU per normalization unit.
+    Global/Generic Instr: The total number of global & generic memory instructions
+      executed on all compute units on the accelerator, per normalization unit.
+    Global/Generic Read: The total number of global & generic memory read instructions
+      executed on all compute units on the accelerator, per normalization unit.
+    Global/Generic Write: The total number of global & generic memory write instructions
+      executed on all compute units on the accelerator, per normalization unit.
+    Global/Generic Atomic: The total number of global & generic memory atomic (with
+      and without return) instructions executed on all compute units on the accelerator,
+      per normalization unit.
+    Spill/Stack Instr: The total number of spill/stack memory instructions executed
+      on all compute units on the accelerator, per normalization unit.
+    Spill/Stack Read: The total number of spill/stack memory read instructions executed
+      on all compute units on the accelerator, per normalization unit.
+    Spill/Stack Write: The total number of spill/stack memory write instructions executed
+      on all compute units on the accelerator, per normalization unit.
+    Spill/Stack Atomic: The total number of spill/stack memory atomic (with and without
+      return) instructions executed on all compute units on the accelerator, per normalization
+      unit. Typically unused as these memory operations are typically used to implement
+      thread-local storage.
+    MFMA-I8: The total number of 8-bit integer MFMA instructions issued per normalization
+      unit.
+    MFMA-F8: The total number of 8-bit floating point MFMA instructions issued per
+      normalization unit. This is supported in AMD Instinct MI300 series and later
+      only.
+    MFMA-F16: The total number of 16-bit floating point MFMA instructions issued per
+      normalization unit.
+    MFMA-BF16: The total number of 16-bit brain floating point MFMA instructions issued
+      per normalization unit.
+    MFMA-F32: The total number of 32-bit floating-point MFMA instructions issued per
+      normalization unit.
+    MFMA-F64: The total number of 64-bit floating-point MFMA instructions issued per
+      normalization unit.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1100_compute_units_compute_pipeline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1100_compute_units_compute_pipeline.yaml
index 5285c6b279..5e6ceb654f 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1100_compute_units_compute_pipeline.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1100_compute_units_compute_pipeline.yaml
@@ -2,84 +2,6 @@
 Panel Config:
   id: 1100
   title: Compute Units - Compute Pipeline
-  metrics_description:
-    VALU FLOPs: 'The total floating-point operations executed per second on the VALU.
-      This is also presented as a percent of the peak theoretical FLOPs achievable
-      on the specific accelerator. Note: this does not include any floating-point
-      operations from MFMA instructions.'
-    VALU IOPs: 'The total integer operations executed per second on the VALU. This
-      is also presented as a percent of the peak theoretical IOPs achievable on the
-      specific accelerator. Note: this does not include any integer operations from
-      MFMA instructions.'
-    MFMA FLOPs (BF16): 'The total number of 16-bit brain floating point MFMA operations
-      executed per second. Note: this does not include any 16-bit brain floating point
-      operations from VALU instructions. This is also presented as a percent of the
-      peak theoretical BF16 MFMA operations achievable on the specific accelerator.'
-    MFMA FLOPs (F16): 'The total number of 16-bit floating point MFMA operations executed
-      per second. Note: this does not include any 16-bit floating point operations
-      from VALU instructions. This is also presented as a percent of the peak theoretical
-      F16 MFMA operations achievable on the specific accelerator.'
-    MFMA FLOPs (F32): 'The total number of 32-bit floating point MFMA operations executed
-      per second. Note: this does not include any 32-bit floating point operations
-      from VALU instructions. This is also presented as a percent of the peak theoretical
-      F32 MFMA operations achievable on the specific accelerator.'
-    MFMA FLOPs (F64): 'The total number of 64-bit floating point MFMA operations executed
-      per second. Note: this does not include any 64-bit floating point operations
-      from VALU instructions. This is also presented as a percent of the peak theoretical
-      F64 MFMA operations achievable on the specific accelerator.'
-    MFMA IOPs (INT8): 'The total number of 8-bit integer MFMA operations executed
-      per second. Note: this does not include any 8-bit integer operations from VALU
-      instructions. This is also presented as a percent of the peak theoretical INT8
-      MFMA operations achievable on the specific accelerator.'
-    IPC: The ratio of the total number of instructions executed on the CU over the
-      total active CU cycles.
-    IPC (Issued): The ratio of the total number of (non-internal) instructions issued
-      over the number of cycles where the scheduler was actively working on issuing
-      instructions.
-    SALU Utilization: Indicates what percent of the kernel's duration the SALU was
-      busy executing instructions. Computed as the ratio of the total number of cycles
-      spent by the scheduler issuing SALU / SMEM instructions over the total CU cycles.
-    VALU Utilization: Indicates what percent of the kernel's duration the VALU was
-      busy executing instructions. Does not include VMEM operations. Computed as the
-      ratio of the total number of cycles spent by the scheduler issuing VALU instructions
-      over the total CU cycles.
-    VMEM Utilization: Indicates what percent of the kernel's duration the VMEM unit
-      was busy executing instructions, including both global/generic and spill/scratch
-      operations (see the VMEM instruction count metrics for more detail). Does not
-      include VALU operations. Computed as the ratio of the total number of cycles
-      spent by the scheduler issuing VMEM instructions over the total CU cycles.
-    Branch Utilization: Indicates what percent of the kernel's duration the branch
-      unit was busy executing instructions. Computed as the ratio of the total number
-      of cycles spent by the scheduler issuing branch instructions over the total
-      CU cycles.
-    VALU Active Threads: Indicates the average level of divergence within a wavefront
-      over the lifetime of the kernel. The number of work-items that were active in
-      a wavefront during execution of each VALU instruction, time-averaged over all
-      VALU instructions run on all wavefronts in the kernel
-    MFMA Utilization: Indicates what percent of the kernel's duration the MFMA unit
-      was busy executing instructions. Computed as the ratio of the total number of
-      cycles spent by the MFMA was busy over the total CU cycles.
-    MFMA Instruction Cycles: The average duration of MFMA instructions in this kernel
-      in cycles. Computed as the ratio of the total number of cycles the MFMA unit
-      was busy over the total number of MFMA instructions.
-    VMEM Latency: The average number of round-trip cycles (that is, from issue to
-      data return / acknowledgment) required for a VMEM instruction to complete.
-    SMEM Latency: The average number of round-trip cycles (that is, from issue to
-      data return / acknowledgment) required for a SMEM instruction to complete.
-    FLOPs (Total): The total number of floating-point operations executed on either
-      the VALU or MFMA units, per normalization unit.
-    IOPs (Total): The total number of integer operations executed on either the VALU
-      or MFMA units, per normalization unit.
-    F16 OPs: The total number of 16-bit floating-point operations executed on either
-      the VALU or MFMA units, per normalization unit.
-    BF16 OPs: The total number of 16-bit brain floating-point operations executed
-      on either the VALU or MFMA units, per normalization unit.
-    F32 OPs: The total number of 32-bit floating-point operations executed on either
-      the VALU or MFMA units, per normalization unit.
-    F64 OPs: The total number of 64-bit floating-point operations executed on either
-      the VALU or MFMA units, per normalization unit.
-    INT8 OPs: The total number of 8-bit integer operations executed on either the
-      VALU or MFMA units, per normalization unit.
   data source:
   - metric_table:
       id: 1101
@@ -165,13 +87,13 @@ Panel Config:
           unit: Instr/cycle
         IPC (Issued):
           avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
-            + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED  + SQ_INSTS_LDS)
+            + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
             / SQ_ACTIVE_INST_ANY))
           min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
             + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
             / SQ_ACTIVE_INST_ANY))
           max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
-            + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED  + SQ_INSTS_LDS)
+            + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
             / SQ_ACTIVE_INST_ANY))
           unit: Instr/cycle
         SALU Utilization:
@@ -271,7 +193,7 @@ Panel Config:
             + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
             + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
             / $denom))
-          unit: (OPs  + $normUnit)
+          unit: (OPs + $normUnit)
         IOPs (Total):
           avg: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
             * 512)) / $denom)
@@ -279,12 +201,12 @@ Panel Config:
             * 512)) / $denom)
           max: MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
             * 512)) / $denom)
-          unit: (OPs  + $normUnit)
+          unit: (OPs + $normUnit)
         F8 OPs:
           avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
           min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
           max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
-          unit: (OPs  + $normUnit)
+          unit: (OPs + $normUnit)
         F16 OPs:
           avg: AVG(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
             + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512
@@ -295,12 +217,12 @@ Panel Config:
           max: MAX(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
             + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512
             * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
-          unit: (OPs  + $normUnit)
+          unit: (OPs + $normUnit)
         BF16 OPs:
           avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
           min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
           max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
-          unit: (OPs  + $normUnit)
+          unit: (OPs + $normUnit)
         F32 OPs:
           avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
             + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
@@ -311,7 +233,7 @@ Panel Config:
           max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
             + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
             / $denom))
-          unit: (OPs  + $normUnit)
+          unit: (OPs + $normUnit)
         F64 OPs:
           avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
             + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
@@ -322,9 +244,94 @@ Panel Config:
           max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
             + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
             / $denom))
-          unit: (OPs  + $normUnit)
+          unit: (OPs + $normUnit)
         INT8 OPs:
           avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
           min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
           max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
-          unit: (OPs  + $normUnit)
+          unit: (OPs + $normUnit)
+  metrics_description:
+    VALU FLOPs: |-
+      The total floating-point operations executed per second on the VALU.
+      This is also presented as a percent of the peak theoretical FLOPs achievable
+      on the specific accelerator. Note: this does not include any floating-point
+      operations from MFMA instructions.
+    VALU IOPs: |-
+      The total integer operations executed per second on the VALU. This is
+      also presented as a percent of the peak theoretical IOPs achievable on the
+      specific accelerator. Note: this does not include any integer operations from
+      MFMA instructions.
+    MFMA FLOPs (BF16): |-
+      The total number of 16-bit brain floating point MFMA operations executed
+      per second. Note: this does not include any 16-bit brain floating point operations
+      from VALU instructions. This is also presented as a percent of the peak theoretical
+      BF16 MFMA operations achievable on the specific accelerator.
+    MFMA FLOPs (F16): |-
+      The total number of 16-bit floating point MFMA operations executed per
+      second. Note: this does not include any 16-bit floating point operations from
+      VALU instructions. This is also presented as a percent of the peak theoretical
+      F16 MFMA operations achievable on the specific accelerator.
+    MFMA FLOPs (F32): |-
+      The total number of 32-bit floating point MFMA operations executed per
+      second. Note: this does not include any 32-bit floating point operations from
+      VALU instructions. This is also presented as a percent of the peak theoretical
+      F32 MFMA operations achievable on the specific accelerator.
+    MFMA FLOPs (F64): |-
+      The total number of 64-bit floating point MFMA operations executed per
+      second. Note: this does not include any 64-bit floating point operations from
+      VALU instructions. This is also presented as a percent of the peak theoretical
+      F64 MFMA operations achievable on the specific accelerator.
+    MFMA IOPs (INT8): |-
+      The total number of 8-bit integer MFMA operations executed per second.
+      Note: this does not include any 8-bit integer operations from VALU instructions.
+      This is also presented as a percent of the peak theoretical INT8 MFMA operations
+      achievable on the specific accelerator.
+    IPC: The ratio of the total number of instructions executed on the CU over the
+      total active CU cycles.
+    IPC (Issued): The ratio of the total number of (non-internal) instructions issued
+      over the number of cycles where the scheduler was actively working on issuing
+      instructions.
+    SALU Utilization: Indicates what percent of the kernel's duration the SALU was
+      busy executing instructions. Computed as the ratio of the total number of cycles
+      spent by the scheduler issuing SALU / SMEM instructions over the total CU cycles.
+    VALU Utilization: Indicates what percent of the kernel's duration the VALU was
+      busy executing instructions. Does not include VMEM operations. Computed as the
+      ratio of the total number of cycles spent by the scheduler issuing VALU instructions
+      over the total CU cycles.
+    VMEM Utilization: Indicates what percent of the kernel's duration the VMEM unit
+      was busy executing instructions, including both global/generic and spill/scratch
+      operations (see the VMEM instruction count metrics for more detail). Does not
+      include VALU operations. Computed as the ratio of the total number of cycles
+      spent by the scheduler issuing VMEM instructions over the total CU cycles.
+    Branch Utilization: Indicates what percent of the kernel's duration the branch
+      unit was busy executing instructions. Computed as the ratio of the total number
+      of cycles spent by the scheduler issuing branch instructions over the total
+      CU cycles.
+    VALU Active Threads: Indicates the average level of divergence within a wavefront
+      over the lifetime of the kernel. The number of work-items that were active in
+      a wavefront during execution of each VALU instruction, time-averaged over all
+      VALU instructions run on all wavefronts in the kernel
+    MFMA Utilization: Indicates what percent of the kernel's duration the MFMA unit
+      was busy executing instructions. Computed as the ratio of the total number of
+      cycles spent by the MFMA was busy over the total CU cycles.
+    MFMA Instruction Cycles: The average duration of MFMA instructions in this kernel
+      in cycles. Computed as the ratio of the total number of cycles the MFMA unit
+      was busy over the total number of MFMA instructions.
+    VMEM Latency: The average number of round-trip cycles (that is, from issue to
+      data return / acknowledgment) required for a VMEM instruction to complete.
+    SMEM Latency: The average number of round-trip cycles (that is, from issue to
+      data return / acknowledgment) required for a SMEM instruction to complete.
+    FLOPs (Total): The total number of floating-point operations executed on either
+      the VALU or MFMA units, per normalization unit.
+    IOPs (Total): The total number of integer operations executed on either the VALU
+      or MFMA units, per normalization unit.
+    F16 OPs: The total number of 16-bit floating-point operations executed on either
+      the VALU or MFMA units, per normalization unit.
+    BF16 OPs: The total number of 16-bit brain floating-point operations executed
+      on either the VALU or MFMA units, per normalization unit.
+    F32 OPs: The total number of 32-bit floating-point operations executed on either
+      the VALU or MFMA units, per normalization unit.
+    F64 OPs: The total number of 64-bit floating-point operations executed on either
+      the VALU or MFMA units, per normalization unit.
+    INT8 OPs: The total number of 8-bit integer operations executed on either the
+      VALU or MFMA units, per normalization unit.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1200_local_data_share_lds.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1200_local_data_share_lds.yaml
index 2718654ad4..b7767fea16 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1200_local_data_share_lds.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1200_local_data_share_lds.yaml
@@ -2,51 +2,6 @@
 Panel Config:
   id: 1200
   title: Local Data Share (LDS)
-  metrics_description:
-    Utilization: Indicates what percent of the kernel's duration the LDS was actively
-      executing instructions (including, but not limited to, load, store, atomic and
-      HIP's __shfl operations). Calculated as the ratio of the total number of cycles
-      LDS was active over the total CU cycles.
-    Access Rate: Indicates the percentage of SIMDs in the VALU actively issuing LDS
-      instructions, averaged over the lifetime of the kernel. Calculated as the ratio
-      of the total number of cycles spent by the scheduler issuing LDS instructions
-      over the total CU cycles.
-    Theoretical Bandwidth Utilization: Indicates the maximum amount of bytes that
-      could have been loaded from, stored to, or atomically updated in the LDS divided
-      as percentage of theoretical peak. Does not take into account the execution
-      mask of the wavefront when the instruction was executed.
-    Theoretical Bandwidth: Indicates the maximum amount of bytes that could have been
-      loaded from, stored to, or atomically updated in the LDS divided by total duration.
-      Does not take into account the execution mask of the wavefront when the instruction
-      was executed.
-    Bank Conflict Rate: Indicates the percentage of active LDS cycles that were spent
-      servicing bank conflicts. Calculated as the ratio of LDS cycles spent servicing
-      bank conflicts over the number of LDS cycles that would have been required to
-      move the same amount of data in an uncontended access.
-    LDS Instructions: The total number of LDS instructions (including, but not limited
-      to, read/write/atomics and HIP's __shfl instructions) executed per normalization
-      unit.
-    LDS Latency: The average number of round-trip cycles (i.e., from issue to data-return
-      / acknowledgment) required for an LDS instruction to complete.
-    Bank Conflicts/Access: The ratio of the number of cycles spent in the LDS scheduler
-      due to bank conflicts (as determined by the conflict resolution hardware) to
-      the base number of cycles that would be spent in the LDS scheduler in a completely
-      uncontended case. This is the unnormalized form of the Bank Conflict Rate.
-    Index Accesses: The total number of cycles spent in the LDS scheduler over all
-      operations per normalization unit.
-    Atomic Return Cycles: The total number of cycles spent on LDS atomics with return
-      per normalization unit.
-    Bank Conflict: The total number of cycles spent in the LDS scheduler due to bank
-      conflicts (as determined by the conflict resolution hardware) per normalization
-      unit.
-    Addr Conflict: The total number of cycles spent in the LDS scheduler due to address
-      conflicts (as determined by the conflict resolution hardware) per normalization
-      unit.
-    Unaligned Stall: The total number of cycles spent in the LDS scheduler due to
-      stalls from non-dword aligned addresses per normalization unit.
-    Mem Violations: "The total number of out-of-bounds accesses made to the LDS, per\
-      \ normalization unit. This is unused and expected to be zero in most configurations\
-      \ for modern CDNA\u2122 accelerators."
   data source:
   - metric_table:
       id: 1201
@@ -87,7 +42,7 @@ Panel Config:
           avg: AVG((SQ_INSTS_LDS / $denom))
           min: MIN((SQ_INSTS_LDS / $denom))
           max: MAX((SQ_INSTS_LDS / $denom))
-          unit: (Instr  + $normUnit)
+          unit: (Instr + $normUnit)
         Theoretical Bandwidth:
           avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
             / (End_Timestamp - Start_Timestamp)))
@@ -117,29 +72,75 @@ Panel Config:
           avg: AVG((SQ_LDS_IDX_ACTIVE / $denom))
           min: MIN((SQ_LDS_IDX_ACTIVE / $denom))
           max: MAX((SQ_LDS_IDX_ACTIVE / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
         Atomic Return Cycles:
           avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom))
           min: MIN((SQ_LDS_ATOMIC_RETURN / $denom))
           max: MAX((SQ_LDS_ATOMIC_RETURN / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
         Bank Conflict:
           avg: AVG((SQ_LDS_BANK_CONFLICT / $denom))
           min: MIN((SQ_LDS_BANK_CONFLICT / $denom))
           max: MAX((SQ_LDS_BANK_CONFLICT / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
         Addr Conflict:
           avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom))
           min: MIN((SQ_LDS_ADDR_CONFLICT / $denom))
           max: MAX((SQ_LDS_ADDR_CONFLICT / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
         Unaligned Stall:
           avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom))
           min: MIN((SQ_LDS_UNALIGNED_STALL / $denom))
           max: MAX((SQ_LDS_UNALIGNED_STALL / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
         Mem Violations:
           avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom))
           min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom))
           max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom))
           unit: (Accesses + $normUnit)
+  metrics_description:
+    Utilization: Indicates what percent of the kernel's duration the LDS was actively
+      executing instructions (including, but not limited to, load, store, atomic and
+      HIP's __shfl operations). Calculated as the ratio of the total number of cycles
+      LDS was active over the total CU cycles.
+    Access Rate: Indicates the percentage of SIMDs in the VALU actively issuing LDS
+      instructions, averaged over the lifetime of the kernel. Calculated as the ratio
+      of the total number of cycles spent by the scheduler issuing LDS instructions
+      over the total CU cycles.
+    Theoretical Bandwidth Utilization: Indicates the maximum amount of bytes that
+      could have been loaded from, stored to, or atomically updated in the LDS divided
+      as percentage of theoretical peak. Does not take into account the execution
+      mask of the wavefront when the instruction was executed.
+    Theoretical Bandwidth: Indicates the maximum amount of bytes that could have been
+      loaded from, stored to, or atomically updated in the LDS divided by total duration.
+      Does not take into account the execution mask of the wavefront when the instruction
+      was executed.
+    Bank Conflict Rate: Indicates the percentage of active LDS cycles that were spent
+      servicing bank conflicts. Calculated as the ratio of LDS cycles spent servicing
+      bank conflicts over the number of LDS cycles that would have been required to
+      move the same amount of data in an uncontended access.
+    LDS Instructions: The total number of LDS instructions (including, but not limited
+      to, read/write/atomics and HIP's __shfl instructions) executed per normalization
+      unit.
+    LDS Latency: The average number of round-trip cycles (i.e., from issue to data-return
+      acknowledgment) required for an LDS instruction to complete.
+    Bank Conflicts/Access: The ratio of the number of cycles spent in the LDS scheduler
+      due to bank conflicts (as determined by the conflict resolution hardware) to
+      the base number of cycles that would be spent in the LDS scheduler in a completely
+      uncontended case. This is the unnormalized form of the Bank Conflict Rate.
+    Index Accesses: The total number of cycles spent in the LDS scheduler over all
+      operations per normalization unit.
+    Atomic Return Cycles: The total number of cycles spent on LDS atomics with return
+      per normalization unit.
+    Bank Conflict: The total number of cycles spent in the LDS scheduler due to bank
+      conflicts (as determined by the conflict resolution hardware) per normalization
+      unit.
+    Addr Conflict: The total number of cycles spent in the LDS scheduler due to address
+      conflicts (as determined by the conflict resolution hardware) per normalization
+      unit.
+    Unaligned Stall: The total number of cycles spent in the LDS scheduler due to
+      stalls from non-dword aligned addresses per normalization unit.
+    Mem Violations: |-
+      The total number of out-of-bounds accesses made to the LDS, per normalization
+      unit. This is unused and expected to be zero in most configurations for
+      modern CDNA\u2122 accelerators.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1300_instruction_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1300_instruction_cache.yaml
index aeda9bc6c7..35808d9d96 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1300_instruction_cache.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1300_instruction_cache.yaml
@@ -2,28 +2,6 @@
 Panel Config:
   id: 1300
   title: Instruction Cache
-  metrics_description:
-    Bandwidth Utilization: The number of bytes looked up in the L1I cache, as a percent
-      of the peak theoretical bandwidth. Calculated as the ratio of L1I requests over
-      the total L1I cycles.
-    Cache Hit Rate: The percent of L1I requests that hit [#l1i-cache]_ on a previously
-      loaded line the cache. Calculated as the ratio of the number of L1I requests
-      that hit over the number of all L1I requests.
-    L1I-L2 Bandwidth Utilization: "The percent of the peak theoretical L1I \u2192\
-      \ L2 cache request bandwidth achieved. Calculated as the ratio of the total\
-      \ number of requests from the L1I to the L2 cache over the total L1I-L2 interface\
-      \ cycles."
-    L1I-L2 Bandwidth: Total number of bytes transferred across L1I - L2 interface
-      divided by total duration.
-    Req: The total number of requests made to the L1I per normalization-unit
-    Hits: The total number of L1I requests that hit on a previously loaded cache line,
-      per normalization-unit.
-    Misses - Non Duplicated: The total number of L1I requests that missed on a cache
-      line that were not already pending due to another request, per normalization-unit.
-    Misses - Duplicated: The total number of L1I requests that missed on a cache line
-      that were already pending due to another request, per normalization-unit.
-    Instruction Fetch Latency: The average number of cycles spent to fetch instructions
-      to a CU.
   data source:
   - metric_table:
       id: 1301
@@ -62,22 +40,22 @@ Panel Config:
           avg: AVG((SQC_ICACHE_REQ / $denom))
           min: MIN((SQC_ICACHE_REQ / $denom))
           max: MAX((SQC_ICACHE_REQ / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Hits:
           avg: AVG((SQC_ICACHE_HITS / $denom))
           min: MIN((SQC_ICACHE_HITS / $denom))
           max: MAX((SQC_ICACHE_HITS / $denom))
-          unit: (Hits  + $normUnit)
+          unit: (Hits + $normUnit)
         Misses - Non Duplicated:
           avg: AVG((SQC_ICACHE_MISSES / $denom))
           min: MIN((SQC_ICACHE_MISSES / $denom))
           max: MAX((SQC_ICACHE_MISSES / $denom))
-          unit: (Misses  + $normUnit)
+          unit: (Misses + $normUnit)
         Misses - Duplicated:
           avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom))
           min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom))
           max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom))
-          unit: (Misses  + $normUnit)
+          unit: (Misses + $normUnit)
         Cache Hit Rate:
           avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
             + SQC_ICACHE_MISSES_DUPLICATE)))
@@ -107,3 +85,25 @@ Panel Config:
           min: MIN(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
           max: MAX(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
           unit: Gbps
+  metrics_description:
+    Bandwidth Utilization: The number of bytes looked up in the L1I cache, as a percent
+      of the peak theoretical bandwidth. Calculated as the ratio of L1I requests over
+      the total L1I cycles.
+    Cache Hit Rate: The percent of L1I requests that hit [#l1i-cache]_ on a previously
+      loaded line the cache. Calculated as the ratio of the number of L1I requests
+      that hit over the number of all L1I requests.
+    L1I-L2 Bandwidth Utilization: |-
+      The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth
+      achieved. Calculated as the ratio of the total number of requests from the
+      L1I to the L2 cache over the total L1I-L2 interface cycles.
+    L1I-L2 Bandwidth: Total number of bytes transferred across L1I - L2 interface
+      divided by total duration.
+    Req: The total number of requests made to the L1I per normalization-unit
+    Hits: The total number of L1I requests that hit on a previously loaded cache line,
+      per normalization-unit.
+    Misses - Non Duplicated: The total number of L1I requests that missed on a cache
+      line that were not already pending due to another request, per normalization-unit.
+    Misses - Duplicated: The total number of L1I requests that missed on a cache line
+      that were already pending due to another request, per normalization-unit.
+    Instruction Fetch Latency: The average number of cycles spent to fetch instructions
+      to a CU.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1400_scalar_l1_data_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1400_scalar_l1_data_cache.yaml
index 282b97ad1f..6b73164848 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1400_scalar_l1_data_cache.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1400_scalar_l1_data_cache.yaml
@@ -2,49 +2,6 @@
 Panel Config:
   id: 1400
   title: Scalar L1 Data Cache
-  metrics_description:
-    Bandwidth Utilization: The number of bytes looked up in the sL1D cache, as a percent
-      of the peak theoretical bandwidth. Calculated as the ratio of sL1D requests
-      over the total sL1D cycles.
-    Cache Hit Rate: Indicates the percent of sL1D requests that hit on a previously
-      loaded line the cache. The ratio of the number of sL1D requests that hit over
-      the number of all sL1D requests.
-    sL1D-L2 BW Utilization: The percentage of the peak theoretical sL1D - L2 interface
-      bandwidth acheived.\ \ Caclulated as total number of bytes read from, written
-      to, or atomically updated\ \ across the sL1D - L2 interface.
-    sL1D-L2 BW: "The total number of bytes read from, written to, or atomically updated\
-      \ across the sL1D\u2194L2 interface, divided by total duration. Note that sL1D\
-      \ writes and atomics are typically unused on current CDNA accelerators, so in\
-      \ the majority of cases this can be interpreted as an sL1D\u2192L2 read bandwidth."
-    Req: The total number of requests, of any size or type, made to the sL1D per normalization
-      unit.
-    Hits: The total number of sL1D requests that hit on a previously loaded cache
-      line, per normalization unit.
-    Misses - Non Duplicated: 'The total number of sL1D requests that missed on a cache
-      line that was not already pending due to another request, per normalization
-      unit. '
-    Misses- Duplicated: The total number of sL1D requests that missed on a cache line
-      that was already pending due to another request, per normalization unit.
-    Read Req (Total): The total number of sL1D read requests of any size, per normalization
-      unit.
-    Atomic Req: The total number of atomic requests from sL1D to the L2, per normalization
-      unit. Typically unused on current CDNA accelerators.
-    Read Req (1 DWord): The total number of sL1D read requests made for a single dword
-      of data (4B), per normalization unit.
-    Read Req (2 DWord): The total number of sL1D read requests made for a two dwords
-      of data (8B), per normalization unit.
-    Read Req (4 DWord): The total number of sL1D read requests made for a four dwords
-      of data (16B), per normalization unit.
-    Read Req (8 DWord): The total number of sL1D read requests made for a eight dwords
-      of data (32B), per normalization unit.
-    Read Req (16 DWord): The total number of sL1D read requests made for a sixteen
-      dwords of data (64B), per normalization unit.
-    Read Req: The total number of read requests from sL1D to the L2 per normalization
-      unit.
-    Write Req: The total number of write requests from sL1D to the L2, per normalization
-      unit. Typically unused on current CDNA accelerators.
-    Stall Cycles: "The total number of cycles the sL1D\u2194L2 interface was stalled,\
-      \ per normalization unit."
   data source:
   - metric_table:
       id: 1401
@@ -84,22 +41,22 @@ Panel Config:
           avg: AVG((SQC_DCACHE_REQ / $denom))
           min: MIN((SQC_DCACHE_REQ / $denom))
           max: MAX((SQC_DCACHE_REQ / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Hits:
           avg: AVG((SQC_DCACHE_HITS / $denom))
           min: MIN((SQC_DCACHE_HITS / $denom))
           max: MAX((SQC_DCACHE_HITS / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Misses - Non Duplicated:
           avg: AVG((SQC_DCACHE_MISSES / $denom))
           min: MIN((SQC_DCACHE_MISSES / $denom))
           max: MAX((SQC_DCACHE_MISSES / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Misses- Duplicated:
           avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom))
           min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom))
           max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Cache Hit Rate:
           avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
             + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
@@ -118,37 +75,37 @@ Panel Config:
             + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
           max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
             + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Atomic Req:
           avg: AVG((SQC_DCACHE_ATOMIC / $denom))
           min: MIN((SQC_DCACHE_ATOMIC / $denom))
           max: MAX((SQC_DCACHE_ATOMIC / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Read Req (1 DWord):
           avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom))
           min: MIN((SQC_DCACHE_REQ_READ_1 / $denom))
           max: MAX((SQC_DCACHE_REQ_READ_1 / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Read Req (2 DWord):
           avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom))
           min: MIN((SQC_DCACHE_REQ_READ_2 / $denom))
           max: MAX((SQC_DCACHE_REQ_READ_2 / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Read Req (4 DWord):
           avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom))
           min: MIN((SQC_DCACHE_REQ_READ_4 / $denom))
           max: MAX((SQC_DCACHE_REQ_READ_4 / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Read Req (8 DWord):
           avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom))
           min: MIN((SQC_DCACHE_REQ_READ_8 / $denom))
           max: MAX((SQC_DCACHE_REQ_READ_8 / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Read Req (16 DWord):
           avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom))
           min: MIN((SQC_DCACHE_REQ_READ_16 / $denom))
           max: MAX((SQC_DCACHE_REQ_READ_16 / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
   - metric_table:
       id: 1403
       title: Scalar L1D Cache - L2 Interface
@@ -171,19 +128,65 @@ Panel Config:
           avg: AVG((SQC_TC_DATA_READ_REQ / $denom))
           min: MIN((SQC_TC_DATA_READ_REQ / $denom))
           max: MAX((SQC_TC_DATA_READ_REQ / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Write Req:
           avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom))
           min: MIN((SQC_TC_DATA_WRITE_REQ / $denom))
           max: MAX((SQC_TC_DATA_WRITE_REQ / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Atomic Req:
           avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom))
           min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom))
           max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Stall Cycles:
           avg: AVG((SQC_TC_STALL / $denom))
           min: MIN((SQC_TC_STALL / $denom))
           max: MAX((SQC_TC_STALL / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
+  metrics_description:
+    Bandwidth Utilization: The number of bytes looked up in the sL1D cache, as a percent
+      of the peak theoretical bandwidth. Calculated as the ratio of sL1D requests
+      over the total sL1D cycles.
+    Cache Hit Rate: Indicates the percent of sL1D requests that hit on a previously
+      loaded line the cache. The ratio of the number of sL1D requests that hit over
+      the number of all sL1D requests.
+    sL1D-L2 BW Utilization: The percentage of the peak theoretical sL1D - L2 interface
+      bandwidth acheived. Calculated as total number of bytes read from, written to,
+      or atomically updated across the sL1D - L2 interface.
+    sL1D-L2 BW: |-
+      The total number of bytes read from, written to, or atomically updated
+      across the sL1D\u2194L2 interface, divided by total duration. Note that sL1D
+      writes and atomics are typically unused on current CDNA accelerators, so
+      in the majority of cases this can be interpreted as an sL1D\u2192L2 read
+      bandwidth.
+    Req: The total number of requests, of any size or type, made to the sL1D per normalization
+      unit.
+    Hits: The total number of sL1D requests that hit on a previously loaded cache
+      line, per normalization unit.
+    Misses - Non Duplicated: |-
+      The total number of sL1D requests that missed on a cache line that was
+      not already pending due to another request, per normalization unit.
+    Misses- Duplicated: The total number of sL1D requests that missed on a cache line
+      that was already pending due to another request, per normalization unit.
+    Read Req (Total): The total number of sL1D read requests of any size, per normalization
+      unit.
+    Atomic Req: The total number of atomic requests from sL1D to the L2, per normalization
+      unit. Typically unused on current CDNA accelerators.
+    Read Req (1 DWord): The total number of sL1D read requests made for a single dword
+      of data (4B), per normalization unit.
+    Read Req (2 DWord): The total number of sL1D read requests made for a two dwords
+      of data (8B), per normalization unit.
+    Read Req (4 DWord): The total number of sL1D read requests made for a four dwords
+      of data (16B), per normalization unit.
+    Read Req (8 DWord): The total number of sL1D read requests made for a eight dwords
+      of data (32B), per normalization unit.
+    Read Req (16 DWord): The total number of sL1D read requests made for a sixteen
+      dwords of data (64B), per normalization unit.
+    Read Req: The total number of read requests from sL1D to the L2 per normalization
+      unit.
+    Write Req: The total number of write requests from sL1D to the L2, per normalization
+      unit. Typically unused on current CDNA accelerators.
+    Stall Cycles: |-
+      The total number of cycles the sL1D\u2194L2 interface was stalled, per
+      normalization unit.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1500_address_processing_unit_and_data_return_path_ta_td.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1500_address_processing_unit_and_data_return_path_ta_td.yaml
index cdbb5393aa..3fd1615719 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1500_address_processing_unit_and_data_return_path_ta_td.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1500_address_processing_unit_and_data_return_path_ta_td.yaml
@@ -2,70 +2,6 @@
 Panel Config:
   id: 1500
   title: Address Processing Unit and Data Return Path (TA/TD)
-  metrics_description:
-    Address Processing Unit Busy: Percent of the total CU cycles the address processor
-      was busy
-    Address Stall: Percent of the total CU cycles the address processor was stalled
-      from sending address requests further into the vL1D pipeline.
-    Data Stall: Percent of the total CU cycles the address processor was stalled from
-      sending write/atomic data further into the vL1D pipeline.
-    "Data-Processor \u2192 Address Stall": Percent of total CU cycles the address
-      processor was stalled waiting to send command data to the data processor.
-    Total Instructions: The total number of memory instructions executed by the address
-      processer over all compute units on the accelerator, per normalization unit.
-    Global/Generic Instructions: The total number of global & generic memory instructions
-      executed on all compute units on the accelerator, per normalization unit.
-    Global/Generic Read Instructions: The total number of global & generic memory
-      read instructions executed on all compute units on the accelerator, per normalization
-      unit.
-    Global/Generic Write Instructions: The total number of global & generic memory
-      write instructions executed on all compute units on the accelerator, per normalization
-      unit.
-    Global/Generic Atomic Instructions: The total number of global & generic memory
-      atomic (with and without return) instructions executed on all compute units
-      on the accelerator, per normalization unit.
-    Spill/Stack Instructions: The total number of spill/stack memory instructions
-      executed on all compute units on the accelerator, per normalization unit.
-    Spill/Stack Read Instructions: The total number of spill/stack memory read instructions
-      executed on all compute units on the accelerator, per normalization unit.
-    Spill/Stack Write Instructions: The total number of spill/stack memory write instructions
-      executed on all compute units on the accelerator, per normalization unit.
-    Spill/Stack Atomic Instructions: The total number of spill/stack memory atomic
-      (with and without return) instructions executed on all compute units on the
-      accelerator, per normalization unit. Typically unused as these memory operations
-      are typically used to implement thread-local storage.
-    Spill/Stack Total Cycles: The number of cycles the address processing unit spent
-      working on spill/stack instructions, per normalization unit.
-    Spill/Stack Coalesced Read: The number of cycles the address processing unit spent
-      working on coalesced spill/stack read instructions, per normalization unit.
-    Spill/Stack Coalesced Write: The number of cycles the address processing unit
-      spent working on coalesced spill/stack write instructions, per normalization
-      unit.
-    Data-Return Busy: Percent of the total CU cycles the data-return unit was busy
-      processing or waiting on data to return to the CU.
-    "Cache RAM \u2192 Data-Return Stall": Percent of the total CU cycles the data-return
-      unit was stalled on data to be returned from the vL1D Cache RAM.
-    "Workgroup manager \u2192 Data-Return Stall": Percent of the total CU cycles the
-      data-return unit was stalled by the workgroup manager due to initialization
-      of registers as a part of launching new workgroups.
-    Coalescable Instructions: The number of instructions submitted to the data-return
-      unit by the address processor that were found to be coalescable, per normalization
-      unit.
-    Read Instructions: The number of read instructions submitted to the data-return
-      unit by the address processor summed over all compute units on the accelerator,
-      per normalization unit. This is expected to be the sum of global/generic and
-      spill/stack reads in the address processor.
-    Write Instructions: The number of store instructions submitted to the data-return
-      unit by the address processor summed over all compute units on the accelerator,
-      per normalization unit. This is expected to be the sum of global/generic and
-      spill/stack stores in the address processor.
-    Atomic Instructions: The number of atomic instructions submitted to the data-return
-      unit by the address processor summed over all compute units on the accelerator,
-      per normalization unit. This is expected to be the sum of global/generic and
-      spill/stack atomics in the address processor.
-    Write Ack Instructions: The total number of write acknowledgements submitted by
-      data-return unit to SQ, summed over all compute units on the accelerator, per
-      normalization unit.
   data source:
   - metric_table:
       id: 1501
@@ -135,47 +71,47 @@ Panel Config:
           avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom))
           min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom))
           max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Global/Generic Instructions:
           avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
           min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
           max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Global/Generic Read Instructions:
           avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
           min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
           max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Global/Generic Write Instructions:
           avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
           min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
           max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Global/Generic Atomic Instructions:
           avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
           min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
           max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Spill/Stack Instructions:
           avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
           min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
           max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Spill/Stack Read Instructions:
           avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
           min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
           max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Spill/Stack Write Instructions:
           avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
           min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
           max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Spill/Stack Atomic Instructions:
           avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
           min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
           max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
   - metric_table:
       id: 1503
       title: Spill and stack metrics
@@ -190,17 +126,17 @@ Panel Config:
           avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
           min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
           max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
         Spill/Stack Coalesced Read:
           avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
           min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
           max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
         Spill/Stack Coalesced Write:
           avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
           min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
           max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
   - metric_table:
       id: 1504
       title: Vector L1 data-return path or Texture Data (TD)
@@ -230,7 +166,7 @@ Panel Config:
           avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom))
           min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom))
           max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Read Instructions:
           avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
             / $denom))
@@ -238,14 +174,75 @@ Panel Config:
             / $denom))
           max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
             / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Write Instructions:
           avg: AVG((TD_STORE_WAVEFRONT_sum / $denom))
           min: MIN((TD_STORE_WAVEFRONT_sum / $denom))
           max: MAX((TD_STORE_WAVEFRONT_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Atomic Instructions:
           avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom))
           min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom))
           max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
+  metrics_description:
+    Address Processing Unit Busy: Percent of the total CU cycles the address processor
+      was busy
+    Address Stall: Percent of the total CU cycles the address processor was stalled
+      from sending address requests further into the vL1D pipeline.
+    Data Stall: Percent of the total CU cycles the address processor was stalled from
+      sending write/atomic data further into the vL1D pipeline.
+    "Data-Processor \u2192 Address Stall": Percent of total CU cycles the address
+      processor was stalled waiting to send command data to the data processor.
+    Total Instructions: The total number of memory instructions executed by the address
+      processer over all compute units on the accelerator, per normalization unit.
+    Global/Generic Instructions: The total number of global & generic memory instructions
+      executed on all compute units on the accelerator, per normalization unit.
+    Global/Generic Read Instructions: The total number of global & generic memory
+      read instructions executed on all compute units on the accelerator, per normalization
+      unit.
+    Global/Generic Write Instructions: The total number of global & generic memory
+      write instructions executed on all compute units on the accelerator, per normalization
+      unit.
+    Global/Generic Atomic Instructions: The total number of global & generic memory
+      atomic (with and without return) instructions executed on all compute units
+      on the accelerator, per normalization unit.
+    Spill/Stack Instructions: The total number of spill/stack memory instructions
+      executed on all compute units on the accelerator, per normalization unit.
+    Spill/Stack Read Instructions: The total number of spill/stack memory read instructions
+      executed on all compute units on the accelerator, per normalization unit.
+    Spill/Stack Write Instructions: The total number of spill/stack memory write instructions
+      executed on all compute units on the accelerator, per normalization unit.
+    Spill/Stack Atomic Instructions: The total number of spill/stack memory atomic
+      (with and without return) instructions executed on all compute units on the
+      accelerator, per normalization unit. Typically unused as these memory operations
+      are typically used to implement thread-local storage.
+    Spill/Stack Total Cycles: The number of cycles the address processing unit spent
+      working on spill/stack instructions, per normalization unit.
+    Spill/Stack Coalesced Read: The number of cycles the address processing unit spent
+      working on coalesced spill/stack read instructions, per normalization unit.
+    Spill/Stack Coalesced Write: The number of cycles the address processing unit
+      spent working on coalesced spill/stack write instructions, per normalization
+      unit.
+    Data-Return Busy: Percent of the total CU cycles the data-return unit was busy
+      processing or waiting on data to return to the CU.
+    "Cache RAM \u2192 Data-Return Stall": Percent of the total CU cycles the data-return
+      unit was stalled on data to be returned from the vL1D Cache RAM.
+    "Workgroup manager \u2192 Data-Return Stall": Percent of the total CU cycles the
+      data-return unit was stalled by the workgroup manager due to initialization
+      of registers as a part of launching new workgroups.
+    Coalescable Instructions: The number of instructions submitted to the data-return
+      unit by the address processor that were found to be coalescable, per normalization
+      unit.
+    Read Instructions: The number of read instructions submitted to the data-return
+      unit by the address processor summed over all compute units on the accelerator,
+      per normalization unit. This is expected to be the sum of global/generic and
+      spill/stack reads in the address processor.
+    Write Instructions: The number of store instructions submitted to the data-return
+      unit by the address processor summed over all compute units on the accelerator,
+      per normalization unit. This is expected to be the sum of global/generic and
+      spill/stack stores in the address processor.
+    Atomic Instructions: The number of atomic instructions submitted to the data-return
+      unit by the address processor summed over all compute units on the accelerator,
+      per normalization unit. This is expected to be the sum of global/generic and
+      spill/stack atomics in the address processor.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1600_vector_l1_data_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1600_vector_l1_data_cache.yaml
index e5b5eb9e9c..3125397a30 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1600_vector_l1_data_cache.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1600_vector_l1_data_cache.yaml
@@ -2,117 +2,6 @@
 Panel Config:
   id: 1600
   title: Vector L1 Data Cache
-  metrics_description:
-    Hit rate: The ratio of the number of vL1D cache line requests that hit in vL1D
-      cache over the total number of cache line requests to the vL1D Cache RAM.
-    Bandwidth Utilization: The number of bytes looked up in the vL1D cache as a result
-      of VMEM instructions, as a percent of the peak theoretical bandwidth achievable
-      on the specific accelerator. The number of bytes is calculated as the number
-      of cache lines requested multiplied by the cache line size. This value does
-      not consider partial requests, so for instance, if only a single value is requested
-      in a cache line, the data movement will still be counted as a full cache line.
-    Utilization: Indicates how busy the vL1D Cache RAM was during the kernel execution.
-      The number of cycles where the vL1D Cache RAM is actively processing any request
-      divided by the number of cycles where the vL1D is active.
-    Coalescing: Indicates how well memory instructions were coalesced by the address
-      processing unit, ranging from uncoalesced (25%) to fully coalesced (100%). Calculated
-      as the average number of thread-requests generated per instruction divided by
-      the ideal number of thread-requests per instruction.
-    Stalled on L2 Data: The ratio of the number of cycles where the vL1D is stalled
-      waiting for requested data to return from the L2 cache divided by the number
-      of cycles where the vL1D is active.
-    Stalled on L2 Req: The ratio of the number of cycles where the vL1D is stalled
-      waiting to issue a request for data to the L2 cache divided by the number of
-      cycles where the vL1D is active.
-    Tag RAM Stall (Read): The ratio of the number of cycles where the vL1D is stalled
-      due to Read requests with conflicting tags being looked up concurrently, divided
-      by the number of cycles where the vL1D is active.
-    Tag RAM Stall (Write): The ratio of the number of cycles where the vL1D is stalled
-      due to Write requests with conflicting tags being looked up concurrently, divided
-      by the number of cycles where the vL1D is active.
-    Tag RAM Stall (Atomic): The ratio of the number of cycles where the vL1D is stalled
-      due to Atomic requests with conflicting tags being looked up concurrently, divided
-      by the number of cycles where the vL1D is active.
-    Total Req: The total number of incoming requests from the address processing unit
-      after coalescing.
-    Read Req: The total number of incoming read requests from the address processing
-      unit after coalescing per normalization unit.
-    Write Req: The total number of incoming write requests from the address processing
-      unit after coalescing per normalization unit.
-    Atomic Req: The total number of incoming atomic requests from the address processing
-      unit after coalescing per normalization unit.
-    Cache BW: The number of bytes looked up in the vL1D cache as a result of VMEM
-      instructions divided by total duration. The number of bytes is calculated as
-      the number of cache lines requested multiplied by the cache line size.  This
-      value does not consider partial requests, so for instance, if only a single
-      value is requested in a cache line, the data movement will still be counted
-      as a full cache line.
-    Cache Hit Rate: The ratio of the number of vL1D cache line requests that hit in
-      vL1D cache over the total number of cache line requests to the vL1D Cache RAM.
-    Cache Accesses: The total number of cache line lookups in the vL1D.
-    Cache Hits: The number of cache accesses minus the number of outgoing requests
-      to the L2 cache, that is, the number of cache line requests serviced by the
-      vL1D Cache RAM per normalization unit.
-    Invalidations: The number of times the vL1D was issued a write-back invalidate
-      command during the kernel's execution per normalization unit. This may be triggered
-      by, for instance, the buffer_wbinvl1 instruction.
-    L1-L2 BW: The number of bytes transferred across the vL1D-L2 interface as a result
-      of VMEM instructions, divided by total duration. The number of bytes is calculated
-      as the number of cache lines requested multiplied by the cache line size. This
-      value does not consider partial requests, so for instance, if only a single
-      value is requested in a cache line, the data movement will still be counted
-      as a full cache line.
-    L1-L2 Read: The number of read requests for a vL1D cache line that were not satisfied
-      by the vL1D and must be retrieved from the to the L2 Cache per normalization
-      unit.
-    L1-L2 Write: The number of write requests to a vL1D cache line that were sent
-      through the vL1D to the L2 cache, per normalization unit.
-    L1-L2 Atomic: The number of atomic requests that are sent through the vL1D to
-      the L2 cache, per normalization unit. This includes requests for atomics with,
-      and without return.
-    L1 Access Latency: Calculated as the average number of cycles that a vL1D cache
-      line request spent in the vL1D cache pipeline.
-    L1-L2 Read Latency: Calculated as the average number of cycles that the vL1D cache
-      took to issue and receive read requests from the L2 Cache. This number also
-      includes requests for atomics with return values.
-    L1-L2 Write Latency: Calculated as the average number of cycles that the vL1D
-      cache took to issue and receive acknowledgement of a write request to the L2
-      Cache. This number also includes requests for atomics without return values.
-    NC - Read: Total read requests with NC mtype from this TCP to all TCCs Sum over
-      TCP instances per normalization unit.
-    UC - Read: Total read requests with UC mtype from this TCP to all TCCs Sum over
-      TCP instances per normalization unit.
-    CC - Read: Total read requests with CC mtype from this TCP to all TCCs Sum over
-      TCP instances per normalization unit.
-    RW - Read: Total read requests with RW mtype from this TCP to all TCCs Sum over
-      TCP instances per normalization unit.
-    RW - Write: Total write requests with RW mtype from this TCP to all TCCs Sum over
-      TCP instances per normalization unit.
-    NC - Write: Total write requests with NC mtype from this TCP to all TCCs Sum over
-      TCP instances per normalization unit.
-    UC - Write: Total write requests with UC mtype from this TCP to all TCCs Sum over
-      TCP instances per normalization unit.
-    CC - Write: Total write requests with CC mtype from this TCP to all TCCs Sum over
-      TCP instances per normalization unit.
-    NC - Atomic: Total atomic requests with NC mtype from this TCP to all TCCs Sum
-      over TCP instances per normalization unit.
-    UC - Atomic: Total atomic requests with UC mtype from this TCP to all TCCs Sum
-      over TCP instances per normalization unit.
-    CC - Atomic: Total atomic requests with CC mtype from this TCP to all TCCs Sum
-      over TCP instances per normalization unit.
-    RW - Atomic: Total atomic requests with RW mtype from this TCP to all TCCs Sum
-      over TCP instances per normalization unit.
-    Req: The number of translation requests made to the UTCL1 per normalization unit.
-    Hit Ratio: The ratio of the number of translation requests that hit in the UTCL1
-      divided by the total number of translation requests made to the UTCL1.
-    Hits: The number of translation requests that hit in the UTCL1, and could be reused,
-      per normalization unit.
-    Translation Misses: The total number of translation requests that missed in the
-      UTCL1 due to  translation not being present in the cache, per normalization
-      unit.
-    Permission Misses: "The total number of translation requests that missed in the\
-      \ UTCL1 due to a permission error, per normalization unit. This is unused and\
-      \ expected to be zero in most configurations for modern CDNA\u2122 accelerators."
   data source:
   - metric_table:
       id: 1601
@@ -181,17 +70,17 @@ Panel Config:
           avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom))
           min: MIN((TCP_TOTAL_ACCESSES_sum / $denom))
           max: MAX((TCP_TOTAL_ACCESSES_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Read Req:
           avg: AVG((TCP_TOTAL_READ_sum / $denom))
           min: MIN((TCP_TOTAL_READ_sum / $denom))
           max: MAX((TCP_TOTAL_READ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Write Req:
           avg: AVG((TCP_TOTAL_WRITE_sum / $denom))
           min: MIN((TCP_TOTAL_WRITE_sum / $denom))
           max: MAX((TCP_TOTAL_WRITE_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Atomic Req:
           avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
             / $denom))
@@ -199,7 +88,7 @@ Panel Config:
             / $denom))
           max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
             / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Cache BW:
           avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
           min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
@@ -223,7 +112,7 @@ Panel Config:
           avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
           min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
           max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Cache Hits:
           avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
             + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
@@ -234,7 +123,7 @@ Panel Config:
           max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
             + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
             / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Invalidations:
           avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
           min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
@@ -252,12 +141,12 @@ Panel Config:
           avg: AVG((TCP_TCC_READ_REQ_sum / $denom))
           min: MIN((TCP_TCC_READ_REQ_sum / $denom))
           max: MAX((TCP_TCC_READ_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         L1-L2 Write:
           avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom))
           min: MIN((TCP_TCC_WRITE_REQ_sum / $denom))
           max: MAX((TCP_TCC_WRITE_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         L1-L2 Atomic:
           avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
             / $denom))
@@ -265,7 +154,7 @@ Panel Config:
             / $denom))
           max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
             / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
   - metric_table:
       id: 1604
       title: L1D - L2 Transactions
@@ -284,84 +173,84 @@ Panel Config:
           avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom))
           min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom))
           max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         UC - Read:
           xfer: Read
           coherency: UC
           avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom))
           min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom))
           max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         CC - Read:
           xfer: Read
           coherency: CC
           avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom))
           min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom))
           max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         RW - Read:
           xfer: Read
           coherency: RW
           avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom))
           min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom))
           max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         RW - Write:
           xfer: Write
           coherency: RW
           avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom))
           min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom))
           max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         NC - Write:
           xfer: Write
           coherency: NC
           avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom))
           min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom))
           max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         UC - Write:
           xfer: Write
           coherency: UC
           avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom))
           min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom))
           max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         CC - Write:
           xfer: Write
           coherency: CC
           avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom))
           min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom))
           max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         NC - Atomic:
           xfer: Atomic
           coherency: NC
           avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
           min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
           max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         UC - Atomic:
           xfer: Atomic
           coherency: UC
           avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
           min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
           max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         CC - Atomic:
           xfer: Atomic
           coherency: CC
           avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
           min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
           max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         RW - Atomic:
           xfer: Atomic
           coherency: RW
           avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
           min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
           max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
   - metric_table:
       id: 1605
       title: L1 Unified Translation Cache (UTCL1)
@@ -410,3 +299,106 @@ Panel Config:
         max: Max
         units: Unit
       metric: {}
+  metrics_description:
+    Hit rate: The ratio of the number of vL1D cache line requests that hit in vL1D
+      cache over the total number of cache line requests to the vL1D Cache RAM.
+    Bandwidth Utilization: The number of bytes looked up in the vL1D cache as a result
+      of VMEM instructions, as a percent of the peak theoretical bandwidth achievable
+      on the specific accelerator. The number of bytes is calculated as the number
+      of cache lines requested multiplied by the cache line size. This value does
+      not consider partial requests, so for instance, if only a single value is requested
+      in a cache line, the data movement will still be counted as a full cache line.
+    Utilization: Indicates how busy the vL1D Cache RAM was during the kernel execution.
+      The number of cycles where the vL1D Cache RAM is actively processing any request
+      divided by the number of cycles where the vL1D is active.
+    Coalescing: Indicates how well memory instructions were coalesced by the address
+      processing unit, ranging from uncoalesced (25%) to fully coalesced (100%). Calculated
+      as the average number of thread-requests generated per instruction divided by
+      the ideal number of thread-requests per instruction.
+    Stalled on L2 Data: The ratio of the number of cycles where the vL1D is stalled
+      waiting for requested data to return from the L2 cache divided by the number
+      of cycles where the vL1D is active.
+    Stalled on L2 Req: The ratio of the number of cycles where the vL1D is stalled
+      waiting to issue a request for data to the L2 cache divided by the number of
+      cycles where the vL1D is active.
+    Tag RAM Stall (Read): The ratio of the number of cycles where the vL1D is stalled
+      due to Read requests with conflicting tags being looked up concurrently, divided
+      by the number of cycles where the vL1D is active.
+    Tag RAM Stall (Write): The ratio of the number of cycles where the vL1D is stalled
+      due to Write requests with conflicting tags being looked up concurrently, divided
+      by the number of cycles where the vL1D is active.
+    Tag RAM Stall (Atomic): The ratio of the number of cycles where the vL1D is stalled
+      due to Atomic requests with conflicting tags being looked up concurrently, divided
+      by the number of cycles where the vL1D is active.
+    Total Req: The total number of incoming requests from the address processing unit
+      after coalescing.
+    Read Req: The total number of incoming read requests from the address processing
+      unit after coalescing per normalization unit.
+    Write Req: The total number of incoming write requests from the address processing
+      unit after coalescing per normalization unit.
+    Atomic Req: The total number of incoming atomic requests from the address processing
+      unit after coalescing per normalization unit.
+    Cache BW: The number of bytes looked up in the vL1D cache as a result of VMEM
+      instructions divided by total duration. The number of bytes is calculated as
+      the number of cache lines requested multiplied by the cache line size. This
+      value does not consider partial requests, so for instance, if only a single
+      value is requested in a cache line, the data movement will still be counted
+      as a full cache line.
+    Cache Hit Rate: The ratio of the number of vL1D cache line requests that hit in
+      vL1D cache over the total number of cache line requests to the vL1D Cache RAM.
+    Cache Accesses: The total number of cache line lookups in the vL1D.
+    Cache Hits: The number of cache accesses minus the number of outgoing requests
+      to the L2 cache, that is, the number of cache line requests serviced by the
+      vL1D Cache RAM per normalization unit.
+    Invalidations: The number of times the vL1D was issued a write-back invalidate
+      command during the kernel's execution per normalization unit. This may be triggered
+      by, for instance, the buffer_wbinvl1 instruction.
+    L1-L2 BW: The number of bytes transferred across the vL1D-L2 interface as a result
+      of VMEM instructions, divided by total duration. The number of bytes is calculated
+      as the number of cache lines requested multiplied by the cache line size. This
+      value does not consider partial requests, so for instance, if only a single
+      value is requested in a cache line, the data movement will still be counted
+      as a full cache line.
+    L1-L2 Read: The number of read requests for a vL1D cache line that were not satisfied
+      by the vL1D and must be retrieved from the to the L2 Cache per normalization
+      unit.
+    L1-L2 Write: The number of write requests to a vL1D cache line that were sent
+      through the vL1D to the L2 cache, per normalization unit.
+    L1-L2 Atomic: The number of atomic requests that are sent through the vL1D to
+      the L2 cache, per normalization unit. This includes requests for atomics with,
+      and without return.
+    NC - Read: Total read requests with NC mtype from this TCP to all TCCs Sum over
+      TCP instances per normalization unit.
+    UC - Read: Total read requests with UC mtype from this TCP to all TCCs Sum over
+      TCP instances per normalization unit.
+    CC - Read: Total read requests with CC mtype from this TCP to all TCCs Sum over
+      TCP instances per normalization unit.
+    RW - Read: Total read requests with RW mtype from this TCP to all TCCs Sum over
+      TCP instances per normalization unit.
+    RW - Write: Total write requests with RW mtype from this TCP to all TCCs Sum over
+      TCP instances per normalization unit.
+    NC - Write: Total write requests with NC mtype from this TCP to all TCCs Sum over
+      TCP instances per normalization unit.
+    UC - Write: Total write requests with UC mtype from this TCP to all TCCs Sum over
+      TCP instances per normalization unit.
+    CC - Write: Total write requests with CC mtype from this TCP to all TCCs Sum over
+      TCP instances per normalization unit.
+    NC - Atomic: Total atomic requests with NC mtype from this TCP to all TCCs Sum
+      over TCP instances per normalization unit.
+    UC - Atomic: Total atomic requests with UC mtype from this TCP to all TCCs Sum
+      over TCP instances per normalization unit.
+    CC - Atomic: Total atomic requests with CC mtype from this TCP to all TCCs Sum
+      over TCP instances per normalization unit.
+    RW - Atomic: Total atomic requests with RW mtype from this TCP to all TCCs Sum
+      over TCP instances per normalization unit.
+    Req: The number of translation requests made to the UTCL1 per normalization unit.
+    Hit Ratio: The ratio of the number of translation requests that hit in the UTCL1
+      divided by the total number of translation requests made to the UTCL1.
+    Hits: The number of translation requests that hit in the UTCL1, and could be reused,
+      per normalization unit.
+    Translation Misses: The total number of translation requests that missed in the
+      UTCL1 due to translation not being present in the cache, per normalization unit.
+    Permission Misses: |-
+      The total number of translation requests that missed in the UTCL1 due
+      to a permission error, per normalization unit. This is unused and expected
+      to be zero in most configurations for modern CDNA\u2122 accelerators.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1700_l2_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1700_l2_cache.yaml
index 6f9bc120d7..a2007f667a 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1700_l2_cache.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1700_l2_cache.yaml
@@ -2,218 +2,6 @@
 Panel Config:
   id: 1700
   title: L2 Cache
-  metrics_description:
-    Utilization: The ratio of the number of cycles an L2 channel was active, summed
-      over all L2 channels on the accelerator over the total L2 cycles.
-    Peak Bandwidth: The number of bytes looked up in the L2 cache, as a percent of
-      the peak theoretical bandwidth achievable on the specific accelerator. The number
-      of bytes is calculated as the number of cache lines requested multiplied by
-      the cache line size. This value does not consider partial requests, so e.g.,
-      if only a single value is requested in a cache line, the data movement will
-      still be counted as a full cache line.
-    Hit Rate: The ratio of the number of L2 cache line requests that hit in the L2
-      cache over the total number of incoming cache line requests to the L2 cache.
-    L2-Fabric Read BW: The number of bytes read by the L2 over the Infinity Fabric
-      interface per unit time.
-    L2-Fabric Write and Atomic BW: The number of bytes sent by the L2 over the Infinity
-      Fabric interface by write and atomic operations per unit time.
-    HBM Bandwidth: Maximum theoretical bandwidth of the accelerator's local high-bandwidth
-      memory (HBM) per unit time. This value is calculated as the number of HBM channels
-      multiplied by the HBM channel width multiplied by the HBM clock frequency.
-    Read BW: The total number of bytes read by the L2 cache from Infinity Fabric divided
-      by total duration.
-    HBM Read Traffic: The percent of read requests generated by the L2 cache that
-      are routed to the accelerator's local high-bandwidth memory (HBM). This breakdown
-      does not consider the size of the request (meaning that 32B and 64B requests
-      are both counted as a single request), so this metric only approximates the
-      percent of the L2-Fabric Read bandwidth directed to the local HBM.
-    Remote Read Traffic: The percent of read requests generated by the L2 cache that
-      are routed to any memory location other than the accelerator's local high-bandwidth
-      memory (HBM) - for example, the CPU's DRAM or a remote accelerator's HBM. This
-      breakdown does not consider the size of the request (meaning that 32B and 64B
-      requests are both counted as a single request), so this metric only approximates
-      the percent of the L2-Fabric Read bandwidth directed to a remote location.
-    Uncached Read Traffic: The percent of read requests generated by the L2 cache
-      that are reading from an uncached memory allocation. Note, as described in the
-      request flow section, a single 64B read request is typically counted as two
-      uncached read requests. So, it is possible for the Uncached Read Traffic to
-      reach up to 200% of the total number of read requests. This breakdown does not
-      consider the size of the request (i.e., 32B and 64B requests are both counted
-      as a single request), so this metric only approximates the percent of the L2-Fabric
-      read bandwidth directed to an uncached memory location.
-    Write and Atomic BW: The total number of bytes written by the L2 over Infinity
-      Fabric by write and atomic operations divided by total duration. Note that on
-      current CDNA accelerators, such as the MI2XX, requests are only considered atomic
-      by Infinity Fabric if they are targeted at non-write-cacheable memory, for example,
-      fine-grained memory allocations or uncached memory allocations on the MI2XX.
-    HBM Write and Atomic Traffic: The percent of write and atomic requests generated
-      by the L2 cache that are routed to the accelerator's local high-bandwidth memory
-      (HBM). This breakdown does not consider the size of the request (meaning that
-      32B and 64B requests are both counted as a single request), so this metric only
-      approximates the percent of the L2-Fabric Write and Atomic bandwidth directed
-      to the local HBM. Note that on current CDNA accelerators, such as the MI2XX,
-      requests are only considered atomic by Infinity Fabric if they are targeted
-      at fine-grained memory allocations or uncached memory allocations.
-    Remote Write and Atomic Traffic: The percent of read requests generated by the
-      L2 cache that are routed to any memory location other than the accelerator's
-      local high-bandwidth memory (HBM) - for example, the CPU's DRAM or a remote
-      accelerator's HBM. This breakdown does not consider the size of the request
-      (meaning that 32B and 64B requests are both counted as a single request), so
-      this metric only approximates the percent of the L2-Fabric Read bandwidth directed
-      to a remote location. Note that on current CDNA accelerators, such as the MI2XX,
-      requests are only considered atomic by Infinity Fabric if they are targeted
-      at fine-grained memory allocations or uncached memory allocations.
-    Atomic Traffic: The percent of write requests generated by the L2 cache that are
-      atomic requests to any memory location. This breakdown does not consider the
-      size of the request (meaning that 32B and 64B requests are both counted as a
-      single request), so this metric only approximates the percent of the L2-Fabric
-      Read bandwidth directed to a remote location. Note that on current CDNA accelerators,
-      such as the MI2XX, requests are only considered atomic by Infinity Fabric if
-      they are targeted at fine-grained memory allocations or uncached memory allocations.
-    Uncached Write and Atomic Traffic: The percent of write and atomic requests generated
-      by the L2 cache that are targeting uncached memory allocations. This breakdown
-      does not consider the size of the request (meaning that 32B and 64B requests
-      are both counted as a single request), so this metric only approximates the
-      percent of the L2-Fabric read bandwidth directed to uncached memory allocations.
-    Read Latency: The time-averaged number of cycles read requests spent in Infinity
-      Fabric before data was returned to the L2.
-    Write and Atomic Latency: The time-averaged number of cycles write requests spent
-      in Infinity Fabric before a completion acknowledgement was returned to the L2.
-    Atomic Latency: The time-averaged number of cycles atomic requests spent in Infinity
-      Fabric before a completion acknowledgement (atomic without return value) or
-      data (atomic with return value) was returned to the L2.
-    Bandwidth: The number of bytes looked up in the L2 cache, divided by total duration.
-      The number of bytes is calculated as the number of cache lines requested multiplied
-      by the cache line size. This value does not consider partial requests, so for
-      example, if only a single value is requested in a cache line, the data movement
-      will still be counted as a full cache line.
-    Read Bandwidth: Total number of bytes looked up in the L2 cache for read requests,
-      divided by total duration.
-    Write Bandwidth: Total number of bytes looked up in the L2 cache for write requests,
-      divided by total duration.
-    Atomic Bandwidth: Total number of bytes looked up in the L2 cache for atomic requests,
-      divided by total duration.
-    Req: The total number of incoming requests to the L2 from all clients for all
-      request types, per normalization unit.
-    Read Req: The total number of read requests to the L2 from all clients.
-    Write Req: The total number of write requests to the L2 from all clients.
-    Atomic Req: The total number of atomic requests (with and without return) to the
-      L2 from all clients.
-    Streaming Req: The total number of incoming requests to the L2 that are marked
-      as streaming. The exact meaning of this may differ depending on the targeted
-      accelerator, however on an MI2XX this corresponds to non-temporal load or stores.
-      The L2 cache attempts to evict streaming requests before normal requests when
-      the L2 is at capacity.
-    Probe Req: The number of coherence probe requests made to the L2 cache from outside
-      the accelerator. On an MI2XX, probe requests may be generated by, for example,
-      writes to fine-grained device memory or by writes to coarse-grained device memory.
-    Cache Hit: The ratio of the number of L2 cache line requests that hit in the L2
-      cache over the total number of incoming cache line requests to the L2 cache.
-    Hits: The total number of requests to the L2 from all clients that hit in the
-      cache. As noted in the Speed-of-Light section, this includes hit-on-miss requests.
-    Misses: The total number of requests to the L2 from all clients that miss in the
-      cache. As noted in the Speed-of-Light section, these do not include hit-on-miss
-      requests.
-    Writeback: The total number of L2 cache lines written back to memory for any reason.
-      Write-backs may occur due to user code (such as HIP kernel calls to _threadfence_system
-      or atomic built-ins) by the command processor's memory acquire/release fences,
-      or for other internal hardware reasons.
-    Writeback (Internal): The total number of L2 cache lines written back to memory
-      for internal hardware reasons, per normalization unit.
-    Writeback (vL1D Req): The total number of L2 cache lines written back to memory
-      due to requests initiated by the vL1D cache, per normalization unit.
-    Evict (Internal): The total number of L2 cache lines evicted from the cache due
-      to capacity limits, per normalization unit.
-    Evict (vL1D Req): The total number of L2 cache lines evicted from the cache due
-      to invalidation requests initiated by the vL1D cache, per normalization unit.
-    NC Req: The total number of requests to the L2 to Not-hardware-Coherent (NC) memory
-      allocations, per normalization unit.
-    UC Req: The total number of requests to the L2 that go to Uncached (UC) memory
-      allocations.
-    CC Req: The total number of requests to the L2 that go to Coherently Cacheable
-      (CC) memory allocations.
-    RW Req: The total number of requests to the L2 that go to Read-Write coherent
-      memory (RW) allocations.
-    Write - Credit Starvation: The number of cycles the L2-Fabric interface was stalled
-      on write or atomic requests to any memory location because too many write/atomic
-      requests were currently in flight, as a percent of the total active L2 cycles.
-    Read (32B): The total number of L2 requests to Infinity Fabric to read 32B of
-      data from any memory location, per normalization unit.
-    Read (64B): The total number of L2 requests to Infinity Fabric to read 64B of
-      data from any memory location, per normalization unit.
-    Read (Uncached): The total number of L2 requests to Infinity Fabric to read uncached
-      data from any memory location, per normalization unit. 64B requests for uncached
-      data are counted as two 32B uncached data requests.
-    HBM Read: The total number of L2 requests to Infinity Fabric to read 32B or 64B
-      of data from the accelerator's local HBM, per normalization unit.
-    Remote Read: The total number of L2 requests to Infinity Fabric to read 32B or
-      64B of data from any source other than the accelerator's local HBM, per normalization
-      unit.
-    Read Bandwidth - PCIe: Total number of bytes due to L2 read requests due to PCIe
-      traffic, divided by total duration.
-    "Read Bandwidth - Infinity Fabric\u2122": Total number of bytes due to L2 read
-      requests due to Infinity Fabric traffic, divided by total duration.
-    Read Bandwidth - HBM: Total number of bytes due to L2 read requests due to HBM
-      traffic, divided by total duration.
-    Write and Atomic (32B): The total number of L2 requests to Infinity Fabric to
-      write or atomically update 32B of data to any memory location, per normalization
-      unit.
-    Write and Atomic (Uncached): The total number of L2 requests to Infinity Fabric
-      to write or atomically update 32B or 64B of uncached data, per normalization
-      unit.
-    Write and Atomic (64B): The total number of L2 requests to Infinity Fabric to
-      write or atomically update 64B of data in any memory location, per normalization
-      unit.
-    HBM Write and Atomic: The total number of L2 requests to Infinity Fabric to write
-      or atomically update 32B or 64B of data in the accelerator's local HBM, per
-      normalization unit.
-    Remote Write and Atomic: The total number of L2 requests to Infinity Fabric to
-      write or atomically update 32B or 64B of data in any memory location other than
-      the accelerator's local HBM, per normalization unit.
-    Write Bandwidth - PCIe: Total number of bytes due to L2 write requests due to
-      PCIe traffic, divided by total duration.
-    "Write Bandwidth - Infinity Fabric\u2122": Total number of bytes due to L2 write
-      requests due to Infinity Fabric traffic, divided by total duration.
-    Write Bandwidth - HBM: Total number of bytes due to L2 write requests due to HBM
-      traffic, divided by total duration.
-    Atomic Bandwidth - PCIe: Total number of bytes due to L2 atomic requests due to
-      PCIe traffic, divided by total duration.
-    "Atomic Bandwidth - Infinity Fabric\u2122": Total number of bytes due to L2 atomic
-      requests due to Infinity Fabric traffic, divided by total duration.
-    Atomic Bandwidth - HBM: Total number of bytes due to L2 atomic requests due to
-      HBM traffic, divided by total duration.
-    Atomic: The total number of L2 requests to Infinity Fabric to atomically update
-      32B or 64B of data in any memory location, per normalization unit. See Request
-      flow for more detail. Note that on current CDNA accelerators, such as the MI2XX,
-      requests are only considered atomic by Infinity Fabric if they are targeted
-      at non-write-cacheable memory, such as fine-grained memory allocations or uncached
-      memory allocations on the MI2XX.
-    Read Stall: "The ratio of the total number of cycles the L2-Fabric interface was\
-      \ stalled on a read request to any destination (local HBM, remote PCIe\xAE connected\
-      \ accelerator or CPU, or remote Infinity Fabric connected accelerator or CPU)\
-      \ over the total active L2 cycles."
-    Write Stall: The ratio of the total number of cycles the L2-Fabric interface was
-      stalled on a write or atomic request to any destination (local HBM, remote accelerator
-      or CPU, PCIe connected accelerator or CPU, or remote Infinity Fabric connected
-      accelerator or CPU) over the total active L2 cycles.
-    Read - PCIe Stall: The number of cycles the L2-Fabric interface was stalled on
-      read requests to remote PCIe connected accelerators or CPUs as a percent of
-      the total active L2 cycles.
-    Read - Infinity Fabric Stall: The number of cycles the L2-Fabric interface was
-      stalled on read requests to remote Infinity Fabric connected accelerators or
-      CPUs as a percent of the total active L2 cycles.
-    Read - HBM Stall: The number of cycles the L2-Fabric interface was stalled on
-      read requests to the accelerator's local HBM as a percent of the total active
-      L2 cycles.
-    Write - PCIe Stall: The number of cycles the L2-Fabric interface was stalled on
-      write or atomic requests to remote PCIe connected accelerators or CPUs as a
-      percent of the total active L2 cycles.
-    Write - Infinity Fabric Stall: The number of cycles the L2-Fabric interface was
-      stalled on write or atomic requests to remote Infinity Fabric connected accelerators
-      or CPUs as a percent of the total active L2 cycles.
-    Write - HBM Stall: The number of cycles the L2-Fabric interface was stalled on
-      write or atomic requests to accelerator's local HBM as a percent of the total
-      active L2 cycles.
   data source:
   - metric_table:
       id: 1701
@@ -374,32 +162,32 @@ Panel Config:
           avg: AVG((TCC_REQ_sum / $denom))
           min: MIN((TCC_REQ_sum / $denom))
           max: MAX((TCC_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Read Req:
           avg: AVG((TCC_READ_sum / $denom))
           min: MIN((TCC_READ_sum / $denom))
           max: MAX((TCC_READ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Write Req:
           avg: AVG((TCC_WRITE_sum / $denom))
           min: MIN((TCC_WRITE_sum / $denom))
           max: MAX((TCC_WRITE_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Atomic Req:
           avg: AVG((TCC_ATOMIC_sum / $denom))
           min: MIN((TCC_ATOMIC_sum / $denom))
           max: MAX((TCC_ATOMIC_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Streaming Req:
           avg: AVG((TCC_STREAMING_REQ_sum / $denom))
           min: MIN((TCC_STREAMING_REQ_sum / $denom))
           max: MAX((TCC_STREAMING_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Probe Req:
           avg: AVG((TCC_PROBE_sum / $denom))
           min: MIN((TCC_PROBE_sum / $denom))
           max: MAX((TCC_PROBE_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Cache Hit:
           avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
             + TCC_MISS_sum) != 0) else None))
@@ -412,17 +200,17 @@ Panel Config:
           avg: AVG((TCC_HIT_sum / $denom))
           min: MIN((TCC_HIT_sum / $denom))
           max: MAX((TCC_HIT_sum / $denom))
-          unit: (Hits  + $normUnit)
+          unit: (Hits + $normUnit)
         Misses:
           avg: AVG((TCC_MISS_sum / $denom))
           min: MIN((TCC_MISS_sum / $denom))
           max: MAX((TCC_MISS_sum / $denom))
-          unit: (Misses  + $normUnit)
+          unit: (Misses + $normUnit)
         Writeback:
           avg: AVG((TCC_WRITEBACK_sum / $denom))
           min: MIN((TCC_WRITEBACK_sum / $denom))
           max: MAX((TCC_WRITEBACK_sum / $denom))
-          unit: (Cachelines  + $normUnit)
+          unit: (Cachelines + $normUnit)
         Writeback (Internal):
           avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom))
           min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom))
@@ -447,22 +235,22 @@ Panel Config:
           avg: AVG((TCC_NC_REQ_sum / $denom))
           min: MIN((TCC_NC_REQ_sum / $denom))
           max: MAX((TCC_NC_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         UC Req:
           avg: AVG((TCC_UC_REQ_sum / $denom))
           min: MIN((TCC_UC_REQ_sum / $denom))
           max: MAX((TCC_UC_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         CC Req:
           avg: AVG((TCC_CC_REQ_sum / $denom))
           min: MIN((TCC_CC_REQ_sum / $denom))
           max: MAX((TCC_CC_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         RW Req:
           avg: AVG((TCC_RW_REQ_sum / $denom))
           min: MIN((TCC_RW_REQ_sum / $denom))
           max: MAX((TCC_RW_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
   - metric_table:
       id: 1704
       title: L2 Cache Stalls
@@ -511,7 +299,7 @@ Panel Config:
           avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom))
           min: MIN((TCC_EA0_RDREQ_32B_sum / $denom))
           max: MAX((TCC_EA0_RDREQ_32B_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Read (64B):
           avg: AVG(MAX(((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum)
             / $denom), 0))
@@ -519,54 +307,216 @@ Panel Config:
             / $denom), 0))
           max: MAX(MAX(((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum)
             / $denom), 0))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Read (128B):
           avg: AVG(((TCC_BUBBLE_sum) / $denom))
           min: MIN(((TCC_BUBBLE_sum) / $denom))
           max: MAX(((TCC_BUBBLE_sum) / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Read (Uncached):
           avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
           min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
           max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         HBM Read:
           avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom))
           min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom))
           max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Remote Read:
           avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
           min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
           max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Write and Atomic (32B):
           avg: AVG(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
           min: MIN(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
           max: MAX(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Write and Atomic (Uncached):
           avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
           min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
           max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Write and Atomic (64B):
           avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom))
           min: MIN((TCC_EA0_WRREQ_64B_sum / $denom))
           max: MAX((TCC_EA0_WRREQ_64B_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         HBM Write and Atomic:
           avg: AVG((TCC_EA0_WRREQ_DRAM_sum / $denom))
           min: MIN((TCC_EA0_WRREQ_DRAM_sum / $denom))
           max: MAX((TCC_EA0_WRREQ_DRAM_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Remote Write and Atomic:
           avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
           min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
           max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Atomic:
           avg: AVG((TCC_EA0_ATOMIC_sum / $denom))
           min: MIN((TCC_EA0_ATOMIC_sum / $denom))
           max: MAX((TCC_EA0_ATOMIC_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
+  metrics_description:
+    Utilization: The ratio of the number of cycles an L2 channel was active, summed
+      over all L2 channels on the accelerator over the total L2 cycles.
+    Peak Bandwidth: The number of bytes looked up in the L2 cache, as a percent of
+      the peak theoretical bandwidth achievable on the specific accelerator. The number
+      of bytes is calculated as the number of cache lines requested multiplied by
+      the cache line size. This value does not consider partial requests, so e.g.,
+      if only a single value is requested in a cache line, the data movement will
+      still be counted as a full cache line.
+    Hit Rate: The ratio of the number of L2 cache line requests that hit in the L2
+      cache over the total number of incoming cache line requests to the L2 cache.
+    L2-Fabric Read BW: The number of bytes read by the L2 over the Infinity Fabric
+      interface per unit time.
+    L2-Fabric Write and Atomic BW: The number of bytes sent by the L2 over the Infinity
+      Fabric interface by write and atomic operations per unit time.
+    HBM Bandwidth: Maximum theoretical bandwidth of the accelerator's local high-bandwidth
+      memory (HBM) per unit time. This value is calculated as the number of HBM channels
+      multiplied by the HBM channel width multiplied by the HBM clock frequency.
+    Read BW: The total number of bytes read by the L2 cache from Infinity Fabric divided
+      by total duration.
+    HBM Read Traffic: The percent of read requests generated by the L2 cache that
+      are routed to the accelerator's local high-bandwidth memory (HBM). This breakdown
+      does not consider the size of the request (meaning that 32B and 64B requests
+      are both counted as a single request), so this metric only approximates the
+      percent of the L2-Fabric Read bandwidth directed to the local HBM.
+    Remote Read Traffic: The percent of read requests generated by the L2 cache that
+      are routed to any memory location other than the accelerator's local high-bandwidth
+      memory (HBM) - for example, the CPU's DRAM or a remote accelerator's HBM. This
+      breakdown does not consider the size of the request (meaning that 32B and 64B
+      requests are both counted as a single request), so this metric only approximates
+      the percent of the L2-Fabric Read bandwidth directed to a remote location.
+    Uncached Read Traffic: The percent of read requests generated by the L2 cache
+      that are reading from an uncached memory allocation. Note, as described in the
+      request flow section, a single 64B read request is typically counted as two
+      uncached read requests. So, it is possible for the Uncached Read Traffic to
+      reach up to 200% of the total number of read requests. This breakdown does not
+      consider the size of the request (i.e., 32B and 64B requests are both counted
+      as a single request), so this metric only approximates the percent of the L2-Fabric
+      read bandwidth directed to an uncached memory location.
+    Write and Atomic BW: The total number of bytes written by the L2 over Infinity
+      Fabric by write and atomic operations divided by total duration. Note that on
+      current CDNA accelerators, such as the MI2XX, requests are only considered atomic
+      by Infinity Fabric if they are targeted at non-write-cacheable memory, for example,
+      fine-grained memory allocations or uncached memory allocations on the MI2XX.
+    HBM Write and Atomic Traffic: The percent of write and atomic requests generated
+      by the L2 cache that are routed to the accelerator's local high-bandwidth memory
+      (HBM). This breakdown does not consider the size of the request (meaning that
+      32B and 64B requests are both counted as a single request), so this metric only
+      approximates the percent of the L2-Fabric Write and Atomic bandwidth directed
+      to the local HBM. Note that on current CDNA accelerators, such as the MI2XX,
+      requests are only considered atomic by Infinity Fabric if they are targeted
+      at fine-grained memory allocations or uncached memory allocations.
+    Remote Write and Atomic Traffic: The percent of read requests generated by the
+      L2 cache that are routed to any memory location other than the accelerator's
+      local high-bandwidth memory (HBM) - for example, the CPU's DRAM or a remote
+      accelerator's HBM. This breakdown does not consider the size of the request
+      (meaning that 32B and 64B requests are both counted as a single request), so
+      this metric only approximates the percent of the L2-Fabric Read bandwidth directed
+      to a remote location. Note that on current CDNA accelerators, such as the MI2XX,
+      requests are only considered atomic by Infinity Fabric if they are targeted
+      at fine-grained memory allocations or uncached memory allocations.
+    Atomic Traffic: The percent of write requests generated by the L2 cache that are
+      atomic requests to any memory location. This breakdown does not consider the
+      size of the request (meaning that 32B and 64B requests are both counted as a
+      single request), so this metric only approximates the percent of the L2-Fabric
+      Read bandwidth directed to a remote location. Note that on current CDNA accelerators,
+      such as the MI2XX, requests are only considered atomic by Infinity Fabric if
+      they are targeted at fine-grained memory allocations or uncached memory allocations.
+    Uncached Write and Atomic Traffic: The percent of write and atomic requests generated
+      by the L2 cache that are targeting uncached memory allocations. This breakdown
+      does not consider the size of the request (meaning that 32B and 64B requests
+      are both counted as a single request), so this metric only approximates the
+      percent of the L2-Fabric read bandwidth directed to uncached memory allocations.
+    Read Latency: The time-averaged number of cycles read requests spent in Infinity
+      Fabric before data was returned to the L2.
+    Write and Atomic Latency: The time-averaged number of cycles write requests spent
+      in Infinity Fabric before a completion acknowledgement was returned to the L2.
+    Atomic Latency: The time-averaged number of cycles atomic requests spent in Infinity
+      Fabric before a completion acknowledgement (atomic without return value) or
+      data (atomic with return value) was returned to the L2.
+    Bandwidth: The number of bytes looked up in the L2 cache, divided by total duration.
+      The number of bytes is calculated as the number of cache lines requested multiplied
+      by the cache line size. This value does not consider partial requests, so for
+      example, if only a single value is requested in a cache line, the data movement
+      will still be counted as a full cache line.
+    Req: The total number of incoming requests to the L2 from all clients for all
+      request types, per normalization unit.
+    Read Req: The total number of read requests to the L2 from all clients.
+    Write Req: The total number of write requests to the L2 from all clients.
+    Atomic Req: The total number of atomic requests (with and without return) to the
+      L2 from all clients.
+    Streaming Req: The total number of incoming requests to the L2 that are marked
+      as streaming. The exact meaning of this may differ depending on the targeted
+      accelerator, however on an MI2XX this corresponds to non-temporal load or stores.
+      The L2 cache attempts to evict streaming requests before normal requests when
+      the L2 is at capacity.
+    Probe Req: The number of coherence probe requests made to the L2 cache from outside
+      the accelerator. On an MI2XX, probe requests may be generated by, for example,
+      writes to fine-grained device memory or by writes to coarse-grained device memory.
+    Cache Hit: The ratio of the number of L2 cache line requests that hit in the L2
+      cache over the total number of incoming cache line requests to the L2 cache.
+    Hits: The total number of requests to the L2 from all clients that hit in the
+      cache. As noted in the Speed-of-Light section, this includes hit-on-miss requests.
+    Misses: The total number of requests to the L2 from all clients that miss in the
+      cache. As noted in the Speed-of-Light section, these do not include hit-on-miss
+      requests.
+    Writeback: The total number of L2 cache lines written back to memory for any reason.
+      Write-backs may occur due to user code (such as HIP kernel calls to _threadfence_system
+      or atomic built-ins) by the command processor's memory acquire/release fences,
+      or for other internal hardware reasons.
+    Writeback (Internal): The total number of L2 cache lines written back to memory
+      for internal hardware reasons, per normalization unit.
+    Writeback (vL1D Req): The total number of L2 cache lines written back to memory
+      due to requests initiated by the vL1D cache, per normalization unit.
+    Evict (Internal): The total number of L2 cache lines evicted from the cache due
+      to capacity limits, per normalization unit.
+    Evict (vL1D Req): The total number of L2 cache lines evicted from the cache due
+      to invalidation requests initiated by the vL1D cache, per normalization unit.
+    NC Req: The total number of requests to the L2 to Not-hardware-Coherent (NC) memory
+      allocations, per normalization unit.
+    UC Req: The total number of requests to the L2 that go to Uncached (UC) memory
+      allocations.
+    CC Req: The total number of requests to the L2 that go to Coherently Cacheable
+      (CC) memory allocations.
+    RW Req: The total number of requests to the L2 that go to Read-Write coherent
+      memory (RW) allocations.
+    Write - Credit Starvation: The number of cycles the L2-Fabric interface was stalled
+      on write or atomic requests to any memory location because too many write/atomic
+      requests were currently in flight, as a percent of the total active L2 cycles.
+    Read (32B): The total number of L2 requests to Infinity Fabric to read 32B of
+      data from any memory location, per normalization unit.
+    Read (64B): The total number of L2 requests to Infinity Fabric to read 64B of
+      data from any memory location, per normalization unit.
+    Read (Uncached): The total number of L2 requests to Infinity Fabric to read uncached
+      data from any memory location, per normalization unit. 64B requests for uncached
+      data are counted as two 32B uncached data requests.
+    HBM Read: The total number of L2 requests to Infinity Fabric to read 32B or 64B
+      of data from the accelerator's local HBM, per normalization unit.
+    Remote Read: The total number of L2 requests to Infinity Fabric to read 32B or
+      64B of data from any source other than the accelerator's local HBM, per normalization
+      unit.
+    Write and Atomic (32B): The total number of L2 requests to Infinity Fabric to
+      write or atomically update 32B of data to any memory location, per normalization
+      unit.
+    Write and Atomic (Uncached): The total number of L2 requests to Infinity Fabric
+      to write or atomically update 32B or 64B of uncached data, per normalization
+      unit.
+    Write and Atomic (64B): The total number of L2 requests to Infinity Fabric to
+      write or atomically update 64B of data in any memory location, per normalization
+      unit.
+    HBM Write and Atomic: The total number of L2 requests to Infinity Fabric to write
+      or atomically update 32B or 64B of data in the accelerator's local HBM, per
+      normalization unit.
+    Remote Write and Atomic: The total number of L2 requests to Infinity Fabric to
+      write or atomically update 32B or 64B of data in any memory location other than
+      the accelerator's local HBM, per normalization unit.
+    Atomic: The total number of L2 requests to Infinity Fabric to atomically update
+      32B or 64B of data in any memory location, per normalization unit. See Request
+      flow for more detail. Note that on current CDNA accelerators, such as the MI2XX,
+      requests are only considered atomic by Infinity Fabric if they are targeted
+      at non-write-cacheable memory, such as fine-grained memory allocations or uncached
+      memory allocations on the MI2XX.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1800_l2_cache_per_channel.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1800_l2_cache_per_channel.yaml
index 849662871e..75ce281b57 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1800_l2_cache_per_channel.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1800_l2_cache_per_channel.yaml
@@ -2,10 +2,6 @@
 Panel Config:
   id: 1800
   title: L2 Cache (per Channel)
-  metrics_description:
-    L2 Cache Hit Rate: The percent of total number of requests to the L2 from all
-      clients that hit in the cache. As noted in the Speed-of-Light section, this
-      includes hit-on-miss requests.
   data source:
   - metric_table:
       id: 1801
@@ -249,3 +245,7 @@ Panel Config:
           ::_1: $total_l2_chan
       cli_style: simple_box
       tui_style: simple_box
+  metrics_description:
+    L2 Cache Hit Rate: The percent of total number of requests to the L2 from all
+      clients that hit in the cache. As noted in the Speed-of-Light section, this
+      includes hit-on-miss requests.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/2100_pc_sampling.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/2100_pc_sampling.yaml
index e94471d7dc..16e4d01e7e 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/2100_pc_sampling.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/2100_pc_sampling.yaml
@@ -2,10 +2,10 @@
 Panel Config:
   id: 2100
   title: PC Sampling
-  metrics_description: {}
   data source:
   - pc_sampling_table:
       id: 2101
       title: PC Sampling
       source: ps_file
       comparable: false
+  metrics_description: {}
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/config_delta/gfx950_diff.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/config_delta/gfx950_diff.yaml
new file mode 100644
index 0000000000..72a6aed7fe
--- /dev/null
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/config_delta/gfx950_diff.yaml
@@ -0,0 +1,761 @@
+# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated by tools/config_management/generate_config_deltas.py
+Addition:
+  - Panel Config:
+      id: 200
+      title: System Speed-of-Light
+    metric_tables:
+      - metric_table:
+          id: 201
+          title: System Speed-of-Light
+          metrics:
+            - MFMA FLOPs (F6F4):
+                value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp - Start_Timestamp)))
+                unit: GFLOP/s
+                peak: ((($max_sclk * $cu_per_gpu) * 16834) / 1000)
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 16834) / 1000))
+  - Panel Config:
+      id: 300
+      title: Memory Chart
+    metric_tables:
+      - metric_table:
+          id: 301
+          title: Memory Chart
+          metrics:
+            - L2 Wr Lat:
+                value: |
+                  ROUND(AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != 0) else None)), 0)
+            - L2 Rd Lat:
+                value: |
+                  ROUND(AVG(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else None)), 0)
+  - Panel Config:
+      id: 400
+      title: Roofline
+    metric_tables:
+      - metric_table:
+          id: 401
+          title: Roofline Performance Rates
+          metrics:
+            - MFMA FLOPs (F6F4):
+                value: |
+                  AVG((((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
+                unit: GFLOP/s
+                peak: $MFMA_FLOPs_F6F4_empirical_peak
+  - Panel Config:
+      id: 500
+      title: Command Processor (CPC/CPF)
+    metric_tables:
+      - metric_table:
+          id: 502
+          title: Command processor packet processor (CPC)
+          metrics:
+            - CPC CANE Stall Rate:
+                avg: AVG((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0) else None)
+                min: MIN((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0) else None)
+                max: MAX((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0) else None)
+                unit: pct
+            - CPC ADC Utilization:
+                avg: AVG((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else None)
+                min: MIN((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else None)
+                max: MAX((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else None)
+                unit: pct
+            - CPC SYNC FIFO Full Rate:
+                avg: |
+                  AVG((100 * CPC_SYNC_FIFO_FULL) / CPC_SYNC_WRREQ_FIFO_BUSY if (CPC_SYNC_WRREQ_FIFO_BUSY != 0) else None)
+                min: |
+                  MIN((100 * CPC_SYNC_FIFO_FULL) / CPC_SYNC_WRREQ_FIFO_BUSY if (CPC_SYNC_WRREQ_FIFO_BUSY != 0) else None)
+                max: |
+                  MAX((100 * CPC_SYNC_FIFO_FULL) / CPC_SYNC_WRREQ_FIFO_BUSY if (CPC_SYNC_WRREQ_FIFO_BUSY != 0) else None)
+                unit: pct
+  - Panel Config:
+      id: 600
+      title: Workgroup Manager (SPI)
+    metric_tables:
+      - metric_table:
+          id: 601
+          title: Workgroup manager utilizations
+          metrics:
+            - Scheduler-Pipe Wave Utilization:
+                avg: |
+                  AVG(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
+                min: |
+                  MIN(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
+                max: |
+                  MAX(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
+                unit: Pct
+            - Schedule-Pipe Wave Occupancy:
+                avg: |
+                  AVG(SPI_CSQ_P0_OCCUPANCY + SPI_CSQ_P1_OCCUPANCY + SPI_CSQ_P2_OCCUPANCY + SPI_CSQ_P3_OCCUPANCY)
+                min: |
+                  MIN(SPI_CSQ_P0_OCCUPANCY + SPI_CSQ_P1_OCCUPANCY + SPI_CSQ_P2_OCCUPANCY + SPI_CSQ_P3_OCCUPANCY)
+                max: |
+                  MAX(SPI_CSQ_P0_OCCUPANCY + SPI_CSQ_P1_OCCUPANCY + SPI_CSQ_P2_OCCUPANCY + SPI_CSQ_P3_OCCUPANCY)
+                unit: Wave
+      - metric_table:
+          id: 602
+          title: Workgroup Manager - Resource Allocation
+          metrics:
+            - Scheduler-Pipe FIFO Full Rate:
+                avg: |
+                  AVG((100 * (SPI_CS0_CRAWLER_STALL + SPI_CS1_CRAWLER_STALL + SPI_CS2_CRAWLER_STALL + SPI_CS3_CRAWLER_STALL) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
+                min: |
+                  MIN((100 * (SPI_CS0_CRAWLER_STALL + SPI_CS1_CRAWLER_STALL + SPI_CS2_CRAWLER_STALL + SPI_CS3_CRAWLER_STALL) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
+                max: |
+                  MAX((100 * (SPI_CS0_CRAWLER_STALL + SPI_CS1_CRAWLER_STALL + SPI_CS2_CRAWLER_STALL + SPI_CS3_CRAWLER_STALL) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
+                unit: Pct
+  - Panel Config:
+      id: 1000
+      title: Compute Units - Instruction Mix
+    metric_tables:
+      - metric_table:
+          id: 1003
+          title: VMEM Instruction Mix
+          metrics:
+            - Spill/Stack Coalesceable Instr:
+                avg: AVG((TA_BUFFER_COALESCEABLE_WAVEFRONTS_sum / $denom))
+                min: MIN((TA_BUFFER_COALESCEABLE_WAVEFRONTS_sum / $denom))
+                max: MAX((TA_BUFFER_COALESCEABLE_WAVEFRONTS_sum / $denom))
+                unit: (instr + $normUnit)
+      - metric_table:
+          id: 1004
+          title: MFMA Arithmetic Instruction Mix
+          metrics:
+            - MFMA-F6F4:
+                avg: AVG((SQ_INSTS_VALU_MFMA_F6F4 / $denom))
+                min: MIN((SQ_INSTS_VALU_MFMA_F6F4 / $denom))
+                max: MAX((SQ_INSTS_VALU_MFMA_F6F4 / $denom))
+                unit: (instr + $normUnit)
+  - Panel Config:
+      id: 1100
+      title: Compute Units - Compute Pipeline
+    metric_tables:
+      - metric_table:
+          id: 1101
+          title: Compute Speed-of-Light
+          metrics:
+            - MFMA FLOPs (F6F4):
+                value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp - Start_Timestamp)))
+                unit: GFLOP
+                peak: ((($max_sclk * $cu_per_gpu) * 16834) / 1000)
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 16834) / 1000))
+      - metric_table:
+          id: 1102
+          title: Pipeline Statistics
+          metrics:
+            - VALU Co-Issue Efficiency:
+                avg: AVG((100 * SQ_ACTIVE_INST_VALU2) / (SQ_ACTIVE_INST_VALU - SQ_ACTIVE_INST_VALU2))
+                min: MIN((100 * SQ_ACTIVE_INST_VALU2) / (SQ_ACTIVE_INST_VALU - SQ_ACTIVE_INST_VALU2))
+                max: MAX((100 * SQ_ACTIVE_INST_VALU2) / (SQ_ACTIVE_INST_VALU - SQ_ACTIVE_INST_VALU2))
+                unit: pct
+      - metric_table:
+          id: 1103
+          title: Arithmetic Operations
+          metrics:
+            - F6F4 OPs:
+                avg: AVG((512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4) / $denom)
+                min: MIN((512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4) / $denom)
+                max: MAX((512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4) / $denom)
+                unit: (OPs + $normUnit)
+  - Panel Config:
+      id: 1200
+      title: Local Data Share (LDS)
+    metric_tables:
+      - metric_table:
+          id: 1202
+          title: LDS Statistics
+          metrics:
+            - LDS STORE Bandwidth:
+                avg: AVG(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp))
+                min: MIN(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp))
+                max: MAX(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp))
+                units: Gbps
+            - LDS STORE:
+                avg: AVG((SQ_INSTS_LDS_STORE / $denom))
+                min: MIN((SQ_INSTS_LDS_STORE / $denom))
+                max: MAX((SQ_INSTS_LDS_STORE / $denom))
+                unit: (instr + $normUnit)
+            - LDS Data FIFO Full Rate:
+                avg: AVG((SQ_LDS_DATA_FIFO_FULL / $denom))
+                min: MIN((SQ_LDS_DATA_FIFO_FULL / $denom))
+                max: MAX((SQ_LDS_DATA_FIFO_FULL / $denom))
+                unit: (Cycles + $normUnit)
+            - LDS ATOMIC:
+                avg: AVG((SQ_INSTS_LDS_ATOMIC / $denom))
+                min: MIN((SQ_INSTS_LDS_ATOMIC / $denom))
+                max: MAX((SQ_INSTS_LDS_ATOMIC / $denom))
+                unit: (instr + $normUnit)
+            - LDS Command FIFO Full Rate:
+                avg: AVG((SQ_LDS_CMD_FIFO_FULL / $denom))
+                min: MIN((SQ_LDS_CMD_FIFO_FULL / $denom))
+                max: MAX((SQ_LDS_CMD_FIFO_FULL / $denom))
+                unit: (Cycles + $normUnit)
+            - LDS ATOMIC Bandwidth:
+                avg: AVG(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp))
+                min: MIN(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp))
+                max: MAX(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp))
+                units: Gbps
+            - LDS LOAD Bandwidth:
+                avg: AVG(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp))
+                min: MIN(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp))
+                max: MAX(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp))
+                units: Gbps
+            - LDS LOAD:
+                avg: AVG((SQ_INSTS_LDS_LOAD / $denom))
+                min: MIN((SQ_INSTS_LDS_LOAD / $denom))
+                max: MAX((SQ_INSTS_LDS_LOAD / $denom))
+                unit: (instr + $normUnit)
+  - Panel Config:
+      id: 1500
+      title: Address Processing Unit and Data Return Path (TA/TD)
+    metric_tables:
+      - metric_table:
+          id: 1504
+          title: Vector L1 data-return path or Texture Data (TD)
+          metrics:
+            - Write Ack Instructions:
+                avg: AVG((TD_WRITE_ACKT_WAVEFRONT_sum / $denom))
+                min: MIN((TD_WRITE_ACKT_WAVEFRONT_sum / $denom))
+                max: MAX((TD_WRITE_ACKT_WAVEFRONT_sum / $denom))
+                unit: (Instructions + $normUnit)
+      - metric_table:
+          id: 1502
+          title: Instruction counts
+          metrics:
+            - Spill/Stack Read Instructions for LDS:
+                avg: AVG((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom))
+                min: MIN((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom))
+                max: MAX((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom))
+                unit: (Instructions + $normUnit)
+            - Global/Generic Read Instructions for LDS:
+                avg: AVG((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom))
+                min: MIN((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom))
+                max: MAX((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom))
+                unit: (Instructions + $normUnit)
+  - Panel Config:
+      id: 1600
+      title: Vector L1 Data Cache
+    metric_tables:
+      - metric_table:
+          id: 1602
+          title: vL1D cache stall metrics
+          metrics:
+            - Stalled on Request FIFO:
+                expr: |
+                  (((100 * TCP_RFIFO_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None)
+            - Stalled on Read Return:
+                expr: |
+                  (((100 * TCP_TCR_RDRET_STALL_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None)
+            - Stalled on Address:
+                expr: |
+                  (((100 * TCP_TCP_TA_ADDR_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None)
+            - Stalled on Data:
+                expr: |
+                  (((100 * TCP_TCP_TA_DATA_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None)
+            - Stalled on Latency FIFO:
+                expr: |
+                  (((100 * TCP_LFIFO_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum != 0) else None)
+      - metric_table:
+          id: 1603
+          title: vL1D cache access metrics
+          metrics:
+            - L1 Access Latency:
+                avg: AVG((TCP_TCP_LATENCY_sum / $denom))
+                min: MIN((TCP_TCP_LATENCY_sum / $denom))
+                max: MAX((TCP_TCP_LATENCY_sum / $denom))
+                unit: (Cycles + $normUnit)
+            - Tag RAM 0 Req:
+                avg: AVG((TCP_TAGRAM0_REQ_sum / $denom))
+                min: MIN((TCP_TAGRAM0_REQ_sum / $denom))
+                max: MAX((TCP_TAGRAM0_REQ_sum / $denom))
+                unit: (Req + $normUnit)
+            - Tag RAM 1 Req:
+                avg: AVG((TCP_TAGRAM1_REQ_sum / $denom))
+                min: MIN((TCP_TAGRAM1_REQ_sum / $denom))
+                max: MAX((TCP_TAGRAM1_REQ_sum / $denom))
+                unit: (Req + $normUnit)
+            - L1-L2 Write Latency:
+                avg: AVG((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom))
+                min: MIN((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom))
+                max: MAX((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom))
+                unit: (Cycles + $normUnit)
+            - Tag RAM 2 Req:
+                avg: AVG((TCP_TAGRAM2_REQ_sum / $denom))
+                min: MIN((TCP_TAGRAM2_REQ_sum / $denom))
+                max: MAX((TCP_TAGRAM2_REQ_sum / $denom))
+                unit: (Req + $normUnit)
+            - L1-L2 Read Latency:
+                avg: AVG((TCP_TCC_READ_REQ_LATENCY_sum / $denom))
+                min: MIN((TCP_TCC_READ_REQ_LATENCY_sum / $denom))
+                max: MAX((TCP_TCC_READ_REQ_LATENCY_sum / $denom))
+                unit: (Cycles + $normUnit)
+            - Tag RAM 3 Req:
+                avg: AVG((TCP_TAGRAM3_REQ_sum / $denom))
+                min: MIN((TCP_TAGRAM3_REQ_sum / $denom))
+                max: MAX((TCP_TAGRAM3_REQ_sum / $denom))
+                unit: (Req + $normUnit)
+      - metric_table:
+          id: 1605
+          title: L1 Unified Translation Cache (UTCL1)
+          metrics:
+            - Misses under Translation Miss:
+                avg: AVG((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom))
+                min: MIN((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom))
+                max: MAX((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom))
+                units: (Req + $normUnit)
+            - Inflight Req:
+                avg: AVG((TCP_CLIENT_UTCL1_INFLIGHT_sum / $denom))
+                min: MIN((TCP_CLIENT_UTCL1_INFLIGHT_sum / $denom))
+                max: MAX((TCP_CLIENT_UTCL1_INFLIGHT_sum / $denom))
+                units: (Req + $normUnit)
+      - metric_table:
+          id: 1606
+          title: L1D Addr Translation Stalls
+          metrics:
+            - Cache Miss Stall:
+                avg: AVG((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom))
+                min: MIN((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom))
+                max: MAX((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom))
+                units: (Cycles + $normUnit)
+            - Thrashing Stall:
+                avg: AVG((TCP_UTCL1_THRASHING_STALL_sum / $denom))
+                min: MIN((TCP_UTCL1_THRASHING_STALL_sum / $denom))
+                max: MAX((TCP_UTCL1_THRASHING_STALL_sum / $denom))
+                units: (Cycles + $normUnit)
+            - Cache Full Stall:
+                avg: AVG((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom))
+                min: MIN((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom))
+                max: MAX((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom))
+                units: (Cycles + $normUnit)
+            - Resident Page Full Stall:
+                avg: AVG((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom))
+                min: MIN((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom))
+                max: MAX((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom))
+                units: (Cycles + $normUnit)
+            - Serialization Stall:
+                avg: AVG((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom))
+                min: MIN((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom))
+                max: MAX((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom))
+                units: (Cycles + $normUnit)
+            - Latency FIFO Stall:
+                avg: AVG((TCP_UTCL1_LFIFO_FULL_sum / $denom))
+                min: MIN((TCP_UTCL1_LFIFO_FULL_sum / $denom))
+                max: MAX((TCP_UTCL1_LFIFO_FULL_sum / $denom))
+                units: (Cycles + $normUnit)
+            - UTCL2 Stall:
+                avg: AVG((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom))
+                min: MIN((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom))
+                max: MAX((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom))
+                units: (Cycles + $normUnit)
+  - Panel Config:
+      id: 1700
+      title: L2 Cache
+    metric_tables:
+      - metric_table:
+          id: 1702
+          title: L2-Fabric interface metrics
+          metrics:
+            - Write Stall:
+                avg: |
+                  AVG(((100 * (TCC_EA0_WRREQ_STALL_sum) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None))
+                min: |
+                  MIN(((100 * (TCC_EA0_WRREQ_STALL_sum) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None))
+                max: |
+                  MAX(((100 * (TCC_EA0_WRREQ_STALL_sum) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None))
+                unit: pct
+            - Read Stall:
+                avg: |
+                  AVG((((100 * ((TCC_EA0_RDREQ_IO_CREDIT_STALL_sum + TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum) + TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None))
+                min: |
+                  MIN((((100 * ((TCC_EA0_RDREQ_IO_CREDIT_STALL_sum + TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum) + TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None))
+                max: |
+                  MAX((((100 * ((TCC_EA0_RDREQ_IO_CREDIT_STALL_sum + TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum) + TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum != 0) else None))
+                unit: pct
+      - metric_table:
+          id: 1703
+          title: L2 Cache Accesses
+          metrics:
+            - Bypasss Req:
+                avg: AVG((TCC_BYPASS_REQ_sum / $denom))
+                min: MIN((TCC_BYPASS_REQ_sum / $denom))
+                max: MAX((TCC_BYPASS_REQ_sum / $denom))
+                unit: (Req + $normUnit)
+            - Write Bandwidth:
+                avg: AVG(TCC_WRITE_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_WRITE_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_WRITE_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Input Buffer Req:
+                avg: AVG((TCC_IB_REQ_sum / $denom))
+                min: MIN((TCC_IB_REQ_sum / $denom))
+                max: MAX((TCC_IB_REQ_sum / $denom))
+                unit: (Req + $normUnit)
+            - Atomic Bandwidth:
+                avg: AVG(TCC_ATOMIC_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_ATOMIC_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_ATOMIC_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Read Bandwidth:
+                avg: AVG(TCC_READ_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_READ_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_READ_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+      - metric_table:
+          id: 1704
+          title: L2 Cache Stalls
+          metrics:
+            - Stalled on Write Data FIFO:
+                avg: AVG(TCC_SRC_FIFO_FULL_sum / $denom)
+                min: MIN(TCC_SRC_FIFO_FULL_sum / $denom)
+                max: MAX(TCC_SRC_FIFO_FULL_sum / $denom)
+                unit: (Cycles + $normUnit)
+            - Input Buffer Stalled on L2:
+                avg: AVG(TCC_IB_STALL_sum / $denom)
+                min: MIN(TCC_IB_STALL_sum / $denom)
+                max: MAX(TCC_IB_STALL_sum / $denom)
+                unit: (Cycles + $normUnit)
+            - Stalled on Latency FIFO:
+                avg: AVG(TCC_LATENCY_FIFO_FULL_sum / $denom)
+                min: MIN(TCC_LATENCY_FIFO_FULL_sum / $denom)
+                max: MAX(TCC_LATENCY_FIFO_FULL_sum / $denom)
+                unit: (Cycles + $normUnit)
+      - metric_table:
+          id: 1705
+          title: L2 - Fabric Interface stalls
+          metrics:
+            - Read - HBM Stall:
+                type: HBM Stall
+                transaction: Read
+                avg: |
+                  AVG(((100 * (TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                min: |
+                  MIN(((100 * (TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                max: |
+                  MAX(((100 * (TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                unit: pct
+            - Read - PCIe Stall:
+                type: PCIe Stall
+                transaction: Read
+                avg: |
+                  AVG(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                min: |
+                  MIN(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                max: |
+                  MAX(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                unit: pct
+            - Read - Infinity Fabric Stall:
+                type: Infinity Fabric™ Stall
+                transaction: Read
+                avg: |
+                  AVG(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                min: |
+                  MIN(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                max: |
+                  MAX(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                unit: pct
+            - Write - PCIe Stall:
+                type: PCIe Stall
+                transaction: Write
+                avg: |
+                  AVG(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                min: |
+                  MIN(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                max: |
+                  MAX(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                unit: pct
+            - Write - Infinity Fabric Stall:
+                type: Infinity Fabric™ Stall
+                transaction: Write
+                avg: |
+                  AVG(((100 * (TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                min: |
+                  MIN(((100 * (TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                max: |
+                  MAX(((100 * (TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                unit: pct
+            - Write - HBM Stall:
+                type: HBM Stall
+                transaction: Write
+                avg: |
+                  AVG(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                min: |
+                  MIN(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                max: |
+                  MAX(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) if (TCC_BUSY_sum != 0) else None))
+                unit: pct
+      - metric_table:
+          id: 1706
+          title: L2 - Fabric interface detailed metrics
+          metrics:
+            - Write Bandwidth - HBM:
+                avg: AVG(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Read Bandwidth - PCIe:
+                avg: AVG(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Read Bandwidth - HBM:
+                avg: AVG(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Write Bandwidth - Infinity Fabric™:
+                avg: AVG(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Atomic - HBM:
+                avg: AVG((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom))
+                min: MIN((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom))
+                max: MAX((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom))
+                unit: (Req + $normUnit)
+            - Write Bandwidth - PCIe:
+                avg: AVG(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Atomic Bandwidth - HBM:
+                avg: AVG(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Atomic Bandwidth - PCIe:
+                avg: AVG(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Read Bandwidth - Infinity Fabric™:
+                avg: AVG(TCC_EA0_RDREQ_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_EA0_RDREQ_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_EA0_RDREQ_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+            - Atomic Bandwidth - Infinity Fabric™:
+                avg: AVG(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                min: MIN(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                max: MAX(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+                unit: Gbps
+
+Deletion:
+  []
+
+Modification:
+  - Panel Config:
+      id: 200
+      title: System Speed-of-Light
+    metric_tables:
+      - metric_table:
+          id: 201
+          title: System Speed-of-Light
+          metrics:
+            - MFMA IOPs (Int8):
+                peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
+            - MFMA FLOPs (BF16):
+                peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
+            - MFMA FLOPs (F64):
+                peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000)
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 128) / 1000))
+            - MFMA FLOPs (F8):
+                peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
+            - MFMA FLOPs (F16):
+                peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
+  - Panel Config:
+      id: 300
+      title: Memory Chart
+    metric_tables:
+      - metric_table:
+          id: 301
+          title: Memory Chart
+          metrics:
+            - Workgroups:
+                value: |
+                  ROUND(AVG(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS), 0)
+            - Wavefronts:
+                value: ROUND(AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE), 0)
+  - Panel Config:
+      id: 400
+      title: Roofline
+    metric_tables:
+      - metric_table:
+          id: 402
+          title: Roofline Plot Points
+          metrics:
+            - AI L1:
+                value: |
+                  ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM(TCP_TOTAL_CACHE_ACCESSES_sum * 64) )
+            - AI HBM:
+                value: |
+                  ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCC_BUBBLE_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum * 64) ) )
+            - AI L2:
+                value: |
+                  ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / SUM( (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) * 64 ) )
+            - Performance (GFLOPs):
+                value: |
+                  ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / (SUM(End_Timestamp - Start_Timestamp) / 1e9) ) / 1e9
+  - Panel Config:
+      id: 600
+      title: Workgroup Manager (SPI)
+    metric_tables:
+      - metric_table:
+          id: 601
+          title: Workgroup manager utilizations
+          metrics:
+            - Dispatched Wavefronts:
+                avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
+                max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
+                min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
+            - Dispatched Workgroups:
+                avg: |
+                  AVG(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS)
+                max: |
+                  MAX(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS)
+                min: |
+                  MIN(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS)
+            - SGPR Writes:
+                avg: |
+                  AVG((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
+                max: |
+                  MAX((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
+                min: |
+                  MIN((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
+            - Scheduler-Pipe Utilization:
+                avg: |
+                  AVG(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
+                max: |
+                  MAX(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
+                min: |
+                  MIN(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
+            - VGPR Writes:
+                avg: |
+                  AVG((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
+                max: |
+                  MAX((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
+                min: |
+                  MIN((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
+  - Panel Config:
+      id: 700
+      title: Wavefront
+    metric_tables:
+      - metric_table:
+          id: 701
+          title: Wavefront Launch Stats
+          metrics:
+            - Total Wavefronts:
+                avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
+                max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
+                min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
+  - Panel Config:
+      id: 1100
+      title: Compute Units - Compute Pipeline
+    metric_tables:
+      - metric_table:
+          id: 1101
+          title: Compute Speed-of-Light
+          metrics:
+            - MFMA FLOPs (BF16):
+                peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
+            - MFMA IOPs (INT8):
+                peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
+            - MFMA FLOPs (F64):
+                peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000)
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 128) / 1000))
+            - MFMA FLOPs (F8):
+                peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
+            - MFMA FLOPs (F16):
+                peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
+                pop: |
+                  ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
+      - metric_table:
+          id: 1103
+          title: Arithmetic Operations
+          metrics:
+            - FLOPs (Total):
+                avg: |
+                  AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom))
+                max: |
+                  MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom))
+                min: |
+                  MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom))
+  - Panel Config:
+      id: 1700
+      title: L2 Cache
+    metric_tables:
+      - metric_table:
+          id: 1701
+          title: L2 Speed-of-Light
+          metrics:
+            - L2-Fabric Read BW:
+                value: |
+                  AVG((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp)))
+      - metric_table:
+          id: 1702
+          title: L2-Fabric interface metrics
+          metrics:
+            - Remote Read Traffic:
+                avg: |
+                  AVG((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
+                max: |
+                  MAX((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
+                min: |
+                  MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
+            - Read BW:
+                avg: |
+                  AVG((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp)))
+                max: |
+                  MAX((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp)))
+                min: |
+                  MIN((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp)))
+      - metric_table:
+          id: 1706
+          title: L2 - Fabric interface detailed metrics
+          metrics:
+            - Read (64B):
+                avg: AVG((TCC_EA0_RDREQ_64B_sum / $denom))
+                max: MAX((TCC_EA0_RDREQ_64B_sum / $denom))
+                min: MIN((TCC_EA0_RDREQ_64B_sum / $denom))
+            - Read (128B):
+                avg: AVG((TCC_EA0_RDREQ_128B_sum / $denom))
+                max: MAX((TCC_EA0_RDREQ_128B_sum / $denom))
+                min: MIN((TCC_EA0_RDREQ_128B_sum / $denom))
+            - HBM Write and Atomic:
+                avg: AVG((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom))
+                max: MAX((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom))
+                min: MIN((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom))
+  - Panel Config:
+      id: 1800
+      title: L2 Cache (per Channel)
+    metric_tables:
+      - metric_table:
+          id: 1809
+          title: L2-Fabric Read Stall (Cycles per normUnit)
+          metrics:
+            - ::_1:
+                ea read stall - pcie: AVG((TO_INT(TCC_EA0_RDREQ_IO_CREDIT_STALL[::_1]) / $denom))
+                ea read stall - if: AVG((TO_INT(TCC_EA0_RDREQ_GMI_CREDIT_STALL[::_1]) / $denom))
+                ea read stall - hbm: AVG((TO_INT(TCC_EA0_RDREQ_DRAM_CREDIT_STALL[::_1]) / $denom))
+      - metric_table:
+          id: 1810
+          title: L2-Fabric Write and Atomic Stall (Cycles per normUnit)
+          metrics:
+            - ::_1:
+                ea write stall - hbm: AVG((TO_INT(TCC_EA0_WRREQ_DRAM_CREDIT_STALL[::_1]) / $denom))
+                ea write stall - if: AVG((TO_INT(TCC_EA0_WRREQ_GMI_CREDIT_STALL[::_1]) / $denom))
+                ea write stall - pcie: AVG((TO_INT(TCC_EA0_WRREQ_IO_CREDIT_STALL[::_1]) / $denom))
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0000_top_stats.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0000_top_stats.yaml
index 55c6f6bb24..5ce5aeeb28 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0000_top_stats.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0000_top_stats.yaml
@@ -2,7 +2,6 @@
 Panel Config:
   id: 0
   title: Top Stats
-  metrics_description: {}
   data source:
   - raw_csv_table:
       id: 1
@@ -12,3 +11,4 @@ Panel Config:
       id: 2
       title: Dispatch List
       source: pmc_dispatch_info.csv
+  metrics_description: {}
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0100_system_info.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0100_system_info.yaml
index 23d024fde3..8b48c2253b 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0100_system_info.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0100_system_info.yaml
@@ -2,10 +2,10 @@
 Panel Config:
   id: 100
   title: System Info
-  metrics_description: {}
   data source:
   - raw_csv_table:
       id: 101
       title: System Info
       source: sysinfo.csv
       columnwise: true
+  metrics_description: {}
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0200_system_speed_of_light.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0200_system_speed_of_light.yaml
index 84327d65ea..bdbd62f755 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0200_system_speed_of_light.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0200_system_speed_of_light.yaml
@@ -2,124 +2,6 @@
 Panel Config:
   id: 200
   title: System Speed-of-Light
-  metrics_description:
-    VALU FLOPs: 'The total floating-point operations executed per second on the VALU.
-      This is also presented as a percent of the peak theoretical FLOPs achievable
-      on the specific accelerator. Note: this does not include any floating-point
-      operations from MFMA instructions.'
-    VALU IOPs: 'The total integer operations executed per second on the VALU. This
-      is also presented as a percent of the peak theoretical IOPs achievable on the
-      specific accelerator. Note: this does not include any integer operations from
-      MFMA instructions.'
-    MFMA FLOPs (F8): The total number of 8-bit brain floating point MFMA operations
-      executed per second. This does not include any 16-bit brain floating point operations
-      from VALU instructions. This is also presented as a percent of the peak theoretical
-      F8 MFMA operations achievable on the specific accelerator. It is supported on
-      AMD Instinct MI300 series and later only.
-    MFMA FLOPs (BF16): 'The total number of 16-bit brain floating point MFMA operations
-      executed per second. Note: this does not include any 16-bit brain floating point
-      operations from VALU instructions. This is also presented as a percent of the
-      peak theoretical BF16 MFMA operations achievable on the specific accelerator.'
-    MFMA FLOPs (F16): 'The total number of 16-bit floating point MFMA operations executed
-      per second. Note: this does not include any 16-bit floating point operations
-      from VALU instructions. This is also presented as a percent of the peak theoretical
-      F16 MFMA operations achievable on the specific accelerator.'
-    MFMA FLOPs (F32): 'The total number of 32-bit floating point MFMA operations executed
-      per second. Note: this does not include any 32-bit floating point operations
-      from VALU instructions. This is also presented as a percent of the peak theoretical
-      F32 MFMA operations achievable on the specific accelerator.'
-    MFMA FLOPs (F64): 'The total number of 64-bit floating point MFMA operations executed
-      per second. Note: this does not include any 64-bit floating point operations
-      from VALU instructions. This is also presented as a percent of the peak theoretical
-      F64 MFMA operations achievable on the specific accelerator.'
-    MFMA IOPs (Int8): 'The total number of 8-bit integer MFMA operations executed
-      per second. Note: this does not include any 8-bit integer operations from VALU
-      instructions. This is also presented as a percent of the peak theoretical INT8
-      MFMA operations achievable on the specific accelerator.'
-    Active CUs: Total number of active compute units (CUs) on the accelerator during
-      the kernel execution.
-    SALU Utilization: Indicates what percent of the kernel's duration the SALU was
-      busy executing instructions. Computed as the ratio of the total number of cycles
-      spent by the scheduler issuing SALU or SMEM instructions over the total CU cycles.
-    VALU Utilization: Indicates what percent of the kernel's duration the VALU was
-      busy executing instructions. Does not include VMEM operations. Computed as the
-      ratio of the total number of cycles spent by the scheduler issuing VALU instructions
-      over the total CU cycles.
-    MFMA Utilization: Indicates what percent of the kernel's duration the MFMA unit
-      was busy executing instructions. Computed as the ratio of the total number of
-      cycles the MFMA was busy over the total CU cycles.
-    VMEM Utilization: Indicates what percent of the kernel's duration the VMEM unit
-      was busy executing instructions, including both global/generic and spill/scratch
-      operations (see the VMEM instruction count metrics) for more detail). Does not
-      include VALU operations. Computed as the ratio of the total number of cycles
-      spent by the scheduler issuing VMEM instructions over the total CU cycles.
-    Branch Utilization: Indicates what percent of the kernel's duration the branch
-      unit was busy executing instructions. Computed as the ratio of the total number
-      of cycles spent by the scheduler issuing branch instructions over the total
-      CU cycles
-    VALU Active Threads: Indicates the average level of divergence within a wavefront
-      over the lifetime of the kernel. The number of work-items that were active in
-      a wavefront during execution of each VALU instruction, time-averaged over all
-      VALU instructions run on all wavefronts in the kernel.
-    IPC: The ratio of the total number of instructions executed on the CU over the
-      total active CU cycles. This is also presented as a percent of the peak theoretical
-      bandwidth achievable on the specific accelerator.
-    Wavefront Occupancy: 'The time-averaged number of wavefronts resident on the accelerator
-      over the lifetime of the kernel. Note: this metric may be inaccurate for short-running
-      kernels (less than 1ms). This is also presented as a percent of the peak theoretical
-      occupancy achievable on the specific accelerator.'
-    Theoretical LDS Bandwidth: Indicates the maximum amount of bytes that could have
-      been loaded from, stored to, or atomically updated in the LDS per unit time
-      (see LDS Bandwidth example for more detail). This is also presented as a percent
-      of the peak theoretical F64 MFMA operations achievable on the specific accelerator.
-    LDS Bank Conflicts/Access: The ratio of the number of cycles spent in the LDS
-      scheduler due to bank conflicts (as determined by the conflict resolution hardware)
-      to the base number of cycles that would be spent in the LDS scheduler in a completely
-      uncontended case. This is also presented in normalized form (i.e., the Bank
-      Conflict Rate).
-    vL1D Cache Hit Rate: The ratio of the number of vL1D cache line requests that
-      hit in vL1D cache over the total number of cache line requests to the vL1D cache
-      RAM.
-    vL1D Cache BW: The number of bytes looked up in the vL1D cache as a result of
-      VMEM instructions per unit time. The number of bytes is calculated as the number
-      of cache lines requested multiplied by the cache line size. This value does
-      not consider partial requests, so e.g., if only a single value is requested
-      in a cache line, the data movement will still be counted as a full cache line.
-      This is also presented as a percent of the peak theoretical bandwidth achievable
-      on the specific accelerator.
-    L2 Cache Hit Rate: The ratio of the number of L2 cache line requests that hit
-      in the L2 cache over the total number of incoming cache line requests to the
-      L2 cache.
-    L2 Cache BW: The number of bytes looked up in the L2 cache per unit time. The
-      number of bytes is calculated as the number of cache lines requested multiplied
-      by the cache line size. This value does not consider partial requests, so e.g.,
-      if only a single value is requested in a cache line, the data movement will
-      still be counted as a full cache line. This is also presented as a percent of
-      the peak theoretical bandwidth achievable on the specific accelerator.
-    L2-Fabric Read BW: "The number of bytes read by the L2 over the Infinity Fabric\u2122\
-      \ interface per unit time. This is also presented as a percent of the peak theoretical\
-      \ bandwidth achievable on the specific accelerator."
-    L2-Fabric Write BW: The number of bytes sent by the L2 over the Infinity Fabric
-      interface by write and atomic operations per unit time. This is also presented
-      as a percent of the peak theoretical bandwidth achievable on the specific accelerator.
-    L2-Fabric Read Latency: The time-averaged number of cycles read requests spent
-      in Infinity Fabric before data was returned to the L2.
-    L2-Fabric Write Latency: The time-averaged number of cycles write requests spent
-      in Infinity Fabric before a completion acknowledgement was returned to the L2.
-    sL1D Cache Hit Rate: The percent of sL1D requests that hit on a previously loaded
-      line the cache. Calculated as the ratio of the number of sL1D requests that
-      hit over the number of all sL1D requests.
-    sL1D Cache BW: The number of bytes looked up in the sL1D cache per unit time.
-      This is also presented as a percent of the peak theoretical bandwidth achievable
-      on the specific accelerator.
-    L1I Hit Rate: The number of bytes looked up in the L1I cache per unit time. This
-      is also presented as a percent of the peak theoretical bandwidth achievable
-      on the specific accelerator.
-    L1I BW: The percent of L1I requests that hit on a previously loaded line the cache.
-      Calculated as the ratio of the number of L1I requests that hit over the number
-      of all L1I requests.
-    L1I Fetch Latency: The average number of cycles spent to fetch instructions to
-      a CU.
   data source:
   - metric_table:
       id: 201
@@ -350,3 +232,130 @@ Panel Config:
           peak: None
           pop: None
           coll_level: SQ_IFETCH_LEVEL
+  metrics_description:
+    VALU FLOPs: |-
+      The total floating-point operations executed per second on the VALU.
+      This is also presented as a percent of the peak theoretical FLOPs achievable
+      on the specific accelerator. Note: this does not include any floating-point
+      operations from MFMA instructions.
+    VALU IOPs: |-
+      The total integer operations executed per second on the VALU. This is
+      also presented as a percent of the peak theoretical IOPs achievable on the
+      specific accelerator. Note: this does not include any integer operations from
+      MFMA instructions.
+    MFMA FLOPs (F8): The total number of 8-bit brain floating point MFMA operations
+      executed per second. This does not include any 16-bit brain floating point operations
+      from VALU instructions. This is also presented as a percent of the peak theoretical
+      F8 MFMA operations achievable on the specific accelerator. It is supported on
+      AMD Instinct MI300 series and later only.
+    MFMA FLOPs (BF16): |-
+      The total number of 16-bit brain floating point MFMA operations executed
+      per second. Note: this does not include any 16-bit brain floating point operations
+      from VALU instructions. This is also presented as a percent of the peak theoretical
+      BF16 MFMA operations achievable on the specific accelerator.
+    MFMA FLOPs (F16): |-
+      The total number of 16-bit floating point MFMA operations executed per
+      second. Note: this does not include any 16-bit floating point operations from
+      VALU instructions. This is also presented as a percent of the peak theoretical
+      F16 MFMA operations achievable on the specific accelerator.
+    MFMA FLOPs (F32): |-
+      The total number of 32-bit floating point MFMA operations executed per
+      second. Note: this does not include any 32-bit floating point operations from
+      VALU instructions. This is also presented as a percent of the peak theoretical
+      F32 MFMA operations achievable on the specific accelerator.
+    MFMA FLOPs (F64): |-
+      The total number of 64-bit floating point MFMA operations executed per
+      second. Note: this does not include any 64-bit floating point operations from
+      VALU instructions. This is also presented as a percent of the peak theoretical
+      F64 MFMA operations achievable on the specific accelerator.
+    MFMA IOPs (Int8): |-
+      The total number of 8-bit integer MFMA operations executed per second.
+      Note: this does not include any 8-bit integer operations from VALU instructions.
+      This is also presented as a percent of the peak theoretical INT8 MFMA operations
+      achievable on the specific accelerator.
+    Active CUs: Total number of active compute units (CUs) on the accelerator during
+      the kernel execution.
+    SALU Utilization: Indicates what percent of the kernel's duration the SALU was
+      busy executing instructions. Computed as the ratio of the total number of cycles
+      spent by the scheduler issuing SALU or SMEM instructions over the total CU cycles.
+    VALU Utilization: Indicates what percent of the kernel's duration the VALU was
+      busy executing instructions. Does not include VMEM operations. Computed as the
+      ratio of the total number of cycles spent by the scheduler issuing VALU instructions
+      over the total CU cycles.
+    MFMA Utilization: Indicates what percent of the kernel's duration the MFMA unit
+      was busy executing instructions. Computed as the ratio of the total number of
+      cycles the MFMA was busy over the total CU cycles.
+    VMEM Utilization: Indicates what percent of the kernel's duration the VMEM unit
+      was busy executing instructions, including both global/generic and spill/scratch
+      operations (see the VMEM instruction count metrics) for more detail). Does not
+      include VALU operations. Computed as the ratio of the total number of cycles
+      spent by the scheduler issuing VMEM instructions over the total CU cycles.
+    Branch Utilization: Indicates what percent of the kernel's duration the branch
+      unit was busy executing instructions. Computed as the ratio of the total number
+      of cycles spent by the scheduler issuing branch instructions over the total
+      CU cycles
+    VALU Active Threads: Indicates the average level of divergence within a wavefront
+      over the lifetime of the kernel. The number of work-items that were active in
+      a wavefront during execution of each VALU instruction, time-averaged over all
+      VALU instructions run on all wavefronts in the kernel.
+    IPC: The ratio of the total number of instructions executed on the CU over the
+      total active CU cycles. This is also presented as a percent of the peak theoretical
+      bandwidth achievable on the specific accelerator.
+    Wavefront Occupancy: |-
+      The time-averaged number of wavefronts resident on the accelerator over
+      the lifetime of the kernel. Note: this metric may be inaccurate for short-running
+      kernels (less than 1ms). This is also presented as a percent of the peak theoretical
+      occupancy achievable on the specific accelerator.
+    Theoretical LDS Bandwidth: Indicates the maximum amount of bytes that could have
+      been loaded from, stored to, or atomically updated in the LDS per unit time
+      (see LDS Bandwidth example for more detail). This is also presented as a percent
+      of the peak theoretical F64 MFMA operations achievable on the specific accelerator.
+    LDS Bank Conflicts/Access: The ratio of the number of cycles spent in the LDS
+      scheduler due to bank conflicts (as determined by the conflict resolution hardware)
+      to the base number of cycles that would be spent in the LDS scheduler in a completely
+      uncontended case. This is also presented in normalized form (i.e., the Bank
+      Conflict Rate).
+    vL1D Cache Hit Rate: The ratio of the number of vL1D cache line requests that
+      hit in vL1D cache over the total number of cache line requests to the vL1D cache
+      RAM.
+    vL1D Cache BW: The number of bytes looked up in the vL1D cache as a result of
+      VMEM instructions per unit time. The number of bytes is calculated as the number
+      of cache lines requested multiplied by the cache line size. This value does
+      not consider partial requests, so e.g., if only a single value is requested
+      in a cache line, the data movement will still be counted as a full cache line.
+      This is also presented as a percent of the peak theoretical bandwidth achievable
+      on the specific accelerator.
+    L2 Cache Hit Rate: The ratio of the number of L2 cache line requests that hit
+      in the L2 cache over the total number of incoming cache line requests to the
+      L2 cache.
+    L2 Cache BW: The number of bytes looked up in the L2 cache per unit time. The
+      number of bytes is calculated as the number of cache lines requested multiplied
+      by the cache line size. This value does not consider partial requests, so e.g.,
+      if only a single value is requested in a cache line, the data movement will
+      still be counted as a full cache line. This is also presented as a percent of
+      the peak theoretical bandwidth achievable on the specific accelerator.
+    L2-Fabric Read BW: |-
+      The number of bytes read by the L2 over the Infinity Fabric\u2122 interface
+      per unit time. This is also presented as a percent of the peak theoretical
+      bandwidth achievable on the specific accelerator.
+    L2-Fabric Write BW: The number of bytes sent by the L2 over the Infinity Fabric
+      interface by write and atomic operations per unit time. This is also presented
+      as a percent of the peak theoretical bandwidth achievable on the specific accelerator.
+    L2-Fabric Read Latency: The time-averaged number of cycles read requests spent
+      in Infinity Fabric before data was returned to the L2.
+    L2-Fabric Write Latency: The time-averaged number of cycles write requests spent
+      in Infinity Fabric before a completion acknowledgement was returned to the L2.
+    sL1D Cache Hit Rate: The percent of sL1D requests that hit on a previously loaded
+      line the cache. Calculated as the ratio of the number of sL1D requests that
+      hit over the number of all sL1D requests.
+    sL1D Cache BW: The number of bytes looked up in the sL1D cache per unit time.
+      This is also presented as a percent of the peak theoretical bandwidth achievable
+      on the specific accelerator.
+    L1I Hit Rate: The number of bytes looked up in the L1I cache per unit time. This
+      is also presented as a percent of the peak theoretical bandwidth achievable
+      on the specific accelerator.
+    L1I BW: The percent of L1I requests that hit on a previously loaded line the cache.
+      Calculated as the ratio of the number of L1I requests that hit over the number
+      of all L1I requests.
+    L1I Fetch Latency: The average number of cycles spent to fetch instructions to
+      a CU.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0300_memory_chart.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0300_memory_chart.yaml
index e7b911a4b0..081d5654df 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0300_memory_chart.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0300_memory_chart.yaml
@@ -2,122 +2,6 @@
 Panel Config:
   id: 300
   title: Memory Chart
-  metrics_description:
-    Wavefront Occupancy: Wavefronts per active CU.
-    Wave Life: Average number of cycles executing a wave.
-    SALU: Total Number of SALU (Scalar ALU) instructions issued per normalization
-      unit.
-    SMEM: Total number of SMEM (Scalar Memory Read) instructions issued normalization
-      unit.
-    VALU: The number of VALU (Vector ALU) instructions issued per normalization unit.
-    MFMA: Total number of MFMA (Matrix-Fused-Multiply-Add) instructions issued per
-      normalization unit.
-    VMEM: The number of VMEM (GPU Memory) read instructions issued (including FLAT/scratch
-      memory) per normalization unit.
-    LDS: The total number of LDS instructions (including, but not limited to, read/write/atomics
-      and HIP's __shfl instructions) executed per normalization unit.
-    GWS: Total number of GDS (global data sync) instructions issued per normalization
-      unit.
-    BR: Total number of BRANCH instructions issued per normalization unit.
-    Active CUs: Total number of active compute units (CUs) on the accelerator during
-      the kernel execution.
-    Num CUs: Total number of compute units (CUs) on the accelerator.
-    VGPR: 'The number of architected vector general-purpose registers allocated for
-      the kernel, see VALU. Note: this may not exactly match the number of VGPRs requested
-      by the compiler due to allocation granularity.'
-    SGPR: 'The number of scalar general-purpose registers allocated for the kernel,
-      see SALU. Note: this may not exactly match the number of SGPRs requested by
-      the compiler due to allocation granularity.'
-    LDS Allocation: 'The number of bytes of LDS memory (or, shared memory) allocated
-      for this kernel. Note: This may also be larger than what was requested at compile
-      time due to both allocation granularity and dynamic per-dispatch LDS allocations.'
-    Scratch Allocation: The number of bytes of scratch memory requested per work-item
-      for this kernel. Scratch memory is used for stack memory on the accelerator,
-      as well as for register spills and restores.
-    Wavefronts: The total number of wavefronts, summed over all workgroups, forming
-      this kernel launch.
-    Workgroups: The total number of workgroups forming this kernel launch.
-    LDS Req: The total number of LDS instructions (including, but not limited to,
-      read/write/atomics and HIP's __shfl instructions) executed per normalization
-      unit.
-    LDS Util: Indicates what percent of the kernel's duration the LDS was actively
-      executing instructions (including, but not limited to, load, store, atomic and
-      HIP's __shfl operations). Calculated as the ratio of the total number of cycles
-      LDS was active over the total CU cycles.
-    LDS Latency: The average number of round-trip cycles (i.e., from issue to data-return
-      / acknowledgment) required for an LDS instruction to complete.
-    VL1 Rd: The total number of incoming read requests from the address processing
-      unit after coalescing per normalization unit
-    VL1 Wr: The total number of incoming write requests from the address processing
-      unit after coalescing per normalization unit
-    VL1 Atomic: The total number of incoming atomic requests from the address processing
-      unit after coalescing per normalization unit
-    VL1 Hit: The ratio of the number of vL1D cache line requests that hit in vL1D
-      cache over the total number of cache line requests to the vL1D Cache RAM.
-    VL1 Lat: Calculated as the average number of cycles that a vL1D cache line request
-      spent in the vL1D cache pipeline.
-    VL1 Coalesce: Indicates how well memory instructions were coalesced by the address
-      processing unit, ranging from uncoalesced (25%) to fully coalesced (100%). Calculated
-      as the average number of thread-requests generated per instruction divided by
-      the ideal number of thread-requests per instruction.
-    VL1 Stall: The ratio of the number of cycles where the vL1D is stalled waiting
-      to issue a request for data to the L2 cache divided by the number of cycles
-      where the vL1D is active.
-    VL1_L2 Rd: The number of read requests for a vL1D cache line that were not satisfied
-      by the vL1D and must be retrieved from the to the L2 Cache per normalization
-      unit.
-    VL1_L2 Wr: The number of write requests to a vL1D cache line that were sent through
-      the vL1D to the L2 cache, per normalization unit.
-    VL1_L2 Atomic: The number of atomic requests that are sent through the vL1D to
-      the L2 cache, per normalization unit. This includes requests for atomics with,
-      and without return.
-    sL1D Rd: The total number of requests, of any size or type, made to the sL1D per
-      normalization unit.
-    sL1D Hit: The total number of sL1D requests that hit on a previously loaded cache
-      line, per normalization unit.
-    sL1D_L2 Rd: The total number of read requests from sL1D to the L2, per normalization
-      unit.
-    sL1D_L2 Wr: The total number of write requests from sL1D to the L2, per normalization
-      unit. Typically unused on current CDNA accelerators.
-    sL1D_L2 Atomic: The total number of atomic requests from sL1D to the L2, per normalization
-      unit. Typically unused on current CDNA accelerators.
-    IL1 Fetch: The total number of requests made to the L1I per normalization-unit.
-    IL1 Hit: The percent of L1I requests that hit on a previously loaded line the
-      cache. Calculated as the ratio of the number of L1I requests that hit over the
-      number of all L1I requests.
-    IL1 Lat: The average number of cycles spent to fetch instructions to a CU.
-    IL1_L2 Rd: The total number of requests across the L1I - L2 interface per normalization-unit.
-    L2 Rd: The total number of read requests to the L2 from all clients.
-    L2 Wr: The total number of write requests to the L2 from all clients.
-    L2 Atomic: The total number of atomic requests (with and without return) to the
-      L2 from all clients.
-    L2 Hit: The ratio of the number of L2 cache line requests that hit in the L2 cache
-      over the total number of incoming cache line requests to the L2 cache.
-    L2 Rd Lat: Calculated as the average number of cycles that the vL1D cache took
-      to issue and receive read requests from the L2 Cache. This number also includes
-      requests for atomics with return values.
-    L2 Wr Lat: Calculated as the average number of cycles that the vL1D cache took
-      to issue and receive acknowledgement of a write request to the L2 Cache. This
-      number also includes requests for atomics without return values.
-    Fabric_L2 Rd: Number of L2 cache - Infinity Fabric read requests (either 32-byte
-      or 64-byte) summed over TCC instances per normalization unit.
-    Fabric_L2 Wr: Number of L2 cache - Infinity Fabric write requests (either 32-byte
-      or 64-byte) summed over TCC instances per normalization unit.
-    Fabric_L2 Atomic: Number of L2 cache - Infinity Fabric write requests (either
-      32-byte or 64-byte) that are actually atomic requests summed over TCC instances
-      per normalization unit.
-    Fabric Rd Lat: The time-averaged number of cycles read requests spent in Infinity
-      Fabric before data was returned to the L2.
-    Fabric Wr Lat: The time-averaged number of cycles write requests spent in Infinity
-      Fabric before a completion acknowledgement was returned to the L2.
-    Fabric Atomic Lat: The time-averaged number of cycles atomic requests spent in
-      Infinity Fabric before a completion acknowledgement (atomic without return value)
-      or data (atomic with return value) was returned to the L2.
-    HBM Rd: The total number of L2 requests to Infinity Fabric to read 32B or 64B
-      of data from the accelerator's local HBM, per normalization unit.
-    HBM Wr: 'The total number of L2 requests to Infinity Fabric to write or atomically
-      update 32B or 64B of data in the accelerator''s local HBM, per normalization
-      unit. '
   data source:
   - metric_table:
       id: 301
@@ -254,13 +138,13 @@ Panel Config:
           value: ROUND(AVG((TCC_EA0_ATOMIC_sum / $denom)), 0)
         Fabric Rd Lat:
           value: ROUND(AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-            != 0) else  0)), 0)
+            != 0) else 0)), 0)
         Fabric Wr Lat:
           value: ROUND(AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-            != 0) else  0)), 0)
+            != 0) else 0)), 0)
         Fabric Atomic Lat:
           value: ROUND(AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
-            != 0) else  0)), 0)
+            != 0) else 0)), 0)
         HBM Rd:
           value: ROUND(AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)), 0)
         HBM Wr:
@@ -268,3 +152,123 @@ Panel Config:
       comparable: false
       cli_style: mem_chart
       tui_style: mem_chart
+  metrics_description:
+    Wavefront Occupancy: Wavefronts per active CU.
+    Wave Life: Average number of cycles executing a wave.
+    SALU: Total Number of SALU (Scalar ALU) instructions issued per normalization
+      unit.
+    SMEM: Total number of SMEM (Scalar Memory Read) instructions issued normalization
+      unit.
+    VALU: The number of VALU (Vector ALU) instructions issued per normalization unit.
+    MFMA: Total number of MFMA (Matrix-Fused-Multiply-Add) instructions issued per
+      normalization unit.
+    VMEM: The number of VMEM (GPU Memory) read instructions issued (including FLAT/scratch
+      memory) per normalization unit.
+    LDS: The total number of LDS instructions (including, but not limited to, read/write/atomics
+      and HIP's __shfl instructions) executed per normalization unit.
+    GWS: Total number of GDS (global data sync) instructions issued per normalization
+      unit.
+    BR: Total number of BRANCH instructions issued per normalization unit.
+    Active CUs: Total number of active compute units (CUs) on the accelerator during
+      the kernel execution.
+    Num CUs: Total number of compute units (CUs) on the accelerator.
+    VGPR: |-
+      The number of architected vector general-purpose registers allocated
+      for the kernel, see VALU. Note: this may not exactly match the number of VGPRs
+      requested by the compiler due to allocation granularity.
+    SGPR: |-
+      The number of scalar general-purpose registers allocated for the kernel,
+      see SALU. Note: this may not exactly match the number of SGPRs requested by
+      the compiler due to allocation granularity.
+    LDS Allocation: |-
+      The number of bytes of LDS memory (or, shared memory) allocated for
+      this kernel. Note: This may also be larger than what was requested at compile
+      time due to both allocation granularity and dynamic per-dispatch LDS allocations.
+    Scratch Allocation: The number of bytes of scratch memory requested per work-item
+      for this kernel. Scratch memory is used for stack memory on the accelerator,
+      as well as for register spills and restores.
+    Wavefronts: The total number of wavefronts, summed over all workgroups, forming
+      this kernel launch.
+    Workgroups: The total number of workgroups forming this kernel launch.
+    LDS Req: The total number of LDS instructions (including, but not limited to,
+      read/write/atomics and HIP's __shfl instructions) executed per normalization
+      unit.
+    LDS Util: Indicates what percent of the kernel's duration the LDS was actively
+      executing instructions (including, but not limited to, load, store, atomic and
+      HIP's __shfl operations). Calculated as the ratio of the total number of cycles
+      LDS was active over the total CU cycles.
+    LDS Latency: The average number of round-trip cycles (i.e., from issue to data-return
+      / acknowledgment) required for an LDS instruction to complete.
+    VL1 Rd: The total number of incoming read requests from the address processing
+      unit after coalescing per normalization unit
+    VL1 Wr: The total number of incoming write requests from the address processing
+      unit after coalescing per normalization unit
+    VL1 Atomic: The total number of incoming atomic requests from the address processing
+      unit after coalescing per normalization unit
+    VL1 Hit: The ratio of the number of vL1D cache line requests that hit in vL1D
+      cache over the total number of cache line requests to the vL1D Cache RAM.
+    VL1 Lat: Calculated as the average number of cycles that a vL1D cache line request
+      spent in the vL1D cache pipeline.
+    VL1 Coalesce: Indicates how well memory instructions were coalesced by the address
+      processing unit, ranging from uncoalesced (25%) to fully coalesced (100%). Calculated
+      as the average number of thread-requests generated per instruction divided by
+      the ideal number of thread-requests per instruction.
+    VL1 Stall: The ratio of the number of cycles where the vL1D is stalled waiting
+      to issue a request for data to the L2 cache divided by the number of cycles
+      where the vL1D is active.
+    VL1_L2 Rd: The number of read requests for a vL1D cache line that were not satisfied
+      by the vL1D and must be retrieved from the to the L2 Cache per normalization
+      unit.
+    VL1_L2 Wr: The number of write requests to a vL1D cache line that were sent through
+      the vL1D to the L2 cache, per normalization unit.
+    VL1_L2 Atomic: The number of atomic requests that are sent through the vL1D to
+      the L2 cache, per normalization unit. This includes requests for atomics with,
+      and without return.
+    sL1D Rd: The total number of requests, of any size or type, made to the sL1D per
+      normalization unit.
+    sL1D Hit: The total number of sL1D requests that hit on a previously loaded cache
+      line, per normalization unit.
+    sL1D_L2 Rd: The total number of read requests from sL1D to the L2, per normalization
+      unit.
+    sL1D_L2 Wr: The total number of write requests from sL1D to the L2, per normalization
+      unit. Typically unused on current CDNA accelerators.
+    sL1D_L2 Atomic: The total number of atomic requests from sL1D to the L2, per normalization
+      unit. Typically unused on current CDNA accelerators.
+    IL1 Fetch: The total number of requests made to the L1I per normalization-unit.
+    IL1 Hit: The percent of L1I requests that hit on a previously loaded line the
+      cache. Calculated as the ratio of the number of L1I requests that hit over the
+      number of all L1I requests.
+    IL1 Lat: The average number of cycles spent to fetch instructions to a CU.
+    IL1_L2 Rd: The total number of requests across the L1I - L2 interface per normalization-unit.
+    L2 Rd: The total number of read requests to the L2 from all clients.
+    L2 Wr: The total number of write requests to the L2 from all clients.
+    L2 Atomic: The total number of atomic requests (with and without return) to the
+      L2 from all clients.
+    L2 Hit: The ratio of the number of L2 cache line requests that hit in the L2 cache
+      over the total number of incoming cache line requests to the L2 cache.
+    L2 Rd Lat: Calculated as the average number of cycles that the vL1D cache took
+      to issue and receive read requests from the L2 Cache. This number also includes
+      requests for atomics with return values.
+    L2 Wr Lat: Calculated as the average number of cycles that the vL1D cache took
+      to issue and receive acknowledgement of a write request to the L2 Cache. This
+      number also includes requests for atomics without return values.
+    Fabric_L2 Rd: Number of L2 cache - Infinity Fabric read requests (either 32-byte
+      or 64-byte) summed over TCC instances per normalization unit.
+    Fabric_L2 Wr: Number of L2 cache - Infinity Fabric write requests (either 32-byte
+      or 64-byte) summed over TCC instances per normalization unit.
+    Fabric_L2 Atomic: Number of L2 cache - Infinity Fabric write requests (either
+      32-byte or 64-byte) that are actually atomic requests summed over TCC instances
+      per normalization unit.
+    Fabric Rd Lat: The time-averaged number of cycles read requests spent in Infinity
+      Fabric before data was returned to the L2.
+    Fabric Wr Lat: The time-averaged number of cycles write requests spent in Infinity
+      Fabric before a completion acknowledgement was returned to the L2.
+    Fabric Atomic Lat: The time-averaged number of cycles atomic requests spent in
+      Infinity Fabric before a completion acknowledgement (atomic without return value)
+      or data (atomic with return value) was returned to the L2.
+    HBM Rd: The total number of L2 requests to Infinity Fabric to read 32B or 64B
+      of data from the accelerator's local HBM, per normalization unit.
+    HBM Wr: |-
+      The total number of L2 requests to Infinity Fabric to write or atomically
+      update 32B or 64B of data in the accelerator's local HBM, per normalization
+      unit.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0400_roofline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0400_roofline.yaml
index 9829a8b37b..0a4b3be693 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0400_roofline.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0400_roofline.yaml
@@ -2,85 +2,6 @@
 Panel Config:
   id: 400
   title: Roofline
-  metrics_description:
-    VALU FLOPs (F16): 'The total 16-bit floating-point operations executed per second
-      on the VALU. This is presented with the value of the peak empirical F16 FLOPs
-      achievable on the specific accelerator. Note: this does not include any F16
-      operations from MFMA instructions.'
-    VALU FLOPs (F32): 'The total 32-bit floating-point operations executed per second
-      on the VALU. This is presented with the value of the peak empirical F32 FLOPs
-      achievable on the specific accelerator. Note: this does not include any F32
-      operations from MFMA instructions.'
-    VALU FLOPs (F64): 'The total 64-bit floating-point operations executed per second
-      on the VALU. This is presented with the value of the peak empirical F64 FLOPs
-      achievable on the specific accelerator. Note: this does not include any F64
-      operations from MFMA instructions.'
-    MFMA FLOPs (F8): The total number of 8-bit brain floating point MFMA operations
-      executed per second. This does not include any 16-bit brain floating point operations
-      from VALU instructions. The peak empirically measured F8 MFMA operations achievable
-      on the specific accelerator is displayed alongside for comparison. It is supported
-      on AMD Instinct MI300 series and later only.
-    MFMA FLOPs (BF16): 'The total number of 16-bit brain floating point MFMA operations
-      executed per second. Note: this does not include any 16-bit brain floating point
-      operations from VALU instructions. The peak empirically measured BF16 MFMA operations
-      achievable on the specific accelerator is displayed alongside for comparison.'
-    MFMA FLOPs (F16): 'The total number of 16-bit floating point MFMA operations executed
-      per second. Note: this does not include any 16-bit floating point operations
-      from VALU instructions. The peak empirically measured F16 MFMA operations achievable
-      on the specific accelerator is displayed alongside for comparison.'
-    MFMA FLOPs (F32): 'The total number of 32-bit floating point MFMA operations executed
-      per second. Note: this does not include any 32-bit floating point operations
-      from VALU instructions. The peak empirically measured F32 MFMA operations achievable
-      on the specific accelerator is displayed alongside for comparison.'
-    MFMA FLOPs (F64): 'The total number of 64-bit floating point MFMA operations executed
-      per second. Note: this does not include any 64-bit floating point operations
-      from VALU instructions. The peak empirically measured F64 MFMA operations achievable
-      on the specific accelerator is displayed alongside for comparison.'
-    MFMA FLOPs (F6F4): 'The total number of 4-bit and 6-bit floating point MFMA operations
-      executed per second. Note: this does not include any floating point operations
-      from VALU instructions. The peak empirically measured F6F4 MFMA operations achievable
-      on the specific accelerator is displayed alongside for comparison. It is supported
-      on AMD Instinct MI350 series (gfx950) and later only.'
-    MFMA IOPs (Int8): 'The total number of 8-bit integer MFMA operations executed
-      per second. Note: this does not include any 8-bit integer operations from VALU
-      instructions. The peak empirically measured INT8 MFMA operations achievable
-      on the specific accelerator is displayed alongside for comparison.'
-    HBM Bandwidth: The total number of bytes read from and written to High-Bandwidth
-      Memory (HBM) per second. The peak empirically measured bandwidth achievable
-      on the specific accelerator is displayed alongside for comparison.
-    L2 Cache Bandwidth: The number of bytes looked up in the L2 cache per unit time.
-      The number of bytes is calculated as the number of cache lines requested multiplied
-      by the cache line size. This value does not consider partial requests, so e.g.,
-      if only a single value is requested in a cache line, the data movement will
-      still be counted as a full cache line. The peak empirically measured bandwidth
-      achievable on the specific accelerator is displayed alongside for comparison.
-    L1 Cache Bandwidth: The number of bytes looked up in the vL1D cache as a result
-      of VMEM instructions per unit time. The number of bytes is calculated as the
-      number of cache lines requested multiplied by the cache line size. This value
-      does not consider partial requests, so e.g., if only a single value is requested
-      in a cache line, the data movement will still be counted as a full cache line.
-      The peak empirically measured bandwidth achievable on the specific accelerator
-      is displayed alongside for comparison.
-    LDS Bandwidth: Indicates the maximum amount of bytes that could have been loaded
-      from, stored to, or atomically updated in the LDS per unit time (see LDS Bandwidth
-      example for more detail). The peak empirically measured LDS bandwidth achievable
-      on the specific accelerator is displayed alongside for comparison.
-    AI L1: The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio
-      of total floating-point operations (FLOPs) to total bytes transferred between
-      the L1 cache and the processing units. This value is used as the x-coordinate
-      for the L1 roofline.
-    AI L2: The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio
-      of total floating-point operations (FLOPs) to total bytes transferred between
-      the L2 cache and the L1 cache. This value is used as the x-coordinate for the
-      L2 roofline.
-    AI HBM: The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM).
-      It is the ratio of total floating-point operations (FLOPs) to total bytes transferred
-      between HBM and the L2 cache. This value is used as the x-coordinate for the
-      HBM roofline.
-    Performance (GFLOPs): The overall achieved performance, measured in GigaFLOPs
-      per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point
-      operations divided by the total execution time. This value is used as the y-coordinate
-      for the kernel's point on the Roofline plot.
   data source:
   - metric_table:
       id: 401
@@ -226,3 +147,97 @@ Panel Config:
             * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) ) / (SUM(End_Timestamp -
             Start_Timestamp) / 1e9) ) / 1e9
           unit: GFLOP/s
+  metrics_description:
+    VALU FLOPs (F16): |-
+      The total 16-bit floating-point operations executed per second on the VALU.
+      This is presented with the value of the peak empirical F16 FLOPs achievable
+      on the specific accelerator. Note: this does not include any F16 operations
+      from MFMA instructions.
+    VALU FLOPs (F32): |-
+      The total 32-bit floating-point operations executed per second on the VALU.
+      This is presented with the value of the peak empirical F32 FLOPs achievable
+      on the specific accelerator. Note: this does not include any F32 operations
+      from MFMA instructions.
+    VALU FLOPs (F64): |-
+      The total 64-bit floating-point operations executed per second on the VALU.
+      This is presented with the value of the peak empirical F64 FLOPs achievable
+      on the specific accelerator. Note: this does not include any F64 operations
+      from MFMA instructions.
+    MFMA FLOPs (F8): The total number of 8-bit brain floating point MFMA operations
+      executed per second. This does not include any 16-bit brain floating point operations
+      from VALU instructions. The peak empirically measured F8 MFMA operations achievable
+      on the specific accelerator is displayed alongside for comparison. It is supported
+      on AMD Instinct MI300 series and later only.
+    MFMA FLOPs (BF16): |-
+      The total number of 16-bit brain floating point MFMA operations executed
+      per second. Note: this does not include any 16-bit brain floating point
+      operations from VALU instructions. The peak empirically measured BF16 MFMA
+      operations achievable on the specific accelerator is displayed alongside
+      for comparison.
+    MFMA FLOPs (F16): |-
+      The total number of 16-bit floating point MFMA operations executed per
+      second. Note: this does not include any 16-bit floating point operations from
+      VALU instructions. The peak empirically measured F16 MFMA operations
+      achievable on the specific accelerator is displayed alongside for comparison.
+    MFMA FLOPs (F32): |-
+      The total number of 32-bit floating point MFMA operations executed per
+      second. Note: this does not include any 32-bit floating point operations from
+      VALU instructions. The peak empirically measured F32 MFMA operations
+      achievable on the specific accelerator is displayed alongside for comparison.
+    MFMA FLOPs (F64): |-
+      The total number of 64-bit floating point MFMA operations executed per
+      second. Note: this does not include any 64-bit floating point operations from
+      VALU instructions. The peak empirically measured F64 MFMA operations
+      achievable on the specific accelerator is displayed alongside for comparison.
+    MFMA FLOPs (F6F4): |-
+      The total number of 4-bit and 6-bit floating point MFMA operations executed
+      per second. Note: this does not include any floating point operations from
+      VALU instructions. The peak empirically measured F6F4 MFMA operations
+      achievable on the specific accelerator is displayed alongside for comparison.
+      It is supported on AMD Instinct MI350 series (gfx950) and later only.
+    MFMA IOPs (Int8): |-
+      The total number of 8-bit integer MFMA operations executed per second.
+      Note: this does not include any 8-bit integer operations from VALU instructions.
+      The peak empirically measured INT8 MFMA operations achievable on the specific
+      accelerator is displayed alongside for comparison.
+    HBM Bandwidth: |-
+      The total number of bytes read from and written to High-Bandwidth
+      Memory (HBM) per second. The peak empirically measured bandwidth achievable
+      on the specific accelerator is displayed alongside for comparison.
+    L2 Cache Bandwidth: The number of bytes looked up in the L2 cache per unit time.
+      The number of bytes is calculated as the number of cache lines requested multiplied
+      by the cache line size. This value does not consider partial requests, so e.g.,
+      if only a single value is requested in a cache line, the data movement will
+      still be counted as a full cache line. The peak empirically measured bandwidth
+      achievable on the specific accelerator is displayed alongside for comparison.
+    L1 Cache Bandwidth: The number of bytes looked up in the vL1D cache as a result
+      of VMEM instructions per unit time. The number of bytes is calculated as the
+      number of cache lines requested multiplied by the cache line size. This value
+      does not consider partial requests, so e.g., if only a single value is requested
+      in a cache line, the data movement will still be counted as a full cache line.
+      The peak empirically measured bandwidth achievable on the specific accelerator
+      is displayed alongside for comparison.
+    LDS Bandwidth: Indicates the maximum amount of bytes that could have been loaded
+      from, stored to, or atomically updated in the LDS per unit time (see LDS Bandwidth
+      example for more detail). The peak empirically measured LDS bandwidth achievable
+      on the specific accelerator is displayed alongside for comparison.
+    AI L1: |-
+      The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio
+      of total floating-point operations (FLOPs) to total bytes transferred between
+      the L1 cache and the processing units. This value is used as the x-coordinate
+      for the L1 roofline.
+    AI L2: |-
+      The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio
+      of total floating-point operations (FLOPs) to total bytes transferred between
+      the L2 cache and the L1 cache. This value is used as the x-coordinate for
+      the L2 roofline.
+    AI HBM: |-
+      The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM).
+      It is the ratio of total floating-point operations (FLOPs) to total bytes
+      transferred between HBM and the L2 cache. This value is used as the x-coordinate
+      for the HBM roofline.
+    Performance (GFLOPs): |-
+      The overall achieved performance, measured in GigaFLOPs
+      per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point
+      operations divided by the total execution time. This value is used as the y-coordinate
+      for the kernel's point on the Roofline plot.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0500_command_processor_cpc_cpf.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0500_command_processor_cpc_cpf.yaml
index 722514277c..58699ebb18 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0500_command_processor_cpc_cpf.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0500_command_processor_cpc_cpf.yaml
@@ -2,30 +2,6 @@
 Panel Config:
   id: 500
   title: Command Processor (CPC/CPF)
-  metrics_description:
-    CPF Utilization: Percent of total cycles where the CPF was busy actively doing
-      any work. The ratio of CPF busy cycles over total cycles counted by the CPF.
-    CPF Stall: Percent of CPF busy cycles where the CPF was stalled for any reason.
-    CPF-L2 Utilization: Percent of total cycles counted by the CPF-L2 interface where
-      the CPF-L2 interface was active doing any work. The ratio of CPF-L2 busy cycles
-      over total cycles counted by the CPF-L2.
-    CPF-L2 Stall: Percent of CPF-L2 L2 busy cycles where the CPF-L2 interface was
-      stalled for any reason.
-    CPF-UTCL1 Stall: Percent of CPF busy cycles where the CPF was stalled by address
-      translation.
-    CPC Utilization: Percent of total cycles where the CPC was busy actively doing
-      any work. The ratio of CPC busy cycles over total cycles counted by the CPC.
-    CPC Stall Rate: Percent of CPC busy cycles where the CPC was stalled for any reason.
-    CPC Packet Decoding Utilization: Percent of CPC busy cycles spent decoding commands
-      for processing.
-    CPC-Workgroup Manager Utilization: Percent of CPC busy cycles spent dispatching
-      workgroups to the workgroup manager.
-    CPC-L2 Utilization: Percent of total cycles counted by the CPC-L2 interface where
-      the CPC-L2 interface was active doing any work.
-    CPC-UTCL1 Stall: Percent of CPC busy cycles where the CPC was stalled by address
-      translation
-    CPC-UTCL2 Utilization: 'Percent of total cycles counted by the CPC''s L2 address
-      translation interface where the CPC was busy doing address translation work.  '
   data source:
   - metric_table:
       id: 501
@@ -164,3 +140,28 @@ Panel Config:
           max: MAX((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
             if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
           unit: pct
+  metrics_description:
+    CPF Utilization: Percent of total cycles where the CPF was busy actively doing
+      any work. The ratio of CPF busy cycles over total cycles counted by the CPF.
+    CPF Stall: Percent of CPF busy cycles where the CPF was stalled for any reason.
+    CPF-L2 Utilization: Percent of total cycles counted by the CPF-L2 interface where
+      the CPF-L2 interface was active doing any work. The ratio of CPF-L2 busy cycles
+      over total cycles counted by the CPF-L2.
+    CPF-L2 Stall: Percent of CPF-L2 L2 busy cycles where the CPF-L2 interface was
+      stalled for any reason.
+    CPF-UTCL1 Stall: Percent of CPF busy cycles where the CPF was stalled by address
+      translation.
+    CPC Utilization: Percent of total cycles where the CPC was busy actively doing
+      any work. The ratio of CPC busy cycles over total cycles counted by the CPC.
+    CPC Stall Rate: Percent of CPC busy cycles where the CPC was stalled for any reason.
+    CPC Packet Decoding Utilization: Percent of CPC busy cycles spent decoding commands
+      for processing.
+    CPC-Workgroup Manager Utilization: Percent of CPC busy cycles spent dispatching
+      workgroups to the workgroup manager.
+    CPC-L2 Utilization: Percent of total cycles counted by the CPC-L2 interface where
+      the CPC-L2 interface was active doing any work.
+    CPC-UTCL1 Stall: Percent of CPC busy cycles where the CPC was stalled by address
+      translation
+    CPC-UTCL2 Utilization: |-
+      Percent of total cycles counted by the CPC's L2 address translation
+      interface where the CPC was busy doing address translation work.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0600_workgroup_manager_spi.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0600_workgroup_manager_spi.yaml
index c32f4ded90..02ed4b3d9c 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0600_workgroup_manager_spi.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0600_workgroup_manager_spi.yaml
@@ -2,61 +2,6 @@
 Panel Config:
   id: 600
   title: Workgroup Manager (SPI)
-  metrics_description:
-    Accelerator Utilization: The percent of cycles in the kernel where the accelerator
-      was actively doing any work.
-    Scheduler-Pipe Utilization: The percent of total scheduler-pipe cycles in the
-      kernel where the scheduler-pipes were actively doing any work.
-    Workgroup Manager Utilization: The percent of cycles in the kernel where the workgroup
-      manager was actively doing any work.
-    Shader Engine Utilization: The percent of total shader engine cycles in the kernel
-      where any CU in a shader-engine was actively doing any work, normalized over
-      all shader-engines. Low values (e.g., << 100%) indicate that the accelerator
-      was not fully saturated by the kernel, or a potential load-imbalance issue.
-    SIMD Utilization: The percent of total SIMD cycles in the kernel where any SIMD
-      on a CU was actively doing any work, summed over all CUs. Low values (less than
-      100%) indicate that the accelerator was not fully saturated by the kernel, or
-      a potential load-imbalance issue.
-    Dispatched Workgroups: The total number of workgroups forming this kernel launch.
-    Dispatched Wavefronts: The total number of wavefronts, summed over all workgroups,
-      forming this kernel launch.
-    VGPR Writes: The average number of cycles spent initializing VGPRs at wave creation.
-    SGPR Writes: The average number of cycles spent initializing SGPRs at wave creation.
-    Not-scheduled Rate (Workgroup Manager): The percent of total scheduler-pipe cycles
-      in the kernel where a workgroup could not be scheduled to a CU due to a bottleneck
-      within the workgroup manager rather than a lack of a CU or SIMD with sufficient
-      resources.
-    Not-scheduled Rate (Scheduler-Pipe): 'The percent of total scheduler-pipe cycles
-      in the kernel where a workgroup could not be scheduled to a CU due to a bottleneck
-      within the scheduler-pipes rather than a lack of a CU or SIMD with sufficient
-      resources. '
-    Scheduler-Pipe Stall Rate: The percent of total scheduler-pipe cycles in the kernel
-      where a workgroup could not be scheduled to a CU due to occupancy limitations
-      (like a lack of a CU or SIMD with sufficient resources).
-    Scratch Stall Rate: The percent of total shader-engine cycles in the kernel where
-      a workgroup could not be scheduled to a CU due to lack of private (a.k.a., scratch)
-      memory slots. While this can reach up to 100%, note that the actual occupancy
-      limitations on a kernel using private memory are typically quite small (for
-      example, less than 1% of the total number of waves that can be scheduled to
-      an accelerator).
-    Insufficient SIMD Waveslots: The percent of total SIMD cycles in the kernel where
-      a workgroup could not be scheduled to a SIMD due to lack of available waveslots.
-    Insufficient SIMD VGPRs: The percent of total SIMD cycles in the kernel where
-      a workgroup could not be scheduled to a SIMD due to lack of available VGPRs.
-    Insufficient SIMD SGPRs: The percent of total SIMD cycles in the kernel where
-      a workgroup could not be scheduled to a SIMD due to lack of available SGPRs.
-    Insufficient CU LDS: The percent of total CU cycles in the kernel where a workgroup
-      could not be scheduled to a CU due to lack of available LDS.
-    Insufficient CU Barriers: The percent of total CU cycles in the kernel where a
-      workgroup could not be scheduled to a CU due to lack of available barriers.
-    Reached CU Workgroup Limit: The percent of total CU cycles in the kernel where
-      a workgroup could not be scheduled to a CU due to limits within the workgroup
-      manager. This is expected to be always be zero on CDNA2 or newer accelerators
-      (and small for previous accelerators).
-    Reached CU Wavefront Limit: The percent of total CU cycles in the kernel where
-      a wavefront could not be scheduled to a CU due to limits within the workgroup
-      manager. This is expected to be always be zero on CDNA2 or newer accelerators
-      (and small for previous accelerators).
   data source:
   - metric_table:
       id: 601
@@ -235,3 +180,58 @@ Panel Config:
           min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
           max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
           unit: Pct
+  metrics_description:
+    Accelerator Utilization: The percent of cycles in the kernel where the accelerator
+      was actively doing any work.
+    Scheduler-Pipe Utilization: The percent of total scheduler-pipe cycles in the
+      kernel where the scheduler-pipes were actively doing any work.
+    Workgroup Manager Utilization: The percent of cycles in the kernel where the workgroup
+      manager was actively doing any work.
+    Shader Engine Utilization: The percent of total shader engine cycles in the kernel
+      where any CU in a shader-engine was actively doing any work, normalized over
+      all shader-engines. Low values (e.g., << 100%) indicate that the accelerator
+      was not fully saturated by the kernel, or a potential load-imbalance issue.
+    SIMD Utilization: The percent of total SIMD cycles in the kernel where any SIMD
+      on a CU was actively doing any work, summed over all CUs. Low values (less than
+      100%) indicate that the accelerator was not fully saturated by the kernel, or
+      a potential load-imbalance issue.
+    Dispatched Workgroups: The total number of workgroups forming this kernel launch.
+    Dispatched Wavefronts: The total number of wavefronts, summed over all workgroups,
+      forming this kernel launch.
+    VGPR Writes: The average number of cycles spent initializing VGPRs at wave creation.
+    SGPR Writes: The average number of cycles spent initializing SGPRs at wave creation.
+    Not-scheduled Rate (Workgroup Manager): The percent of total scheduler-pipe cycles
+      in the kernel where a workgroup could not be scheduled to a CU due to a bottleneck
+      within the workgroup manager rather than a lack of a CU or SIMD with sufficient
+      resources.
+    Not-scheduled Rate (Scheduler-Pipe): |-
+      The percent of total scheduler-pipe cycles in the kernel where a workgroup
+      could not be scheduled to a CU due to a bottleneck within the scheduler-pipes
+      rather than a lack of a CU or SIMD with sufficient resources.
+    Scheduler-Pipe Stall Rate: The percent of total scheduler-pipe cycles in the kernel
+      where a workgroup could not be scheduled to a CU due to occupancy limitations
+      (like a lack of a CU or SIMD with sufficient resources).
+    Scratch Stall Rate: The percent of total shader-engine cycles in the kernel where
+      a workgroup could not be scheduled to a CU due to lack of private (a.k.a., scratch)
+      memory slots. While this can reach up to 100%, note that the actual occupancy
+      limitations on a kernel using private memory are typically quite small (for
+      example, less than 1% of the total number of waves that can be scheduled to
+      an accelerator).
+    Insufficient SIMD Waveslots: The percent of total SIMD cycles in the kernel where
+      a workgroup could not be scheduled to a SIMD due to lack of available waveslots.
+    Insufficient SIMD VGPRs: The percent of total SIMD cycles in the kernel where
+      a workgroup could not be scheduled to a SIMD due to lack of available VGPRs.
+    Insufficient SIMD SGPRs: The percent of total SIMD cycles in the kernel where
+      a workgroup could not be scheduled to a SIMD due to lack of available SGPRs.
+    Insufficient CU LDS: The percent of total CU cycles in the kernel where a workgroup
+      could not be scheduled to a CU due to lack of available LDS.
+    Insufficient CU Barriers: The percent of total CU cycles in the kernel where a
+      workgroup could not be scheduled to a CU due to lack of available barriers.
+    Reached CU Workgroup Limit: The percent of total CU cycles in the kernel where
+      a workgroup could not be scheduled to a CU due to limits within the workgroup
+      manager. This is expected to be always be zero on CDNA2 or newer accelerators
+      (and small for previous accelerators).
+    Reached CU Wavefront Limit: The percent of total CU cycles in the kernel where
+      a wavefront could not be scheduled to a CU due to limits within the workgroup
+      manager. This is expected to be always be zero on CDNA2 or newer accelerators
+      (and small for previous accelerators).
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0700_wavefront.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0700_wavefront.yaml
index 25679c6207..bd6ca38642 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0700_wavefront.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0700_wavefront.yaml
@@ -2,63 +2,6 @@
 Panel Config:
   id: 700
   title: Wavefront
-  metrics_description:
-    Grid Size: The total number of work-items (or, threads) launched as a part of
-      the kernel dispatch. In HIP, this is equivalent to the total grid size multiplied
-      by the total workgroup (or, block) size.
-    Workgroup Size: The total number of work-items (or, threads) in each workgroup
-      (or, block) launched as part of the kernel dispatch. In HIP, this is equivalent
-      to the total block size.
-    Total Wavefronts: "The total number of wavefronts launched as part of the kernel\
-      \ dispatch. On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs,\
-      \ the wavefront size is always 64 work-items. Thus, the total number of wavefronts\
-      \ should be equivalent to the ceiling of grid size divided by 64."
-    Saved Wavefronts: The total number of wavefronts saved at a context-save.
-    Restored Wavefronts: The total number of wavefronts restored from a context-save.
-    VGPRs: 'The number of architected vector general-purpose registers allocated for
-      the kernel, see VALU. Note: this may not exactly match the number of VGPRs requested
-      by the compiler due to allocation granularity.'
-    AGPRs: 'The number of accumulation vector general-purpose registers allocated
-      for the kernel, see AGPRs. Note: this may not exactly match the number of AGPRs
-      requested by the compiler due to allocation granularity.'
-    SGPRs: 'The number of scalar general-purpose registers allocated for the kernel,
-      see SALU. Note: this may not exactly match the number of SGPRs requested by
-      the compiler due to allocation granularity.'
-    LDS Allocation: 'The number of bytes of LDS memory (or, shared memory) allocated
-      for this kernel. Note: This may also be larger than what was requested at compile
-      time due to both allocation granularity and dynamic per-dispatch LDS allocations.'
-    Scratch Allocation: The number of bytes of scratch memory requested per work-item
-      for this kernel. Scratch memory is used for stack memory on the accelerator,
-      as well as for register spills and restores.
-    Kernel Time: The total duration of the executed kernel.
-    Kernel Time (Cycles): The total duration of the executed kernel in cycles.
-    Instructions per wavefront: The average number of instructions (of all types)
-      executed per wavefront. This is averaged over all wavefronts in a kernel dispatch.
-    Wave Cycles: The number of cycles a wavefront in the kernel dispatch spent resident
-      on a compute unit per normalization unit. This is averaged over all wavefronts
-      in a kernel dispatch.
-    Dependency Wait Cycles: The number of cycles a wavefront in the kernel dispatch
-      spent resident on a compute unit per normalization unit. This is averaged over
-      all wavefronts in a kernel dispatch.
-    Issue Wait Cycles: The number of cycles a wavefront in the kernel dispatch was
-      unable to issue an instruction for any reason (e.g., execution pipe back-pressure,
-      arbitration loss, etc.) per normalization unit. This counter is incremented
-      at every cycle by all wavefronts on a CU unable to issue an instruction. As
-      such, it is most useful to get a sense of how waves were spending their time,
-      rather than identification of a precise limiter because another wave could be
-      actively executing while a wave is issue stalled. The sum of this metric, Dependency
-      Wait Cycles and Active Cycles should be equal to the total Wave Cycles metric.
-    Active Cycles: The average number of cycles a wavefront in the kernel dispatch
-      was actively executing instructions per normalization unit. This measurement
-      is made on a per-wavefront basis, and may include cycles that another wavefront
-      spent actively executing (on another execution unit, for example) or was stalled.
-      As such, it is most useful to get a sense of how waves were spending their time,
-      rather than identification of a precise limiter. The sum of this metric, Issue
-      Wait Cycles and Active Wait Cycles should be equal to the total Wave Cycles
-      metric.
-    Wavefront Occupancy: 'The time-averaged number of wavefronts resident on the accelerator
-      over the lifetime of the kernel. Note: this metric may be inaccurate for short-running
-      kernels (less than 1ms).'
   data source:
   - metric_table:
       id: 701
@@ -171,3 +114,66 @@ Panel Config:
           max: MAX((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
           unit: Wavefronts
           coll_level: SQ_LEVEL_WAVES
+  metrics_description:
+    Grid Size: The total number of work-items (or, threads) launched as a part of
+      the kernel dispatch. In HIP, this is equivalent to the total grid size multiplied
+      by the total workgroup (or, block) size.
+    Workgroup Size: The total number of work-items (or, threads) in each workgroup
+      (or, block) launched as part of the kernel dispatch. In HIP, this is equivalent
+      to the total block size.
+    Total Wavefronts: |-
+      The total number of wavefronts launched as part of the kernel dispatch.
+      On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront
+      size is always 64 work-items. Thus, the total number of wavefronts should
+      be equivalent to the ceiling of grid size divided by 64.
+    Saved Wavefronts: The total number of wavefronts saved at a context-save.
+    Restored Wavefronts: The total number of wavefronts restored from a context-save.
+    VGPRs: |-
+      The number of architected vector general-purpose registers allocated
+      for the kernel, see VALU. Note: this may not exactly match the number of VGPRs
+      requested by the compiler due to allocation granularity.
+    AGPRs: |-
+      The number of accumulation vector general-purpose registers allocated
+      for the kernel, see AGPRs. Note: this may not exactly match the number of
+      AGPRs requested by the compiler due to allocation granularity.
+    SGPRs: |-
+      The number of scalar general-purpose registers allocated for the kernel,
+      see SALU. Note: this may not exactly match the number of SGPRs requested by
+      the compiler due to allocation granularity.
+    LDS Allocation: |-
+      The number of bytes of LDS memory (or, shared memory) allocated for
+      this kernel. Note: This may also be larger than what was requested at compile
+      time due to both allocation granularity and dynamic per-dispatch LDS allocations.
+    Scratch Allocation: The number of bytes of scratch memory requested per work-item
+      for this kernel. Scratch memory is used for stack memory on the accelerator,
+      as well as for register spills and restores.
+    Kernel Time: The total duration of the executed kernel.
+    Kernel Time (Cycles): The total duration of the executed kernel in cycles.
+    Instructions per wavefront: The average number of instructions (of all types)
+      executed per wavefront. This is averaged over all wavefronts in a kernel dispatch.
+    Wave Cycles: The number of cycles a wavefront in the kernel dispatch spent resident
+      on a compute unit per normalization unit. This is averaged over all wavefronts
+      in a kernel dispatch.
+    Dependency Wait Cycles: The number of cycles a wavefront in the kernel dispatch
+      spent resident on a compute unit per normalization unit. This is averaged over
+      all wavefronts in a kernel dispatch.
+    Issue Wait Cycles: The number of cycles a wavefront in the kernel dispatch was
+      unable to issue an instruction for any reason (e.g., execution pipe back-pressure,
+      arbitration loss, etc.) per normalization unit. This counter is incremented
+      at every cycle by all wavefronts on a CU unable to issue an instruction. As
+      such, it is most useful to get a sense of how waves were spending their time,
+      rather than identification of a precise limiter because another wave could be
+      actively executing while a wave is issue stalled. The sum of this metric, Dependency
+      Wait Cycles and Active Cycles should be equal to the total Wave Cycles metric.
+    Active Cycles: The average number of cycles a wavefront in the kernel dispatch
+      was actively executing instructions per normalization unit. This measurement
+      is made on a per-wavefront basis, and may include cycles that another wavefront
+      spent actively executing (on another execution unit, for example) or was stalled.
+      As such, it is most useful to get a sense of how waves were spending their time,
+      rather than identification of a precise limiter. The sum of this metric, Issue
+      Wait Cycles and Active Wait Cycles should be equal to the total Wave Cycles
+      metric.
+    Wavefront Occupancy: |-
+      The time-averaged number of wavefronts resident on the accelerator over
+      the lifetime of the kernel. Note: this metric may be inaccurate for short-running
+      kernels (less than 1ms).
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1000_compute_units_instruction_mix.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1000_compute_units_instruction_mix.yaml
index 3a40d83f61..551dad2bc2 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1000_compute_units_instruction_mix.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1000_compute_units_instruction_mix.yaml
@@ -2,90 +2,6 @@
 Panel Config:
   id: 1000
   title: Compute Units - Instruction Mix
-  metrics_description:
-    VALU: The total number of vector arithmetic logic unit (VALU) operations issued.
-      These are the workhorses of the compute unit, and are used to execute a wide
-      range of instruction types including floating point operations, non-uniform
-      address calculations, transcendental operations, integer operations, shifts,
-      conditional evaluation, etc.
-    VMEM: The total number of vector memory operations issued. These include most
-      loads, stores and atomic operations and all accesses to generic, global, private
-      and texture memory.
-    LDS: The total number of LDS (also known as shared memory) operations issued.
-      These include loads, stores, atomics, and HIP's __shfl operations.
-    MFMA: The total number of matrix fused multiply-add instructions issued.
-    SALU: The total number of scalar arithmetic logic unit (SALU) operations issued.
-      Typically these are used for address calculations, literal constants, and other
-      operations that are provably uniform across a wavefront. Although scalar memory
-      (SMEM) operations are issued by the SALU, they are counted separately in this
-      section.
-    SMEM: The total number of scalar memory (SMEM) operations issued. These are typically
-      used for loading kernel arguments, base-pointers and loads from HIP's __constant__
-      memory.
-    Branch: The total number of branch operations issued. These typically consist
-      of jump or branch operations and are used to implement control flow.
-    INT32: The total number of instructions operating on 32-bit integer operands issued
-      to the VALU per normalization unit.
-    INT64: The total number of instructions operating on 64-bit integer operands issued
-      to the VALU per normalization unit.
-    F16-ADD: The total number of addition instructions operating on 16-bit floating-point
-      operands issued to the VALU per normalization unit.
-    F16-MUL: The total number of multiplication instructions operating on 16-bit floating-point
-      operands issued to the VALU per normalization unit.
-    F16-FMA: The total number of fused multiply-add instructions operating on 16-bit
-      floating-point operands issued to the VALU per normalization unit.
-    F16-Trans: The total number of transcendental instructions (e.g., sqrt) operating
-      on 16-bit floating-point operands issued to the VALU per normalization unit.
-    F32-ADD: The total number of addition instructions operating on 32-bit floating-point
-      operands issued to the VALU per normalization unit.
-    F32-MUL: The total number of multiplication instructions operating on 32-bit floating-point
-      operands issued to the VALU per normalization unit.
-    F32-FMA: The total number of fused multiply-add instructions operating on 32-bit
-      floating-point operands issued to the VALU per normalization unit.
-    F32-Trans: The total number of transcendental instructions (such as sqrt) operating
-      on 32-bit floating-point operands issued to the VALU per normalization unit.
-    F64-ADD: The total number of addition instructions operating on 64-bit floating-point
-      operands issued to the VALU per normalization unit.
-    F64-MUL: The total number of multiplication instructions operating on 64-bit floating-point
-      operands issued to the VALU per normalization unit.
-    F64-FMA: The total number of fused multiply-add instructions operating on 64-bit
-      floating-point operands issued to the VALU per normalization unit.
-    F64-Trans: The total number of transcendental instructions (such as sqrt) operating
-      on 64-bit floating-point operands issued to the VALU per normalization unit.
-    Conversion: "The total number of type conversion instructions (such as converting\
-      \ data to or from F32\u2194F64) issued to the VALU per normalization unit."
-    Global/Generic Instr: The total number of global & generic memory instructions
-      executed on all compute units on the accelerator, per normalization unit.
-    Global/Generic Read: The total number of global & generic memory read instructions
-      executed on all compute units on the accelerator, per normalization unit.
-    Global/Generic Write: The total number of global & generic memory write instructions
-      executed on all compute units on the accelerator, per normalization unit.
-    Global/Generic Atomic: The total number of global & generic memory atomic (with
-      and without return) instructions executed on all compute units on the accelerator,
-      per normalization unit.
-    Spill/Stack Instr: The total number of spill/stack memory instructions executed
-      on all compute units on the accelerator, per normalization unit.
-    Spill/Stack Read: The total number of spill/stack memory read instructions executed
-      on all compute units on the accelerator, per normalization unit.
-    Spill/Stack Write: The total number of spill/stack memory write instructions executed
-      on all compute units on the accelerator, per normalization unit.
-    Spill/Stack Atomic: The total number of spill/stack memory atomic (with and without
-      return) instructions executed on all compute units on the accelerator, per normalization
-      unit. Typically unused as these memory operations are typically used to implement
-      thread-local storage.
-    MFMA-I8: The total number of 8-bit integer MFMA instructions issued per normalization
-      unit.
-    MFMA-F8: The total number of 8-bit floating point MFMA instructions issued per
-      normalization unit. This is supported in AMD Instinct MI300 series and later
-      only.
-    MFMA-F16: The total number of 16-bit floating point MFMA instructions issued per
-      normalization unit.
-    MFMA-BF16: The total number of 16-bit brain floating point MFMA instructions issued
-      per normalization unit.
-    MFMA-F32: The total number of 32-bit floating-point MFMA instructions issued per
-      normalization unit.
-    MFMA-F64: The total number of 64-bit floating-point MFMA instructions issued per
-      normalization unit.
   data source:
   - metric_table:
       id: 1001
@@ -317,3 +233,88 @@ Panel Config:
           min: MIN((SQ_INSTS_VALU_MFMA_F6F4 / $denom))
           max: MAX((SQ_INSTS_VALU_MFMA_F6F4 / $denom))
           unit: (instr + $normUnit)
+  metrics_description:
+    VALU: The total number of vector arithmetic logic unit (VALU) operations issued.
+      These are the workhorses of the compute unit, and are used to execute a wide
+      range of instruction types including floating point operations, non-uniform
+      address calculations, transcendental operations, integer operations, shifts,
+      conditional evaluation, etc.
+    VMEM: The total number of vector memory operations issued. These include most
+      loads, stores and atomic operations and all accesses to generic, global, private
+      and texture memory.
+    LDS: The total number of LDS (also known as shared memory) operations issued.
+      These include loads, stores, atomics, and HIP's __shfl operations.
+    MFMA: The total number of matrix fused multiply-add instructions issued.
+    SALU: The total number of scalar arithmetic logic unit (SALU) operations issued.
+      Typically these are used for address calculations, literal constants, and other
+      operations that are provably uniform across a wavefront. Although scalar memory
+      (SMEM) operations are issued by the SALU, they are counted separately in this
+      section.
+    SMEM: The total number of scalar memory (SMEM) operations issued. These are typically
+      used for loading kernel arguments, base-pointers and loads from HIP's __constant__
+      memory.
+    Branch: The total number of branch operations issued. These typically consist
+      of jump or branch operations and are used to implement control flow.
+    INT32: The total number of instructions operating on 32-bit integer operands issued
+      to the VALU per normalization unit.
+    INT64: The total number of instructions operating on 64-bit integer operands issued
+      to the VALU per normalization unit.
+    F16-ADD: The total number of addition instructions operating on 16-bit floating-point
+      operands issued to the VALU per normalization unit.
+    F16-MUL: The total number of multiplication instructions operating on 16-bit floating-point
+      operands issued to the VALU per normalization unit.
+    F16-FMA: The total number of fused multiply-add instructions operating on 16-bit
+      floating-point operands issued to the VALU per normalization unit.
+    F16-Trans: The total number of transcendental instructions (e.g., sqrt) operating
+      on 16-bit floating-point operands issued to the VALU per normalization unit.
+    F32-ADD: The total number of addition instructions operating on 32-bit floating-point
+      operands issued to the VALU per normalization unit.
+    F32-MUL: The total number of multiplication instructions operating on 32-bit floating-point
+      operands issued to the VALU per normalization unit.
+    F32-FMA: The total number of fused multiply-add instructions operating on 32-bit
+      floating-point operands issued to the VALU per normalization unit.
+    F32-Trans: The total number of transcendental instructions (such as sqrt) operating
+      on 32-bit floating-point operands issued to the VALU per normalization unit.
+    F64-ADD: The total number of addition instructions operating on 64-bit floating-point
+      operands issued to the VALU per normalization unit.
+    F64-MUL: The total number of multiplication instructions operating on 64-bit floating-point
+      operands issued to the VALU per normalization unit.
+    F64-FMA: The total number of fused multiply-add instructions operating on 64-bit
+      floating-point operands issued to the VALU per normalization unit.
+    F64-Trans: The total number of transcendental instructions (such as sqrt) operating
+      on 64-bit floating-point operands issued to the VALU per normalization unit.
+    Conversion: |-
+      The total number of type conversion instructions (such as converting
+      data to or from F32\u2194F64) issued to the VALU per normalization unit.
+    Global/Generic Instr: The total number of global & generic memory instructions
+      executed on all compute units on the accelerator, per normalization unit.
+    Global/Generic Read: The total number of global & generic memory read instructions
+      executed on all compute units on the accelerator, per normalization unit.
+    Global/Generic Write: The total number of global & generic memory write instructions
+      executed on all compute units on the accelerator, per normalization unit.
+    Global/Generic Atomic: The total number of global & generic memory atomic (with
+      and without return) instructions executed on all compute units on the accelerator,
+      per normalization unit.
+    Spill/Stack Instr: The total number of spill/stack memory instructions executed
+      on all compute units on the accelerator, per normalization unit.
+    Spill/Stack Read: The total number of spill/stack memory read instructions executed
+      on all compute units on the accelerator, per normalization unit.
+    Spill/Stack Write: The total number of spill/stack memory write instructions executed
+      on all compute units on the accelerator, per normalization unit.
+    Spill/Stack Atomic: The total number of spill/stack memory atomic (with and without
+      return) instructions executed on all compute units on the accelerator, per normalization
+      unit. Typically unused as these memory operations are typically used to implement
+      thread-local storage.
+    MFMA-I8: The total number of 8-bit integer MFMA instructions issued per normalization
+      unit.
+    MFMA-F8: The total number of 8-bit floating point MFMA instructions issued per
+      normalization unit. This is supported in AMD Instinct MI300 series and later
+      only.
+    MFMA-F16: The total number of 16-bit floating point MFMA instructions issued per
+      normalization unit.
+    MFMA-BF16: The total number of 16-bit brain floating point MFMA instructions issued
+      per normalization unit.
+    MFMA-F32: The total number of 32-bit floating-point MFMA instructions issued per
+      normalization unit.
+    MFMA-F64: The total number of 64-bit floating-point MFMA instructions issued per
+      normalization unit.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1100_compute_units_compute_pipeline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1100_compute_units_compute_pipeline.yaml
index 329e28d6e8..dc36164667 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1100_compute_units_compute_pipeline.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1100_compute_units_compute_pipeline.yaml
@@ -2,84 +2,6 @@
 Panel Config:
   id: 1100
   title: Compute Units - Compute Pipeline
-  metrics_description:
-    VALU FLOPs: 'The total floating-point operations executed per second on the VALU.
-      This is also presented as a percent of the peak theoretical FLOPs achievable
-      on the specific accelerator. Note: this does not include any floating-point
-      operations from MFMA instructions.'
-    VALU IOPs: 'The total integer operations executed per second on the VALU. This
-      is also presented as a percent of the peak theoretical IOPs achievable on the
-      specific accelerator. Note: this does not include any integer operations from
-      MFMA instructions.'
-    MFMA FLOPs (BF16): 'The total number of 16-bit brain floating point MFMA operations
-      executed per second. Note: this does not include any 16-bit brain floating point
-      operations from VALU instructions. This is also presented as a percent of the
-      peak theoretical BF16 MFMA operations achievable on the specific accelerator.'
-    MFMA FLOPs (F16): 'The total number of 16-bit floating point MFMA operations executed
-      per second. Note: this does not include any 16-bit floating point operations
-      from VALU instructions. This is also presented as a percent of the peak theoretical
-      F16 MFMA operations achievable on the specific accelerator.'
-    MFMA FLOPs (F32): 'The total number of 32-bit floating point MFMA operations executed
-      per second. Note: this does not include any 32-bit floating point operations
-      from VALU instructions. This is also presented as a percent of the peak theoretical
-      F32 MFMA operations achievable on the specific accelerator.'
-    MFMA FLOPs (F64): 'The total number of 64-bit floating point MFMA operations executed
-      per second. Note: this does not include any 64-bit floating point operations
-      from VALU instructions. This is also presented as a percent of the peak theoretical
-      F64 MFMA operations achievable on the specific accelerator.'
-    MFMA IOPs (INT8): 'The total number of 8-bit integer MFMA operations executed
-      per second. Note: this does not include any 8-bit integer operations from VALU
-      instructions. This is also presented as a percent of the peak theoretical INT8
-      MFMA operations achievable on the specific accelerator.'
-    IPC: The ratio of the total number of instructions executed on the CU over the
-      total active CU cycles.
-    IPC (Issued): The ratio of the total number of (non-internal) instructions issued
-      over the number of cycles where the scheduler was actively working on issuing
-      instructions.
-    SALU Utilization: Indicates what percent of the kernel's duration the SALU was
-      busy executing instructions. Computed as the ratio of the total number of cycles
-      spent by the scheduler issuing SALU / SMEM instructions over the total CU cycles.
-    VALU Utilization: Indicates what percent of the kernel's duration the VALU was
-      busy executing instructions. Does not include VMEM operations. Computed as the
-      ratio of the total number of cycles spent by the scheduler issuing VALU instructions
-      over the total CU cycles.
-    VMEM Utilization: Indicates what percent of the kernel's duration the VMEM unit
-      was busy executing instructions, including both global/generic and spill/scratch
-      operations (see the VMEM instruction count metrics for more detail). Does not
-      include VALU operations. Computed as the ratio of the total number of cycles
-      spent by the scheduler issuing VMEM instructions over the total CU cycles.
-    Branch Utilization: Indicates what percent of the kernel's duration the branch
-      unit was busy executing instructions. Computed as the ratio of the total number
-      of cycles spent by the scheduler issuing branch instructions over the total
-      CU cycles.
-    VALU Active Threads: Indicates the average level of divergence within a wavefront
-      over the lifetime of the kernel. The number of work-items that were active in
-      a wavefront during execution of each VALU instruction, time-averaged over all
-      VALU instructions run on all wavefronts in the kernel
-    MFMA Utilization: Indicates what percent of the kernel's duration the MFMA unit
-      was busy executing instructions. Computed as the ratio of the total number of
-      cycles spent by the MFMA was busy over the total CU cycles.
-    MFMA Instruction Cycles: The average duration of MFMA instructions in this kernel
-      in cycles. Computed as the ratio of the total number of cycles the MFMA unit
-      was busy over the total number of MFMA instructions.
-    VMEM Latency: The average number of round-trip cycles (that is, from issue to
-      data return / acknowledgment) required for a VMEM instruction to complete.
-    SMEM Latency: The average number of round-trip cycles (that is, from issue to
-      data return / acknowledgment) required for a SMEM instruction to complete.
-    FLOPs (Total): The total number of floating-point operations executed on either
-      the VALU or MFMA units, per normalization unit.
-    IOPs (Total): The total number of integer operations executed on either the VALU
-      or MFMA units, per normalization unit.
-    F16 OPs: The total number of 16-bit floating-point operations executed on either
-      the VALU or MFMA units, per normalization unit.
-    BF16 OPs: The total number of 16-bit brain floating-point operations executed
-      on either the VALU or MFMA units, per normalization unit.
-    F32 OPs: The total number of 32-bit floating-point operations executed on either
-      the VALU or MFMA units, per normalization unit.
-    F64 OPs: The total number of 64-bit floating-point operations executed on either
-      the VALU or MFMA units, per normalization unit.
-    INT8 OPs: The total number of 8-bit integer operations executed on either the
-      VALU or MFMA units, per normalization unit.
   data source:
   - metric_table:
       id: 1101
@@ -171,13 +93,13 @@ Panel Config:
           unit: Instr/cycle
         IPC (Issued):
           avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
-            + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED  + SQ_INSTS_LDS)
+            + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
             / SQ_ACTIVE_INST_ANY))
           min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
             + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
             / SQ_ACTIVE_INST_ANY))
           max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
-            + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED  + SQ_INSTS_LDS)
+            + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
             / SQ_ACTIVE_INST_ANY))
           unit: Instr/cycle
         SALU Utilization:
@@ -282,7 +204,7 @@ Panel Config:
             + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
             + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)
             + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) / $denom))
-          unit: (OPs  + $normUnit)
+          unit: (OPs + $normUnit)
         IOPs (Total):
           avg: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
             * 512)) / $denom)
@@ -290,12 +212,12 @@ Panel Config:
             * 512)) / $denom)
           max: MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
             * 512)) / $denom)
-          unit: (OPs  + $normUnit)
+          unit: (OPs + $normUnit)
         F8 OPs:
           avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
           min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
           max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
-          unit: (OPs  + $normUnit)
+          unit: (OPs + $normUnit)
         F16 OPs:
           avg: AVG(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
             + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512
@@ -306,12 +228,12 @@ Panel Config:
           max: MAX(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
             + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + (512
             * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
-          unit: (OPs  + $normUnit)
+          unit: (OPs + $normUnit)
         BF16 OPs:
           avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
           min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
           max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
-          unit: (OPs  + $normUnit)
+          unit: (OPs + $normUnit)
         F32 OPs:
           avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
             + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
@@ -322,7 +244,7 @@ Panel Config:
           max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
             + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
             / $denom))
-          unit: (OPs  + $normUnit)
+          unit: (OPs + $normUnit)
         F64 OPs:
           avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
             + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
@@ -333,14 +255,99 @@ Panel Config:
           max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
             + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
             / $denom))
-          unit: (OPs  + $normUnit)
+          unit: (OPs + $normUnit)
         F6F4 OPs:
           avg: AVG((512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4) / $denom)
           min: MIN((512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4) / $denom)
           max: MAX((512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4) / $denom)
-          unit: (OPs  + $normUnit)
+          unit: (OPs + $normUnit)
         INT8 OPs:
           avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
           min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
           max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
-          unit: (OPs  + $normUnit)
+          unit: (OPs + $normUnit)
+  metrics_description:
+    VALU FLOPs: |-
+      The total floating-point operations executed per second on the VALU.
+      This is also presented as a percent of the peak theoretical FLOPs achievable
+      on the specific accelerator. Note: this does not include any floating-point
+      operations from MFMA instructions.
+    VALU IOPs: |-
+      The total integer operations executed per second on the VALU. This is
+      also presented as a percent of the peak theoretical IOPs achievable on the
+      specific accelerator. Note: this does not include any integer operations from
+      MFMA instructions.
+    MFMA FLOPs (BF16): |-
+      The total number of 16-bit brain floating point MFMA operations executed
+      per second. Note: this does not include any 16-bit brain floating point operations
+      from VALU instructions. This is also presented as a percent of the peak theoretical
+      BF16 MFMA operations achievable on the specific accelerator.
+    MFMA FLOPs (F16): |-
+      The total number of 16-bit floating point MFMA operations executed per
+      second. Note: this does not include any 16-bit floating point operations from
+      VALU instructions. This is also presented as a percent of the peak theoretical
+      F16 MFMA operations achievable on the specific accelerator.
+    MFMA FLOPs (F32): |-
+      The total number of 32-bit floating point MFMA operations executed per
+      second. Note: this does not include any 32-bit floating point operations from
+      VALU instructions. This is also presented as a percent of the peak theoretical
+      F32 MFMA operations achievable on the specific accelerator.
+    MFMA FLOPs (F64): |-
+      The total number of 64-bit floating point MFMA operations executed per
+      second. Note: this does not include any 64-bit floating point operations from
+      VALU instructions. This is also presented as a percent of the peak theoretical
+      F64 MFMA operations achievable on the specific accelerator.
+    MFMA IOPs (INT8): |-
+      The total number of 8-bit integer MFMA operations executed per second.
+      Note: this does not include any 8-bit integer operations from VALU instructions.
+      This is also presented as a percent of the peak theoretical INT8 MFMA operations
+      achievable on the specific accelerator.
+    IPC: The ratio of the total number of instructions executed on the CU over the
+      total active CU cycles.
+    IPC (Issued): The ratio of the total number of (non-internal) instructions issued
+      over the number of cycles where the scheduler was actively working on issuing
+      instructions.
+    SALU Utilization: Indicates what percent of the kernel's duration the SALU was
+      busy executing instructions. Computed as the ratio of the total number of cycles
+      spent by the scheduler issuing SALU / SMEM instructions over the total CU cycles.
+    VALU Utilization: Indicates what percent of the kernel's duration the VALU was
+      busy executing instructions. Does not include VMEM operations. Computed as the
+      ratio of the total number of cycles spent by the scheduler issuing VALU instructions
+      over the total CU cycles.
+    VMEM Utilization: Indicates what percent of the kernel's duration the VMEM unit
+      was busy executing instructions, including both global/generic and spill/scratch
+      operations (see the VMEM instruction count metrics for more detail). Does not
+      include VALU operations. Computed as the ratio of the total number of cycles
+      spent by the scheduler issuing VMEM instructions over the total CU cycles.
+    Branch Utilization: Indicates what percent of the kernel's duration the branch
+      unit was busy executing instructions. Computed as the ratio of the total number
+      of cycles spent by the scheduler issuing branch instructions over the total
+      CU cycles.
+    VALU Active Threads: Indicates the average level of divergence within a wavefront
+      over the lifetime of the kernel. The number of work-items that were active in
+      a wavefront during execution of each VALU instruction, time-averaged over all
+      VALU instructions run on all wavefronts in the kernel
+    MFMA Utilization: Indicates what percent of the kernel's duration the MFMA unit
+      was busy executing instructions. Computed as the ratio of the total number of
+      cycles spent by the MFMA was busy over the total CU cycles.
+    MFMA Instruction Cycles: The average duration of MFMA instructions in this kernel
+      in cycles. Computed as the ratio of the total number of cycles the MFMA unit
+      was busy over the total number of MFMA instructions.
+    VMEM Latency: The average number of round-trip cycles (that is, from issue to
+      data return / acknowledgment) required for a VMEM instruction to complete.
+    SMEM Latency: The average number of round-trip cycles (that is, from issue to
+      data return / acknowledgment) required for a SMEM instruction to complete.
+    FLOPs (Total): The total number of floating-point operations executed on either
+      the VALU or MFMA units, per normalization unit.
+    IOPs (Total): The total number of integer operations executed on either the VALU
+      or MFMA units, per normalization unit.
+    F16 OPs: The total number of 16-bit floating-point operations executed on either
+      the VALU or MFMA units, per normalization unit.
+    BF16 OPs: The total number of 16-bit brain floating-point operations executed
+      on either the VALU or MFMA units, per normalization unit.
+    F32 OPs: The total number of 32-bit floating-point operations executed on either
+      the VALU or MFMA units, per normalization unit.
+    F64 OPs: The total number of 64-bit floating-point operations executed on either
+      the VALU or MFMA units, per normalization unit.
+    INT8 OPs: The total number of 8-bit integer operations executed on either the
+      VALU or MFMA units, per normalization unit.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1200_local_data_share_lds.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1200_local_data_share_lds.yaml
index c334698661..7b839fc1f7 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1200_local_data_share_lds.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1200_local_data_share_lds.yaml
@@ -2,51 +2,6 @@
 Panel Config:
   id: 1200
   title: Local Data Share (LDS)
-  metrics_description:
-    Utilization: Indicates what percent of the kernel's duration the LDS was actively
-      executing instructions (including, but not limited to, load, store, atomic and
-      HIP's __shfl operations). Calculated as the ratio of the total number of cycles
-      LDS was active over the total CU cycles.
-    Access Rate: Indicates the percentage of SIMDs in the VALU actively issuing LDS
-      instructions, averaged over the lifetime of the kernel. Calculated as the ratio
-      of the total number of cycles spent by the scheduler issuing LDS instructions
-      over the total CU cycles.
-    Theoretical Bandwidth Utilization: Indicates the maximum amount of bytes that
-      could have been loaded from, stored to, or atomically updated in the LDS divided
-      as percentage of theoretical peak. Does not take into account the execution
-      mask of the wavefront when the instruction was executed.
-    Theoretical Bandwidth: Indicates the maximum amount of bytes that could have been
-      loaded from, stored to, or atomically updated in the LDS divided by total duration.
-      Does not take into account the execution mask of the wavefront when the instruction
-      was executed.
-    Bank Conflict Rate: Indicates the percentage of active LDS cycles that were spent
-      servicing bank conflicts. Calculated as the ratio of LDS cycles spent servicing
-      bank conflicts over the number of LDS cycles that would have been required to
-      move the same amount of data in an uncontended access.
-    LDS Instructions: The total number of LDS instructions (including, but not limited
-      to, read/write/atomics and HIP's __shfl instructions) executed per normalization
-      unit.
-    LDS Latency: The average number of round-trip cycles (i.e., from issue to data-return
-      / acknowledgment) required for an LDS instruction to complete.
-    Bank Conflicts/Access: The ratio of the number of cycles spent in the LDS scheduler
-      due to bank conflicts (as determined by the conflict resolution hardware) to
-      the base number of cycles that would be spent in the LDS scheduler in a completely
-      uncontended case. This is the unnormalized form of the Bank Conflict Rate.
-    Index Accesses: The total number of cycles spent in the LDS scheduler over all
-      operations per normalization unit.
-    Atomic Return Cycles: The total number of cycles spent on LDS atomics with return
-      per normalization unit.
-    Bank Conflict: The total number of cycles spent in the LDS scheduler due to bank
-      conflicts (as determined by the conflict resolution hardware) per normalization
-      unit.
-    Addr Conflict: The total number of cycles spent in the LDS scheduler due to address
-      conflicts (as determined by the conflict resolution hardware) per normalization
-      unit.
-    Unaligned Stall: The total number of cycles spent in the LDS scheduler due to
-      stalls from non-dword aligned addresses per normalization unit.
-    Mem Violations: "The total number of out-of-bounds accesses made to the LDS, per\
-      \ normalization unit. This is unused and expected to be zero in most configurations\
-      \ for modern CDNA\u2122 accelerators."
   data source:
   - metric_table:
       id: 1201
@@ -87,11 +42,11 @@ Panel Config:
           avg: AVG((SQ_INSTS_LDS / $denom))
           min: MIN((SQ_INSTS_LDS / $denom))
           max: MAX((SQ_INSTS_LDS / $denom))
-          unit: (Instr  + $normUnit)
+          unit: (Instr + $normUnit)
         LDS LOAD:
           avg: AVG((SQ_INSTS_LDS_LOAD / $denom))
-          min: MIN((SQ_INSTS_LDS_LOAD  / $denom))
-          max: MAX((SQ_INSTS_LDS_LOAD  / $denom))
+          min: MIN((SQ_INSTS_LDS_LOAD / $denom))
+          max: MAX((SQ_INSTS_LDS_LOAD / $denom))
           unit: (instr + $normUnit)
         LDS STORE:
           avg: AVG((SQ_INSTS_LDS_STORE / $denom))
@@ -147,27 +102,27 @@ Panel Config:
           avg: AVG((SQ_LDS_IDX_ACTIVE / $denom))
           min: MIN((SQ_LDS_IDX_ACTIVE / $denom))
           max: MAX((SQ_LDS_IDX_ACTIVE / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
         Atomic Return Cycles:
           avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom))
           min: MIN((SQ_LDS_ATOMIC_RETURN / $denom))
           max: MAX((SQ_LDS_ATOMIC_RETURN / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
         Bank Conflict:
           avg: AVG((SQ_LDS_BANK_CONFLICT / $denom))
           min: MIN((SQ_LDS_BANK_CONFLICT / $denom))
           max: MAX((SQ_LDS_BANK_CONFLICT / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
         Addr Conflict:
           avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom))
           min: MIN((SQ_LDS_ADDR_CONFLICT / $denom))
           max: MAX((SQ_LDS_ADDR_CONFLICT / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
         Unaligned Stall:
           avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom))
           min: MIN((SQ_LDS_UNALIGNED_STALL / $denom))
           max: MAX((SQ_LDS_UNALIGNED_STALL / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
         Mem Violations:
           avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom))
           min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom))
@@ -177,9 +132,55 @@ Panel Config:
           avg: AVG((SQ_LDS_CMD_FIFO_FULL / $denom))
           min: MIN((SQ_LDS_CMD_FIFO_FULL / $denom))
           max: MAX((SQ_LDS_CMD_FIFO_FULL / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
         LDS Data FIFO Full Rate:
           avg: AVG((SQ_LDS_DATA_FIFO_FULL / $denom))
           min: MIN((SQ_LDS_DATA_FIFO_FULL / $denom))
           max: MAX((SQ_LDS_DATA_FIFO_FULL / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
+  metrics_description:
+    Utilization: Indicates what percent of the kernel's duration the LDS was actively
+      executing instructions (including, but not limited to, load, store, atomic and
+      HIP's __shfl operations). Calculated as the ratio of the total number of cycles
+      LDS was active over the total CU cycles.
+    Access Rate: Indicates the percentage of SIMDs in the VALU actively issuing LDS
+      instructions, averaged over the lifetime of the kernel. Calculated as the ratio
+      of the total number of cycles spent by the scheduler issuing LDS instructions
+      over the total CU cycles.
+    Theoretical Bandwidth Utilization: Indicates the maximum amount of bytes that
+      could have been loaded from, stored to, or atomically updated in the LDS divided
+      as percentage of theoretical peak. Does not take into account the execution
+      mask of the wavefront when the instruction was executed.
+    Theoretical Bandwidth: Indicates the maximum amount of bytes that could have been
+      loaded from, stored to, or atomically updated in the LDS divided by total duration.
+      Does not take into account the execution mask of the wavefront when the instruction
+      was executed.
+    Bank Conflict Rate: Indicates the percentage of active LDS cycles that were spent
+      servicing bank conflicts. Calculated as the ratio of LDS cycles spent servicing
+      bank conflicts over the number of LDS cycles that would have been required to
+      move the same amount of data in an uncontended access.
+    LDS Instructions: The total number of LDS instructions (including, but not limited
+      to, read/write/atomics and HIP's __shfl instructions) executed per normalization
+      unit.
+    LDS Latency: The average number of round-trip cycles (i.e., from issue to data-return
+      acknowledgment) required for an LDS instruction to complete.
+    Bank Conflicts/Access: The ratio of the number of cycles spent in the LDS scheduler
+      due to bank conflicts (as determined by the conflict resolution hardware) to
+      the base number of cycles that would be spent in the LDS scheduler in a completely
+      uncontended case. This is the unnormalized form of the Bank Conflict Rate.
+    Index Accesses: The total number of cycles spent in the LDS scheduler over all
+      operations per normalization unit.
+    Atomic Return Cycles: The total number of cycles spent on LDS atomics with return
+      per normalization unit.
+    Bank Conflict: The total number of cycles spent in the LDS scheduler due to bank
+      conflicts (as determined by the conflict resolution hardware) per normalization
+      unit.
+    Addr Conflict: The total number of cycles spent in the LDS scheduler due to address
+      conflicts (as determined by the conflict resolution hardware) per normalization
+      unit.
+    Unaligned Stall: The total number of cycles spent in the LDS scheduler due to
+      stalls from non-dword aligned addresses per normalization unit.
+    Mem Violations: |-
+      The total number of out-of-bounds accesses made to the LDS, per normalization
+      unit. This is unused and expected to be zero in most configurations for
+      modern CDNA\u2122 accelerators.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1300_instruction_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1300_instruction_cache.yaml
index aeda9bc6c7..35808d9d96 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1300_instruction_cache.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1300_instruction_cache.yaml
@@ -2,28 +2,6 @@
 Panel Config:
   id: 1300
   title: Instruction Cache
-  metrics_description:
-    Bandwidth Utilization: The number of bytes looked up in the L1I cache, as a percent
-      of the peak theoretical bandwidth. Calculated as the ratio of L1I requests over
-      the total L1I cycles.
-    Cache Hit Rate: The percent of L1I requests that hit [#l1i-cache]_ on a previously
-      loaded line the cache. Calculated as the ratio of the number of L1I requests
-      that hit over the number of all L1I requests.
-    L1I-L2 Bandwidth Utilization: "The percent of the peak theoretical L1I \u2192\
-      \ L2 cache request bandwidth achieved. Calculated as the ratio of the total\
-      \ number of requests from the L1I to the L2 cache over the total L1I-L2 interface\
-      \ cycles."
-    L1I-L2 Bandwidth: Total number of bytes transferred across L1I - L2 interface
-      divided by total duration.
-    Req: The total number of requests made to the L1I per normalization-unit
-    Hits: The total number of L1I requests that hit on a previously loaded cache line,
-      per normalization-unit.
-    Misses - Non Duplicated: The total number of L1I requests that missed on a cache
-      line that were not already pending due to another request, per normalization-unit.
-    Misses - Duplicated: The total number of L1I requests that missed on a cache line
-      that were already pending due to another request, per normalization-unit.
-    Instruction Fetch Latency: The average number of cycles spent to fetch instructions
-      to a CU.
   data source:
   - metric_table:
       id: 1301
@@ -62,22 +40,22 @@ Panel Config:
           avg: AVG((SQC_ICACHE_REQ / $denom))
           min: MIN((SQC_ICACHE_REQ / $denom))
           max: MAX((SQC_ICACHE_REQ / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Hits:
           avg: AVG((SQC_ICACHE_HITS / $denom))
           min: MIN((SQC_ICACHE_HITS / $denom))
           max: MAX((SQC_ICACHE_HITS / $denom))
-          unit: (Hits  + $normUnit)
+          unit: (Hits + $normUnit)
         Misses - Non Duplicated:
           avg: AVG((SQC_ICACHE_MISSES / $denom))
           min: MIN((SQC_ICACHE_MISSES / $denom))
           max: MAX((SQC_ICACHE_MISSES / $denom))
-          unit: (Misses  + $normUnit)
+          unit: (Misses + $normUnit)
         Misses - Duplicated:
           avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom))
           min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom))
           max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom))
-          unit: (Misses  + $normUnit)
+          unit: (Misses + $normUnit)
         Cache Hit Rate:
           avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
             + SQC_ICACHE_MISSES_DUPLICATE)))
@@ -107,3 +85,25 @@ Panel Config:
           min: MIN(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
           max: MAX(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
           unit: Gbps
+  metrics_description:
+    Bandwidth Utilization: The number of bytes looked up in the L1I cache, as a percent
+      of the peak theoretical bandwidth. Calculated as the ratio of L1I requests over
+      the total L1I cycles.
+    Cache Hit Rate: The percent of L1I requests that hit [#l1i-cache]_ on a previously
+      loaded line the cache. Calculated as the ratio of the number of L1I requests
+      that hit over the number of all L1I requests.
+    L1I-L2 Bandwidth Utilization: |-
+      The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth
+      achieved. Calculated as the ratio of the total number of requests from the
+      L1I to the L2 cache over the total L1I-L2 interface cycles.
+    L1I-L2 Bandwidth: Total number of bytes transferred across L1I - L2 interface
+      divided by total duration.
+    Req: The total number of requests made to the L1I per normalization-unit
+    Hits: The total number of L1I requests that hit on a previously loaded cache line,
+      per normalization-unit.
+    Misses - Non Duplicated: The total number of L1I requests that missed on a cache
+      line that were not already pending due to another request, per normalization-unit.
+    Misses - Duplicated: The total number of L1I requests that missed on a cache line
+      that were already pending due to another request, per normalization-unit.
+    Instruction Fetch Latency: The average number of cycles spent to fetch instructions
+      to a CU.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1400_scalar_l1_data_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1400_scalar_l1_data_cache.yaml
index 282b97ad1f..6b73164848 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1400_scalar_l1_data_cache.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1400_scalar_l1_data_cache.yaml
@@ -2,49 +2,6 @@
 Panel Config:
   id: 1400
   title: Scalar L1 Data Cache
-  metrics_description:
-    Bandwidth Utilization: The number of bytes looked up in the sL1D cache, as a percent
-      of the peak theoretical bandwidth. Calculated as the ratio of sL1D requests
-      over the total sL1D cycles.
-    Cache Hit Rate: Indicates the percent of sL1D requests that hit on a previously
-      loaded line the cache. The ratio of the number of sL1D requests that hit over
-      the number of all sL1D requests.
-    sL1D-L2 BW Utilization: The percentage of the peak theoretical sL1D - L2 interface
-      bandwidth acheived.\ \ Caclulated as total number of bytes read from, written
-      to, or atomically updated\ \ across the sL1D - L2 interface.
-    sL1D-L2 BW: "The total number of bytes read from, written to, or atomically updated\
-      \ across the sL1D\u2194L2 interface, divided by total duration. Note that sL1D\
-      \ writes and atomics are typically unused on current CDNA accelerators, so in\
-      \ the majority of cases this can be interpreted as an sL1D\u2192L2 read bandwidth."
-    Req: The total number of requests, of any size or type, made to the sL1D per normalization
-      unit.
-    Hits: The total number of sL1D requests that hit on a previously loaded cache
-      line, per normalization unit.
-    Misses - Non Duplicated: 'The total number of sL1D requests that missed on a cache
-      line that was not already pending due to another request, per normalization
-      unit. '
-    Misses- Duplicated: The total number of sL1D requests that missed on a cache line
-      that was already pending due to another request, per normalization unit.
-    Read Req (Total): The total number of sL1D read requests of any size, per normalization
-      unit.
-    Atomic Req: The total number of atomic requests from sL1D to the L2, per normalization
-      unit. Typically unused on current CDNA accelerators.
-    Read Req (1 DWord): The total number of sL1D read requests made for a single dword
-      of data (4B), per normalization unit.
-    Read Req (2 DWord): The total number of sL1D read requests made for a two dwords
-      of data (8B), per normalization unit.
-    Read Req (4 DWord): The total number of sL1D read requests made for a four dwords
-      of data (16B), per normalization unit.
-    Read Req (8 DWord): The total number of sL1D read requests made for a eight dwords
-      of data (32B), per normalization unit.
-    Read Req (16 DWord): The total number of sL1D read requests made for a sixteen
-      dwords of data (64B), per normalization unit.
-    Read Req: The total number of read requests from sL1D to the L2 per normalization
-      unit.
-    Write Req: The total number of write requests from sL1D to the L2, per normalization
-      unit. Typically unused on current CDNA accelerators.
-    Stall Cycles: "The total number of cycles the sL1D\u2194L2 interface was stalled,\
-      \ per normalization unit."
   data source:
   - metric_table:
       id: 1401
@@ -84,22 +41,22 @@ Panel Config:
           avg: AVG((SQC_DCACHE_REQ / $denom))
           min: MIN((SQC_DCACHE_REQ / $denom))
           max: MAX((SQC_DCACHE_REQ / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Hits:
           avg: AVG((SQC_DCACHE_HITS / $denom))
           min: MIN((SQC_DCACHE_HITS / $denom))
           max: MAX((SQC_DCACHE_HITS / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Misses - Non Duplicated:
           avg: AVG((SQC_DCACHE_MISSES / $denom))
           min: MIN((SQC_DCACHE_MISSES / $denom))
           max: MAX((SQC_DCACHE_MISSES / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Misses- Duplicated:
           avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom))
           min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom))
           max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Cache Hit Rate:
           avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
             + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
@@ -118,37 +75,37 @@ Panel Config:
             + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
           max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
             + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Atomic Req:
           avg: AVG((SQC_DCACHE_ATOMIC / $denom))
           min: MIN((SQC_DCACHE_ATOMIC / $denom))
           max: MAX((SQC_DCACHE_ATOMIC / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Read Req (1 DWord):
           avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom))
           min: MIN((SQC_DCACHE_REQ_READ_1 / $denom))
           max: MAX((SQC_DCACHE_REQ_READ_1 / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Read Req (2 DWord):
           avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom))
           min: MIN((SQC_DCACHE_REQ_READ_2 / $denom))
           max: MAX((SQC_DCACHE_REQ_READ_2 / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Read Req (4 DWord):
           avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom))
           min: MIN((SQC_DCACHE_REQ_READ_4 / $denom))
           max: MAX((SQC_DCACHE_REQ_READ_4 / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Read Req (8 DWord):
           avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom))
           min: MIN((SQC_DCACHE_REQ_READ_8 / $denom))
           max: MAX((SQC_DCACHE_REQ_READ_8 / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Read Req (16 DWord):
           avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom))
           min: MIN((SQC_DCACHE_REQ_READ_16 / $denom))
           max: MAX((SQC_DCACHE_REQ_READ_16 / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
   - metric_table:
       id: 1403
       title: Scalar L1D Cache - L2 Interface
@@ -171,19 +128,65 @@ Panel Config:
           avg: AVG((SQC_TC_DATA_READ_REQ / $denom))
           min: MIN((SQC_TC_DATA_READ_REQ / $denom))
           max: MAX((SQC_TC_DATA_READ_REQ / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Write Req:
           avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom))
           min: MIN((SQC_TC_DATA_WRITE_REQ / $denom))
           max: MAX((SQC_TC_DATA_WRITE_REQ / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Atomic Req:
           avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom))
           min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom))
           max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Stall Cycles:
           avg: AVG((SQC_TC_STALL / $denom))
           min: MIN((SQC_TC_STALL / $denom))
           max: MAX((SQC_TC_STALL / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
+  metrics_description:
+    Bandwidth Utilization: The number of bytes looked up in the sL1D cache, as a percent
+      of the peak theoretical bandwidth. Calculated as the ratio of sL1D requests
+      over the total sL1D cycles.
+    Cache Hit Rate: Indicates the percent of sL1D requests that hit on a previously
+      loaded line the cache. The ratio of the number of sL1D requests that hit over
+      the number of all sL1D requests.
+    sL1D-L2 BW Utilization: The percentage of the peak theoretical sL1D - L2 interface
+      bandwidth acheived. Calculated as total number of bytes read from, written to,
+      or atomically updated across the sL1D - L2 interface.
+    sL1D-L2 BW: |-
+      The total number of bytes read from, written to, or atomically updated
+      across the sL1D\u2194L2 interface, divided by total duration. Note that sL1D
+      writes and atomics are typically unused on current CDNA accelerators, so
+      in the majority of cases this can be interpreted as an sL1D\u2192L2 read
+      bandwidth.
+    Req: The total number of requests, of any size or type, made to the sL1D per normalization
+      unit.
+    Hits: The total number of sL1D requests that hit on a previously loaded cache
+      line, per normalization unit.
+    Misses - Non Duplicated: |-
+      The total number of sL1D requests that missed on a cache line that was
+      not already pending due to another request, per normalization unit.
+    Misses- Duplicated: The total number of sL1D requests that missed on a cache line
+      that was already pending due to another request, per normalization unit.
+    Read Req (Total): The total number of sL1D read requests of any size, per normalization
+      unit.
+    Atomic Req: The total number of atomic requests from sL1D to the L2, per normalization
+      unit. Typically unused on current CDNA accelerators.
+    Read Req (1 DWord): The total number of sL1D read requests made for a single dword
+      of data (4B), per normalization unit.
+    Read Req (2 DWord): The total number of sL1D read requests made for a two dwords
+      of data (8B), per normalization unit.
+    Read Req (4 DWord): The total number of sL1D read requests made for a four dwords
+      of data (16B), per normalization unit.
+    Read Req (8 DWord): The total number of sL1D read requests made for a eight dwords
+      of data (32B), per normalization unit.
+    Read Req (16 DWord): The total number of sL1D read requests made for a sixteen
+      dwords of data (64B), per normalization unit.
+    Read Req: The total number of read requests from sL1D to the L2 per normalization
+      unit.
+    Write Req: The total number of write requests from sL1D to the L2, per normalization
+      unit. Typically unused on current CDNA accelerators.
+    Stall Cycles: |-
+      The total number of cycles the sL1D\u2194L2 interface was stalled, per
+      normalization unit.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1500_address_processing_unit_and_data_return_path_ta_td.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1500_address_processing_unit_and_data_return_path_ta_td.yaml
index a37f24eab6..492c000318 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1500_address_processing_unit_and_data_return_path_ta_td.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1500_address_processing_unit_and_data_return_path_ta_td.yaml
@@ -2,70 +2,6 @@
 Panel Config:
   id: 1500
   title: Address Processing Unit and Data Return Path (TA/TD)
-  metrics_description:
-    Address Processing Unit Busy: Percent of the total CU cycles the address processor
-      was busy
-    Address Stall: Percent of the total CU cycles the address processor was stalled
-      from sending address requests further into the vL1D pipeline.
-    Data Stall: Percent of the total CU cycles the address processor was stalled from
-      sending write/atomic data further into the vL1D pipeline.
-    "Data-Processor \u2192 Address Stall": Percent of total CU cycles the address
-      processor was stalled waiting to send command data to the data processor.
-    Total Instructions: The total number of memory instructions executed by the address
-      processer over all compute units on the accelerator, per normalization unit.
-    Global/Generic Instructions: The total number of global & generic memory instructions
-      executed on all compute units on the accelerator, per normalization unit.
-    Global/Generic Read Instructions: The total number of global & generic memory
-      read instructions executed on all compute units on the accelerator, per normalization
-      unit.
-    Global/Generic Write Instructions: The total number of global & generic memory
-      write instructions executed on all compute units on the accelerator, per normalization
-      unit.
-    Global/Generic Atomic Instructions: The total number of global & generic memory
-      atomic (with and without return) instructions executed on all compute units
-      on the accelerator, per normalization unit.
-    Spill/Stack Instructions: The total number of spill/stack memory instructions
-      executed on all compute units on the accelerator, per normalization unit.
-    Spill/Stack Read Instructions: The total number of spill/stack memory read instructions
-      executed on all compute units on the accelerator, per normalization unit.
-    Spill/Stack Write Instructions: The total number of spill/stack memory write instructions
-      executed on all compute units on the accelerator, per normalization unit.
-    Spill/Stack Atomic Instructions: The total number of spill/stack memory atomic
-      (with and without return) instructions executed on all compute units on the
-      accelerator, per normalization unit. Typically unused as these memory operations
-      are typically used to implement thread-local storage.
-    Spill/Stack Total Cycles: The number of cycles the address processing unit spent
-      working on spill/stack instructions, per normalization unit.
-    Spill/Stack Coalesced Read: The number of cycles the address processing unit spent
-      working on coalesced spill/stack read instructions, per normalization unit.
-    Spill/Stack Coalesced Write: The number of cycles the address processing unit
-      spent working on coalesced spill/stack write instructions, per normalization
-      unit.
-    Data-Return Busy: Percent of the total CU cycles the data-return unit was busy
-      processing or waiting on data to return to the CU.
-    "Cache RAM \u2192 Data-Return Stall": Percent of the total CU cycles the data-return
-      unit was stalled on data to be returned from the vL1D Cache RAM.
-    "Workgroup manager \u2192 Data-Return Stall": Percent of the total CU cycles the
-      data-return unit was stalled by the workgroup manager due to initialization
-      of registers as a part of launching new workgroups.
-    Coalescable Instructions: The number of instructions submitted to the data-return
-      unit by the address processor that were found to be coalescable, per normalization
-      unit.
-    Read Instructions: The number of read instructions submitted to the data-return
-      unit by the address processor summed over all compute units on the accelerator,
-      per normalization unit. This is expected to be the sum of global/generic and
-      spill/stack reads in the address processor.
-    Write Instructions: The number of store instructions submitted to the data-return
-      unit by the address processor summed over all compute units on the accelerator,
-      per normalization unit. This is expected to be the sum of global/generic and
-      spill/stack stores in the address processor.
-    Atomic Instructions: The number of atomic instructions submitted to the data-return
-      unit by the address processor summed over all compute units on the accelerator,
-      per normalization unit. This is expected to be the sum of global/generic and
-      spill/stack atomics in the address processor.
-    Write Ack Instructions: The total number of write acknowledgements submitted by
-      data-return unit to SQ, summed over all compute units on the accelerator, per
-      normalization unit.
   data source:
   - metric_table:
       id: 1501
@@ -135,57 +71,57 @@ Panel Config:
           avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom))
           min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom))
           max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Global/Generic Instructions:
           avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
           min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
           max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Global/Generic Read Instructions:
           avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
           min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
           max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Global/Generic Read Instructions for LDS:
           avg: AVG((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom))
           min: MIN((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom))
           max: MAX((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Global/Generic Write Instructions:
           avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
           min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
           max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Global/Generic Atomic Instructions:
           avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
           min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
           max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Spill/Stack Instructions:
           avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
           min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
           max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Spill/Stack Read Instructions:
           avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
           min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
           max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Spill/Stack Read Instructions for LDS:
           avg: AVG((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom))
           min: MIN((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom))
           max: MAX((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Spill/Stack Write Instructions:
           avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
           min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
           max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Spill/Stack Atomic Instructions:
           avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
           min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
           max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
   - metric_table:
       id: 1503
       title: Spill and stack metrics
@@ -200,17 +136,17 @@ Panel Config:
           avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
           min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
           max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
         Spill/Stack Coalesced Read:
           avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
           min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
           max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
         Spill/Stack Coalesced Write:
           avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
           min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
           max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
   - metric_table:
       id: 1504
       title: Vector L1 data-return path or Texture Data (TD)
@@ -240,7 +176,7 @@ Panel Config:
           avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom))
           min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom))
           max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Read Instructions:
           avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
             / $denom))
@@ -248,19 +184,83 @@ Panel Config:
             / $denom))
           max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
             / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Write Instructions:
           avg: AVG((TD_STORE_WAVEFRONT_sum / $denom))
           min: MIN((TD_STORE_WAVEFRONT_sum / $denom))
           max: MAX((TD_STORE_WAVEFRONT_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Atomic Instructions:
           avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom))
           min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom))
           max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
         Write Ack Instructions:
           avg: AVG((TD_WRITE_ACKT_WAVEFRONT_sum / $denom))
           min: MIN((TD_WRITE_ACKT_WAVEFRONT_sum / $denom))
           max: MAX((TD_WRITE_ACKT_WAVEFRONT_sum / $denom))
-          unit: (Instructions  + $normUnit)
+          unit: (Instructions + $normUnit)
+  metrics_description:
+    Address Processing Unit Busy: Percent of the total CU cycles the address processor
+      was busy
+    Address Stall: Percent of the total CU cycles the address processor was stalled
+      from sending address requests further into the vL1D pipeline.
+    Data Stall: Percent of the total CU cycles the address processor was stalled from
+      sending write/atomic data further into the vL1D pipeline.
+    "Data-Processor \u2192 Address Stall": Percent of total CU cycles the address
+      processor was stalled waiting to send command data to the data processor.
+    Total Instructions: The total number of memory instructions executed by the address
+      processer over all compute units on the accelerator, per normalization unit.
+    Global/Generic Instructions: The total number of global & generic memory instructions
+      executed on all compute units on the accelerator, per normalization unit.
+    Global/Generic Read Instructions: The total number of global & generic memory
+      read instructions executed on all compute units on the accelerator, per normalization
+      unit.
+    Global/Generic Write Instructions: The total number of global & generic memory
+      write instructions executed on all compute units on the accelerator, per normalization
+      unit.
+    Global/Generic Atomic Instructions: The total number of global & generic memory
+      atomic (with and without return) instructions executed on all compute units
+      on the accelerator, per normalization unit.
+    Spill/Stack Instructions: The total number of spill/stack memory instructions
+      executed on all compute units on the accelerator, per normalization unit.
+    Spill/Stack Read Instructions: The total number of spill/stack memory read instructions
+      executed on all compute units on the accelerator, per normalization unit.
+    Spill/Stack Write Instructions: The total number of spill/stack memory write instructions
+      executed on all compute units on the accelerator, per normalization unit.
+    Spill/Stack Atomic Instructions: The total number of spill/stack memory atomic
+      (with and without return) instructions executed on all compute units on the
+      accelerator, per normalization unit. Typically unused as these memory operations
+      are typically used to implement thread-local storage.
+    Spill/Stack Total Cycles: The number of cycles the address processing unit spent
+      working on spill/stack instructions, per normalization unit.
+    Spill/Stack Coalesced Read: The number of cycles the address processing unit spent
+      working on coalesced spill/stack read instructions, per normalization unit.
+    Spill/Stack Coalesced Write: The number of cycles the address processing unit
+      spent working on coalesced spill/stack write instructions, per normalization
+      unit.
+    Data-Return Busy: Percent of the total CU cycles the data-return unit was busy
+      processing or waiting on data to return to the CU.
+    "Cache RAM \u2192 Data-Return Stall": Percent of the total CU cycles the data-return
+      unit was stalled on data to be returned from the vL1D Cache RAM.
+    "Workgroup manager \u2192 Data-Return Stall": Percent of the total CU cycles the
+      data-return unit was stalled by the workgroup manager due to initialization
+      of registers as a part of launching new workgroups.
+    Coalescable Instructions: The number of instructions submitted to the data-return
+      unit by the address processor that were found to be coalescable, per normalization
+      unit.
+    Read Instructions: The number of read instructions submitted to the data-return
+      unit by the address processor summed over all compute units on the accelerator,
+      per normalization unit. This is expected to be the sum of global/generic and
+      spill/stack reads in the address processor.
+    Write Instructions: The number of store instructions submitted to the data-return
+      unit by the address processor summed over all compute units on the accelerator,
+      per normalization unit. This is expected to be the sum of global/generic and
+      spill/stack stores in the address processor.
+    Atomic Instructions: The number of atomic instructions submitted to the data-return
+      unit by the address processor summed over all compute units on the accelerator,
+      per normalization unit. This is expected to be the sum of global/generic and
+      spill/stack atomics in the address processor.
+    Write Ack Instructions: The total number of write acknowledgements submitted by
+      data-return unit to SQ, summed over all compute units on the accelerator, per
+      normalization unit.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1600_vector_l1_data_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1600_vector_l1_data_cache.yaml
index 2d8ac4d781..48408d16d7 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1600_vector_l1_data_cache.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1600_vector_l1_data_cache.yaml
@@ -2,117 +2,6 @@
 Panel Config:
   id: 1600
   title: Vector L1 Data Cache
-  metrics_description:
-    Hit rate: The ratio of the number of vL1D cache line requests that hit in vL1D
-      cache over the total number of cache line requests to the vL1D Cache RAM.
-    Bandwidth Utilization: The number of bytes looked up in the vL1D cache as a result
-      of VMEM instructions, as a percent of the peak theoretical bandwidth achievable
-      on the specific accelerator. The number of bytes is calculated as the number
-      of cache lines requested multiplied by the cache line size. This value does
-      not consider partial requests, so for instance, if only a single value is requested
-      in a cache line, the data movement will still be counted as a full cache line.
-    Utilization: Indicates how busy the vL1D Cache RAM was during the kernel execution.
-      The number of cycles where the vL1D Cache RAM is actively processing any request
-      divided by the number of cycles where the vL1D is active.
-    Coalescing: Indicates how well memory instructions were coalesced by the address
-      processing unit, ranging from uncoalesced (25%) to fully coalesced (100%). Calculated
-      as the average number of thread-requests generated per instruction divided by
-      the ideal number of thread-requests per instruction.
-    Stalled on L2 Data: The ratio of the number of cycles where the vL1D is stalled
-      waiting for requested data to return from the L2 cache divided by the number
-      of cycles where the vL1D is active.
-    Stalled on L2 Req: The ratio of the number of cycles where the vL1D is stalled
-      waiting to issue a request for data to the L2 cache divided by the number of
-      cycles where the vL1D is active.
-    Tag RAM Stall (Read): The ratio of the number of cycles where the vL1D is stalled
-      due to Read requests with conflicting tags being looked up concurrently, divided
-      by the number of cycles where the vL1D is active.
-    Tag RAM Stall (Write): The ratio of the number of cycles where the vL1D is stalled
-      due to Write requests with conflicting tags being looked up concurrently, divided
-      by the number of cycles where the vL1D is active.
-    Tag RAM Stall (Atomic): The ratio of the number of cycles where the vL1D is stalled
-      due to Atomic requests with conflicting tags being looked up concurrently, divided
-      by the number of cycles where the vL1D is active.
-    Total Req: The total number of incoming requests from the address processing unit
-      after coalescing.
-    Read Req: The total number of incoming read requests from the address processing
-      unit after coalescing per normalization unit.
-    Write Req: The total number of incoming write requests from the address processing
-      unit after coalescing per normalization unit.
-    Atomic Req: The total number of incoming atomic requests from the address processing
-      unit after coalescing per normalization unit.
-    Cache BW: The number of bytes looked up in the vL1D cache as a result of VMEM
-      instructions divided by total duration. The number of bytes is calculated as
-      the number of cache lines requested multiplied by the cache line size.  This
-      value does not consider partial requests, so for instance, if only a single
-      value is requested in a cache line, the data movement will still be counted
-      as a full cache line.
-    Cache Hit Rate: The ratio of the number of vL1D cache line requests that hit in
-      vL1D cache over the total number of cache line requests to the vL1D Cache RAM.
-    Cache Accesses: The total number of cache line lookups in the vL1D.
-    Cache Hits: The number of cache accesses minus the number of outgoing requests
-      to the L2 cache, that is, the number of cache line requests serviced by the
-      vL1D Cache RAM per normalization unit.
-    Invalidations: The number of times the vL1D was issued a write-back invalidate
-      command during the kernel's execution per normalization unit. This may be triggered
-      by, for instance, the buffer_wbinvl1 instruction.
-    L1-L2 BW: The number of bytes transferred across the vL1D-L2 interface as a result
-      of VMEM instructions, divided by total duration. The number of bytes is calculated
-      as the number of cache lines requested multiplied by the cache line size. This
-      value does not consider partial requests, so for instance, if only a single
-      value is requested in a cache line, the data movement will still be counted
-      as a full cache line.
-    L1-L2 Read: The number of read requests for a vL1D cache line that were not satisfied
-      by the vL1D and must be retrieved from the to the L2 Cache per normalization
-      unit.
-    L1-L2 Write: The number of write requests to a vL1D cache line that were sent
-      through the vL1D to the L2 cache, per normalization unit.
-    L1-L2 Atomic: The number of atomic requests that are sent through the vL1D to
-      the L2 cache, per normalization unit. This includes requests for atomics with,
-      and without return.
-    L1 Access Latency: Calculated as the average number of cycles that a vL1D cache
-      line request spent in the vL1D cache pipeline.
-    L1-L2 Read Latency: Calculated as the average number of cycles that the vL1D cache
-      took to issue and receive read requests from the L2 Cache. This number also
-      includes requests for atomics with return values.
-    L1-L2 Write Latency: Calculated as the average number of cycles that the vL1D
-      cache took to issue and receive acknowledgement of a write request to the L2
-      Cache. This number also includes requests for atomics without return values.
-    NC - Read: Total read requests with NC mtype from this TCP to all TCCs Sum over
-      TCP instances per normalization unit.
-    UC - Read: Total read requests with UC mtype from this TCP to all TCCs Sum over
-      TCP instances per normalization unit.
-    CC - Read: Total read requests with CC mtype from this TCP to all TCCs Sum over
-      TCP instances per normalization unit.
-    RW - Read: Total read requests with RW mtype from this TCP to all TCCs Sum over
-      TCP instances per normalization unit.
-    RW - Write: Total write requests with RW mtype from this TCP to all TCCs Sum over
-      TCP instances per normalization unit.
-    NC - Write: Total write requests with NC mtype from this TCP to all TCCs Sum over
-      TCP instances per normalization unit.
-    UC - Write: Total write requests with UC mtype from this TCP to all TCCs Sum over
-      TCP instances per normalization unit.
-    CC - Write: Total write requests with CC mtype from this TCP to all TCCs Sum over
-      TCP instances per normalization unit.
-    NC - Atomic: Total atomic requests with NC mtype from this TCP to all TCCs Sum
-      over TCP instances per normalization unit.
-    UC - Atomic: Total atomic requests with UC mtype from this TCP to all TCCs Sum
-      over TCP instances per normalization unit.
-    CC - Atomic: Total atomic requests with CC mtype from this TCP to all TCCs Sum
-      over TCP instances per normalization unit.
-    RW - Atomic: Total atomic requests with RW mtype from this TCP to all TCCs Sum
-      over TCP instances per normalization unit.
-    Req: The number of translation requests made to the UTCL1 per normalization unit.
-    Hit Ratio: The ratio of the number of translation requests that hit in the UTCL1
-      divided by the total number of translation requests made to the UTCL1.
-    Hits: The number of translation requests that hit in the UTCL1, and could be reused,
-      per normalization unit.
-    Translation Misses: The total number of translation requests that missed in the
-      UTCL1 due to  translation not being present in the cache, per normalization
-      unit.
-    Permission Misses: "The total number of translation requests that missed in the\
-      \ UTCL1 due to a permission error, per normalization unit. This is unused and\
-      \ expected to be zero in most configurations for modern CDNA\u2122 accelerators."
   data source:
   - metric_table:
       id: 1601
@@ -196,17 +85,17 @@ Panel Config:
           avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom))
           min: MIN((TCP_TOTAL_ACCESSES_sum / $denom))
           max: MAX((TCP_TOTAL_ACCESSES_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Read Req:
           avg: AVG((TCP_TOTAL_READ_sum / $denom))
           min: MIN((TCP_TOTAL_READ_sum / $denom))
           max: MAX((TCP_TOTAL_READ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Write Req:
           avg: AVG((TCP_TOTAL_WRITE_sum / $denom))
           min: MIN((TCP_TOTAL_WRITE_sum / $denom))
           max: MAX((TCP_TOTAL_WRITE_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Atomic Req:
           avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
             / $denom))
@@ -214,7 +103,7 @@ Panel Config:
             / $denom))
           max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
             / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Cache BW:
           avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
           min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
@@ -238,7 +127,7 @@ Panel Config:
           avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
           min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
           max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Cache Hits:
           avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
             + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
@@ -249,7 +138,7 @@ Panel Config:
           max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
             + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
             / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Invalidations:
           avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
           min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
@@ -267,32 +156,32 @@ Panel Config:
           avg: AVG((TCP_TAGRAM0_REQ_sum / $denom))
           min: MIN((TCP_TAGRAM0_REQ_sum / $denom))
           max: MAX((TCP_TAGRAM0_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Tag RAM 1 Req:
           avg: AVG((TCP_TAGRAM1_REQ_sum / $denom))
           min: MIN((TCP_TAGRAM1_REQ_sum / $denom))
           max: MAX((TCP_TAGRAM1_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Tag RAM 2 Req:
           avg: AVG((TCP_TAGRAM2_REQ_sum / $denom))
           min: MIN((TCP_TAGRAM2_REQ_sum / $denom))
           max: MAX((TCP_TAGRAM2_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Tag RAM 3 Req:
           avg: AVG((TCP_TAGRAM3_REQ_sum / $denom))
           min: MIN((TCP_TAGRAM3_REQ_sum / $denom))
           max: MAX((TCP_TAGRAM3_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         L1-L2 Read:
           avg: AVG((TCP_TCC_READ_REQ_sum / $denom))
           min: MIN((TCP_TCC_READ_REQ_sum / $denom))
           max: MAX((TCP_TCC_READ_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         L1-L2 Write:
           avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom))
           min: MIN((TCP_TCC_WRITE_REQ_sum / $denom))
           max: MAX((TCP_TCC_WRITE_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         L1-L2 Atomic:
           avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
             / $denom))
@@ -300,22 +189,22 @@ Panel Config:
             / $denom))
           max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
             / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         L1 Access Latency:
           avg: AVG((TCP_TCP_LATENCY_sum / $denom))
           min: MIN((TCP_TCP_LATENCY_sum / $denom))
           max: MAX((TCP_TCP_LATENCY_sum / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
         L1-L2 Read Latency:
           avg: AVG((TCP_TCC_READ_REQ_LATENCY_sum / $denom))
           min: MIN((TCP_TCC_READ_REQ_LATENCY_sum / $denom))
           max: MAX((TCP_TCC_READ_REQ_LATENCY_sum / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
         L1-L2 Write Latency:
           avg: AVG((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom))
           min: MIN((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom))
           max: MAX((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom))
-          unit: (Cycles  + $normUnit)
+          unit: (Cycles + $normUnit)
   - metric_table:
       id: 1604
       title: L1D - L2 Transactions
@@ -334,84 +223,84 @@ Panel Config:
           avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom))
           min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom))
           max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         UC - Read:
           xfer: Read
           coherency: UC
           avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom))
           min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom))
           max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         CC - Read:
           xfer: Read
           coherency: CC
           avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom))
           min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom))
           max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         RW - Read:
           xfer: Read
           coherency: RW
           avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom))
           min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom))
           max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         RW - Write:
           xfer: Write
           coherency: RW
           avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom))
           min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom))
           max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         NC - Write:
           xfer: Write
           coherency: NC
           avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom))
           min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom))
           max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         UC - Write:
           xfer: Write
           coherency: UC
           avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom))
           min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom))
           max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         CC - Write:
           xfer: Write
           coherency: CC
           avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom))
           min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom))
           max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         NC - Atomic:
           xfer: Atomic
           coherency: NC
           avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
           min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
           max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         UC - Atomic:
           xfer: Atomic
           coherency: UC
           avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
           min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
           max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         CC - Atomic:
           xfer: Atomic
           coherency: CC
           avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
           min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
           max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         RW - Atomic:
           xfer: Atomic
           coherency: RW
           avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
           min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
           max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
   - metric_table:
       id: 1605
       title: L1 Unified Translation Cache (UTCL1)
@@ -505,3 +394,114 @@ Panel Config:
           min: MIN((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom))
           max: MAX((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom))
           units: (Cycles + $normUnit)
+  metrics_description:
+    Hit rate: The ratio of the number of vL1D cache line requests that hit in vL1D
+      cache over the total number of cache line requests to the vL1D Cache RAM.
+    Bandwidth Utilization: The number of bytes looked up in the vL1D cache as a result
+      of VMEM instructions, as a percent of the peak theoretical bandwidth achievable
+      on the specific accelerator. The number of bytes is calculated as the number
+      of cache lines requested multiplied by the cache line size. This value does
+      not consider partial requests, so for instance, if only a single value is requested
+      in a cache line, the data movement will still be counted as a full cache line.
+    Utilization: Indicates how busy the vL1D Cache RAM was during the kernel execution.
+      The number of cycles where the vL1D Cache RAM is actively processing any request
+      divided by the number of cycles where the vL1D is active.
+    Coalescing: Indicates how well memory instructions were coalesced by the address
+      processing unit, ranging from uncoalesced (25%) to fully coalesced (100%). Calculated
+      as the average number of thread-requests generated per instruction divided by
+      the ideal number of thread-requests per instruction.
+    Stalled on L2 Data: The ratio of the number of cycles where the vL1D is stalled
+      waiting for requested data to return from the L2 cache divided by the number
+      of cycles where the vL1D is active.
+    Stalled on L2 Req: The ratio of the number of cycles where the vL1D is stalled
+      waiting to issue a request for data to the L2 cache divided by the number of
+      cycles where the vL1D is active.
+    Tag RAM Stall (Read): The ratio of the number of cycles where the vL1D is stalled
+      due to Read requests with conflicting tags being looked up concurrently, divided
+      by the number of cycles where the vL1D is active.
+    Tag RAM Stall (Write): The ratio of the number of cycles where the vL1D is stalled
+      due to Write requests with conflicting tags being looked up concurrently, divided
+      by the number of cycles where the vL1D is active.
+    Tag RAM Stall (Atomic): The ratio of the number of cycles where the vL1D is stalled
+      due to Atomic requests with conflicting tags being looked up concurrently, divided
+      by the number of cycles where the vL1D is active.
+    Total Req: The total number of incoming requests from the address processing unit
+      after coalescing.
+    Read Req: The total number of incoming read requests from the address processing
+      unit after coalescing per normalization unit.
+    Write Req: The total number of incoming write requests from the address processing
+      unit after coalescing per normalization unit.
+    Atomic Req: The total number of incoming atomic requests from the address processing
+      unit after coalescing per normalization unit.
+    Cache BW: The number of bytes looked up in the vL1D cache as a result of VMEM
+      instructions divided by total duration. The number of bytes is calculated as
+      the number of cache lines requested multiplied by the cache line size. This
+      value does not consider partial requests, so for instance, if only a single
+      value is requested in a cache line, the data movement will still be counted
+      as a full cache line.
+    Cache Hit Rate: The ratio of the number of vL1D cache line requests that hit in
+      vL1D cache over the total number of cache line requests to the vL1D Cache RAM.
+    Cache Accesses: The total number of cache line lookups in the vL1D.
+    Cache Hits: The number of cache accesses minus the number of outgoing requests
+      to the L2 cache, that is, the number of cache line requests serviced by the
+      vL1D Cache RAM per normalization unit.
+    Invalidations: The number of times the vL1D was issued a write-back invalidate
+      command during the kernel's execution per normalization unit. This may be triggered
+      by, for instance, the buffer_wbinvl1 instruction.
+    L1-L2 BW: The number of bytes transferred across the vL1D-L2 interface as a result
+      of VMEM instructions, divided by total duration. The number of bytes is calculated
+      as the number of cache lines requested multiplied by the cache line size. This
+      value does not consider partial requests, so for instance, if only a single
+      value is requested in a cache line, the data movement will still be counted
+      as a full cache line.
+    L1-L2 Read: The number of read requests for a vL1D cache line that were not satisfied
+      by the vL1D and must be retrieved from the to the L2 Cache per normalization
+      unit.
+    L1-L2 Write: The number of write requests to a vL1D cache line that were sent
+      through the vL1D to the L2 cache, per normalization unit.
+    L1-L2 Atomic: The number of atomic requests that are sent through the vL1D to
+      the L2 cache, per normalization unit. This includes requests for atomics with,
+      and without return.
+    L1 Access Latency: Calculated as the average number of cycles that a vL1D cache
+      line request spent in the vL1D cache pipeline.
+    L1-L2 Read Latency: Calculated as the average number of cycles that the vL1D cache
+      took to issue and receive read requests from the L2 Cache. This number also
+      includes requests for atomics with return values.
+    L1-L2 Write Latency: Calculated as the average number of cycles that the vL1D
+      cache took to issue and receive acknowledgement of a write request to the L2
+      Cache. This number also includes requests for atomics without return values.
+    NC - Read: Total read requests with NC mtype from this TCP to all TCCs Sum over
+      TCP instances per normalization unit.
+    UC - Read: Total read requests with UC mtype from this TCP to all TCCs Sum over
+      TCP instances per normalization unit.
+    CC - Read: Total read requests with CC mtype from this TCP to all TCCs Sum over
+      TCP instances per normalization unit.
+    RW - Read: Total read requests with RW mtype from this TCP to all TCCs Sum over
+      TCP instances per normalization unit.
+    RW - Write: Total write requests with RW mtype from this TCP to all TCCs Sum over
+      TCP instances per normalization unit.
+    NC - Write: Total write requests with NC mtype from this TCP to all TCCs Sum over
+      TCP instances per normalization unit.
+    UC - Write: Total write requests with UC mtype from this TCP to all TCCs Sum over
+      TCP instances per normalization unit.
+    CC - Write: Total write requests with CC mtype from this TCP to all TCCs Sum over
+      TCP instances per normalization unit.
+    NC - Atomic: Total atomic requests with NC mtype from this TCP to all TCCs Sum
+      over TCP instances per normalization unit.
+    UC - Atomic: Total atomic requests with UC mtype from this TCP to all TCCs Sum
+      over TCP instances per normalization unit.
+    CC - Atomic: Total atomic requests with CC mtype from this TCP to all TCCs Sum
+      over TCP instances per normalization unit.
+    RW - Atomic: Total atomic requests with RW mtype from this TCP to all TCCs Sum
+      over TCP instances per normalization unit.
+    Req: The number of translation requests made to the UTCL1 per normalization unit.
+    Hit Ratio: The ratio of the number of translation requests that hit in the UTCL1
+      divided by the total number of translation requests made to the UTCL1.
+    Hits: The number of translation requests that hit in the UTCL1, and could be reused,
+      per normalization unit.
+    Translation Misses: The total number of translation requests that missed in the
+      UTCL1 due to translation not being present in the cache, per normalization unit.
+    Permission Misses: |-
+      The total number of translation requests that missed in the UTCL1 due
+      to a permission error, per normalization unit. This is unused and expected
+      to be zero in most configurations for modern CDNA\u2122 accelerators.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1700_l2_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1700_l2_cache.yaml
index 96e93d474d..40cbd3856f 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1700_l2_cache.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1700_l2_cache.yaml
@@ -2,218 +2,6 @@
 Panel Config:
   id: 1700
   title: L2 Cache
-  metrics_description:
-    Utilization: The ratio of the number of cycles an L2 channel was active, summed
-      over all L2 channels on the accelerator over the total L2 cycles.
-    Peak Bandwidth: The number of bytes looked up in the L2 cache, as a percent of
-      the peak theoretical bandwidth achievable on the specific accelerator. The number
-      of bytes is calculated as the number of cache lines requested multiplied by
-      the cache line size. This value does not consider partial requests, so e.g.,
-      if only a single value is requested in a cache line, the data movement will
-      still be counted as a full cache line.
-    Hit Rate: The ratio of the number of L2 cache line requests that hit in the L2
-      cache over the total number of incoming cache line requests to the L2 cache.
-    L2-Fabric Read BW: The number of bytes read by the L2 over the Infinity Fabric
-      interface per unit time.
-    L2-Fabric Write and Atomic BW: The number of bytes sent by the L2 over the Infinity
-      Fabric interface by write and atomic operations per unit time.
-    HBM Bandwidth: Maximum theoretical bandwidth of the accelerator's local high-bandwidth
-      memory (HBM) per unit time. This value is calculated as the number of HBM channels
-      multiplied by the HBM channel width multiplied by the HBM clock frequency.
-    Read BW: The total number of bytes read by the L2 cache from Infinity Fabric divided
-      by total duration.
-    HBM Read Traffic: The percent of read requests generated by the L2 cache that
-      are routed to the accelerator's local high-bandwidth memory (HBM). This breakdown
-      does not consider the size of the request (meaning that 32B and 64B requests
-      are both counted as a single request), so this metric only approximates the
-      percent of the L2-Fabric Read bandwidth directed to the local HBM.
-    Remote Read Traffic: The percent of read requests generated by the L2 cache that
-      are routed to any memory location other than the accelerator's local high-bandwidth
-      memory (HBM) - for example, the CPU's DRAM or a remote accelerator's HBM. This
-      breakdown does not consider the size of the request (meaning that 32B and 64B
-      requests are both counted as a single request), so this metric only approximates
-      the percent of the L2-Fabric Read bandwidth directed to a remote location.
-    Uncached Read Traffic: The percent of read requests generated by the L2 cache
-      that are reading from an uncached memory allocation. Note, as described in the
-      request flow section, a single 64B read request is typically counted as two
-      uncached read requests. So, it is possible for the Uncached Read Traffic to
-      reach up to 200% of the total number of read requests. This breakdown does not
-      consider the size of the request (i.e., 32B and 64B requests are both counted
-      as a single request), so this metric only approximates the percent of the L2-Fabric
-      read bandwidth directed to an uncached memory location.
-    Write and Atomic BW: The total number of bytes written by the L2 over Infinity
-      Fabric by write and atomic operations divided by total duration. Note that on
-      current CDNA accelerators, such as the MI2XX, requests are only considered atomic
-      by Infinity Fabric if they are targeted at non-write-cacheable memory, for example,
-      fine-grained memory allocations or uncached memory allocations on the MI2XX.
-    HBM Write and Atomic Traffic: The percent of write and atomic requests generated
-      by the L2 cache that are routed to the accelerator's local high-bandwidth memory
-      (HBM). This breakdown does not consider the size of the request (meaning that
-      32B and 64B requests are both counted as a single request), so this metric only
-      approximates the percent of the L2-Fabric Write and Atomic bandwidth directed
-      to the local HBM. Note that on current CDNA accelerators, such as the MI2XX,
-      requests are only considered atomic by Infinity Fabric if they are targeted
-      at fine-grained memory allocations or uncached memory allocations.
-    Remote Write and Atomic Traffic: The percent of read requests generated by the
-      L2 cache that are routed to any memory location other than the accelerator's
-      local high-bandwidth memory (HBM) - for example, the CPU's DRAM or a remote
-      accelerator's HBM. This breakdown does not consider the size of the request
-      (meaning that 32B and 64B requests are both counted as a single request), so
-      this metric only approximates the percent of the L2-Fabric Read bandwidth directed
-      to a remote location. Note that on current CDNA accelerators, such as the MI2XX,
-      requests are only considered atomic by Infinity Fabric if they are targeted
-      at fine-grained memory allocations or uncached memory allocations.
-    Atomic Traffic: The percent of write requests generated by the L2 cache that are
-      atomic requests to any memory location. This breakdown does not consider the
-      size of the request (meaning that 32B and 64B requests are both counted as a
-      single request), so this metric only approximates the percent of the L2-Fabric
-      Read bandwidth directed to a remote location. Note that on current CDNA accelerators,
-      such as the MI2XX, requests are only considered atomic by Infinity Fabric if
-      they are targeted at fine-grained memory allocations or uncached memory allocations.
-    Uncached Write and Atomic Traffic: The percent of write and atomic requests generated
-      by the L2 cache that are targeting uncached memory allocations. This breakdown
-      does not consider the size of the request (meaning that 32B and 64B requests
-      are both counted as a single request), so this metric only approximates the
-      percent of the L2-Fabric read bandwidth directed to uncached memory allocations.
-    Read Latency: The time-averaged number of cycles read requests spent in Infinity
-      Fabric before data was returned to the L2.
-    Write and Atomic Latency: The time-averaged number of cycles write requests spent
-      in Infinity Fabric before a completion acknowledgement was returned to the L2.
-    Atomic Latency: The time-averaged number of cycles atomic requests spent in Infinity
-      Fabric before a completion acknowledgement (atomic without return value) or
-      data (atomic with return value) was returned to the L2.
-    Bandwidth: The number of bytes looked up in the L2 cache, divided by total duration.
-      The number of bytes is calculated as the number of cache lines requested multiplied
-      by the cache line size. This value does not consider partial requests, so for
-      example, if only a single value is requested in a cache line, the data movement
-      will still be counted as a full cache line.
-    Read Bandwidth: Total number of bytes looked up in the L2 cache for read requests,
-      divided by total duration.
-    Write Bandwidth: Total number of bytes looked up in the L2 cache for write requests,
-      divided by total duration.
-    Atomic Bandwidth: Total number of bytes looked up in the L2 cache for atomic requests,
-      divided by total duration.
-    Req: The total number of incoming requests to the L2 from all clients for all
-      request types, per normalization unit.
-    Read Req: The total number of read requests to the L2 from all clients.
-    Write Req: The total number of write requests to the L2 from all clients.
-    Atomic Req: The total number of atomic requests (with and without return) to the
-      L2 from all clients.
-    Streaming Req: The total number of incoming requests to the L2 that are marked
-      as streaming. The exact meaning of this may differ depending on the targeted
-      accelerator, however on an MI2XX this corresponds to non-temporal load or stores.
-      The L2 cache attempts to evict streaming requests before normal requests when
-      the L2 is at capacity.
-    Probe Req: The number of coherence probe requests made to the L2 cache from outside
-      the accelerator. On an MI2XX, probe requests may be generated by, for example,
-      writes to fine-grained device memory or by writes to coarse-grained device memory.
-    Cache Hit: The ratio of the number of L2 cache line requests that hit in the L2
-      cache over the total number of incoming cache line requests to the L2 cache.
-    Hits: The total number of requests to the L2 from all clients that hit in the
-      cache. As noted in the Speed-of-Light section, this includes hit-on-miss requests.
-    Misses: The total number of requests to the L2 from all clients that miss in the
-      cache. As noted in the Speed-of-Light section, these do not include hit-on-miss
-      requests.
-    Writeback: The total number of L2 cache lines written back to memory for any reason.
-      Write-backs may occur due to user code (such as HIP kernel calls to _threadfence_system
-      or atomic built-ins) by the command processor's memory acquire/release fences,
-      or for other internal hardware reasons.
-    Writeback (Internal): The total number of L2 cache lines written back to memory
-      for internal hardware reasons, per normalization unit.
-    Writeback (vL1D Req): The total number of L2 cache lines written back to memory
-      due to requests initiated by the vL1D cache, per normalization unit.
-    Evict (Internal): The total number of L2 cache lines evicted from the cache due
-      to capacity limits, per normalization unit.
-    Evict (vL1D Req): The total number of L2 cache lines evicted from the cache due
-      to invalidation requests initiated by the vL1D cache, per normalization unit.
-    NC Req: The total number of requests to the L2 to Not-hardware-Coherent (NC) memory
-      allocations, per normalization unit.
-    UC Req: The total number of requests to the L2 that go to Uncached (UC) memory
-      allocations.
-    CC Req: The total number of requests to the L2 that go to Coherently Cacheable
-      (CC) memory allocations.
-    RW Req: The total number of requests to the L2 that go to Read-Write coherent
-      memory (RW) allocations.
-    Write - Credit Starvation: The number of cycles the L2-Fabric interface was stalled
-      on write or atomic requests to any memory location because too many write/atomic
-      requests were currently in flight, as a percent of the total active L2 cycles.
-    Read (32B): The total number of L2 requests to Infinity Fabric to read 32B of
-      data from any memory location, per normalization unit.
-    Read (64B): The total number of L2 requests to Infinity Fabric to read 64B of
-      data from any memory location, per normalization unit.
-    Read (Uncached): The total number of L2 requests to Infinity Fabric to read uncached
-      data from any memory location, per normalization unit. 64B requests for uncached
-      data are counted as two 32B uncached data requests.
-    HBM Read: The total number of L2 requests to Infinity Fabric to read 32B or 64B
-      of data from the accelerator's local HBM, per normalization unit.
-    Remote Read: The total number of L2 requests to Infinity Fabric to read 32B or
-      64B of data from any source other than the accelerator's local HBM, per normalization
-      unit.
-    Read Bandwidth - PCIe: Total number of bytes due to L2 read requests due to PCIe
-      traffic, divided by total duration.
-    "Read Bandwidth - Infinity Fabric\u2122": Total number of bytes due to L2 read
-      requests due to Infinity Fabric traffic, divided by total duration.
-    Read Bandwidth - HBM: Total number of bytes due to L2 read requests due to HBM
-      traffic, divided by total duration.
-    Write and Atomic (32B): The total number of L2 requests to Infinity Fabric to
-      write or atomically update 32B of data to any memory location, per normalization
-      unit.
-    Write and Atomic (Uncached): The total number of L2 requests to Infinity Fabric
-      to write or atomically update 32B or 64B of uncached data, per normalization
-      unit.
-    Write and Atomic (64B): The total number of L2 requests to Infinity Fabric to
-      write or atomically update 64B of data in any memory location, per normalization
-      unit.
-    HBM Write and Atomic: The total number of L2 requests to Infinity Fabric to write
-      or atomically update 32B or 64B of data in the accelerator's local HBM, per
-      normalization unit.
-    Remote Write and Atomic: The total number of L2 requests to Infinity Fabric to
-      write or atomically update 32B or 64B of data in any memory location other than
-      the accelerator's local HBM, per normalization unit.
-    Write Bandwidth - PCIe: Total number of bytes due to L2 write requests due to
-      PCIe traffic, divided by total duration.
-    "Write Bandwidth - Infinity Fabric\u2122": Total number of bytes due to L2 write
-      requests due to Infinity Fabric traffic, divided by total duration.
-    Write Bandwidth - HBM: Total number of bytes due to L2 write requests due to HBM
-      traffic, divided by total duration.
-    Atomic Bandwidth - PCIe: Total number of bytes due to L2 atomic requests due to
-      PCIe traffic, divided by total duration.
-    "Atomic Bandwidth - Infinity Fabric\u2122": Total number of bytes due to L2 atomic
-      requests due to Infinity Fabric traffic, divided by total duration.
-    Atomic Bandwidth - HBM: Total number of bytes due to L2 atomic requests due to
-      HBM traffic, divided by total duration.
-    Atomic: The total number of L2 requests to Infinity Fabric to atomically update
-      32B or 64B of data in any memory location, per normalization unit. See Request
-      flow for more detail. Note that on current CDNA accelerators, such as the MI2XX,
-      requests are only considered atomic by Infinity Fabric if they are targeted
-      at non-write-cacheable memory, such as fine-grained memory allocations or uncached
-      memory allocations on the MI2XX.
-    Read Stall: "The ratio of the total number of cycles the L2-Fabric interface was\
-      \ stalled on a read request to any destination (local HBM, remote PCIe\xAE connected\
-      \ accelerator or CPU, or remote Infinity Fabric connected accelerator or CPU)\
-      \ over the total active L2 cycles."
-    Write Stall: The ratio of the total number of cycles the L2-Fabric interface was
-      stalled on a write or atomic request to any destination (local HBM, remote accelerator
-      or CPU, PCIe connected accelerator or CPU, or remote Infinity Fabric connected
-      accelerator or CPU) over the total active L2 cycles.
-    Read - PCIe Stall: The number of cycles the L2-Fabric interface was stalled on
-      read requests to remote PCIe connected accelerators or CPUs as a percent of
-      the total active L2 cycles.
-    Read - Infinity Fabric Stall: The number of cycles the L2-Fabric interface was
-      stalled on read requests to remote Infinity Fabric connected accelerators or
-      CPUs as a percent of the total active L2 cycles.
-    Read - HBM Stall: The number of cycles the L2-Fabric interface was stalled on
-      read requests to the accelerator's local HBM as a percent of the total active
-      L2 cycles.
-    Write - PCIe Stall: The number of cycles the L2-Fabric interface was stalled on
-      write or atomic requests to remote PCIe connected accelerators or CPUs as a
-      percent of the total active L2 cycles.
-    Write - Infinity Fabric Stall: The number of cycles the L2-Fabric interface was
-      stalled on write or atomic requests to remote Infinity Fabric connected accelerators
-      or CPUs as a percent of the total active L2 cycles.
-    Write - HBM Stall: The number of cycles the L2-Fabric interface was stalled on
-      write or atomic requests to accelerator's local HBM as a percent of the total
-      active L2 cycles.
   data source:
   - metric_table:
       id: 1701
@@ -404,42 +192,42 @@ Panel Config:
           avg: AVG((TCC_REQ_sum / $denom))
           min: MIN((TCC_REQ_sum / $denom))
           max: MAX((TCC_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Read Req:
           avg: AVG((TCC_READ_sum / $denom))
           min: MIN((TCC_READ_sum / $denom))
           max: MAX((TCC_READ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Write Req:
           avg: AVG((TCC_WRITE_sum / $denom))
           min: MIN((TCC_WRITE_sum / $denom))
           max: MAX((TCC_WRITE_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Atomic Req:
           avg: AVG((TCC_ATOMIC_sum / $denom))
           min: MIN((TCC_ATOMIC_sum / $denom))
           max: MAX((TCC_ATOMIC_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Streaming Req:
           avg: AVG((TCC_STREAMING_REQ_sum / $denom))
           min: MIN((TCC_STREAMING_REQ_sum / $denom))
           max: MAX((TCC_STREAMING_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Bypasss Req:
           avg: AVG((TCC_BYPASS_REQ_sum / $denom))
           min: MIN((TCC_BYPASS_REQ_sum / $denom))
           max: MAX((TCC_BYPASS_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Probe Req:
           avg: AVG((TCC_PROBE_sum / $denom))
           min: MIN((TCC_PROBE_sum / $denom))
           max: MAX((TCC_PROBE_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Input Buffer Req:
           avg: AVG((TCC_IB_REQ_sum / $denom))
           min: MIN((TCC_IB_REQ_sum / $denom))
           max: MAX((TCC_IB_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Cache Hit:
           avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
             + TCC_MISS_sum) != 0) else None))
@@ -452,17 +240,17 @@ Panel Config:
           avg: AVG((TCC_HIT_sum / $denom))
           min: MIN((TCC_HIT_sum / $denom))
           max: MAX((TCC_HIT_sum / $denom))
-          unit: (Hits  + $normUnit)
+          unit: (Hits + $normUnit)
         Misses:
           avg: AVG((TCC_MISS_sum / $denom))
           min: MIN((TCC_MISS_sum / $denom))
           max: MAX((TCC_MISS_sum / $denom))
-          unit: (Misses  + $normUnit)
+          unit: (Misses + $normUnit)
         Writeback:
           avg: AVG((TCC_WRITEBACK_sum / $denom))
           min: MIN((TCC_WRITEBACK_sum / $denom))
           max: MAX((TCC_WRITEBACK_sum / $denom))
-          unit: (Cachelines  + $normUnit)
+          unit: (Cachelines + $normUnit)
         Writeback (Internal):
           avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom))
           min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom))
@@ -487,22 +275,22 @@ Panel Config:
           avg: AVG((TCC_NC_REQ_sum / $denom))
           min: MIN((TCC_NC_REQ_sum / $denom))
           max: MAX((TCC_NC_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         UC Req:
           avg: AVG((TCC_UC_REQ_sum / $denom))
           min: MIN((TCC_UC_REQ_sum / $denom))
           max: MAX((TCC_UC_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         CC Req:
           avg: AVG((TCC_CC_REQ_sum / $denom))
           min: MIN((TCC_CC_REQ_sum / $denom))
           max: MAX((TCC_CC_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         RW Req:
           avg: AVG((TCC_RW_REQ_sum / $denom))
           min: MIN((TCC_RW_REQ_sum / $denom))
           max: MAX((TCC_RW_REQ_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
   - metric_table:
       id: 1704
       title: L2 Cache Stalls
@@ -626,32 +414,32 @@ Panel Config:
           avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom))
           min: MIN((TCC_EA0_RDREQ_32B_sum / $denom))
           max: MAX((TCC_EA0_RDREQ_32B_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Read (64B):
           avg: AVG((TCC_EA0_RDREQ_64B_sum / $denom))
           min: MIN((TCC_EA0_RDREQ_64B_sum / $denom))
           max: MAX((TCC_EA0_RDREQ_64B_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Read (128B):
           avg: AVG((TCC_EA0_RDREQ_128B_sum / $denom))
           min: MIN((TCC_EA0_RDREQ_128B_sum / $denom))
           max: MAX((TCC_EA0_RDREQ_128B_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Read (Uncached):
           avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
           min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
           max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         HBM Read:
           avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom))
           min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom))
           max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Remote Read:
           avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
           min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
           max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Read Bandwidth - PCIe:
           avg: AVG(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
           min: MIN(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
@@ -659,39 +447,39 @@ Panel Config:
           unit: Gbps
         "Read Bandwidth - Infinity Fabric\u2122":
           avg: AVG(TCC_EA0_RDREQ_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
-          min: MIN(TCC_EA0_RDREQ_GMI_32B_sum  * 32/ (End_Timestamp - Start_Timestamp))
-          max: MAX(TCC_EA0_RDREQ_GMI_32B_sum  * 32/ (End_Timestamp - Start_Timestamp))
+          min: MIN(TCC_EA0_RDREQ_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+          max: MAX(TCC_EA0_RDREQ_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
           unit: Gbps
         Read Bandwidth - HBM:
-          avg: AVG(TCC_EA0_RDREQ_DRAM_32B_sum  * 32/ (End_Timestamp - Start_Timestamp))
-          min: MIN(TCC_EA0_RDREQ_DRAM_32B_sum  * 32/ (End_Timestamp - Start_Timestamp))
-          max: MAX(TCC_EA0_RDREQ_DRAM_32B_sum  * 32/ (End_Timestamp - Start_Timestamp))
+          avg: AVG(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+          min: MIN(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+          max: MAX(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
           unit: Gbps
         Write and Atomic (32B):
           avg: AVG(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
           min: MIN(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
           max: MAX(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Write and Atomic (Uncached):
           avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
           min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
           max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Write and Atomic (64B):
           avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom))
           min: MIN((TCC_EA0_WRREQ_64B_sum / $denom))
           max: MAX((TCC_EA0_WRREQ_64B_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         HBM Write and Atomic:
           avg: AVG((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom))
           min: MIN((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom))
           max: MAX((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Remote Write and Atomic:
           avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
           min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
           max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Write Bandwidth - PCIe:
           avg: AVG(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
           min: MIN(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
@@ -699,36 +487,249 @@ Panel Config:
           unit: Gbps
         "Write Bandwidth - Infinity Fabric\u2122":
           avg: AVG(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
-          min: MIN(TCC_EA0_WRREQ_WRITE_GMI_32B_sum  * 32/ (End_Timestamp - Start_Timestamp))
-          max: MAX(TCC_EA0_WRREQ_WRITE_GMI_32B_sum  * 32/ (End_Timestamp - Start_Timestamp))
+          min: MIN(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+          max: MAX(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
           unit: Gbps
         Write Bandwidth - HBM:
-          avg: AVG(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum  * 32/ (End_Timestamp - Start_Timestamp))
-          min: MIN(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum  * 32/ (End_Timestamp - Start_Timestamp))
-          max: MAX(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum  * 32/ (End_Timestamp - Start_Timestamp))
+          avg: AVG(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+          min: MIN(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+          max: MAX(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
           unit: Gbps
         Atomic:
           avg: AVG((TCC_EA0_ATOMIC_sum / $denom))
           min: MIN((TCC_EA0_ATOMIC_sum / $denom))
           max: MAX((TCC_EA0_ATOMIC_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Atomic - HBM:
           avg: AVG((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom))
           min: MIN((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom))
           max: MAX((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom))
-          unit: (Req  + $normUnit)
+          unit: (Req + $normUnit)
         Atomic Bandwidth - PCIe:
-          avg: AVG(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum  * 32/ (End_Timestamp - Start_Timestamp))
-          min: MIN(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum  * 32/ (End_Timestamp - Start_Timestamp))
-          max: MAX(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum  * 32/ (End_Timestamp - Start_Timestamp))
+          avg: AVG(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+          min: MIN(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+          max: MAX(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
           unit: Gbps
         "Atomic Bandwidth - Infinity Fabric\u2122":
-          avg: AVG(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum  * 32/ (End_Timestamp - Start_Timestamp))
-          min: MIN(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum  * 32/ (End_Timestamp - Start_Timestamp))
-          max: MAX(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum  * 32/ (End_Timestamp - Start_Timestamp))
+          avg: AVG(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+          min: MIN(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+          max: MAX(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
           unit: Gbps
         Atomic Bandwidth - HBM:
-          avg: AVG(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum  * 32/ (End_Timestamp - Start_Timestamp))
-          min: MIN(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum  * 32/ (End_Timestamp - Start_Timestamp))
-          max: MAX(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum  * 32/ (End_Timestamp - Start_Timestamp))
+          avg: AVG(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+          min: MIN(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+          max: MAX(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
           unit: Gbps
+  metrics_description:
+    Utilization: The ratio of the number of cycles an L2 channel was active, summed
+      over all L2 channels on the accelerator over the total L2 cycles.
+    Peak Bandwidth: The number of bytes looked up in the L2 cache, as a percent of
+      the peak theoretical bandwidth achievable on the specific accelerator. The number
+      of bytes is calculated as the number of cache lines requested multiplied by
+      the cache line size. This value does not consider partial requests, so e.g.,
+      if only a single value is requested in a cache line, the data movement will
+      still be counted as a full cache line.
+    Hit Rate: The ratio of the number of L2 cache line requests that hit in the L2
+      cache over the total number of incoming cache line requests to the L2 cache.
+    L2-Fabric Read BW: The number of bytes read by the L2 over the Infinity Fabric
+      interface per unit time.
+    L2-Fabric Write and Atomic BW: The number of bytes sent by the L2 over the Infinity
+      Fabric interface by write and atomic operations per unit time.
+    HBM Bandwidth: Maximum theoretical bandwidth of the accelerator's local high-bandwidth
+      memory (HBM) per unit time. This value is calculated as the number of HBM channels
+      multiplied by the HBM channel width multiplied by the HBM clock frequency.
+    Read BW: The total number of bytes read by the L2 cache from Infinity Fabric divided
+      by total duration.
+    HBM Read Traffic: The percent of read requests generated by the L2 cache that
+      are routed to the accelerator's local high-bandwidth memory (HBM). This breakdown
+      does not consider the size of the request (meaning that 32B and 64B requests
+      are both counted as a single request), so this metric only approximates the
+      percent of the L2-Fabric Read bandwidth directed to the local HBM.
+    Remote Read Traffic: The percent of read requests generated by the L2 cache that
+      are routed to any memory location other than the accelerator's local high-bandwidth
+      memory (HBM) - for example, the CPU's DRAM or a remote accelerator's HBM. This
+      breakdown does not consider the size of the request (meaning that 32B and 64B
+      requests are both counted as a single request), so this metric only approximates
+      the percent of the L2-Fabric Read bandwidth directed to a remote location.
+    Uncached Read Traffic: The percent of read requests generated by the L2 cache
+      that are reading from an uncached memory allocation. Note, as described in the
+      request flow section, a single 64B read request is typically counted as two
+      uncached read requests. So, it is possible for the Uncached Read Traffic to
+      reach up to 200% of the total number of read requests. This breakdown does not
+      consider the size of the request (i.e., 32B and 64B requests are both counted
+      as a single request), so this metric only approximates the percent of the L2-Fabric
+      read bandwidth directed to an uncached memory location.
+    Write and Atomic BW: The total number of bytes written by the L2 over Infinity
+      Fabric by write and atomic operations divided by total duration. Note that on
+      current CDNA accelerators, such as the MI2XX, requests are only considered atomic
+      by Infinity Fabric if they are targeted at non-write-cacheable memory, for example,
+      fine-grained memory allocations or uncached memory allocations on the MI2XX.
+    HBM Write and Atomic Traffic: The percent of write and atomic requests generated
+      by the L2 cache that are routed to the accelerator's local high-bandwidth memory
+      (HBM). This breakdown does not consider the size of the request (meaning that
+      32B and 64B requests are both counted as a single request), so this metric only
+      approximates the percent of the L2-Fabric Write and Atomic bandwidth directed
+      to the local HBM. Note that on current CDNA accelerators, such as the MI2XX,
+      requests are only considered atomic by Infinity Fabric if they are targeted
+      at fine-grained memory allocations or uncached memory allocations.
+    Remote Write and Atomic Traffic: The percent of read requests generated by the
+      L2 cache that are routed to any memory location other than the accelerator's
+      local high-bandwidth memory (HBM) - for example, the CPU's DRAM or a remote
+      accelerator's HBM. This breakdown does not consider the size of the request
+      (meaning that 32B and 64B requests are both counted as a single request), so
+      this metric only approximates the percent of the L2-Fabric Read bandwidth directed
+      to a remote location. Note that on current CDNA accelerators, such as the MI2XX,
+      requests are only considered atomic by Infinity Fabric if they are targeted
+      at fine-grained memory allocations or uncached memory allocations.
+    Atomic Traffic: The percent of write requests generated by the L2 cache that are
+      atomic requests to any memory location. This breakdown does not consider the
+      size of the request (meaning that 32B and 64B requests are both counted as a
+      single request), so this metric only approximates the percent of the L2-Fabric
+      Read bandwidth directed to a remote location. Note that on current CDNA accelerators,
+      such as the MI2XX, requests are only considered atomic by Infinity Fabric if
+      they are targeted at fine-grained memory allocations or uncached memory allocations.
+    Uncached Write and Atomic Traffic: The percent of write and atomic requests generated
+      by the L2 cache that are targeting uncached memory allocations. This breakdown
+      does not consider the size of the request (meaning that 32B and 64B requests
+      are both counted as a single request), so this metric only approximates the
+      percent of the L2-Fabric read bandwidth directed to uncached memory allocations.
+    Read Latency: The time-averaged number of cycles read requests spent in Infinity
+      Fabric before data was returned to the L2.
+    Write and Atomic Latency: The time-averaged number of cycles write requests spent
+      in Infinity Fabric before a completion acknowledgement was returned to the L2.
+    Atomic Latency: The time-averaged number of cycles atomic requests spent in Infinity
+      Fabric before a completion acknowledgement (atomic without return value) or
+      data (atomic with return value) was returned to the L2.
+    Bandwidth: The number of bytes looked up in the L2 cache, divided by total duration.
+      The number of bytes is calculated as the number of cache lines requested multiplied
+      by the cache line size. This value does not consider partial requests, so for
+      example, if only a single value is requested in a cache line, the data movement
+      will still be counted as a full cache line.
+    Read Bandwidth: Total number of bytes looked up in the L2 cache for read requests,
+      divided by total duration.
+    Write Bandwidth: Total number of bytes looked up in the L2 cache for write requests,
+      divided by total duration.
+    Atomic Bandwidth: Total number of bytes looked up in the L2 cache for atomic requests,
+      divided by total duration.
+    Req: The total number of incoming requests to the L2 from all clients for all
+      request types, per normalization unit.
+    Read Req: The total number of read requests to the L2 from all clients.
+    Write Req: The total number of write requests to the L2 from all clients.
+    Atomic Req: The total number of atomic requests (with and without return) to the
+      L2 from all clients.
+    Streaming Req: The total number of incoming requests to the L2 that are marked
+      as streaming. The exact meaning of this may differ depending on the targeted
+      accelerator, however on an MI2XX this corresponds to non-temporal load or stores.
+      The L2 cache attempts to evict streaming requests before normal requests when
+      the L2 is at capacity.
+    Probe Req: The number of coherence probe requests made to the L2 cache from outside
+      the accelerator. On an MI2XX, probe requests may be generated by, for example,
+      writes to fine-grained device memory or by writes to coarse-grained device memory.
+    Cache Hit: The ratio of the number of L2 cache line requests that hit in the L2
+      cache over the total number of incoming cache line requests to the L2 cache.
+    Hits: The total number of requests to the L2 from all clients that hit in the
+      cache. As noted in the Speed-of-Light section, this includes hit-on-miss requests.
+    Misses: The total number of requests to the L2 from all clients that miss in the
+      cache. As noted in the Speed-of-Light section, these do not include hit-on-miss
+      requests.
+    Writeback: The total number of L2 cache lines written back to memory for any reason.
+      Write-backs may occur due to user code (such as HIP kernel calls to _threadfence_system
+      or atomic built-ins) by the command processor's memory acquire/release fences,
+      or for other internal hardware reasons.
+    Writeback (Internal): The total number of L2 cache lines written back to memory
+      for internal hardware reasons, per normalization unit.
+    Writeback (vL1D Req): The total number of L2 cache lines written back to memory
+      due to requests initiated by the vL1D cache, per normalization unit.
+    Evict (Internal): The total number of L2 cache lines evicted from the cache due
+      to capacity limits, per normalization unit.
+    Evict (vL1D Req): The total number of L2 cache lines evicted from the cache due
+      to invalidation requests initiated by the vL1D cache, per normalization unit.
+    NC Req: The total number of requests to the L2 to Not-hardware-Coherent (NC) memory
+      allocations, per normalization unit.
+    UC Req: The total number of requests to the L2 that go to Uncached (UC) memory
+      allocations.
+    CC Req: The total number of requests to the L2 that go to Coherently Cacheable
+      (CC) memory allocations.
+    RW Req: The total number of requests to the L2 that go to Read-Write coherent
+      memory (RW) allocations.
+    Write - Credit Starvation: The number of cycles the L2-Fabric interface was stalled
+      on write or atomic requests to any memory location because too many write/atomic
+      requests were currently in flight, as a percent of the total active L2 cycles.
+    Read (32B): The total number of L2 requests to Infinity Fabric to read 32B of
+      data from any memory location, per normalization unit.
+    Read (64B): The total number of L2 requests to Infinity Fabric to read 64B of
+      data from any memory location, per normalization unit.
+    Read (Uncached): The total number of L2 requests to Infinity Fabric to read uncached
+      data from any memory location, per normalization unit. 64B requests for uncached
+      data are counted as two 32B uncached data requests.
+    HBM Read: The total number of L2 requests to Infinity Fabric to read 32B or 64B
+      of data from the accelerator's local HBM, per normalization unit.
+    Remote Read: The total number of L2 requests to Infinity Fabric to read 32B or
+      64B of data from any source other than the accelerator's local HBM, per normalization
+      unit.
+    Read Bandwidth - PCIe: Total number of bytes due to L2 read requests due to PCIe
+      traffic, divided by total duration.
+    "Read Bandwidth - Infinity Fabric\u2122": Total number of bytes due to L2 read
+      requests due to Infinity Fabric traffic, divided by total duration.
+    Read Bandwidth - HBM: Total number of bytes due to L2 read requests due to HBM
+      traffic, divided by total duration.
+    Write and Atomic (32B): The total number of L2 requests to Infinity Fabric to
+      write or atomically update 32B of data to any memory location, per normalization
+      unit.
+    Write and Atomic (Uncached): The total number of L2 requests to Infinity Fabric
+      to write or atomically update 32B or 64B of uncached data, per normalization
+      unit.
+    Write and Atomic (64B): The total number of L2 requests to Infinity Fabric to
+      write or atomically update 64B of data in any memory location, per normalization
+      unit.
+    HBM Write and Atomic: The total number of L2 requests to Infinity Fabric to write
+      or atomically update 32B or 64B of data in the accelerator's local HBM, per
+      normalization unit.
+    Remote Write and Atomic: The total number of L2 requests to Infinity Fabric to
+      write or atomically update 32B or 64B of data in any memory location other than
+      the accelerator's local HBM, per normalization unit.
+    Write Bandwidth - PCIe: Total number of bytes due to L2 write requests due to
+      PCIe traffic, divided by total duration.
+    "Write Bandwidth - Infinity Fabric\u2122": Total number of bytes due to L2 write
+      requests due to Infinity Fabric traffic, divided by total duration.
+    Write Bandwidth - HBM: Total number of bytes due to L2 write requests due to HBM
+      traffic, divided by total duration.
+    Atomic Bandwidth - PCIe: Total number of bytes due to L2 atomic requests due to
+      PCIe traffic, divided by total duration.
+    "Atomic Bandwidth - Infinity Fabric\u2122": Total number of bytes due to L2 atomic
+      requests due to Infinity Fabric traffic, divided by total duration.
+    Atomic Bandwidth - HBM: Total number of bytes due to L2 atomic requests due to
+      HBM traffic, divided by total duration.
+    Atomic: The total number of L2 requests to Infinity Fabric to atomically update
+      32B or 64B of data in any memory location, per normalization unit. See Request
+      flow for more detail. Note that on current CDNA accelerators, such as the MI2XX,
+      requests are only considered atomic by Infinity Fabric if they are targeted
+      at non-write-cacheable memory, such as fine-grained memory allocations or uncached
+      memory allocations on the MI2XX.
+    Read Stall: |-
+      The ratio of the total number of cycles the L2-Fabric interface was
+      stalled on a read request to any destination (local HBM, remote PCIe\xAE
+      connected accelerator or CPU, or remote Infinity Fabric connected accelerator
+      or CPU) over the total active L2 cycles.
+    Write Stall: The ratio of the total number of cycles the L2-Fabric interface was
+      stalled on a write or atomic request to any destination (local HBM, remote accelerator
+      or CPU, PCIe connected accelerator or CPU, or remote Infinity Fabric connected
+      accelerator or CPU) over the total active L2 cycles.
+    Read - PCIe Stall: The number of cycles the L2-Fabric interface was stalled on
+      read requests to remote PCIe connected accelerators or CPUs as a percent of
+      the total active L2 cycles.
+    Read - Infinity Fabric Stall: The number of cycles the L2-Fabric interface was
+      stalled on read requests to remote Infinity Fabric connected accelerators or
+      CPUs as a percent of the total active L2 cycles.
+    Read - HBM Stall: The number of cycles the L2-Fabric interface was stalled on
+      read requests to the accelerator's local HBM as a percent of the total active
+      L2 cycles.
+    Write - PCIe Stall: The number of cycles the L2-Fabric interface was stalled on
+      write or atomic requests to remote PCIe connected accelerators or CPUs as a
+      percent of the total active L2 cycles.
+    Write - Infinity Fabric Stall: The number of cycles the L2-Fabric interface was
+      stalled on write or atomic requests to remote Infinity Fabric connected accelerators
+      or CPUs as a percent of the total active L2 cycles.
+    Write - HBM Stall: The number of cycles the L2-Fabric interface was stalled on
+      write or atomic requests to accelerator's local HBM as a percent of the total
+      active L2 cycles.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1800_l2_cache_per_channel.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1800_l2_cache_per_channel.yaml
index 09a1298380..503206ebe5 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1800_l2_cache_per_channel.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1800_l2_cache_per_channel.yaml
@@ -2,10 +2,6 @@
 Panel Config:
   id: 1800
   title: L2 Cache (per Channel)
-  metrics_description:
-    L2 Cache Hit Rate: The percent of total number of requests to the L2 from all
-      clients that hit in the cache. As noted in the Speed-of-Light section, this
-      includes hit-on-miss requests.
   data source:
   - metric_table:
       id: 1801
@@ -255,3 +251,7 @@ Panel Config:
           ::_1: $total_l2_chan
       cli_style: simple_box
       tui_style: simple_box
+  metrics_description:
+    L2 Cache Hit Rate: The percent of total number of requests to the L2 from all
+      clients that hit in the cache. As noted in the Speed-of-Light section, this
+      includes hit-on-miss requests.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/2100_pc_sampling.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/2100_pc_sampling.yaml
index e94471d7dc..16e4d01e7e 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/2100_pc_sampling.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/2100_pc_sampling.yaml
@@ -2,10 +2,10 @@
 Panel Config:
   id: 2100
   title: PC Sampling
-  metrics_description: {}
   data source:
   - pc_sampling_table:
       id: 2101
       title: PC Sampling
       source: ps_file
       comparable: false
+  metrics_description: {}
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/soc_base.py b/projects/rocprofiler-compute/src/rocprof_compute_soc/soc_base.py
index 86a217e122..e273bb0ab1 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/soc_base.py
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/soc_base.py
@@ -48,9 +48,11 @@ from utils.mi_gpu_spec import mi_gpu_specs
 from utils.parser import BUILD_IN_VARS, SUPPORTED_DENOM
 from utils.specs import MachineSpecs
 from utils.utils import (
+    METRIC_ID_RE,
     add_counter_extra_config_input_yaml,
     convert_metric_id_to_panel_info,
     detect_rocprof,
+    get_panel_alias,
     get_submodules,
     is_tcc_channel_counter,
     mibench,
@@ -317,6 +319,16 @@ class OmniSoC_Base:
                     texts.append(stream.read())
 
         for block_id in filter_blocks:
+            if METRIC_ID_RE.match(block_id):
+                block_id = block_id
+            else:
+                alias = block_id
+                panel_alias_dict = get_panel_alias()
+                if alias not in panel_alias_dict:
+                    raise KeyError(f"Unknown panel alias: {alias!r}")
+                block_id = panel_alias_dict[alias]  # int
+                print(f"alias: {alias}, block id: {block_id}")
+
             file_id, panel_id, metric_id = convert_metric_id_to_panel_info(block_id)
 
             # File id filtering
@@ -326,6 +338,7 @@ class OmniSoC_Base:
                     f"{config_root_dir}"
                 )
                 continue
+
             with open(config_filename_dict[file_id]) as stream:
                 file_config = yaml.safe_load(stream)
             if panel_id is None:
@@ -711,6 +724,7 @@ class OmniSoC_Base:
             or (
                 self.get_args().filter_blocks
                 and "4" not in self.get_args().filter_blocks
+                and "roof" not in self.get_args().filter_blocks
             )
         ):
             console_log("roofline", "Skipping roofline")
diff --git a/projects/rocprofiler-compute/src/utils/file_io.py b/projects/rocprofiler-compute/src/utils/file_io.py
index 2481caaf5e..99caca5694 100644
--- a/projects/rocprofiler-compute/src/utils/file_io.py
+++ b/projects/rocprofiler-compute/src/utils/file_io.py
@@ -55,7 +55,7 @@ def load_panel_configs(
     """
     configs: dict[int, dict[str, Any]] = {}
     for dir_path in dirs:
-        for yaml_file in Path(dir_path).rglob("*.yaml"):
+        for yaml_file in Path(dir_path).glob("*.yaml"):
             with open(yaml_file) as file:
                 config_yml = yaml.safe_load(file)
                 # metric key can be None due to some metric-
diff --git a/projects/rocprofiler-compute/src/utils/tty.py b/projects/rocprofiler-compute/src/utils/tty.py
index eb122c3419..ea839762a4 100644
--- a/projects/rocprofiler-compute/src/utils/tty.py
+++ b/projects/rocprofiler-compute/src/utils/tty.py
@@ -36,7 +36,12 @@ import config
 from utils import mem_chart, parser, schema
 from utils.kernel_name_shortener import kernel_name_shortener
 from utils.logger import console_error, console_log, console_warning
-from utils.utils import convert_metric_id_to_panel_info, get_uuid
+from utils.utils import (
+    METRIC_ID_RE,
+    convert_metric_id_to_panel_info,
+    get_panel_alias,
+    get_uuid,
+)
 
 
 def string_multiple_lines(source: str, width: int, max_rows: int) -> str:
@@ -140,7 +145,9 @@ def is_roofline_shown(
     )
 
     if not has_roofline_style or (
-        args.filter_metrics and "4" not in args.filter_metrics
+        args.filter_metrics
+        and "4" not in args.filter_metrics
+        and "roof" not in args.filter_metrics
     ):
         return False
 
@@ -433,21 +440,33 @@ def show_all(
     Show all panels with their data in plain text mode.
     """
     comparable_columns = parser.build_comparable_columns(args.time_unit)
-    filter_panel_ids = profiling_config.get("filter_blocks", [])
+    raw_filter_panel_ids = profiling_config.get("filter_blocks", [])
     csv_dir = None
 
-    if isinstance(filter_panel_ids, dict):
+    if isinstance(raw_filter_panel_ids, dict):
         # For backward compatibility
-        filter_panel_ids = [
+        raw_filter_panel_ids = [
             name
-            for name, table_type in filter_panel_ids.items()
+            for name, table_type in raw_filter_panel_ids.items()
             if table_type == "metric_id"
         ]
-    filter_panel_ids = [
-        int(result[0])
-        for metric_id in filter_panel_ids
-        if (result := convert_metric_id_to_panel_info(metric_id)) is not None
-    ]
+
+    panel_alias = get_panel_alias()  # alias -> panel_id (string or int)
+
+    filter_panel_ids = set()
+    for bid in raw_filter_panel_ids:
+        bid_s = str(bid)
+
+        # If it's not already an ID, resolve alias -> ID
+        if not METRIC_ID_RE.match(bid_s):
+            try:
+                bid_s = str(panel_alias[bid_s])
+            except KeyError as e:
+                raise KeyError(f"Unknown panel alias: {bid_s!r}") from e
+
+        file_id, _, _ = convert_metric_id_to_panel_info(bid_s)
+        if file_id is not None:
+            filter_panel_ids.add(int(file_id))
 
     if args.include_cols:
         hidden_cols = list(set(config.HIDDEN_COLUMNS_CLI) - set(args.include_cols))
@@ -467,19 +486,19 @@ def show_all(
         if len(args.path) > 1 and panel_id in config.HIDDEN_SECTIONS:
             continue
 
-        panel_content = ""  # store content of all data_source from one panel
+        if panel_id == 400 and not is_roofline_shown(
+            args, runs, output, panel, roof_plot, hidden_cols
+        ):
+            continue
 
-        if panel_id == 400:
-            if is_roofline_shown(args, runs, output, panel, roof_plot, hidden_cols):
-                continue
+        panel_content = ""  # store content of all data_source from one panel
 
         for data_source in panel["data source"]:
             for table_type, table_config in data_source.items():
-                # If block filtering was used during analysis, then don't use profiling
-                # config. If block filtering was used in profiling config, only show
-                # those panels. If block filtering not used in profiling config, show
-                # all panels. Skip this table if table id or panel id is not present
-                # in block filters. However, always show panel id <= 100.
+                # Block-filter logic:
+                # - If analysis used --filter-metrics, ignore profiling block filters
+                # - If profiling had block filters, only show selected tables/panels
+                # - Always show panels with id <= 100
                 if (
                     not args.filter_metrics
                     and filter_panel_ids
@@ -497,9 +516,8 @@ def show_all(
                     )
                     continue
 
-                # Metrics baseline comparison mode
+                # Metrics baseline comparison mode: only show common metrics across runs
                 # We cannot guarantee that all runs have the same metrics.
-                # Only show common metrics.
                 if (
                     table_type == "metric_table"
                     and "Metric" in table_config["header"].values()
diff --git a/projects/rocprofiler-compute/src/utils/utils.py b/projects/rocprofiler-compute/src/utils/utils.py
index aa76fa8500..750c5f2353 100644
--- a/projects/rocprofiler-compute/src/utils/utils.py
+++ b/projects/rocprofiler-compute/src/utils/utils.py
@@ -60,6 +60,8 @@ from utils.logger import (
     demarcate,
 )
 
+METRIC_ID_RE = re.compile(pattern=r"^\d{1,2}(?:\.\d{1,2}){0,2}$")
+
 rocprof_cmd = ""
 rocprof_args = ""
 
@@ -1629,3 +1631,16 @@ def format_scientific_notation_if_needed(
         formatted = normal_str
 
     return formatted
+
+
+def load_yaml(filepath: str) -> dict[str, Any]:
+    """Load YAML file and return as dictionary."""
+    with open(filepath) as f:
+        return yaml.safe_load(f)
+
+
+def get_panel_alias() -> dict[str, str]:
+    panel_yaml = load_yaml("tools/config_management/gfx9_config_template.yaml")
+    return {
+        panel["panel_alias"]: str(panel["panel_id"]) for panel in panel_yaml["panels"]
+    }
diff --git a/projects/rocprofiler-compute/tests/conftest.py b/projects/rocprofiler-compute/tests/conftest.py
index 66972235b0..e9f729c658 100644
--- a/projects/rocprofiler-compute/tests/conftest.py
+++ b/projects/rocprofiler-compute/tests/conftest.py
@@ -25,12 +25,18 @@
 
 import os
 import subprocess
+import sys
 from importlib.machinery import SourceFileLoader
 from pathlib import Path
 from unittest.mock import patch
 
 import pytest
 
+ROOT = os.path.dirname(os.path.dirname(__file__))
+SRC = os.path.join(ROOT, "src")
+if SRC not in sys.path:
+    sys.path.insert(0, SRC)
+
 rocprof_compute = SourceFileLoader(
     "rocprof-compute", "src/rocprof-compute"
 ).load_module()
diff --git a/projects/rocprofiler-compute/tests/test_autogen_config.py b/projects/rocprofiler-compute/tests/test_autogen_config.py
index acd8337520..9ef3d3325f 100644
--- a/projects/rocprofiler-compute/tests/test_autogen_config.py
+++ b/projects/rocprofiler-compute/tests/test_autogen_config.py
@@ -31,7 +31,7 @@ import yaml
 
 def test_modification_time():
     # Ensure hash map consistency
-    hash_path = Path("utils/autogen_hash.yaml")
+    hash_path = Path("tools/autogen_hash.yaml")
     with open(hash_path) as f:
         hash_map = yaml.safe_load(f)
     for file, hash in hash_map.items():
diff --git a/projects/rocprofiler-compute/tools/__init__.py b/projects/rocprofiler-compute/tools/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/projects/rocprofiler-compute/tools/autogen_hash.yaml b/projects/rocprofiler-compute/tools/autogen_hash.yaml
new file mode 100644
index 0000000000..0b3955ff1a
--- /dev/null
+++ b/projects/rocprofiler-compute/tools/autogen_hash.yaml
@@ -0,0 +1,116 @@
+# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py
+src/rocprof_compute_soc/analysis_configs/gfx908/0000_top_stats.yaml: ad7818c680acb0d4e3cb624e0f14f79d44fa7efe14531c5643f47ac96266c91d
+src/rocprof_compute_soc/analysis_configs/gfx90a/0000_top_stats.yaml: ad7818c680acb0d4e3cb624e0f14f79d44fa7efe14531c5643f47ac96266c91d
+src/rocprof_compute_soc/analysis_configs/gfx940/0000_top_stats.yaml: ad7818c680acb0d4e3cb624e0f14f79d44fa7efe14531c5643f47ac96266c91d
+src/rocprof_compute_soc/analysis_configs/gfx941/0000_top_stats.yaml: ad7818c680acb0d4e3cb624e0f14f79d44fa7efe14531c5643f47ac96266c91d
+src/rocprof_compute_soc/analysis_configs/gfx942/0000_top_stats.yaml: ad7818c680acb0d4e3cb624e0f14f79d44fa7efe14531c5643f47ac96266c91d
+src/rocprof_compute_soc/analysis_configs/gfx950/0000_top_stats.yaml: ad7818c680acb0d4e3cb624e0f14f79d44fa7efe14531c5643f47ac96266c91d
+src/rocprof_compute_soc/analysis_configs/gfx908/0100_system_info.yaml: d95eea137c439cc2aa4ac5273f06ac6a05037a74550bc23a095162ee366d39cb
+src/rocprof_compute_soc/analysis_configs/gfx90a/0100_system_info.yaml: d95eea137c439cc2aa4ac5273f06ac6a05037a74550bc23a095162ee366d39cb
+src/rocprof_compute_soc/analysis_configs/gfx940/0100_system_info.yaml: d95eea137c439cc2aa4ac5273f06ac6a05037a74550bc23a095162ee366d39cb
+src/rocprof_compute_soc/analysis_configs/gfx941/0100_system_info.yaml: d95eea137c439cc2aa4ac5273f06ac6a05037a74550bc23a095162ee366d39cb
+src/rocprof_compute_soc/analysis_configs/gfx942/0100_system_info.yaml: d95eea137c439cc2aa4ac5273f06ac6a05037a74550bc23a095162ee366d39cb
+src/rocprof_compute_soc/analysis_configs/gfx950/0100_system_info.yaml: d95eea137c439cc2aa4ac5273f06ac6a05037a74550bc23a095162ee366d39cb
+src/rocprof_compute_soc/analysis_configs/gfx908/0200_system_speed_of_light.yaml: aa60b7a75e46196195675a1c8d6aa65211483ace8dfe346ed0228056586bc8a5
+src/rocprof_compute_soc/analysis_configs/gfx90a/0200_system_speed_of_light.yaml: 54d0ef58f8222463516984d3b9153806f5185de9e719d1903537af4c8344a4f4
+src/rocprof_compute_soc/analysis_configs/gfx940/0200_system_speed_of_light.yaml: a6a5d78d76eb39471249c4c55ccea2e8084a5136c01d29aaeb87d308cce05d2e
+src/rocprof_compute_soc/analysis_configs/gfx941/0200_system_speed_of_light.yaml: 352d4702fbebd8550883b777b875893a8404a7909d83c74cdd50c1b713452c81
+src/rocprof_compute_soc/analysis_configs/gfx942/0200_system_speed_of_light.yaml: a6a5d78d76eb39471249c4c55ccea2e8084a5136c01d29aaeb87d308cce05d2e
+src/rocprof_compute_soc/analysis_configs/gfx950/0200_system_speed_of_light.yaml: 1a164dfbb551e4b0a8a55a843d776738d90406cdbe2930e0f474b77a075a7353
+src/rocprof_compute_soc/analysis_configs/gfx908/0300_memory_chart.yaml: ff5fd164694f454a95ccd52c8c0bfa20aebfa476908cab2ac03215fb33e48598
+src/rocprof_compute_soc/analysis_configs/gfx90a/0300_memory_chart.yaml: 332c1965f462e75a479ddf3270294e1cf723701eb08b60c6cea550eb3bc192e7
+src/rocprof_compute_soc/analysis_configs/gfx940/0300_memory_chart.yaml: 92dc15222a707fff79ce2084172ae2068465bfe064b89538ca7e83359422dfc8
+src/rocprof_compute_soc/analysis_configs/gfx941/0300_memory_chart.yaml: 92dc15222a707fff79ce2084172ae2068465bfe064b89538ca7e83359422dfc8
+src/rocprof_compute_soc/analysis_configs/gfx942/0300_memory_chart.yaml: 92dc15222a707fff79ce2084172ae2068465bfe064b89538ca7e83359422dfc8
+src/rocprof_compute_soc/analysis_configs/gfx950/0300_memory_chart.yaml: d3a2e085061068ff8cff0b80f6944dc866ec3e748cf1e4c0cfcd76e1e14d21f8
+src/rocprof_compute_soc/analysis_configs/gfx908/0400_roofline.yaml: e91988af6d99a03e2a19593155447f79abe64dc128a83a170a5037ab466b238c
+src/rocprof_compute_soc/analysis_configs/gfx90a/0400_roofline.yaml: 0807c87d20faed19f2ef9470e9277715f2287e687aa831a328dcab4915a38812
+src/rocprof_compute_soc/analysis_configs/gfx940/0400_roofline.yaml: f5f35d1ae9a35fe83bcdf572aa788401c14cc6718761c4cf8e4dddcf249c3548
+src/rocprof_compute_soc/analysis_configs/gfx941/0400_roofline.yaml: 760ecef9947fa31d3a0fb5c45d653060d06213d8d9f216c19cbb1b1ce29942b6
+src/rocprof_compute_soc/analysis_configs/gfx942/0400_roofline.yaml: e037ce1a2cf8ba08e2317e322b56954caace6ec2427a966acbabf2135cd89855
+src/rocprof_compute_soc/analysis_configs/gfx950/0400_roofline.yaml: f53b2a92b3ac051290eff9b1f63343c30e6cd223b9cbf9d30a93ef4a5ff158b3
+src/rocprof_compute_soc/analysis_configs/gfx908/0500_command_processor_cpc_cpf.yaml: 649bec27b9ccee34c96520c1f6bc0977779a8c4f8a58ee21ff59d61207962966
+src/rocprof_compute_soc/analysis_configs/gfx90a/0500_command_processor_cpc_cpf.yaml: 649bec27b9ccee34c96520c1f6bc0977779a8c4f8a58ee21ff59d61207962966
+src/rocprof_compute_soc/analysis_configs/gfx940/0500_command_processor_cpc_cpf.yaml: 649bec27b9ccee34c96520c1f6bc0977779a8c4f8a58ee21ff59d61207962966
+src/rocprof_compute_soc/analysis_configs/gfx941/0500_command_processor_cpc_cpf.yaml: 649bec27b9ccee34c96520c1f6bc0977779a8c4f8a58ee21ff59d61207962966
+src/rocprof_compute_soc/analysis_configs/gfx942/0500_command_processor_cpc_cpf.yaml: 649bec27b9ccee34c96520c1f6bc0977779a8c4f8a58ee21ff59d61207962966
+src/rocprof_compute_soc/analysis_configs/gfx950/0500_command_processor_cpc_cpf.yaml: 1e4c1bc1158398df8966d24e56b7d434458ce10ade9e13f168887d9a0d9abaef
+src/rocprof_compute_soc/analysis_configs/gfx908/0600_workgroup_manager_spi.yaml: 2bcb7045609e8ff023c9bfa384e63f6a2cc926ff3261f3eab6737f89a899809e
+src/rocprof_compute_soc/analysis_configs/gfx90a/0600_workgroup_manager_spi.yaml: 2bcb7045609e8ff023c9bfa384e63f6a2cc926ff3261f3eab6737f89a899809e
+src/rocprof_compute_soc/analysis_configs/gfx940/0600_workgroup_manager_spi.yaml: 2bcb7045609e8ff023c9bfa384e63f6a2cc926ff3261f3eab6737f89a899809e
+src/rocprof_compute_soc/analysis_configs/gfx941/0600_workgroup_manager_spi.yaml: 2bcb7045609e8ff023c9bfa384e63f6a2cc926ff3261f3eab6737f89a899809e
+src/rocprof_compute_soc/analysis_configs/gfx942/0600_workgroup_manager_spi.yaml: 2bcb7045609e8ff023c9bfa384e63f6a2cc926ff3261f3eab6737f89a899809e
+src/rocprof_compute_soc/analysis_configs/gfx950/0600_workgroup_manager_spi.yaml: 6d97f3ebf3bef1d164255d4c4979e43d7f313f1eda067324aad9be06be98f090
+src/rocprof_compute_soc/analysis_configs/gfx908/0700_wavefront.yaml: da9fb740f9dfafa43c8d0401d22082915d1c04021e07fb8003ac1f31005e282b
+src/rocprof_compute_soc/analysis_configs/gfx90a/0700_wavefront.yaml: da9fb740f9dfafa43c8d0401d22082915d1c04021e07fb8003ac1f31005e282b
+src/rocprof_compute_soc/analysis_configs/gfx940/0700_wavefront.yaml: da9fb740f9dfafa43c8d0401d22082915d1c04021e07fb8003ac1f31005e282b
+src/rocprof_compute_soc/analysis_configs/gfx941/0700_wavefront.yaml: da9fb740f9dfafa43c8d0401d22082915d1c04021e07fb8003ac1f31005e282b
+src/rocprof_compute_soc/analysis_configs/gfx942/0700_wavefront.yaml: da9fb740f9dfafa43c8d0401d22082915d1c04021e07fb8003ac1f31005e282b
+src/rocprof_compute_soc/analysis_configs/gfx950/0700_wavefront.yaml: a6012921ec2e5984861d34ebfca416703b00f3b2cd4cb07541378a285a58b778
+src/rocprof_compute_soc/analysis_configs/gfx908/1000_compute_units_instruction_mix.yaml: 82ef2f27395f2887d1385a33b1d4bcb7cb646ece11146fe1238af2a2fc49108f
+src/rocprof_compute_soc/analysis_configs/gfx90a/1000_compute_units_instruction_mix.yaml: e58c1dff540e06ec3021ae4e852cec5a116e978f00f3e0902b74b5d86f1b88ac
+src/rocprof_compute_soc/analysis_configs/gfx940/1000_compute_units_instruction_mix.yaml: c74ada0b2cd9eda1e1115679267343e7afad9c9638b3a54b3f98193ae9637e09
+src/rocprof_compute_soc/analysis_configs/gfx941/1000_compute_units_instruction_mix.yaml: c74ada0b2cd9eda1e1115679267343e7afad9c9638b3a54b3f98193ae9637e09
+src/rocprof_compute_soc/analysis_configs/gfx942/1000_compute_units_instruction_mix.yaml: c74ada0b2cd9eda1e1115679267343e7afad9c9638b3a54b3f98193ae9637e09
+src/rocprof_compute_soc/analysis_configs/gfx950/1000_compute_units_instruction_mix.yaml: a0fe88305b0972c0702e542558c0d491eac26438577660e58817e988b7b1f0d4
+src/rocprof_compute_soc/analysis_configs/gfx908/1100_compute_units_compute_pipeline.yaml: e815205890d9c815f7f53cdaa64eeef6219bce83054b92fa2be25e240093bdb0
+src/rocprof_compute_soc/analysis_configs/gfx90a/1100_compute_units_compute_pipeline.yaml: b44f500ee07856ec8c59afa1ebb0a204d8b5f3247a43725ba16782484fef6ad1
+src/rocprof_compute_soc/analysis_configs/gfx940/1100_compute_units_compute_pipeline.yaml: e493741974eae65d88afd4fa98b6b3089fb483900b17af2630be18160964d80c
+src/rocprof_compute_soc/analysis_configs/gfx941/1100_compute_units_compute_pipeline.yaml: e493741974eae65d88afd4fa98b6b3089fb483900b17af2630be18160964d80c
+src/rocprof_compute_soc/analysis_configs/gfx942/1100_compute_units_compute_pipeline.yaml: e493741974eae65d88afd4fa98b6b3089fb483900b17af2630be18160964d80c
+src/rocprof_compute_soc/analysis_configs/gfx950/1100_compute_units_compute_pipeline.yaml: 4797cd3052fdb37278aa9a28572287c1a9a7228f05a77ce22c0eb4786cbbd404
+src/rocprof_compute_soc/analysis_configs/gfx908/1200_local_data_share_lds.yaml: 307733f9fee02c620558e2ee4ca3978954f62c3fab26cc98766511b93e96d54c
+src/rocprof_compute_soc/analysis_configs/gfx90a/1200_local_data_share_lds.yaml: 307733f9fee02c620558e2ee4ca3978954f62c3fab26cc98766511b93e96d54c
+src/rocprof_compute_soc/analysis_configs/gfx940/1200_local_data_share_lds.yaml: 307733f9fee02c620558e2ee4ca3978954f62c3fab26cc98766511b93e96d54c
+src/rocprof_compute_soc/analysis_configs/gfx941/1200_local_data_share_lds.yaml: 307733f9fee02c620558e2ee4ca3978954f62c3fab26cc98766511b93e96d54c
+src/rocprof_compute_soc/analysis_configs/gfx942/1200_local_data_share_lds.yaml: 307733f9fee02c620558e2ee4ca3978954f62c3fab26cc98766511b93e96d54c
+src/rocprof_compute_soc/analysis_configs/gfx950/1200_local_data_share_lds.yaml: 35c98741e9b5afd2f7638d2675b22138f5854168e15bc4633112857ed94edbc1
+src/rocprof_compute_soc/analysis_configs/gfx908/1300_instruction_cache.yaml: d2b0a455e9f28d66e6cef701d598072285c58eeebea2d08e1864a8602cdd797a
+src/rocprof_compute_soc/analysis_configs/gfx90a/1300_instruction_cache.yaml: d2b0a455e9f28d66e6cef701d598072285c58eeebea2d08e1864a8602cdd797a
+src/rocprof_compute_soc/analysis_configs/gfx940/1300_instruction_cache.yaml: d2b0a455e9f28d66e6cef701d598072285c58eeebea2d08e1864a8602cdd797a
+src/rocprof_compute_soc/analysis_configs/gfx941/1300_instruction_cache.yaml: d2b0a455e9f28d66e6cef701d598072285c58eeebea2d08e1864a8602cdd797a
+src/rocprof_compute_soc/analysis_configs/gfx942/1300_instruction_cache.yaml: d2b0a455e9f28d66e6cef701d598072285c58eeebea2d08e1864a8602cdd797a
+src/rocprof_compute_soc/analysis_configs/gfx950/1300_instruction_cache.yaml: d2b0a455e9f28d66e6cef701d598072285c58eeebea2d08e1864a8602cdd797a
+src/rocprof_compute_soc/analysis_configs/gfx908/1400_scalar_l1_data_cache.yaml: ccbe9a1309177db8760727f256cd14a7612708833828068dd2bede73ad319d75
+src/rocprof_compute_soc/analysis_configs/gfx90a/1400_scalar_l1_data_cache.yaml: ccbe9a1309177db8760727f256cd14a7612708833828068dd2bede73ad319d75
+src/rocprof_compute_soc/analysis_configs/gfx940/1400_scalar_l1_data_cache.yaml: ccbe9a1309177db8760727f256cd14a7612708833828068dd2bede73ad319d75
+src/rocprof_compute_soc/analysis_configs/gfx941/1400_scalar_l1_data_cache.yaml: ccbe9a1309177db8760727f256cd14a7612708833828068dd2bede73ad319d75
+src/rocprof_compute_soc/analysis_configs/gfx942/1400_scalar_l1_data_cache.yaml: ccbe9a1309177db8760727f256cd14a7612708833828068dd2bede73ad319d75
+src/rocprof_compute_soc/analysis_configs/gfx950/1400_scalar_l1_data_cache.yaml: ccbe9a1309177db8760727f256cd14a7612708833828068dd2bede73ad319d75
+src/rocprof_compute_soc/analysis_configs/gfx908/1500_address_processing_unit_and_data_return_path_ta_td.yaml: b98a800c31da0275704e076e561468dccdaf0b8bff1cc8d74a4e6bf9c7be2973
+src/rocprof_compute_soc/analysis_configs/gfx90a/1500_address_processing_unit_and_data_return_path_ta_td.yaml: 58834a04fc4fb6f9eb648a6b8944f737ce4a8c9d4a6c5f75104d9fd528f520a6
+src/rocprof_compute_soc/analysis_configs/gfx940/1500_address_processing_unit_and_data_return_path_ta_td.yaml: 7f37bcd01557a45aa5ed9009962a9f2499ad924a6a07d7d25a3af97138f360f8
+src/rocprof_compute_soc/analysis_configs/gfx941/1500_address_processing_unit_and_data_return_path_ta_td.yaml: 7f37bcd01557a45aa5ed9009962a9f2499ad924a6a07d7d25a3af97138f360f8
+src/rocprof_compute_soc/analysis_configs/gfx942/1500_address_processing_unit_and_data_return_path_ta_td.yaml: 7f37bcd01557a45aa5ed9009962a9f2499ad924a6a07d7d25a3af97138f360f8
+src/rocprof_compute_soc/analysis_configs/gfx950/1500_address_processing_unit_and_data_return_path_ta_td.yaml: 5c6555a93b01c057f01e0b0cef3169eeb324ca8c256c42f5f9fc0d1ea131486b
+src/rocprof_compute_soc/analysis_configs/gfx908/1600_vector_l1_data_cache.yaml: 4fcb618450366a29c09e428368e1a9afd29a0b80ec3f03a5b3d55a2111bd5704
+src/rocprof_compute_soc/analysis_configs/gfx90a/1600_vector_l1_data_cache.yaml: 4fcb618450366a29c09e428368e1a9afd29a0b80ec3f03a5b3d55a2111bd5704
+src/rocprof_compute_soc/analysis_configs/gfx940/1600_vector_l1_data_cache.yaml: 1c25e20d701aff1ab9276a29cfd5f219b24c621b534aa5b86d1b78d2ae2f300a
+src/rocprof_compute_soc/analysis_configs/gfx941/1600_vector_l1_data_cache.yaml: 1c25e20d701aff1ab9276a29cfd5f219b24c621b534aa5b86d1b78d2ae2f300a
+src/rocprof_compute_soc/analysis_configs/gfx942/1600_vector_l1_data_cache.yaml: 1c25e20d701aff1ab9276a29cfd5f219b24c621b534aa5b86d1b78d2ae2f300a
+src/rocprof_compute_soc/analysis_configs/gfx950/1600_vector_l1_data_cache.yaml: 3cec51c5a848c4f513c4c0a74aa35a5657289148a67179f8db4ea3e55bdb6ac3
+src/rocprof_compute_soc/analysis_configs/gfx908/1700_l2_cache.yaml: e37693ef03caf3d77ae7b91c3c166d033fa0732880cc50a21b8c06a4e79b1f38
+src/rocprof_compute_soc/analysis_configs/gfx90a/1700_l2_cache.yaml: 3314a1e473b1cfc95b742b1a8cfbc47d4602061ca89d7a4ac89ea7cc15908962
+src/rocprof_compute_soc/analysis_configs/gfx940/1700_l2_cache.yaml: cb8922a41dd2088e8e2b0c1e82c7b95fa55304cf90435b217da128234805d77a
+src/rocprof_compute_soc/analysis_configs/gfx941/1700_l2_cache.yaml: 2187f141480a2c57b271ded46255735510de5197441de830cf1efa9345e5566a
+src/rocprof_compute_soc/analysis_configs/gfx942/1700_l2_cache.yaml: 7ce34989a66b8f8750cf1bf76f5cdaf59bf662a7205355f6fe12cace796d4ceb
+src/rocprof_compute_soc/analysis_configs/gfx950/1700_l2_cache.yaml: a3a8db0f555cd1069a61dfc3b89df83e9423d4a0200f1401c7612942ff75152e
+src/rocprof_compute_soc/analysis_configs/gfx908/1800_l2_cache_per_channel.yaml: fd32454bf9f0d3027c77a85ea6be308e92f6815d0ea732c6bafacc8e0f32a25f
+src/rocprof_compute_soc/analysis_configs/gfx90a/1800_l2_cache_per_channel.yaml: 23e9a258ab541d24d29cde2237f9445db695e7a4d17d5974cb4fd5ff9a9869c0
+src/rocprof_compute_soc/analysis_configs/gfx940/1800_l2_cache_per_channel.yaml: eb0823823506bfe0d40931fd69c435baab4979d2dfee158dc33c3651721f9f33
+src/rocprof_compute_soc/analysis_configs/gfx941/1800_l2_cache_per_channel.yaml: eb0823823506bfe0d40931fd69c435baab4979d2dfee158dc33c3651721f9f33
+src/rocprof_compute_soc/analysis_configs/gfx942/1800_l2_cache_per_channel.yaml: eb0823823506bfe0d40931fd69c435baab4979d2dfee158dc33c3651721f9f33
+src/rocprof_compute_soc/analysis_configs/gfx950/1800_l2_cache_per_channel.yaml: b6336ab78a97fb9750e2f925893a5acc4e66e43ac60472c20225e56c440983d7
+src/rocprof_compute_soc/analysis_configs/gfx908/2100_pc_sampling.yaml: efa16d3aadc3363bcc067895a978b82c1e06fa90882dfe33e03315e2c0425d36
+src/rocprof_compute_soc/analysis_configs/gfx90a/2100_pc_sampling.yaml: efa16d3aadc3363bcc067895a978b82c1e06fa90882dfe33e03315e2c0425d36
+src/rocprof_compute_soc/analysis_configs/gfx940/2100_pc_sampling.yaml: efa16d3aadc3363bcc067895a978b82c1e06fa90882dfe33e03315e2c0425d36
+src/rocprof_compute_soc/analysis_configs/gfx941/2100_pc_sampling.yaml: efa16d3aadc3363bcc067895a978b82c1e06fa90882dfe33e03315e2c0425d36
+src/rocprof_compute_soc/analysis_configs/gfx942/2100_pc_sampling.yaml: efa16d3aadc3363bcc067895a978b82c1e06fa90882dfe33e03315e2c0425d36
+src/rocprof_compute_soc/analysis_configs/gfx950/2100_pc_sampling.yaml: efa16d3aadc3363bcc067895a978b82c1e06fa90882dfe33e03315e2c0425d36
+src/rocprof_compute_soc/profile_configs/sets/gfx908_sets.yaml: ee28989e70d0537db8b0f0a4bc5499444b44ff0e73d3e7f2926943be11d0aeda
+src/rocprof_compute_soc/profile_configs/sets/gfx90a_sets.yaml: 9c9533174a3f7bd5c8e09ec998743c7bb2642c4ce3f818b546673be9cafc40a8
+src/rocprof_compute_soc/profile_configs/sets/gfx940_sets.yaml: 44cd2b32b050cafa73d0ead5703b82836edf25a057c21699046b6b8b8918b242
+src/rocprof_compute_soc/profile_configs/sets/gfx941_sets.yaml: 44cd2b32b050cafa73d0ead5703b82836edf25a057c21699046b6b8b8918b242
+src/rocprof_compute_soc/profile_configs/sets/gfx942_sets.yaml: 44cd2b32b050cafa73d0ead5703b82836edf25a057c21699046b6b8b8918b242
+src/rocprof_compute_soc/profile_configs/sets/gfx950_sets.yaml: 238d9dc8a98cfead3fc904885bfe413e5bcb4f1af31e9820cd640388bcd1e1c2
+docs/data/metrics_description.yaml: 12164b43dab4a1088f90763a80ffc8feb38aa82fd7b767edf8f65bd304f22162
diff --git a/projects/rocprofiler-compute/utils/build.sh b/projects/rocprofiler-compute/tools/build.sh
similarity index 100%
rename from projects/rocprofiler-compute/utils/build.sh
rename to projects/rocprofiler-compute/tools/build.sh
diff --git a/projects/rocprofiler-compute/tools/config_management/.config_hashes.json b/projects/rocprofiler-compute/tools/config_management/.config_hashes.json
new file mode 100644
index 0000000000..5989df1edf
--- /dev/null
+++ b/projects/rocprofiler-compute/tools/config_management/.config_hashes.json
@@ -0,0 +1,142 @@
+{
+  "archs": {
+    "gfx908": {
+      "delta_hash": "a2d9bef7e5d8b056605f9b1fa6569678",
+      "files": {
+        "0000_top_stats.yaml": "2819d96f5b1c3704f2ac50868a246a7f",
+        "0100_system_info.yaml": "cefae2b10db8cf4b0d3a971cff5e82c8",
+        "0200_system_speed_of_light.yaml": "c54676a8a385c02be50fcf09a721bef6",
+        "0300_memory_chart.yaml": "f952fe7de6d86cb22f6f8ce34867905f",
+        "0400_roofline.yaml": "02ca6cf3583f2718ab371bbbfdd8cfef",
+        "0500_command_processor_cpc_cpf.yaml": "93174ba73bf014c143e179719c110db6",
+        "0600_workgroup_manager_spi.yaml": "7364c8431929891d587e6f9b96ddce10",
+        "0700_wavefront.yaml": "5cc88d7743cba8c638491d97725f6778",
+        "1000_compute_units_instruction_mix.yaml": "e96eccdcb0e5d28b292107c0f68ec845",
+        "1100_compute_units_compute_pipeline.yaml": "8f61973d0d08bf49895b5dfe32d05c09",
+        "1200_local_data_share_lds.yaml": "97be647681c51e762e774eb91e8283fc",
+        "1300_instruction_cache.yaml": "dcab979ce17e30f2d48fd2734bf08e74",
+        "1400_scalar_l1_data_cache.yaml": "e90a514e7bb597ec1d22e238650c81d9",
+        "1500_address_processing_unit_and_data_return_path_ta_td.yaml": "645eb10a440eed62c6250a0f5a2407f3",
+        "1600_vector_l1_data_cache.yaml": "e3b8d1787003094ab7b8372da818ff1e",
+        "1700_l2_cache.yaml": "38e7db4c404007c471864251dff30570",
+        "1800_l2_cache_per_channel.yaml": "7193043cd8eee47501cd8c0ae02b51e9",
+        "2100_pc_sampling.yaml": "8049866f25214544f1e53a9e2f08399b"
+      }
+    },
+    "gfx90a": {
+      "delta_hash": "55e28dda19e9ae640ba436be1a42fe97",
+      "files": {
+        "0000_top_stats.yaml": "2819d96f5b1c3704f2ac50868a246a7f",
+        "0100_system_info.yaml": "cefae2b10db8cf4b0d3a971cff5e82c8",
+        "0200_system_speed_of_light.yaml": "747b14ab50dd4d7689af7c268569b32a",
+        "0300_memory_chart.yaml": "0d6d094ad24cebf6e583e643beaae06e",
+        "0400_roofline.yaml": "632b16e1d251e57de0cf7237d3a89766",
+        "0500_command_processor_cpc_cpf.yaml": "93174ba73bf014c143e179719c110db6",
+        "0600_workgroup_manager_spi.yaml": "7364c8431929891d587e6f9b96ddce10",
+        "0700_wavefront.yaml": "5cc88d7743cba8c638491d97725f6778",
+        "1000_compute_units_instruction_mix.yaml": "af6304cce1fe38c119b1d17fa635265c",
+        "1100_compute_units_compute_pipeline.yaml": "c38ece6032d757f394c83ad9f93e0dce",
+        "1200_local_data_share_lds.yaml": "97be647681c51e762e774eb91e8283fc",
+        "1300_instruction_cache.yaml": "dcab979ce17e30f2d48fd2734bf08e74",
+        "1400_scalar_l1_data_cache.yaml": "e90a514e7bb597ec1d22e238650c81d9",
+        "1500_address_processing_unit_and_data_return_path_ta_td.yaml": "8005b28532601a759ace2f653d10da56",
+        "1600_vector_l1_data_cache.yaml": "e3b8d1787003094ab7b8372da818ff1e",
+        "1700_l2_cache.yaml": "1630ae8fc504ea056e91bb19909d5629",
+        "1800_l2_cache_per_channel.yaml": "5ee4fd9c849670c301c4afee257acddd",
+        "2100_pc_sampling.yaml": "8049866f25214544f1e53a9e2f08399b"
+      }
+    },
+    "gfx940": {
+      "delta_hash": "531bb865bffcb2fc5658c2e613b341d2",
+      "files": {
+        "0000_top_stats.yaml": "2819d96f5b1c3704f2ac50868a246a7f",
+        "0100_system_info.yaml": "cefae2b10db8cf4b0d3a971cff5e82c8",
+        "0200_system_speed_of_light.yaml": "74482aebb54b6d7b429c9ca605cb9951",
+        "0300_memory_chart.yaml": "e28ebc1340d2db1948c68225a6e008ff",
+        "0400_roofline.yaml": "1f3888778245e7eb05e769bda605588a",
+        "0500_command_processor_cpc_cpf.yaml": "93174ba73bf014c143e179719c110db6",
+        "0600_workgroup_manager_spi.yaml": "7364c8431929891d587e6f9b96ddce10",
+        "0700_wavefront.yaml": "5cc88d7743cba8c638491d97725f6778",
+        "1000_compute_units_instruction_mix.yaml": "ac290954de96988004b2a4be345a3a25",
+        "1100_compute_units_compute_pipeline.yaml": "470e3093ce9d53211923d3400e7e7bd7",
+        "1200_local_data_share_lds.yaml": "97be647681c51e762e774eb91e8283fc",
+        "1300_instruction_cache.yaml": "dcab979ce17e30f2d48fd2734bf08e74",
+        "1400_scalar_l1_data_cache.yaml": "e90a514e7bb597ec1d22e238650c81d9",
+        "1500_address_processing_unit_and_data_return_path_ta_td.yaml": "12fe315acb3e06d4c16e4538f418f0ca",
+        "1600_vector_l1_data_cache.yaml": "006854a23925320b94727261f30680b7",
+        "1700_l2_cache.yaml": "0987e21ac2547134fea87499dee01847",
+        "1800_l2_cache_per_channel.yaml": "ba5eeabcd749ecbb107c42de5ce69317",
+        "2100_pc_sampling.yaml": "8049866f25214544f1e53a9e2f08399b"
+      }
+    },
+    "gfx941": {
+      "delta_hash": "9b30264f36ff99f54941346a18af016a",
+      "files": {
+        "0000_top_stats.yaml": "2819d96f5b1c3704f2ac50868a246a7f",
+        "0100_system_info.yaml": "cefae2b10db8cf4b0d3a971cff5e82c8",
+        "0200_system_speed_of_light.yaml": "7ed2ceba47e232b4e39431228a254f7f",
+        "0300_memory_chart.yaml": "e28ebc1340d2db1948c68225a6e008ff",
+        "0400_roofline.yaml": "a80de496435c2c76eb4cfdc38d62155f",
+        "0500_command_processor_cpc_cpf.yaml": "93174ba73bf014c143e179719c110db6",
+        "0600_workgroup_manager_spi.yaml": "7364c8431929891d587e6f9b96ddce10",
+        "0700_wavefront.yaml": "5cc88d7743cba8c638491d97725f6778",
+        "1000_compute_units_instruction_mix.yaml": "ac290954de96988004b2a4be345a3a25",
+        "1100_compute_units_compute_pipeline.yaml": "470e3093ce9d53211923d3400e7e7bd7",
+        "1200_local_data_share_lds.yaml": "97be647681c51e762e774eb91e8283fc",
+        "1300_instruction_cache.yaml": "dcab979ce17e30f2d48fd2734bf08e74",
+        "1400_scalar_l1_data_cache.yaml": "e90a514e7bb597ec1d22e238650c81d9",
+        "1500_address_processing_unit_and_data_return_path_ta_td.yaml": "12fe315acb3e06d4c16e4538f418f0ca",
+        "1600_vector_l1_data_cache.yaml": "006854a23925320b94727261f30680b7",
+        "1700_l2_cache.yaml": "05a86637744ad66f6491620c4ad659d2",
+        "1800_l2_cache_per_channel.yaml": "ba5eeabcd749ecbb107c42de5ce69317",
+        "2100_pc_sampling.yaml": "8049866f25214544f1e53a9e2f08399b"
+      }
+    },
+    "gfx942": {
+      "delta_hash": "66cf66455fafa2b6b5936d31fecf3e85",
+      "files": {
+        "0000_top_stats.yaml": "2819d96f5b1c3704f2ac50868a246a7f",
+        "0100_system_info.yaml": "cefae2b10db8cf4b0d3a971cff5e82c8",
+        "0200_system_speed_of_light.yaml": "74482aebb54b6d7b429c9ca605cb9951",
+        "0300_memory_chart.yaml": "e28ebc1340d2db1948c68225a6e008ff",
+        "0400_roofline.yaml": "f94c87dad18f87e5582566276a5c0cfc",
+        "0500_command_processor_cpc_cpf.yaml": "93174ba73bf014c143e179719c110db6",
+        "0600_workgroup_manager_spi.yaml": "7364c8431929891d587e6f9b96ddce10",
+        "0700_wavefront.yaml": "5cc88d7743cba8c638491d97725f6778",
+        "1000_compute_units_instruction_mix.yaml": "ac290954de96988004b2a4be345a3a25",
+        "1100_compute_units_compute_pipeline.yaml": "470e3093ce9d53211923d3400e7e7bd7",
+        "1200_local_data_share_lds.yaml": "97be647681c51e762e774eb91e8283fc",
+        "1300_instruction_cache.yaml": "dcab979ce17e30f2d48fd2734bf08e74",
+        "1400_scalar_l1_data_cache.yaml": "e90a514e7bb597ec1d22e238650c81d9",
+        "1500_address_processing_unit_and_data_return_path_ta_td.yaml": "12fe315acb3e06d4c16e4538f418f0ca",
+        "1600_vector_l1_data_cache.yaml": "006854a23925320b94727261f30680b7",
+        "1700_l2_cache.yaml": "96e49399b26d00d88ad534a35c95304b",
+        "1800_l2_cache_per_channel.yaml": "ba5eeabcd749ecbb107c42de5ce69317",
+        "2100_pc_sampling.yaml": "8049866f25214544f1e53a9e2f08399b"
+      }
+    },
+    "gfx950": {
+      "delta_hash": null,
+      "files": {
+        "0000_top_stats.yaml": "2819d96f5b1c3704f2ac50868a246a7f",
+        "0100_system_info.yaml": "cefae2b10db8cf4b0d3a971cff5e82c8",
+        "0200_system_speed_of_light.yaml": "4a215bccc9378583a6e7e7733b601537",
+        "0300_memory_chart.yaml": "f19548711a687779df0c0b87a1df7a27",
+        "0400_roofline.yaml": "156c1a1d7a6c1e55aea25552334a84d5",
+        "0500_command_processor_cpc_cpf.yaml": "5b67ff80efbc2e1dffb7e3922499ca88",
+        "0600_workgroup_manager_spi.yaml": "63a7b6f7a4487fb87d67549214e08aac",
+        "0700_wavefront.yaml": "1ecfc3a91ec0cce6ed9eb94afae17aa9",
+        "1000_compute_units_instruction_mix.yaml": "7088fafcaa66a8ec48a9d3939cd7339a",
+        "1100_compute_units_compute_pipeline.yaml": "fce707e3f419ee2708676c8f7c325df5",
+        "1200_local_data_share_lds.yaml": "06bee89ddab210dbd122eaaedef0b29a",
+        "1300_instruction_cache.yaml": "dcab979ce17e30f2d48fd2734bf08e74",
+        "1400_scalar_l1_data_cache.yaml": "e90a514e7bb597ec1d22e238650c81d9",
+        "1500_address_processing_unit_and_data_return_path_ta_td.yaml": "355a0c6b9b113fcfb686a300b78be21a",
+        "1600_vector_l1_data_cache.yaml": "68382e45c7a3c578df861d6285024803",
+        "1700_l2_cache.yaml": "f70f23b93e97b99327b5db3907eb133e",
+        "1800_l2_cache_per_channel.yaml": "7e2a1809a9b7f70a088068d6689c8aa4",
+        "2100_pc_sampling.yaml": "8049866f25214544f1e53a9e2f08399b"
+      }
+    }
+  }
+}
diff --git a/projects/rocprofiler-compute/tools/config_management/README.md b/projects/rocprofiler-compute/tools/config_management/README.md
new file mode 100644
index 0000000000..3677bdf2c6
--- /dev/null
+++ b/projects/rocprofiler-compute/tools/config_management/README.md
@@ -0,0 +1,500 @@
+# Architecture Configuration Workflow
+
+This document explains the master workflow system for managing architecture-specific metric configurations.
+
+## Overview
+
+The workflow system manages changes to architecture configurations located in `src/rocprof_compute_soc/analysis_configs/gfx<arch>/`. It handles:
+
+- **Metric changes** (additions, deletions, modifications)
+- **Metric description changes** (plain text + RST documentation)
+- **New architecture additions**
+- **Template updates**
+- **Config delta generation** for version control
+
+## Files Overview
+
+### Core Scripts
+
+1. **`master_config_workflow_script.py`** - Main orchestrator script
+2. **`hash_manager.py`** - Tracks file changes via MD5 hashes
+3. **`metric_description_manager.py`** - Syncs metric descriptions across files
+4. **`config_workflow.yaml`** - Configuration file
+5. **`parse_config_template.py`** - Parses base config template from latest arch
+6. **`generate_config_deltas.py`** - Generates config deltas between two archs
+7. **`apply_config_deltas.py`** - Applies config deltas to genearte new arch configs
+8. **`verify_against_config_template.py`** - Validates configs against template
+
+## Quick Start
+
+### Initial Setup (not needed following first commit)
+
+1. Create the hash database:
+```bash
+python hash_manager.py --compute-all src/rocprof_compute_soc/analysis_configs
+```
+
+2. Ensure `analysis_config_template.yaml` has metadata:
+```yaml
+latest_arch: gfx950
+panels:
+  - file: top_stats.yaml
+    panel_id: 0
+    ...
+```
+
+### Making Changes
+
+Simply run the master workflow after making any changes:
+
+```bash
+python master_config_workflow_script.py
+```
+
+The script will:
+- Detect what changed
+- Prompt you for confirmation
+- Apply changes
+- Validate results
+- Update all necessary files
+
+### Dry Run Mode
+
+To see what would happen without making changes:
+
+```bash
+python master_config_workflow_script.py --dry-run
+```
+
+## Usage Scenarios
+
+### Scenario A: Add Metrics to Latest Arch (gfx950)
+
+**Method 1: Direct Edit**
+
+1. Edit `src/rocprof_compute_soc/analysis_configs/gfx950/0700_wavefront.yaml`
+2. Add your metric to the appropriate table
+3. Add description to `metrics_description` section
+4. Run: `python master_config_workflow_script.py`
+5. Answer prompts
+
+**Method 2: Using Delta**
+
+1. Create `src/rocprof_compute_soc/analysis_configs/gfx950/config_delta/gfx955_diff.yaml`:
+```yaml
+Addition:
+  - Panel Config:
+      id: 700
+      title: Wavefront
+    metric_tables:
+      - metric_table:
+          id: 701
+          title: Wavefront Launch Stats
+          metrics:
+            - New Metric:
+                avg: AVG(something)
+                unit: Units
+    metric_descriptions:
+      New Metric:
+        plain: Description text
+        rst: |- # Optional
+          Description with :ref:`RST markup <link>`
+
+Deletion:
+  []
+
+Modification:
+  []
+```
+
+2. Run: `python master_config_workflow_script.py`
+
+**What Happens:**
+- Changes applied to gfx950
+- Template updated
+- Deltas regenerated for all previous archs (gfx940, gfx941, etc.)
+- Metric descriptions synced to:
+  - `tools/per_arch_metric_definitions/gfx950_metrics_description.yaml`
+  - `docs/data/metrics_description.yaml`
+- All archs validated
+- Hashes updated
+
+### Scenario B: Modify Metrics in Older Arch (gfx940)
+
+**Method 1: Direct Edit**
+
+1. Edit `src/rocprof_compute_soc/analysis_configs/gfx940/0700_wavefront.yaml`
+2. Make your changes
+3. Run: `python master_config_workflow_script.py`
+
+**Method 2: Using Delta**
+
+1. Create `src/rocprof_compute_soc/analysis_configs/gfx940/config_delta/gfx950_diff.yaml`
+2. Run: `python master_config_workflow_script.py`
+
+**What Happens:**
+- Changes applied to gfx940 only
+- Validated against template (must still match structure)
+- Metric descriptions synced to `tools/per_arch_metric_definitions/gfx940_metrics_description.yaml`
+- Hashes updated for gfx940 only
+
+### Scenario C: Add New Architecture (gfx955)
+
+**Method 1: Create Directory with YAMLs**
+
+1. Create `src/rocprof_compute_soc/analysis_configs/gfx955/`
+2. Copy/create YAML files
+3. Run: `python master_config_workflow_script.py`
+4. Confirm this is the new latest arch
+
+**Method 2: Using Delta from Latest**
+
+1. Create delta showing differences from gfx950
+2. Place in `src/rocprof_compute_soc/analysis_configs/gfx955/config_delta/gfx955_diff.yaml`
+3. Run: `python master_config_workflow_script.py`
+4. Confirm this is the new latest arch
+
+**What Happens:**
+- gfx955 becomes new latest arch
+- Template updated with gfx955 as source
+- Deltas generated: gfx955 → gfx950, gfx955 → gfx940, etc.
+- All archs validated
+- Metric descriptions synced
+- Hashes updated
+
+### Scenario D: Update Metric Descriptions Only
+
+1. Edit description in config YAML:
+```yaml
+metrics_description:
+  Grid Size: "Updated description text"
+```
+
+2. Run: `python master_config_workflow_script.py`
+
+**What Happens:**
+- Same workflow as metric changes
+- Plain text stored in config YAMLs
+- RST version generated and stored in docs/tools files
+
+## Delta YAML Structure
+
+### Complete Example
+
+```yaml
+Addition:
+  - Panel Config:
+      id: 1100
+      title: Compute Units - Compute Pipeline
+    metric_tables:
+      - metric_table:
+          id: 1103
+          title: Arithmetic Operations
+          metrics:
+            - F8 OPs:
+                avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
+                min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
+                max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
+                unit: (OPs + $normUnit)
+    metric_descriptions:
+      F8 OPs:
+        plain: Number of 8-bit floating point operations
+        rst: |-
+          Number of 8-bit floating point operations per :ref:`normalization unit <normalization-units>`"
+
+Deletion:
+  - Panel Config:
+      id: 1100
+      title: Compute Units - Compute Pipeline
+    metric_tables:
+      - metric_table:
+          id: 1103
+          title: Arithmetic Operations
+          metrics:
+            - Old Metric:
+                avg: AVG(something)
+    metric_descriptions:
+      Old Metric:
+        plain: "Old description"
+
+Modification:
+  - Panel Config:
+      id: 1100
+      title: Compute Units - Compute Pipeline
+    metric_tables:
+      - metric_table:
+          id: 1103
+          title: Arithmetic Operations
+          metrics:
+            - Existing Metric:
+                avg: AVG(new_formula)  # Changed field only
+    metric_descriptions:
+      Existing Metric:
+        plain: Updated description
+        rst: |-
+          Updated description with **RST**"
+```
+
+### Rules for Deltas
+
+1. **Must have all three sections**: Addition, Deletion, Modification (can be empty lists)
+2. **Metric descriptions**:
+   - `plain` field is required
+   - `rst` field is optional (defaults to copy of plain)
+3. **Delta filename**: Must be `<target_arch>_diff.yaml`
+4. **Location**: `src/rocprof_compute_soc/analysis_configs/gfx<arch>/config_delta/`
+
+## Standalone Tool Usage
+
+### Hash Manager
+
+```bash
+# Compute hashes for all archs
+python hash_manager.py --compute-all src/rocprof_compute_soc/analysis_configs
+
+# Detect changes
+python hash_manager.py --detect-changes src/rocprof_compute_soc/analysis_configs
+
+# Update hashes for specific arch
+python hash_manager.py --update gfx950 src/rocprof_compute_soc/analysis_configs
+```
+
+### Metric Description Manager
+
+```bash
+# Sync descriptions for specific arch
+python metric_description_manager.py --sync-arch gfx950 src/rocprof_compute_soc/analysis_configs --latest-arch gfx950
+
+# Sync all archs
+python metric_description_manager.py --sync-all src/rocprof_compute_soc/analysis_configs --latest-arch gfx950
+
+# Validate descriptions
+python metric_description_manager.py --validate gfx950 src/rocprof_compute_soc/analysis_configs
+```
+
+### Parse Config Template
+
+```bash
+# Generate template with metadata
+python parse_config_template.py src/rocprof_compute_soc/analysis_configs/gfx950 \
+    tools/config_management/analysis_config_template.yaml \
+    --latest-arch gfx950
+```
+
+### Generate Delta
+
+```bash
+# Generate delta from current arch to previous arch
+python generate_config_deltas.py \
+    src/rocprof_compute_soc/analysis_configs/gfx950 \
+    src/rocprof_compute_soc/analysis_configs/gfx940
+```
+
+### Apply Delta
+
+```bash
+# Apply delta to base arch
+python apply_config_deltas.py \
+    src/rocprof_compute_soc/analysis_configs/gfx940 \
+    src/rocprof_compute_soc/analysis_configs/gfx940/config_delta/gfx950_diff.yaml \
+    output_dir
+```
+
+### Verify Against Template
+
+```bash
+# Validate all archs
+python verify_against_config_template.py \
+    src/rocprof_compute_soc/analysis_configs \
+    tools/config_management/analysis_config_template.yaml
+```
+
+## File Structure
+
+```
+.
+├── src/rocprof_compute_soc/analysis_configs/
+│   ├── gfx940/
+│   │   ├── 0700_wavefront.yaml           # Config with plain descriptions
+│   │   └── config_delta/
+│   │       └── gfx950_diff.yaml          # Delta to apply changes
+│   ├── gfx941/
+│   └── gfx950/                           # Latest arch
+│       ├── 0700_wavefront.yaml
+│       └── config_delta/
+│           └── gfx950_diff.yaml          # Optional delta for modifications
+│
+├── tools/
+│   ├── config_management/
+│   │   ├── .config_hashes.json           # Hash database (auto-generated)
+│   │   ├── analysis_config_template.yaml # Template with metadata
+│   │   ├── hash_manager.py
+│   │   ├── metric_description_manager.py
+│   │   ├── parse_config_template.py
+│   │   ├── generate_config_deltas.py
+│   │   ├── apply_config_deltas.py
+│   │   ├── verify_against_config_template.py
+│   │   ├── master_config_workflow_script.py
+│   │   └── config_workflow.yaml
+│   │
+│   └── per_arch_metric_definitions/
+│       ├── gfx940_metrics_description.yaml  # RST only
+│       ├── gfx941_metrics_description.yaml
+│       └── gfx950_metrics_description.yaml
+│
+├── docs/data/
+│   └── metrics_description.yaml          # RST only, latest arch only
+│
+└── .backups/                             # Auto-generated backups
+    └── 20250115_143022/                  # Timestamped backup
+```
+
+## Configuration
+
+Edit `config_workflow.yaml` to customize paths and behavior:
+
+```yaml
+paths:
+  template: tools/config_management/analysis_config_template.yaml
+  configs_root: src/rocprof_compute_soc/analysis_configs
+  backups: .backups
+  hashes: tools/config_management/.config_hashes.json
+  per_arch_metrics: tools/per_arch_metric_definitions
+  docs_metrics: docs/data/metrics_description.yaml
+
+validation:
+  strict_mode: true              # Fail on warnings
+  verify_after_changes: true     # Validate after operations
+
+behavior:
+  require_confirmation: true     # Prompt before changes
+```
+
+## Error Handling
+
+### Validation Failures
+
+If validation fails:
+1. All changes are automatically reverted
+2. Backup is restored
+3. Detailed error report is printed
+4. Fix the issue and run again
+
+### Hash Mismatches
+
+If hashes are out of sync:
+```bash
+# Recompute all hashes
+python hash_manager.py --compute-all src/rocprof_compute_soc/analysis_configs
+```
+
+### Description Validation Errors
+
+Common issues:
+- **Missing descriptions**: Warning only (won't fail)
+- **Invalid RST syntax**: Error (will fail and revert)
+- **Missing plain text**: Error (plain is required)
+
+## Best Practices
+
+1. **Always use master_config_workflow_script.py** - Don't run individual scripts manually unless debugging
+2. **Test with --dry-run first** - See what will happen before committing
+3. **Use deltas for complex changes** - Easier to review and version control
+4. **Keep descriptions updated** - Plain text in configs, RST in docs
+5. **One change at a time** - If multiple archs need updates, do them sequentially
+6. **Check validation output** - Review warnings even if they don't fail
+
+## Troubleshooting
+
+### "No changes detected"
+
+- Check that files were actually modified
+- Ensure you're in the correct directory
+- Verify hash database exists: `tools/config_management/.config_hashes.json`
+
+### "Validation failed"
+
+- Review the error output carefully
+- Check that new metrics match template structure
+- Ensure panel IDs are correct
+- Verify data source ordering
+
+### "Failed to sync metric descriptions"
+
+- Check RST syntax in descriptions
+- Ensure all metrics have descriptions
+- Verify section_panel_map includes your table ID
+
+### Changes not detected after manual edit
+
+```bash
+# Force recompute hashes
+python hash_manager.py --compute-all src/rocprof_compute_soc/analysis_configs
+
+# Then run workflow
+python master_config_workflow_script.py
+```
+
+## Development Notes
+
+### Adding New Architecture Support
+
+When adding a completely new architecture line:
+
+1. Ensure table IDs are in `metric_description_manager.py`'s `SECTION_PANEL_MAP`
+2. Follow existing naming conventions (gfxXXX)
+3. Create complete YAML set (don't start with partial configs)
+
+### Modifying the Workflow
+
+If you need to modify the workflow behavior:
+
+1. Edit `config_workflow.yaml` for path/behavior changes
+2. Edit `master_config_workflow_script.py` for workflow logic changes
+3. Test with `--dry-run` extensively
+4. Update this README
+
+
+# Pre-commit: Hash Consistency Check
+
+We ship a lightweight pre-commit hook that catches inconsistent hash updates across config YAMLs and deltas.
+
+## What it enforces (per arch)
+
+* Latest panels changed → latest delta must change (if there are older archs).
+* Latest delta changed → latest panels must change or a new arch must be added.
+* Older arch panels changed → that arch’s delta must change.
+* Older arch delta changed → either latest panels or that arch’s panels must have changed.
+
+## Setup
+
+Install and enable pre-commit:
+
+```bash
+pip install pre-commit
+pre-commit install
+```
+
+Our .pre-commit-config.yaml includes a local hook that runs the checker.
+
+```yaml
+- repo: local
+  hooks:
+    - id: hash-check
+      name: Hash consistency check
+      entry: bash -lc 'cd projects/rocprofiler-compute && python3 tools/config_management/hash_checker.py'
+      language: system
+      pass_filenames: false
+      stages: [pre-commit]
+```
+
+## Run manually
+
+```bash
+# from super-repo root
+pre-commit run --all-files
+
+# or directly in the subproject
+cd projects/rocprofiler-compute
+python3 tools/config_management/hash_checker.py
+```
diff --git a/projects/rocprofiler-compute/tools/config_management/__init__.py b/projects/rocprofiler-compute/tools/config_management/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/projects/rocprofiler-compute/tools/config_management/apply_config_deltas.py b/projects/rocprofiler-compute/tools/config_management/apply_config_deltas.py
new file mode 100644
index 0000000000..f96fba1650
--- /dev/null
+++ b/projects/rocprofiler-compute/tools/config_management/apply_config_deltas.py
@@ -0,0 +1,258 @@
+#!/usr/bin/env python3
+##############################################################################
+# MIT License
+#
+# Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+##############################################################################
+
+"""
+Apply delta YAML to base architecture to produce target architecture.
+Usage: python apply_config_deltas.py <base_arch_dir> <delta_yaml> <output_dir>
+"""
+
+from __future__ import annotations
+
+import shutil
+import sys
+from pathlib import Path
+from typing import Any, Optional, Union
+
+try:
+    from . import utils as cm_utils
+except Exception:
+    repo_root = Path(__file__).resolve().parents[1]
+    if str(repo_root) not in sys.path:
+        sys.path.insert(0, str(repo_root))
+    try:
+        import config_management.utils as cm_utils  # type: ignore
+    except Exception:
+        import utils as cm_utils  # type: ignore
+
+AUTOGEN_TEXT = (
+    "# AUTOGENERATED FILE. Only edit for testing purposes, not for development. "
+    "Generated by tools/config_management/apply_config_deltas.py\n"
+)
+
+
+def find_table_in_config(config: dict, table_id: Any) -> Optional[dict]:
+    """Find and return the table with given id, or None."""
+    for item in config.get("Panel Config", {}).get("data source", []):
+        table = item.get("metric_table")
+        if isinstance(table, dict) and table.get("id") == table_id:
+            return table
+    return None
+
+
+def add_table(config: dict, metric_table: dict) -> None:
+    """Add entire new table to config."""
+    config.setdefault("Panel Config", {}).setdefault("data source", []).append({
+        "metric_table": metric_table
+    })
+    print(f"Added table: {metric_table.get('id')} - {metric_table.get('title')}")
+
+
+def add_metrics(config: dict, table_id: Any, metrics: list[dict]) -> None:
+    """Add metrics to existing table."""
+    table = find_table_in_config(config, table_id)
+    if not table:
+        print(f"WARNING: Table {table_id} not found for metric addition")
+        return
+
+    table.setdefault("metric", {})
+    for metric_dict in metrics:
+        for metric_name, metric_data in metric_dict.items():
+            table["metric"][metric_name] = metric_data
+            print(f"Added metric: {metric_name} to table {table_id}")
+
+
+def delete_table(config: dict, table_id: Any) -> None:
+    """Remove entire table from config."""
+    data_source = config.get("Panel Config", {}).get("data source", [])
+    for idx, item in enumerate(list(data_source)):
+        table = item.get("metric_table")
+        if isinstance(table, dict) and table.get("id") == table_id:
+            data_source.pop(idx)
+            print(f"Deleted table: {table_id}")
+            return
+    print(f"WARNING: Table {table_id} not found for deletion")
+
+
+def delete_metrics(config: dict, table_id: Any, metrics: list[dict]) -> None:
+    """Remove specific metrics from table."""
+    table = find_table_in_config(config, table_id)
+    if not table or "metric" not in table:
+        print(f"WARNING: Table {table_id} not found or has no metrics")
+        return
+
+    for metric_dict in metrics:
+        for metric_name in metric_dict.keys():
+            if metric_name in table["metric"]:
+                del table["metric"][metric_name]
+                print(f"Deleted metric: {metric_name} from table {table_id}")
+
+
+def modify_metrics(config: dict, table_id: Any, metrics: list[dict]) -> None:
+    """Modify specific fields in existing metrics."""
+    table = find_table_in_config(config, table_id)
+    if not table or "metric" not in table:
+        print(f"WARNING: Table {table_id} not found or has no metrics")
+        return
+
+    for metric_dict in metrics:
+        for metric_name, new_fields in metric_dict.items():
+            if metric_name not in table["metric"]:
+                print(f"WARNING: Metric '{metric_name}' not found in table {table_id}")
+                continue
+            for field_name, field_value in new_fields.items():
+                table["metric"][metric_name][field_name] = field_value
+                print(f"Modified {metric_name}.{field_name} in table {table_id}")
+
+
+def add_descriptions(config: dict, descriptions: dict) -> None:
+    """Add metric descriptions to config."""
+    pc = config.setdefault("Panel Config", {})
+    pc.setdefault("metrics_description", {})
+    md = pc["metrics_description"]
+
+    for metric_name, desc_data in descriptions.items():
+        value = desc_data if isinstance(desc_data, dict) else desc_data
+        md[metric_name] = value
+        print(f"Added description: {metric_name}")
+
+
+def delete_descriptions(config: dict, descriptions: dict) -> None:
+    """Remove metric descriptions from config."""
+    md = config.get("Panel Config", {}).get("metrics_description", {})
+    for metric_name in descriptions.keys():
+        if metric_name in md:
+            del md[metric_name]
+            print(f"Deleted description: {metric_name}")
+
+
+def modify_descriptions(config: dict, descriptions: dict) -> None:
+    """Modify metric descriptions in config."""
+    pc = config.setdefault("Panel Config", {})
+    pc.setdefault("metrics_description", {})
+    md = pc["metrics_description"]
+
+    for metric_name, desc_data in descriptions.items():
+        value = desc_data if isinstance(desc_data, dict) else desc_data
+        md[metric_name] = value
+        print(f"Added description: {metric_name}")
+
+
+def apply_changes(config: dict, changes: list[dict], category: str) -> None:
+    """Apply delta changes to configuration."""
+    for change in changes:
+        for mt_wrapper in change.get("metric_tables", []):
+            mt = mt_wrapper.get("metric_table", mt_wrapper)
+            table_id = mt.get("id")
+
+            if category == "Addition":
+                if "metrics" in mt:
+                    add_metrics(config, table_id, mt["metrics"])
+                elif "metric" in mt:
+                    add_table(config, mt)
+
+            elif category == "Deletion":
+                if "metrics" in mt:
+                    delete_metrics(config, table_id, mt["metrics"])
+                else:
+                    delete_table(config, table_id)
+
+            elif category == "Modification":
+                if "metrics" in mt:
+                    modify_metrics(config, table_id, mt["metrics"])
+
+        descriptions = change.get("metric_descriptions", {})
+        if descriptions:
+            if category == "Addition":
+                add_descriptions(config, descriptions)
+            elif category == "Deletion":
+                delete_descriptions(config, descriptions)
+            elif category == "Modification":
+                modify_descriptions(config, descriptions)
+
+
+def apply_delta(
+    base_dir: Union[str, Path],
+    delta_file: Union[str, Path],
+    output_dir: Union[str, Path],
+) -> None:
+    """Apply delta YAML to all files in base directory."""
+    delta = cm_utils.load_yaml(delta_file)
+    output_path = Path(output_dir)
+    output_path.mkdir(parents=True, exist_ok=True)
+
+    changes_by_panel: dict[Any, dict[str, list[dict]]] = {}
+    for category in ("Addition", "Deletion", "Modification"):
+        for change in delta.get(category, []):
+            panel_id = change.get("Panel Config", {}).get("id")
+            panel_bucket = changes_by_panel.setdefault(
+                panel_id, {"Addition": [], "Deletion": [], "Modification": []}
+            )
+            panel_bucket[category].append(change)
+
+    base_path = Path(base_dir)
+    for yaml_file in base_path.glob("*.yaml"):
+        config = cm_utils.load_yaml(yaml_file)
+        panel_id = config.get("Panel Config", {}).get("id")
+
+        if panel_id in changes_by_panel:
+            print(f"\nApplying deltas to {yaml_file.name} (Panel ID: {panel_id})")
+            # Process in safe order: deletions -> modifications -> additions
+            for category in ("Deletion", "Modification", "Addition"):
+                if changes_by_panel[panel_id][category]:
+                    apply_changes(
+                        config, changes_by_panel[panel_id][category], category
+                    )
+
+            cm_utils.save_yaml(config, output_path / yaml_file.name, AUTOGEN_TEXT)
+            print(f"Saved: {yaml_file.name}")
+        else:
+            shutil.copy(yaml_file, output_path / yaml_file.name)
+
+
+def main() -> None:
+    if len(sys.argv) != 4:
+        print(
+            "Usage: python apply_config_deltas.py "
+            "<base_arch_dir> <delta_yaml> <output_dir>"
+        )
+        sys.exit(1)
+
+    base_dir, delta_file, output_dir = sys.argv[1:4]
+
+    if not Path(base_dir).is_dir():
+        print(f"Error: {base_dir} is not a directory")
+        sys.exit(1)
+
+    if not Path(delta_file).is_file():
+        print(f"Error: {delta_file} is not a file")
+        sys.exit(1)
+
+    apply_delta(base_dir, delta_file, output_dir)
+    print("\nDelta application complete!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/projects/rocprofiler-compute/tools/config_management/config_workflow.yaml b/projects/rocprofiler-compute/tools/config_management/config_workflow.yaml
new file mode 100644
index 0000000000..b6649e540d
--- /dev/null
+++ b/projects/rocprofiler-compute/tools/config_management/config_workflow.yaml
@@ -0,0 +1,31 @@
+# Configuration file for master workflow script
+
+paths:
+  # Path to analysis config template
+  template: tools/config_management/analysis_config_template.yaml
+
+  # Root directory containing architecture configs
+  configs_root: src/rocprof_compute_soc/analysis_configs
+
+  # Backup directory
+  backups: .backups
+
+  # Hash database file
+  hashes: tools/config_management/.config_hashes.json
+
+  # Per-arch metric definitions output
+  per_arch_metrics: tools/per_arch_metric_definitions
+
+  # Docs metrics description file
+  docs_metrics: docs/data/metrics_description.yaml
+
+validation:
+  # Fail on warnings (strict mode)
+  strict_mode: true
+
+  # Verify all archs after changes
+  verify_after_changes: true
+
+behavior:
+  # Require confirmation before destructive operations
+  require_confirmation: true
diff --git a/projects/rocprofiler-compute/tools/config_management/delta_template.yaml b/projects/rocprofiler-compute/tools/config_management/delta_template.yaml
new file mode 100644
index 0000000000..e4ebfb5015
--- /dev/null
+++ b/projects/rocprofiler-compute/tools/config_management/delta_template.yaml
@@ -0,0 +1,164 @@
+# Delta YAML Template
+# Use this template when creating delta files for architecture config changes
+#
+# Filename: <target_arch>_diff.yaml (e.g., gfx950_diff.yaml)
+# Location: src/rocprof_compute_soc/analysis_configs/gfx<arch>/config_delta/
+#
+# All three sections (Addition, Deletion, Modification) are REQUIRED
+# Use empty list [] if no changes in a section
+
+Addition:
+  # Add new metrics, tables, or descriptions
+  # Example: Adding a new metric to existing table
+  - Panel Config:
+      id: 700  # Panel ID (matches config YAML)
+      title: Wavefront  # Panel title
+    metric_tables:
+      - metric_table:
+          id: 701  # Table ID
+          title: Wavefront Launch Stats  # Table title
+          metrics:
+            - New Metric Name:  # Metric name
+                avg: AVG(formula)  # All metric fields
+                min: MIN(formula)
+                max: MAX(formula)
+                unit: Units
+    metric_descriptions:
+      New Metric Name:
+        plain: "Plain text description (REQUIRED)"
+        rst: "RST description with :ref:`markup <link>` (optional, defaults to plain)"
+
+  # Example: Adding entire new table
+  - Panel Config:
+      id: 1100
+      title: Compute Units - Compute Pipeline
+    metric_tables:
+      - metric_table:
+          id: 1104  # New table ID
+          title: New Table Name
+          metric:  # Use 'metric' (not 'metrics') for full table
+            Metric One:
+              avg: AVG(formula)
+              unit: Units
+            Metric Two:
+              avg: AVG(formula)
+              unit: Units
+    metric_descriptions:
+      Metric One:
+        plain: "Description for metric one"
+      Metric Two:
+        plain: "Description for metric two"
+
+Deletion:
+  # Remove metrics, tables, or descriptions
+  # Example: Deleting specific metrics from table
+  - Panel Config:
+      id: 700
+      title: Wavefront
+    metric_tables:
+      - metric_table:
+          id: 701
+          title: Wavefront Launch Stats
+          metrics:
+            - Old Metric:  # Just the metric name and any field
+                avg: AVG(old_formula)  # Fields don't matter for deletion
+    metric_descriptions:
+      Old Metric:
+        plain: "Old description"  # Description also deleted with metric
+
+  # Example: Deleting entire table
+  - Panel Config:
+      id: 1100
+      title: Compute Units - Compute Pipeline
+    metric_tables:
+      - metric_table:
+          id: 1104  # Table to delete
+          title: Table Name
+          metric:  # Use 'metric' for full table deletion
+            Metric One:
+              avg: AVG(formula)
+
+Modification:
+  # Modify existing metrics or descriptions
+  # Only include fields that are changing
+  - Panel Config:
+      id: 700
+      title: Wavefront
+    metric_tables:
+      - metric_table:
+          id: 701
+          title: Wavefront Launch Stats
+          metrics:
+            - Existing Metric:
+                avg: AVG(new_formula)  # Only changed fields
+                # Other fields (min, max, unit) remain unchanged
+    metric_descriptions:
+      Existing Metric:
+        plain: "Updated plain text description"
+        rst: "Updated RST with **formatting**"
+
+  # Example: Updating only description, not metric
+  - Panel Config:
+      id: 1100
+      title: Compute Units - Compute Pipeline
+    metric_descriptions:
+      Some Metric:
+        plain: "New description text"
+        # rst will default to copy of plain if not specified
+
+# ============================================================================
+# IMPORTANT NOTES
+# ============================================================================
+#
+# 1. Panel Config:
+#    - id: Panel ID from the config YAML (e.g., 700)
+#    - title: Panel title (must match)
+#
+# 2. Metric Tables:
+#    - Use "metrics:" (list) when adding/deleting/modifying specific metrics
+#    - Use "metric:" (dict) when adding/deleting entire table
+#    - id: Table ID within panel (e.g., 701)
+#
+# 3. Metric Descriptions:
+#    - plain: Required - stored in config YAMLs
+#    - rst: Optional - if omitted, RST will be same as plain
+#    - Used for both docs and per-arch description files
+#
+# 4. Units:
+#    - Units are part of metric definition, NOT descriptions
+#    - Change units by modifying metric fields
+#    - Units auto-sync to description files
+#
+# 5. Multi-line strings:
+#    - Use | for literal multi-line
+#    - Use > for folded multi-line
+#
+# Example multi-line:
+#   description:
+#     plain: |
+#       This is line one.
+#       This is line two.
+#       This is line three.
+#     rst: |
+#       This is line one with :ref:`reference <link>`.
+#       This is line two.
+#
+# 6. Delta Naming:
+#    - Must be named: <source_arch>_diff.yaml
+#    - For gfx950 changes: gfx950_diff.yaml
+#    - Located in: gfx<arch>/config_delta/
+#
+# 7. RST Syntax:
+#    - :ref:`text <link>` - Cross-reference
+#    - :doc:`text <document>` - Document link
+#    - **bold** - Bold text
+#    - *italic* - Italic text
+#    - ``code`` - Inline code
+#    - `external link <url>`_ - External link
+#
+# 8. Common Mistakes:
+#    - Forgetting to include all three sections
+#    - Using 'metric' instead of 'metrics' or vice versa
+#    - Forgetting plain description (it's required!)
+#    - Wrong panel/table IDs
+#    - Inconsistent indentation (use 2 spaces)
diff --git a/projects/rocprofiler-compute/tools/config_management/generate_config_deltas.py b/projects/rocprofiler-compute/tools/config_management/generate_config_deltas.py
new file mode 100644
index 0000000000..7fcaccc044
--- /dev/null
+++ b/projects/rocprofiler-compute/tools/config_management/generate_config_deltas.py
@@ -0,0 +1,360 @@
+#!/usr/bin/env python3
+##############################################################################
+# MIT License
+#
+# Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+##############################################################################
+
+"""
+Analysis Config Differentiation Script
+Generates differences from curr arch directory to prev arch directory.
+Output shows what needs to change in prev arch to match curr arch.
+"""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+try:
+    from . import utils as cm_utils
+except Exception:
+    repo_root = Path(__file__).resolve().parents[1]
+    if str(repo_root) not in sys.path:
+        sys.path.insert(0, str(repo_root))
+    try:
+        import config_management.utils as cm_utils  # type: ignore
+    except Exception:
+        import utils as cm_utils  # type: ignore
+
+AUTOGEN_TEXT = (
+    "# AUTOGENERATED FILE. Only edit for testing purposes, not for development. "
+    "Generated by tools/config_management/generate_config_deltas.py\n"
+)
+
+
+def get_metric_tables(data: dict) -> list[dict]:
+    """Extract all metric tables from data source."""
+    tables: list[dict] = []
+    for item in data.get("Panel Config", {}).get("data source", []):
+        mt = item.get("metric_table")
+        if isinstance(mt, dict):
+            tables.append(mt)
+    return tables
+
+
+def get_metric_descriptions(data: dict) -> dict:
+    """Extract metric descriptions from panel config."""
+    return data.get("Panel Config", {}).get("metrics_description", {}) or {}
+
+
+def compare_metrics(
+    prev_metrics: dict, curr_metrics: dict
+) -> tuple[list[dict], list[dict], list[dict]]:
+    """Compare metrics and return (additions, deletions, modifications)."""
+    prev_keys = set(prev_metrics.keys())
+    curr_keys = set(curr_metrics.keys())
+
+    additions = [{name: curr_metrics[name]} for name in sorted(curr_keys - prev_keys)]
+    deletions = [{name: prev_metrics[name]} for name in sorted(prev_keys - curr_keys)]
+
+    modifications: list[dict] = []
+    for name in sorted(prev_keys & curr_keys):
+        if prev_metrics[name] != curr_metrics[name]:
+            all_fields = set(prev_metrics[name].keys()) | set(curr_metrics[name].keys())
+            modified_fields = {
+                field: curr_metrics[name].get(field)
+                for field in all_fields
+                if prev_metrics[name].get(field) != curr_metrics[name].get(field)
+            }
+            if modified_fields:
+                modifications.append({name: modified_fields})
+
+    return additions, deletions, modifications
+
+
+def compare_descriptions(
+    prev_descriptions: dict, curr_descriptions: dict
+) -> tuple[dict, dict, dict]:
+    """
+    Compare metric descriptions and return (additions, deletions, modifications).
+    Values are dicts with 'plain' and 'rst'.
+    """
+    prev_keys = set(prev_descriptions.keys())
+    curr_keys = set(curr_descriptions.keys())
+
+    additions: dict = {}
+    deletions: dict = {}
+    modifications: dict = {}
+
+    for name in sorted(curr_keys - prev_keys):
+        desc = curr_descriptions[name]
+        additions[name] = (
+            desc if isinstance(desc, dict) else {"plain": desc, "rst": desc}
+        )
+
+    for name in sorted(prev_keys - curr_keys):
+        desc = prev_descriptions[name]
+        deletions[name] = (
+            desc if isinstance(desc, dict) else {"plain": desc, "rst": desc}
+        )
+
+    for name in sorted(prev_keys & curr_keys):
+        prev_desc = prev_descriptions[name]
+        curr_desc = curr_descriptions[name]
+
+        prev_plain = (
+            prev_desc if isinstance(prev_desc, str) else prev_desc.get("plain", "")
+        )
+        curr_plain = (
+            curr_desc if isinstance(curr_desc, str) else curr_desc.get("plain", "")
+        )
+
+        prev_rst = (
+            prev_desc
+            if isinstance(prev_desc, str)
+            else prev_desc.get("rst", prev_plain)
+        )
+        curr_rst = (
+            curr_desc
+            if isinstance(curr_desc, str)
+            else curr_desc.get("rst", curr_plain)
+        )
+
+        if prev_plain != curr_plain or prev_rst != curr_rst:
+            modifications[name] = {"plain": curr_plain, "rst": curr_rst}
+
+    return additions, deletions, modifications
+
+
+def compare_tables(
+    prev_tables: list[dict], curr_tables: list[dict]
+) -> tuple[list[dict], list[dict], list[dict]]:
+    """Compare tables and return (additions, deletions, modifications)."""
+    prev_dict = {t["id"]: t for t in prev_tables}
+    curr_dict = {t["id"]: t for t in curr_tables}
+
+    prev_ids = set(prev_dict.keys())
+    curr_ids = set(curr_dict.keys())
+
+    additions: list[dict] = []
+    deletions: list[dict] = []
+    modifications: list[dict] = []
+
+    additions.extend(curr_dict[tid] for tid in sorted(curr_ids - prev_ids))
+    deletions.extend(prev_dict[tid] for tid in sorted(prev_ids - curr_ids))
+
+    for tid in sorted(prev_ids & curr_ids):
+        prev_metrics = prev_dict[tid].get("metric", {}) or {}
+        curr_metrics = curr_dict[tid].get("metric", {}) or {}
+
+        metric_adds, metric_dels, metric_mods = compare_metrics(
+            prev_metrics, curr_metrics
+        )
+
+        if metric_adds:
+            additions.append({
+                "id": tid,
+                "title": curr_dict[tid].get("title"),
+                "metrics": metric_adds,
+            })
+        if metric_dels:
+            deletions.append({
+                "id": tid,
+                "title": prev_dict[tid].get("title"),
+                "metrics": metric_dels,
+            })
+        if metric_mods:
+            modifications.append({
+                "id": tid,
+                "title": curr_dict[tid].get("title"),
+                "metrics": metric_mods,
+            })
+
+    return additions, deletions, modifications
+
+
+def format_metric_fields(metric_data: dict) -> list[str]:
+    """Format metric fields as YAML lines."""
+    lines: list[str] = []
+    for field_name, field_value in metric_data.items():
+        if isinstance(field_value, str) and (
+            "\n" in field_value or len(field_value) > 80
+        ):
+            lines.append(f"                {field_name}: |")
+            lines.extend(
+                f"                  {line}" for line in field_value.split("\n")
+            )
+        else:
+            lines.append(f"                {field_name}: {field_value}")
+    return lines
+
+
+def format_description_fields(desc_data: dict) -> list[str]:
+    """Format description fields as YAML lines."""
+    lines: list[str] = []
+    for field_name, field_value in desc_data.items():
+        if isinstance(field_value, str) and (
+            "\n" in field_value or len(field_value) > 80
+        ):
+            lines.append(f"          {field_name}: |")
+            lines.extend(f"            {line}" for line in field_value.split("\n"))
+        else:
+            lines.append(f"          {field_name}: {field_value}")
+    return lines
+
+
+def format_output(combined_diff: dict) -> str:
+    """Format the diff dictionary into a YAML string."""
+    lines: list[str] = []
+    for category in ("Addition", "Deletion", "Modification"):
+        lines.append(f"{category}:")
+        if not combined_diff.get(category):
+            lines.append("  []")
+            lines.append("")
+            continue
+
+        for panel_item in combined_diff[category]:
+            pc = panel_item["panel_config"]
+            lines.extend([
+                "  - Panel Config:",
+                f"      id: {pc['id']}",
+                f"      title: {pc['title']}",
+            ])
+
+            if panel_item.get("metric_tables"):
+                lines.append("    metric_tables:")
+                for mt in panel_item["metric_tables"]:
+                    lines.extend([
+                        "      - metric_table:",
+                        f"          id: {mt['id']}",
+                        f"          title: {mt['title']}",
+                        "          metrics:",
+                    ])
+                    metrics_to_format = mt.get("metrics") or [
+                        {name: data} for name, data in (mt.get("metric") or {}).items()
+                    ]
+                    for metric in metrics_to_format:
+                        for metric_name, metric_data in metric.items():
+                            lines.append(f"            - {metric_name}:")
+                            lines.extend(format_metric_fields(metric_data))
+
+            if panel_item.get("metric_descriptions"):
+                lines.append("    metric_descriptions:")
+                for metric_name, desc_data in panel_item["metric_descriptions"].items():
+                    lines.append(f"      {metric_name}:")
+                    lines.extend(format_description_fields(desc_data))
+
+        lines.append("")
+    return "\n".join(lines)
+
+
+def main() -> None:
+    if len(sys.argv) != 3:
+        print("Usage: python generate_config_deltas.py <curr_arch_dir> <prev_arch_dir>")
+        sys.exit(1)
+
+    curr_arch_dir = Path(sys.argv[1])
+    prev_arch_dir = Path(sys.argv[2])
+
+    if not curr_arch_dir.is_dir() or not prev_arch_dir.is_dir():
+        print("Error: Both arguments must be directories")
+        sys.exit(1)
+
+    curr_files = {f.name for f in curr_arch_dir.glob("*.yaml")}
+    prev_files = {f.name for f in prev_arch_dir.glob("*.yaml")}
+    common_files = curr_files & prev_files
+
+    if not common_files:
+        print("Error: No common YAML files found")
+        sys.exit(1)
+
+    print(f"Comparing {len(common_files)} files...")
+
+    combined_diff = {"Addition": [], "Deletion": [], "Modification": []}
+
+    for filename in sorted(common_files):
+        curr_data = cm_utils.load_yaml(curr_arch_dir / filename)
+        prev_data = cm_utils.load_yaml(prev_arch_dir / filename)
+
+        curr_pc = curr_data.get("Panel Config", {}) or {}
+        prev_pc = prev_data.get("Panel Config", {}) or {}
+
+        curr_tables = get_metric_tables(curr_data)
+        prev_tables = get_metric_tables(prev_data)
+
+        curr_descriptions = get_metric_descriptions(curr_data)
+        prev_descriptions = get_metric_descriptions(prev_data)
+
+        table_adds, table_dels, table_mods = compare_tables(prev_tables, curr_tables)
+        desc_adds, desc_dels, desc_mods = compare_descriptions(
+            prev_descriptions, curr_descriptions
+        )
+
+        if table_adds or desc_adds:
+            entry = {
+                "panel_config": {"id": curr_pc.get("id"), "title": curr_pc.get("title")}
+            }
+            if table_adds:
+                entry["metric_tables"] = table_adds
+            if desc_adds:
+                entry["metric_descriptions"] = desc_adds
+            combined_diff["Addition"].append(entry)
+
+        if table_dels or desc_dels:
+            entry = {
+                "panel_config": {"id": prev_pc.get("id"), "title": prev_pc.get("title")}
+            }
+            if table_dels:
+                entry["metric_tables"] = table_dels
+            if desc_dels:
+                entry["metric_descriptions"] = desc_dels
+            combined_diff["Deletion"].append(entry)
+
+        if table_mods or desc_mods:
+            entry = {
+                "panel_config": {"id": curr_pc.get("id"), "title": curr_pc.get("title")}
+            }
+            if table_mods:
+                entry["metric_tables"] = table_mods
+            if desc_mods:
+                entry["metric_descriptions"] = desc_mods
+            combined_diff["Modification"].append(entry)
+
+    output = AUTOGEN_TEXT + format_output(combined_diff)
+
+    print("\n" + "=" * 80)
+    print("COMBINED DIFF OUTPUT:")
+    print("=" * 80)
+    print(output)
+
+    output_dir = prev_arch_dir / "config_delta"
+    output_dir.mkdir(exist_ok=True)
+    output_file = output_dir / f"{curr_arch_dir.name}_diff.yaml"
+    with open(output_file, "w") as f:
+        f.write(output)
+
+    print(f"\nDiff written to: {output_file}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/projects/rocprofiler-compute/tools/config_management/gfx9_config_template.yaml b/projects/rocprofiler-compute/tools/config_management/gfx9_config_template.yaml
new file mode 100644
index 0000000000..877cecbc27
--- /dev/null
+++ b/projects/rocprofiler-compute/tools/config_management/gfx9_config_template.yaml
@@ -0,0 +1,260 @@
+latest_arch: gfx950
+panels:
+- file: top_stats.yaml
+  panel_id: 0
+  panel_title: Top Stats
+  panel_alias: topstats
+  data_sources:
+  - type: raw_csv_table
+    id: 1
+    title: Top Kernels
+  - type: raw_csv_table
+    id: 2
+    title: Dispatch List
+- file: system_info.yaml
+  panel_id: 1
+  panel_title: System Info
+  panel_alias: sysinfo
+  data_sources:
+  - type: raw_csv_table
+    id: 1
+    title: System Info
+- file: system_speed_of_light.yaml
+  panel_id: 2
+  panel_title: System Speed-of-Light
+  panel_alias: sol
+  data_sources:
+  - type: metric_table
+    id: 1
+    title: System Speed-of-Light
+- file: memory_chart.yaml
+  panel_id: 3
+  panel_title: Memory Chart
+  panel_alias: memchart
+  data_sources:
+  - type: metric_table
+    id: 1
+    title: Memory Chart
+- file: roofline.yaml
+  panel_id: 4
+  panel_title: Roofline
+  panel_alias: roof
+  data_sources:
+  - type: metric_table
+    id: 1
+    title: Roofline Performance Rates
+  - type: metric_table
+    id: 2
+    title: Roofline Plot Points
+- file: command_processor_cpc_cpf.yaml
+  panel_id: 5
+  panel_title: Command Processor (CPC/CPF)
+  panel_alias: cpc
+  data_sources:
+  - type: metric_table
+    id: 1
+    title: Command processor fetcher (CPF)
+  - type: metric_table
+    id: 2
+    title: Command processor packet processor (CPC)
+- file: workgroup_manager_spi.yaml
+  panel_id: 6
+  panel_title: Workgroup Manager (SPI)
+  panel_alias: spi
+  data_sources:
+  - type: metric_table
+    id: 1
+    title: Workgroup manager utilizations
+  - type: metric_table
+    id: 2
+    title: Workgroup Manager - Resource Allocation
+- file: wavefront.yaml
+  panel_id: 7
+  panel_title: Wavefront
+  panel_alias: wavefront
+  data_sources:
+  - type: metric_table
+    id: 1
+    title: Wavefront Launch Stats
+  - type: metric_table
+    id: 2
+    title: Wavefront Runtime Stats
+- file: compute_units_instruction_mix.yaml
+  panel_id: 10
+  panel_title: Compute Units - Instruction Mix
+  panel_alias: cu_ins
+  data_sources:
+  - type: metric_table
+    id: 1
+    title: Overall Instruction Mix
+  - type: metric_table
+    id: 2
+    title: VALU Arithmetic Instruction Mix
+  - type: metric_table
+    id: 3
+    title: VMEM Instruction Mix
+  - type: metric_table
+    id: 4
+    title: MFMA Arithmetic Instruction Mix
+- file: compute_units_compute_pipeline.yaml
+  panel_id: 11
+  panel_title: Compute Units - Compute Pipeline
+  panel_alias: cu_pipe
+  data_sources:
+  - type: metric_table
+    id: 1
+    title: Compute Speed-of-Light
+  - type: metric_table
+    id: 2
+    title: Pipeline Statistics
+  - type: metric_table
+    id: 3
+    title: Arithmetic Operations
+- file: local_data_share_lds.yaml
+  panel_id: 12
+  panel_title: Local Data Share (LDS)
+  panel_alias: lds
+  data_sources:
+  - type: metric_table
+    id: 1
+    title: LDS Speed-of-Light
+  - type: metric_table
+    id: 2
+    title: LDS Statistics
+- file: instruction_cache.yaml
+  panel_id: 13
+  panel_title: Instruction Cache
+  panel_alias: ins_cache
+  data_sources:
+  - type: metric_table
+    id: 1
+    title: L1I Speed-of-Light
+  - type: metric_table
+    id: 2
+    title: L1I cache accesses
+  - type: metric_table
+    id: 3
+    title: L1I <-> L2 interface
+- file: scalar_l1_data_cache.yaml
+  panel_id: 14
+  panel_title: Scalar L1 Data Cache
+  panel_alias: sl1d
+  data_sources:
+  - type: metric_table
+    id: 1
+    title: Scalar L1D Speed-of-Light
+  - type: metric_table
+    id: 2
+    title: Scalar L1D cache accesses
+  - type: metric_table
+    id: 3
+    title: Scalar L1D Cache - L2 Interface
+- file: address_processing_unit_and_data_return_path_ta_td.yaml
+  panel_id: 15
+  panel_title: Address Processing Unit and Data Return Path (TA/TD)
+  panel_alias: tatd
+  data_sources:
+  - type: metric_table
+    id: 1
+    title: Busy and stall metrics
+  - type: metric_table
+    id: 2
+    title: Instruction counts
+  - type: metric_table
+    id: 3
+    title: Spill and stack metrics
+  - type: metric_table
+    id: 4
+    title: Vector L1 data-return path or Texture Data (TD)
+- file: vector_l1_data_cache.yaml
+  panel_id: 16
+  panel_title: Vector L1 Data Cache
+  panel_alias: vl1d
+  data_sources:
+  - type: metric_table
+    id: 1
+    title: vL1D Speed-of-Light
+  - type: metric_table
+    id: 2
+    title: vL1D cache stall metrics
+  - type: metric_table
+    id: 3
+    title: vL1D cache access metrics
+  - type: metric_table
+    id: 4
+    title: L1D - L2 Transactions
+  - type: metric_table
+    id: 5
+    title: L1 Unified Translation Cache (UTCL1)
+  - type: metric_table
+    id: 6
+    title: L1D Addr Translation Stalls
+- file: l2_cache.yaml
+  panel_id: 17
+  panel_title: L2 Cache
+  panel_alias: l2
+  data_sources:
+  - type: metric_table
+    id: 1
+    title: L2 Speed-of-Light
+  - type: metric_table
+    id: 2
+    title: L2-Fabric interface metrics
+  - type: metric_table
+    id: 3
+    title: L2 Cache Accesses
+  - type: metric_table
+    id: 4
+    title: L2 Cache Stalls
+  - type: metric_table
+    id: 5
+    title: L2 - Fabric Interface stalls
+  - type: metric_table
+    id: 6
+    title: L2 - Fabric interface detailed metrics
+- file: l2_cache_per_channel.yaml
+  panel_id: 18
+  panel_title: L2 Cache (per Channel)
+  panel_alias: l2_per_channel
+  data_sources:
+  - type: metric_table
+    id: 1
+    title: Aggregate Stats (All channels)
+  - type: metric_table
+    id: 2
+    title: L2 Cache Hit Rate (pct)
+  - type: metric_table
+    id: 3
+    title: L2 Requests (per normUnit)
+  - type: metric_table
+    id: 4
+    title: L2 Requests (per normUnit)
+  - type: metric_table
+    id: 5
+    title: L2-Fabric Requests (per normUnit)
+  - type: metric_table
+    id: 6
+    title: L2-Fabric Read Latency (Cycles)
+  - type: metric_table
+    id: 7
+    title: L2-Fabric Write and Atomic Latency (Cycles)
+  - type: metric_table
+    id: 8
+    title: L2-Fabric Atomic Latency (Cycles)
+  - type: metric_table
+    id: 9
+    title: L2-Fabric Read Stall (Cycles per normUnit)
+  - type: metric_table
+    id: 10
+    title: L2-Fabric Write and Atomic Stall (Cycles per normUnit)
+  - type: metric_table
+    id: 12
+    title: L2-Fabric (128B read requests per normUnit)
+- file: pc_sampling.yaml
+  panel_id: 21
+  panel_title: PC Sampling
+  panel_alias: pc_sampling
+  data_sources:
+  - type: pc_sampling_table
+    id: 1
+    title: PC Sampling
diff --git a/projects/rocprofiler-compute/tools/config_management/hash_checker.py b/projects/rocprofiler-compute/tools/config_management/hash_checker.py
new file mode 100644
index 0000000000..14c5d17254
--- /dev/null
+++ b/projects/rocprofiler-compute/tools/config_management/hash_checker.py
@@ -0,0 +1,215 @@
+#!/usr/bin/env python3
+##############################################################################
+# MIT License
+#
+# Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+##############################################################################
+
+"""
+Hash consistency guard for rocprofiler-compute.
+
+Errors (per arch):
+- If latest-arch panels changed but its delta did not (and there are older archs)
+- If latest-arch delta changed but its panels did not AND no new arch was added
+- If an older arch's panels changed but its delta did not
+- If an older arch's delta changed but neither latest panels nor this arch's
+  panels changed
+
+"""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+import yaml
+
+try:
+    from . import hash_manager  # type: ignore
+except Exception:
+    import importlib.util
+
+    _HERE = Path(__file__).resolve().parent
+    _SPEC = importlib.util.spec_from_file_location(
+        "hash_manager", str(_HERE / "hash_manager.py")
+    )
+    hash_manager = importlib.util.module_from_spec(_SPEC)  # type: ignore[assignment]
+    assert _SPEC and _SPEC.loader is not None
+    _SPEC.loader.exec_module(hash_manager)  # type: ignore[attr-defined]
+# ---------------------------------------------------------------------------
+
+# Subproject root: .../projects/rocprofiler-compute
+SUBROOT = Path(__file__).resolve().parents[2]
+
+CONFIGS_ROOT: Path = SUBROOT / "src" / "rocprof_compute_soc" / "analysis_configs"
+HASH_FILE: Path = SUBROOT / "tools" / "config_management" / ".config_hashes.json"
+TEMPLATE_FILE: Path = (
+    SUBROOT / "tools" / "config_management" / "analysis_config_template.yaml"
+)
+
+
+# ---------- helpers ----------
+
+
+def _latest_arch(template_file: Path) -> str:
+    if not template_file.is_file():
+        return ""
+    with open(template_file, "r", encoding="utf-8") as f:
+        data = yaml.safe_load(f) or {}
+    return str(data.get("latest_arch") or "")
+
+
+def _all_archs(cfg_root: Path) -> list[str]:
+    if not cfg_root.is_dir():
+        return []
+    return sorted(
+        p.name for p in cfg_root.iterdir() if p.is_dir() and p.name.startswith("gfx")
+    )
+
+
+def _cur_panels_and_delta(arch_dir: Path) -> tuple[dict[str, str], str]:
+    """
+    Current (on-disk) hashes using hash_manager.compute_arch_hashes API:
+      returns {"files": {...}, "delta_hash": <md5 or None>}
+    """
+    cur = hash_manager.compute_arch_hashes(arch_dir)
+    panels = dict(cur.get("files") or {})
+    delta_hash = cur.get("delta_hash") or ""
+    return panels, str(delta_hash)
+
+
+def _prev_panels_and_delta(
+    hashes_path: Path, arch_name: str
+) -> tuple[dict[str, str], str]:
+    """
+    Previous (DB) hashes saved in .config_hashes.json:
+      stored as {"files": {...}, "delta_hash": <md5 or None>}
+    """
+    db: dict = hash_manager.load_hash_db(hashes_path)
+    prev_arch: dict = (db.get("archs") or {}).get(arch_name, {})  # type: ignore[assignment]
+    panels = dict(prev_arch.get("files") or {})
+    delta_hash = prev_arch.get("delta_hash") or ""
+    return panels, str(delta_hash)
+
+
+def _changed_panel_files(cur: dict[str, str], prev: dict[str, str]) -> list[str]:
+    """
+    Return a small list of changed panel filenames (added/removed/modified).
+    """
+    # structural changes (added/removed)
+    changed = sorted(set(cur) ^ set(prev))
+    if not changed:
+        # content changes for existing files
+        changed = sorted(k for k in cur.keys() & prev.keys() if cur[k] != prev[k])
+    return changed
+
+
+# ---------- main ----------
+
+
+def main() -> int:
+    if not CONFIGS_ROOT.is_dir():
+        print(f"ERROR: analysis_configs directory not found at: {CONFIGS_ROOT}")
+        return 2
+
+    latest = _latest_arch(TEMPLATE_FILE)
+    all_archs = _all_archs(CONFIGS_ROOT)
+    older_archs = [a for a in all_archs if a != latest]
+
+    # detect new archs via hash_manager.detect_changes if available
+    try:
+        changes: dict = hash_manager.detect_changes(CONFIGS_ROOT, HASH_FILE)  # type: ignore[call-arg]
+    except TypeError:
+        # old/new signatures both accept (cfg_root, hashes_path)
+        changes = hash_manager.detect_changes(CONFIGS_ROOT, HASH_FILE)  # type: ignore[call-arg]
+    new_archs: list = changes.get("new_archs") or []
+
+    errors: list[str] = []
+
+    # Track whether latest panels changed (used for older-arch delta rule)
+    latest_panels_changed = False
+
+    for arch in all_archs:
+        arch_dir = CONFIGS_ROOT / arch
+        cur_panels, cur_delta = _cur_panels_and_delta(arch_dir)
+        prev_panels, prev_delta = _prev_panels_and_delta(HASH_FILE, arch)
+
+        panel_changed = cur_panels != prev_panels
+        delta_changed = cur_delta != prev_delta
+
+        if arch == latest:
+            latest_panels_changed = panel_changed
+
+            # A) Latest panels changed but no delta changed (and there ARE older archs)
+            if panel_changed and not delta_changed and older_archs:
+                snippet = ", ".join(_changed_panel_files(cur_panels, prev_panels)[:5])
+                errors.append(
+                    f"Panels changed in latest arch '{latest}' "
+                    "but its delta file did not change.\n"
+                    f"Changed panels (sample): {snippet}\n"
+                    "Run the workflow to regenerate deltas for previous archs."
+                )
+
+            # B) Latest delta changed but panels did not AND no new arch was added
+            if delta_changed and not panel_changed and latest not in new_archs:
+                errors.append(
+                    "Delta file changed for latest, but panels "
+                    "didn't change and no new arch was added.\n"
+                    "This usually means deltas were edited/regenerated "
+                    "without corresponding latest updates."
+                )
+
+        else:
+            # C) Arch panels changed but its delta did not
+            if panel_changed and not delta_changed:
+                snippet = ", ".join(_changed_panel_files(cur_panels, prev_panels)[:5])
+                errors.append(
+                    f"Panels changed in arch '{arch}' "
+                    "but its delta file did not change.\n"
+                    f"Changed panels (sample): {snippet}\n"
+                    "Regenerate deltas for this arch (diff vs latest) "
+                    "and commit them."
+                )
+
+            # D) Older arch delta changed without either latest panels changing
+            #    OR this arch's panels changing -> error
+            #    (allow if latest panels changed: deltas can legitimately change then)
+            if delta_changed and not panel_changed and not latest_panels_changed:
+                errors.append(
+                    f"Delta file changed under older arch '{arch}' "
+                    "but neither latest nor this arch's panels changed.\n"
+                    "This suggests stray delta edits; "
+                    "verify latest panels or this arch's panels, or revert."
+                )
+
+    if errors:
+        print("\nHASH CONSISTENCY ERRORS:")
+        for e in errors:
+            print("  - " + e)
+        return 1
+
+    print("Hash consistency check passed.")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/projects/rocprofiler-compute/tools/config_management/hash_manager.py b/projects/rocprofiler-compute/tools/config_management/hash_manager.py
new file mode 100644
index 0000000000..5c93534986
--- /dev/null
+++ b/projects/rocprofiler-compute/tools/config_management/hash_manager.py
@@ -0,0 +1,279 @@
+#!/usr/bin/env python3
+##############################################################################
+# MIT License
+#
+# Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+##############################################################################
+
+"""
+Hash manager for tracking configuration file changes.
+Can be used standalone or imported by the master workflow.
+
+Usage:
+    python hash_manager.py --compute-all <configs_dir> [hash_file]
+    python hash_manager.py --detect-changes <configs_dir> [hash_file]
+    python hash_manager.py --update <arch_name> <configs_dir> [hash_file]
+"""
+
+from __future__ import annotations
+
+import argparse
+import hashlib
+import json
+import sys
+from pathlib import Path
+from typing import Optional
+
+DEFAULT_HASH_DB = "tools/config_management/.config_hashes.json"
+
+
+def compute_file_hash(filepath: Path) -> str:
+    """Compute MD5 hash of a file."""
+    md5 = hashlib.md5()
+    with open(filepath, "rb") as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            md5.update(chunk)
+    return md5.hexdigest()
+
+
+def compute_arch_hashes(arch_dir: Path) -> dict:
+    """
+    Compute hashes for all YAML files in an arch directory.
+    Returns dict: {"files": {filename: hash}, "delta_hash": <md5 or None>}
+    """
+    arch_path = Path(arch_dir)
+    if not arch_path.is_dir():
+        return {"files": {}, "delta_hash": None}
+
+    file_hashes: dict[str, str] = {}
+    for yaml_file in sorted(arch_path.glob("*.yaml")):
+        file_hashes[yaml_file.name] = compute_file_hash(yaml_file)
+
+    # Check for delta file (assume exactly one *_diff.yaml)
+    delta_dir = arch_path / "config_delta"
+    delta_hash: Optional[str] = None
+    if delta_dir.is_dir():
+        delta_files = list(delta_dir.glob("*_diff.yaml"))
+        if delta_files:
+            delta_hash = compute_file_hash(delta_files[0])
+
+    return {"files": file_hashes, "delta_hash": delta_hash}
+
+
+def load_hash_db(hash_file: Path) -> dict:
+    """Load hash database from file (or initialize)."""
+    hash_path = Path(hash_file)
+    if not hash_path.exists():
+        return {"archs": {}}
+    with open(hash_path) as f:
+        return json.load(f)
+
+
+def save_hash_db(hash_file: Path, data: dict) -> None:
+    """Save hash database to file."""
+    hash_path = Path(hash_file)
+    hash_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(hash_path, "w") as f:
+        json.dump(data, f, indent=2, sort_keys=True)
+
+
+def detect_changes(configs_dir: Path, hash_file: Path) -> dict:
+    """
+    Detect changes in architecture configs.
+    Returns dict with keys:
+        - new_archs: list[str]
+        - modified_archs: dict[str, list[str]]
+        - delta_files: dict[str, str]   # arch -> delta file path
+        - deleted_archs: list[str]
+    """
+    configs_path = Path(configs_dir)
+    hash_db = load_hash_db(hash_file)
+
+    current_archs = {
+        d.name
+        for d in configs_path.iterdir()
+        if d.is_dir() and d.name.startswith("gfx")
+    }
+    stored_archs = set(hash_db.get("archs", {}).keys())
+
+    changes = {
+        "new_archs": sorted(current_archs - stored_archs),
+        "modified_archs": {},
+        "delta_files": {},
+        "deleted_archs": sorted(stored_archs - current_archs),
+    }
+
+    # Compare existing archs
+    for arch in sorted(current_archs & stored_archs):
+        arch_dir = configs_path / arch
+        current_hashes = compute_arch_hashes(arch_dir)
+        stored_hashes = hash_db["archs"].get(arch, {"files": {}, "delta_hash": None})
+
+        modified_files: list[str] = []
+
+        current_files = set(current_hashes["files"].keys())
+        stored_files = set(stored_hashes.get("files", {}).keys())
+
+        # New files
+        for f in sorted(current_files - stored_files):
+            modified_files.append(f)
+
+        # Modified files
+        for f in sorted(current_files & stored_files):
+            if current_hashes["files"][f] != stored_hashes["files"][f]:
+                modified_files.append(f)
+
+        # Deleted files (mark as "[DELETED] <name>")
+        for f in sorted(stored_files - current_files):
+            modified_files.append(f"[DELETED] {f}")
+
+        if modified_files:
+            changes["modified_archs"][arch] = modified_files
+
+        # Delta changes
+        delta_dir = arch_dir / "config_delta"
+        if delta_dir.is_dir():
+            delta_files = list(delta_dir.glob("*_diff.yaml"))
+            if delta_files:
+                current_delta_hash = compute_file_hash(delta_files[0])
+                stored_delta_hash = stored_hashes.get("delta_hash")
+                if current_delta_hash != stored_delta_hash:
+                    changes["delta_files"][arch] = str(delta_files[0])
+
+    return changes
+
+
+def update_hashes(arch_name: str, configs_dir: Path, hash_file: Path) -> bool:
+    """Update hashes for a specific architecture."""
+    hash_db = load_hash_db(hash_file)
+    arch_dir = Path(configs_dir) / arch_name
+    if not arch_dir.is_dir():
+        print(f"Error: {arch_dir} is not a directory")
+        return False
+
+    arch_hashes = compute_arch_hashes(arch_dir)
+    hash_db.setdefault("archs", {})[arch_name] = arch_hashes
+    save_hash_db(hash_file, hash_db)
+    print(f"Updated hashes for {arch_name}")
+    return True
+
+
+def compute_all_hashes(configs_dir: Path, hash_file: Path) -> bool:
+    """Compute and store hashes for all architectures under configs_dir."""
+    configs_path = Path(configs_dir)
+    if not configs_path.is_dir():
+        print(f"Error: {configs_dir} is not a directory")
+        return False
+
+    hash_db = {"archs": {}}
+    for arch_dir in sorted(configs_path.iterdir()):
+        if arch_dir.is_dir() and arch_dir.name.startswith("gfx"):
+            arch_name = arch_dir.name
+            hash_db["archs"][arch_name] = compute_arch_hashes(arch_dir)
+            print(f"Computed hashes for {arch_name}")
+
+    save_hash_db(hash_file, hash_db)
+    print(f"\nHash database saved to {hash_file}")
+    return True
+
+
+def _print_change_summary(changes: dict) -> None:
+    print("Change Detection Results")
+    print("=" * 80)
+
+    if changes["new_archs"]:
+        print("\nNew Architectures")
+        for arch in changes["new_archs"]:
+            print(f"   • {arch}")
+
+    if changes["modified_archs"]:
+        print("\nModified Architectures")
+        for arch, files in changes["modified_archs"].items():
+            print(f"   • {arch}")
+            for f in files:
+                print(f"      - {f}")
+
+    if changes["delta_files"]:
+        print("\nDelta Files Detected")
+        for arch, delta_file in changes["delta_files"].items():
+            print(f"   • {arch}: {delta_file}")
+
+    if changes["deleted_archs"]:
+        print("\nDeleted Architectures")
+        for arch in changes["deleted_archs"]:
+            print(f"   • {arch}")
+
+    if not any([
+        changes["new_archs"],
+        changes["modified_archs"],
+        changes["delta_files"],
+        changes["deleted_archs"],
+    ]):
+        print("\nNo changes detected")
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description="Manage configuration file hashes for change detection"
+    )
+    parser.add_argument(
+        "--compute-all",
+        action="store_true",
+        help="Compute hashes for all architectures",
+    )
+    parser.add_argument(
+        "--detect-changes", action="store_true", help="Detect changes in configurations"
+    )
+    parser.add_argument(
+        "--update", metavar="ARCH", help="Update hashes for specific architecture"
+    )
+    parser.add_argument("configs_dir", help="Path to analysis_configs directory")
+    parser.add_argument(
+        "hash_file",
+        nargs="?",
+        default=DEFAULT_HASH_DB,
+        help="Path to hash database file",
+    )
+
+    args = parser.parse_args()
+    configs_dir = Path(args.configs_dir)
+    hash_file = Path(args.hash_file)
+
+    if args.compute_all:
+        success = compute_all_hashes(configs_dir, hash_file)
+        return 0 if success else 1
+
+    if args.detect_changes:
+        changes = detect_changes(configs_dir, hash_file)
+        _print_change_summary(changes)
+        return 0
+
+    if args.update:
+        success = update_hashes(args.update, configs_dir, hash_file)
+        return 0 if success else 1
+
+    parser.print_help()
+    return 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/projects/rocprofiler-compute/tools/config_management/master_config_workflow_script.py b/projects/rocprofiler-compute/tools/config_management/master_config_workflow_script.py
new file mode 100644
index 0000000000..50d61b7b48
--- /dev/null
+++ b/projects/rocprofiler-compute/tools/config_management/master_config_workflow_script.py
@@ -0,0 +1,1014 @@
+#!/usr/bin/env python3
+##############################################################################
+# MIT License
+#
+# Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+##############################################################################
+
+"""
+Master workflow script for managing architecture configurations.
+- Detects changes
+- Handles direct edits and delta files
+- Supports promoting a NEW arch from:
+    (A) direct edits to latest, or
+    (B) a delta YAML targeting latest
+- Validates, syncs metric descriptions, and updates hashes
+
+"""
+
+from __future__ import annotations
+
+import argparse
+import shutil
+import subprocess
+import sys
+from datetime import datetime
+from pathlib import Path
+from typing import Optional
+
+try:
+    from . import hash_manager, metric_description_manager
+except Exception:
+    repo_root = Path(__file__).resolve().parents[1]  # repo root
+    if str(repo_root) not in sys.path:
+        sys.path.insert(0, str(repo_root))
+    import config_management.hash_manager as hash_manager  # type: ignore
+    import config_management.metric_description_manager as metric_description_manager  # type: ignore
+
+import yaml
+
+# =============================================================================
+# CONFIG
+# =============================================================================
+
+CONFIG_FILE = "config_workflow.yaml"
+
+DEFAULT_CONFIG: dict = {
+    "paths": {
+        "template": "tools/config_management/gfx9_config_template.yaml",
+        "configs_root": "src/rocprof_compute_soc/analysis_configs",
+        "backups": ".backups",
+        "hashes": "tools/config_management/.config_hashes.json",
+        "per_arch_metrics": "tools/per_arch_metric_definitions",
+        "docs_metrics": "docs/data/metrics_description.yaml",
+    },
+    "validation": {"strict_mode": True, "verify_after_changes": True},
+    "behavior": {"require_confirmation": True},
+}
+
+
+# =============================================================================
+# UTILITIES
+# =============================================================================
+
+
+def load_config() -> dict:
+    """Load config from CONFIG_FILE with a shallow merge onto DEFAULT_CONFIG."""
+    p = Path(CONFIG_FILE)
+    if not p.exists():
+        return DEFAULT_CONFIG
+    with open(p) as f:
+        user = yaml.safe_load(f) or {}
+    merged = DEFAULT_CONFIG.copy()
+    for k, v in user.items():
+        if isinstance(v, dict) and isinstance(merged.get(k), dict):
+            merged[k] = {**merged[k], **v}
+        else:
+            merged[k] = v
+    return merged
+
+
+def create_backup(source_paths: list[str], backup_dir: str) -> Path:
+    """Create a timestamped backup of the provided paths."""
+    ts = datetime.now().strftime("%Y%m%d_%H%M%S_%f")  # add microseconds
+    base = Path(backup_dir)
+    base.mkdir(parents=True, exist_ok=True)
+    backup_path = base / ts
+
+    # Fallback suffix if somehow collides
+    i = 1
+    while backup_path.exists():
+        backup_path = base / f"{ts}_{i}"
+        i += 1
+
+    print(f"Creating backup: {backup_path}")
+    for s in source_paths:
+        sp = Path(s)
+        dst = backup_path / sp.name
+        if sp.is_dir():
+            shutil.copytree(sp, dst)
+        elif sp.is_file():
+            dst.parent.mkdir(parents=True, exist_ok=True)
+            shutil.copy2(sp, dst)
+    return backup_path
+
+
+def restore_backup(backup_path: Path, target_paths: list[str]) -> None:
+    """Restore files/dirs from a given backup path."""
+    print(f"Restoring from backup: {backup_path}")
+    for t in target_paths:
+        tp = Path(t)
+        bp = backup_path / tp.name
+        if not bp.exists():
+            continue
+        if tp.is_dir():
+            shutil.rmtree(tp, ignore_errors=True)
+        elif tp.exists():
+            tp.unlink()
+        if bp.is_dir():
+            shutil.copytree(bp, tp)
+        else:
+            shutil.copy2(bp, tp)
+    print("Backup restored")
+
+
+def cleanup_old_backups(backup_dir: str) -> None:
+    """Keep latest backup, remove older ones."""
+    b = Path(backup_dir)
+    if not b.exists():
+        return
+    dirs = sorted([d for d in b.iterdir() if d.is_dir()])
+    for old in dirs[:-1]:
+        shutil.rmtree(old, ignore_errors=True)
+        print(f"Removed old backup: {old.name}")
+
+
+def prompt_yes_no(question: str, default: Optional[bool] = None) -> bool:
+    """Ask a yes/no question in the terminal."""
+    if default is None:
+        prompt = f"{question} (y/n): "
+    elif default:
+        prompt = f"{question} [Y/n]: "
+    else:
+        prompt = f"{question} [y/N]: "
+    while True:
+        ans = input(prompt).strip().lower()
+        if not ans and default is not None:
+            return default
+        if ans in ("y", "yes"):
+            return True
+        if ans in ("n", "no"):
+            return False
+        print("Please answer 'y' or 'n'.")
+
+
+def run_script(
+    script_name: str, args: list[str], capture_output: bool = True
+) -> subprocess.CompletedProcess:
+    """Run a Python helper script and return CompletedProcess."""
+    return subprocess.run(
+        [sys.executable, script_name] + args, capture_output=capture_output, text=True
+    )
+
+
+def get_all_archs(configs_dir: str) -> list[str]:
+    """Return sorted list of gfx* directories."""
+    root = Path(configs_dir)
+    return sorted([
+        d.name for d in root.iterdir() if d.is_dir() and d.name.startswith("gfx")
+    ])
+
+
+def get_latest_arch(template_file: str) -> Optional[str]:
+    """Read 'latest_arch' from template YAML."""
+    p = Path(template_file)
+    if not p.is_file():
+        return None
+    with open(p) as f:
+        data = yaml.safe_load(f) or {}
+    return data.get("latest_arch")
+
+
+def validate_delta_structure(delta_file: str) -> tuple[bool, str]:
+    """Ensure delta YAML contains Addition/Deletion/Modification keys."""
+    with open(delta_file) as f:
+        data = yaml.safe_load(f) or {}
+    required = {"Addition", "Deletion", "Modification"}
+    if not isinstance(data, dict) or not required.issubset(data.keys()):
+        return False, "Delta must have Addition, Deletion, Modification keys"
+    return True, ""
+
+
+# =============================================================================
+# VALIDATION / SYNC
+# =============================================================================
+
+
+def validate_all_archs(config: dict) -> tuple[bool, str]:
+    """Validate all archs against the template."""
+    print("Validating all architectures against template...")
+    res = run_script(
+        "tools/config_management/verify_against_config_template.py",
+        [config["paths"]["configs_root"], config["paths"]["template"]],
+        capture_output=True,
+    )
+    if res.stdout:
+        print(res.stdout)
+    if res.returncode != 0:
+        if res.stderr:
+            print(res.stderr)
+        return False, "Validation failed"
+    return True, "Validation passed"
+
+
+def validate_arch_against_template(arch_name: str, config: dict) -> tuple[bool, str]:
+    """Validate one arch (best-effort: rely on script output mentioning arch)."""
+    print(f"Validating {arch_name} against template...")
+    res = run_script(
+        "tools/config_management/verify_against_config_template.py",
+        [config["paths"]["configs_root"], config["paths"]["template"]],
+        capture_output=True,
+    )
+    if res.returncode != 0 and arch_name in (res.stdout or ""):
+        print(res.stdout)
+        return False, f"Validation failed for {arch_name}"
+    return True, f"Validation passed for {arch_name}"
+
+
+# =============================================================================
+# CHANGE DETECTION
+# =============================================================================
+
+
+def detect_changes(config: dict) -> dict:
+    print("Detecting changes...")
+    return hash_manager.detect_changes(
+        config["paths"]["configs_root"], config["paths"]["hashes"]
+    )
+
+
+def display_change_summary(changes: dict) -> bool:
+    print("\n" + "=" * 80)
+    print("CHANGE SUMMARY")
+    print("=" * 80)
+
+    has_changes = any([
+        changes.get("new_archs"),
+        changes.get("modified_archs"),
+        changes.get("delta_files"),
+        changes.get("deleted_archs"),
+    ])
+
+    if changes.get("new_archs"):
+        print("\nNew Architecture Directories:")
+        for a in changes["new_archs"]:
+            print(f"   • {a}")
+
+    if changes.get("modified_archs"):
+        print("\nModified Architectures:")
+        for a, files in changes["modified_archs"].items():
+            print(f"   • {a}:")
+            for f in files[:5]:
+                print(f"      - {f}")
+            extra = len(files) - 5
+            if extra > 0:
+                print(f"      ... and {extra} more files")
+
+    if changes.get("delta_files"):
+        print("\nDelta Files Detected:")
+        for a, d in changes["delta_files"].items():
+            print(f"   • {a}: {Path(d).name}")
+
+    if changes.get("deleted_archs"):
+        print("\nDeleted Architectures:")
+        for a in changes["deleted_archs"]:
+            print(f"   • {a}")
+
+    if not has_changes:
+        print("\nNo changes detected")
+
+    print("=" * 80 + "\n")
+    return has_changes
+
+
+# =============================================================================
+# CORE WORKFLOW OPS
+# =============================================================================
+
+
+def promote_to_latest(
+    new_arch: str, config: dict, reuse_backup: Optional[Path] = None
+) -> bool:
+    """
+    Original 'promote' that assumes new_arch dir already exists & populated.
+    (Kept for backward compatibility.)
+    """
+    print(f"\nPROMOTING {new_arch} TO LATEST ARCHITECTURE...")
+    backup_paths = [config["paths"]["configs_root"], config["paths"]["template"]]
+    backup_path = reuse_backup or create_backup(
+        backup_paths, config["paths"]["backups"]
+    )
+
+    try:
+        root = Path(config["paths"]["configs_root"])
+        new_dir = root / new_arch
+        if not new_dir.is_dir():
+            raise Exception(f"New arch directory not found: {new_dir}")
+
+        all_archs = get_all_archs(config["paths"]["configs_root"])
+        prev_archs = [a for a in all_archs if a != new_arch]
+
+        print(f"\n1. Updating template with new latest arch: {new_arch}")
+        res = run_script(
+            "tools/config_management/parse_config_template.py",
+            [str(new_dir), config["paths"]["template"], "--latest-arch", new_arch],
+            capture_output=True,
+        )
+        if res.returncode != 0:
+            raise Exception(f"Failed to update template: {res.stderr}")
+
+        print(f"\n2. Generating deltas for {len(prev_archs)} previous architectures")
+        for p in prev_archs:
+            prev_dir = root / p
+            gen = run_script(
+                "tools/config_management/generate_config_deltas.py",
+                [str(new_dir), str(prev_dir)],
+                capture_output=True,
+            )
+            if gen.returncode != 0:
+                raise Exception(f"Failed to generate delta for {p}: {gen.stderr}")
+
+        print("\n\tUpdating hashes for previous architectures (delta files)")
+        for p in prev_archs:
+            hash_manager.update_hashes(
+                p, config["paths"]["configs_root"], config["paths"]["hashes"]
+            )
+
+        print("\n3. Validating all architectures")
+        ok, msg = validate_all_archs(config)
+        if not ok:
+            raise Exception(msg)
+
+        print("\n4. Syncing metric descriptions")
+        ok = metric_description_manager.sync_arch(
+            new_arch,
+            config["paths"]["configs_root"],
+            config["paths"]["per_arch_metrics"],
+            config["paths"]["docs_metrics"],
+            is_latest=True,
+        )
+        if not ok:
+            raise Exception("Failed to sync metric descriptions")
+
+        print("\n5. Updating hashes")
+        hash_manager.update_hashes(
+            new_arch, config["paths"]["configs_root"], config["paths"]["hashes"]
+        )
+
+        print(f"\nSuccessfully promoted {new_arch} to latest architecture!")
+        return True
+
+    except Exception as e:
+        print(f"\nERROR: {e}\nRestoring from backup...")
+        restore_backup(backup_path, backup_paths)
+        return False
+
+
+def update_latest_arch_from_delta(
+    delta_file: str, arch_name: str, config: dict
+) -> bool:
+    """Apply a delta in-place to the latest arch (legacy flow)."""
+    print(f"\nUPDATING LATEST ARCH {arch_name} FROM DELTA...")
+    backup_paths = [config["paths"]["configs_root"], config["paths"]["template"]]
+    backup_path = create_backup(backup_paths, config["paths"]["backups"])
+
+    try:
+        root = Path(config["paths"]["configs_root"])
+        arch_dir = root / arch_name
+        tmp = root / f"{arch_name}_tmp"
+
+        print(f"\n1. Applying delta to {arch_name}")
+        res = run_script(
+            "tools/config_management/apply_config_deltas.py",
+            [str(arch_dir), delta_file, str(tmp)],
+            capture_output=True,
+        )
+        if res.returncode != 0:
+            raise Exception(f"Failed to apply delta: {res.stderr}")
+
+        shutil.rmtree(arch_dir)
+        shutil.move(str(tmp), str(arch_dir))
+
+        print("\n2. Updating template")
+        res = run_script(
+            "tools/config_management/parse_config_template.py",
+            [str(arch_dir), config["paths"]["template"], "--latest-arch", arch_name],
+            capture_output=True,
+        )
+        if res.returncode != 0:
+            raise Exception(f"Failed to update template: {res.stderr}")
+
+        print("\n3. Regenerating deltas for previous architectures")
+        all_archs = get_all_archs(config["paths"]["configs_root"])
+        for prev in [a for a in all_archs if a != arch_name]:
+            prev_dir = root / prev
+            gen = run_script(
+                "tools/config_management/generate_config_deltas.py",
+                [str(arch_dir), str(prev_dir)],
+                capture_output=True,
+            )
+            if gen.returncode != 0:
+                raise Exception(f"Failed to generate delta for {prev}")
+
+        for prev in [a for a in all_archs if a != arch_name]:
+            hash_manager.update_hashes(
+                prev, config["paths"]["configs_root"], config["paths"]["hashes"]
+            )
+
+        print("\n4. Validating all architectures")
+        ok, msg = validate_all_archs(config)
+        if not ok:
+            raise Exception(msg)
+
+        print("\n5. Syncing metric descriptions")
+        ok = metric_description_manager.sync_arch(
+            arch_name,
+            config["paths"]["configs_root"],
+            config["paths"]["per_arch_metrics"],
+            config["paths"]["docs_metrics"],
+            is_latest=True,
+        )
+        if not ok:
+            raise Exception("Failed to sync metric descriptions")
+
+        print("\n6. Updating hashes")
+        hash_manager.update_hashes(
+            arch_name, config["paths"]["configs_root"], config["paths"]["hashes"]
+        )
+
+        print(f"\nSuccessfully updated latest arch {arch_name}!")
+        return True
+
+    except Exception as e:
+        print(f"\nERROR: {e}\nRestoring from backup...")
+        restore_backup(backup_path, backup_paths)
+        return False
+
+
+def update_older_arch_from_delta(delta_file: str, arch_name: str, config: dict) -> bool:
+    """Apply a delta in-place to an older arch (legacy flow)."""
+    print(f"\nUPDATING OLDER ARCH {arch_name} FROM DELTA...")
+    root = Path(config["paths"]["configs_root"])
+    arch_dir = root / arch_name
+    backup_path = create_backup([str(arch_dir)], config["paths"]["backups"])
+
+    try:
+        tmp = root / f"{arch_name}_tmp"
+
+        print(f"\n1. Applying delta to {arch_name}")
+        res = run_script(
+            "tools/config_management/apply_config_deltas.py",
+            [str(arch_dir), delta_file, str(tmp)],
+            capture_output=True,
+        )
+        if res.returncode != 0:
+            raise Exception(f"Failed to apply delta: {res.stderr}")
+
+        shutil.rmtree(arch_dir)
+        shutil.move(str(tmp), str(arch_dir))
+
+        print("\n2. Validating against template")
+        ok, msg = validate_arch_against_template(arch_name, config)
+        if not ok:
+            raise Exception(msg)
+
+        print("\n3. Syncing metric descriptions")
+        ok = metric_description_manager.sync_arch(
+            arch_name,
+            config["paths"]["configs_root"],
+            config["paths"]["per_arch_metrics"],
+            config["paths"]["docs_metrics"],
+            is_latest=False,
+        )
+        if not ok:
+            raise Exception("Failed to sync metric descriptions")
+
+        print("\n4. Updating hashes")
+        hash_manager.update_hashes(
+            arch_name, config["paths"]["configs_root"], config["paths"]["hashes"]
+        )
+
+        print(f"\nSuccessfully updated older arch {arch_name}!")
+        return True
+
+    except Exception as e:
+        print(f"\nERROR: {e}\nRestoring from backup...")
+        restore_backup(backup_path, [str(arch_dir)])
+        return False
+
+
+def update_latest_arch_from_edits(arch_name: str, config: dict) -> bool:
+    """Re-derive template/deltas from direct edits to latest (legacy in-place)."""
+    print(f"\nUPDATING LATEST ARCH {arch_name} FROM DIRECT EDITS...")
+    backup_paths = [config["paths"]["configs_root"], config["paths"]["template"]]
+    backup_path = create_backup(backup_paths, config["paths"]["backups"])
+
+    try:
+        root = Path(config["paths"]["configs_root"])
+        arch_dir = root / arch_name
+
+        print("\n1. Updating template")
+        res = run_script(
+            "tools/config_management/parse_config_template.py",
+            [str(arch_dir), config["paths"]["template"], "--latest-arch", arch_name],
+            capture_output=True,
+        )
+        if res.returncode != 0:
+            raise Exception(f"Failed to update template: {res.stderr}")
+
+        print("\n2. Regenerating deltas for previous architectures")
+        for prev in [
+            a for a in get_all_archs(config["paths"]["configs_root"]) if a != arch_name
+        ]:
+            prev_dir = root / prev
+            gen = run_script(
+                "tools/config_management/generate_config_deltas.py",
+                [str(arch_dir), str(prev_dir)],
+                capture_output=True,
+            )
+            if gen.returncode != 0:
+                raise Exception(f"Failed to generate delta for {prev}")
+
+        for prev in [
+            a for a in get_all_archs(config["paths"]["configs_root"]) if a != arch_name
+        ]:
+            hash_manager.update_hashes(
+                prev, config["paths"]["configs_root"], config["paths"]["hashes"]
+            )
+
+        print("\n3. Validating all architectures")
+        ok, msg = validate_all_archs(config)
+        if not ok:
+            raise Exception(msg)
+
+        print("\n4. Syncing metric descriptions")
+        ok = metric_description_manager.sync_arch(
+            arch_name,
+            config["paths"]["configs_root"],
+            config["paths"]["per_arch_metrics"],
+            config["paths"]["docs_metrics"],
+            is_latest=True,
+        )
+        if not ok:
+            raise Exception("Failed to sync metric descriptions")
+
+        print("\n5. Updating hashes")
+        hash_manager.update_hashes(
+            arch_name, config["paths"]["configs_root"], config["paths"]["hashes"]
+        )
+
+        print(f"\nSuccessfully updated latest arch {arch_name}!")
+        return True
+
+    except Exception as e:
+        print(f"\nERROR: {e}\nRestoring from backup...")
+        restore_backup(backup_path, backup_paths)
+        return False
+
+
+def update_older_arch_from_edits(arch_name: str, config: dict) -> bool:
+    """Re-validate/sync/hash older arch after direct edits (legacy in-place)."""
+    print(f"\nUPDATING OLDER ARCH {arch_name} FROM DIRECT EDITS...")
+    root = Path(config["paths"]["configs_root"])
+    arch_dir = root / arch_name
+    backup_path = create_backup([str(arch_dir)], config["paths"]["backups"])
+
+    try:
+        print("\n1. Validating against template")
+        ok, msg = validate_arch_against_template(arch_name, config)
+        if not ok:
+            raise Exception(msg)
+
+        print("\n2. Syncing metric descriptions")
+        ok = metric_description_manager.sync_arch(
+            arch_name,
+            config["paths"]["configs_root"],
+            config["paths"]["per_arch_metrics"],
+            config["paths"]["docs_metrics"],
+            is_latest=False,
+        )
+        if not ok:
+            raise Exception("Failed to sync metric descriptions")
+
+        print("\n3. Updating hashes")
+        hash_manager.update_hashes(
+            arch_name, config["paths"]["configs_root"], config["paths"]["hashes"]
+        )
+
+        print(f"\nSuccessfully updated older arch {arch_name}!")
+        return True
+
+    except Exception as e:
+        print(f"\nERROR: {e}\nRestoring from backup...")
+        restore_backup(backup_path, [str(arch_dir)])
+        return False
+
+
+# =============================================================================
+# NEW: PROMOTE NEW ARCH FROM (A) EDITS or (B) DELTA
+# =============================================================================
+
+
+def _git_restore_pristine(path: Path) -> None:
+    """
+    Best-effort restore of a directory to HEAD using Git.
+    No-op if not in a Git repo. Raises on checkout failure when in a repo.
+    """
+    chk = subprocess.run(
+        ["git", "rev-parse", "--is-inside-work-tree"], capture_output=True, text=True
+    )
+    if chk.returncode != 0 or chk.stdout.strip() != "true":
+        return
+    res = subprocess.run(
+        ["git", "checkout", "--", str(path)], capture_output=True, text=True
+    )
+    if res.returncode != 0:
+        raise Exception(f"Failed to restore pristine state from Git for {path}")
+
+
+def promote_new_arch_from_latest_edits(
+    latest_arch: str, new_arch: str, config: dict
+) -> bool:
+    """
+    Flow (A): Direct edits were made to the current latest arch.
+    1) Snapshot edited latest to temp
+    2) Restore pristine latest (via Git)
+    3) Copy pristine latest → new arch
+    4) Generate delta (edited_tmp vs pristine_latest) → write under latest/config_delta/
+    5) Apply delta to new arch
+    6) Update template latest=new_arch, regen deltas, validate, sync, hash
+    """
+    print(f"\nPROMOTING {new_arch} FROM EDITS IN {latest_arch}...")
+    root = Path(config["paths"]["configs_root"])
+    latest_dir = root / latest_arch
+    new_dir = root / new_arch
+    edited_tmp = root / f"_{latest_arch}_edited_tmp"
+    new_tmp = root / f"_{new_arch}_tmp"
+
+    backup_paths = [config["paths"]["configs_root"], config["paths"]["template"]]
+    backup_path = create_backup(backup_paths, config["paths"]["backups"])
+
+    try:
+        # 1) Snapshot edited latest
+        if edited_tmp.exists():
+            shutil.rmtree(edited_tmp)
+        shutil.copytree(latest_dir, edited_tmp)
+
+        # 2) Restore pristine latest
+        _git_restore_pristine(latest_dir)
+
+        # 3) Copy pristine latest → new arch
+        if new_dir.exists():
+            raise Exception(f"Target new arch directory already exists: {new_dir}")
+        shutil.copytree(latest_dir, new_dir)
+
+        # 4) Generate delta: edited (curr) vs pristine latest (prev)
+        print("\nGenerating delta (edited latest → pristine latest)")
+        gen = run_script(
+            "tools/config_management/generate_config_deltas.py",
+            [str(edited_tmp), str(latest_dir)],
+            capture_output=True,
+        )
+        if gen.returncode != 0:
+            raise Exception(f"Failed to generate delta: {gen.stderr}")
+
+        delta_dir = latest_dir / "config_delta"
+        # Prefer the file named for edited_tmp; otherwise take the latest *_diff.yaml
+        candidates = sorted(delta_dir.glob(f"{edited_tmp.name}_diff.yaml")) or sorted(
+            delta_dir.glob("*_diff.yaml")
+        )
+        if not candidates:
+            raise Exception("Delta file not found after generation.")
+        delta_file = candidates[-1]
+
+        # 5) Apply delta onto new arch
+        if new_tmp.exists():
+            shutil.rmtree(new_tmp)
+        print(f"\nApplying delta to {new_arch}: {delta_file.name}")
+        app = run_script(
+            "tools/config_management/apply_config_deltas.py",
+            [str(new_dir), str(delta_file), str(new_tmp)],
+            capture_output=True,
+        )
+        if app.returncode != 0:
+            raise Exception(f"Failed to apply delta: {app.stderr}")
+        shutil.rmtree(new_dir)
+        shutil.move(str(new_tmp), str(new_dir))
+
+        # 6) Promote to latest, regen deltas, validate, sync, hash
+        return promote_to_latest(new_arch, config, reuse_backup=backup_path)
+
+    except Exception as e:
+        print(f"\nERROR: {e}\nRestoring from backup...")
+        restore_backup(backup_path, backup_paths)
+        return False
+    finally:
+        if edited_tmp.exists():
+            shutil.rmtree(edited_tmp, ignore_errors=True)
+        if new_tmp.exists():
+            shutil.rmtree(new_tmp, ignore_errors=True)
+
+
+def promote_new_arch_from_delta(
+    latest_arch: str, new_arch: str, delta_file: str, config: dict
+) -> bool:
+    """
+    Flow (B): Developer added a delta YAML targeting the latest arch.
+    1) Copy pristine latest → new arch
+    2) Apply the provided delta to new arch
+    3) Promote to latest, regen deltas, validate, sync, hash
+    """
+    print(f"\nPROMOTING {new_arch} FROM DELTA ON {latest_arch}...")
+    root = Path(config["paths"]["configs_root"])
+    latest_dir = root / latest_arch
+    new_dir = root / new_arch
+    new_tmp = root / f"_{new_arch}_tmp"
+
+    backup_paths = [config["paths"]["configs_root"], config["paths"]["template"]]
+    backup_path = create_backup(backup_paths, config["paths"]["backups"])
+
+    try:
+        if not Path(delta_file).is_file():
+            raise Exception(f"Delta file does not exist: {delta_file}")
+        if not latest_dir.is_dir():
+            raise Exception(f"Latest arch not found: {latest_dir}")
+        if new_dir.exists():
+            raise Exception(f"Target new arch directory already exists: {new_dir}")
+
+        # Start from pristine latest
+        _git_restore_pristine(latest_dir)
+
+        # 1) Copy pristine latest → new arch
+        shutil.copytree(latest_dir, new_dir)
+
+        # 2) Apply delta onto the new arch
+        if new_tmp.exists():
+            shutil.rmtree(new_tmp)
+        print(f"\nApplying delta to {new_arch}: {Path(delta_file).name}")
+        app = run_script(
+            "tools/config_management/apply_config_deltas.py",
+            [str(new_dir), str(delta_file), str(new_tmp)],
+            capture_output=True,
+        )
+        if app.returncode != 0:
+            raise Exception(f"Failed to apply delta: {app.stderr}")
+        shutil.rmtree(new_dir)
+        shutil.move(str(new_tmp), str(new_dir))
+
+        # 3) Promote to latest, regen deltas, validate, sync, hash
+        return promote_to_latest(new_arch, config, reuse_backup=backup_path)
+
+    except Exception as e:
+        print(f"\nERROR: {e}\nRestoring from backup...")
+        restore_backup(backup_path, backup_paths)
+        return False
+    finally:
+        if new_tmp.exists():
+            shutil.rmtree(new_tmp, ignore_errors=True)
+
+
+# =============================================================================
+# USER-FACING SCENARIO HANDLERS
+# =============================================================================
+
+
+def handle_new_arch(arch_name: str, config: dict, dry_run: bool = False) -> bool:
+    print(f"\n{'=' * 80}\nNEW ARCHITECTURE DETECTED: {arch_name}\n{'=' * 80}")
+    if not prompt_yes_no(f"Is {arch_name} the new latest architecture?"):
+        print(
+            "ERROR: New arch detected but not marked as latest.\n   "
+            "Only the latest arch should be added as a new directory."
+        )
+        return False
+    if dry_run:
+        print(f"[DRY RUN] Would promote {arch_name} to latest")
+        return True
+    return promote_to_latest(arch_name, config)
+
+
+def handle_delta_file(
+    delta_file: str, arch_name: str, config: dict, dry_run: bool = False
+) -> bool:
+    print(
+        f"\n{'=' * 80}\nDELTA FILE DETECTED: {Path(delta_file).name}\n   "
+        f"Target architecture: {arch_name}\n{'=' * 80}"
+    )
+
+    valid, err = validate_delta_structure(delta_file)
+    if not valid:
+        print(f"ERROR: Invalid delta structure - {err}")
+        return False
+
+    latest = (
+        get_latest_arch(config["paths"]["template"])
+        or (get_all_archs(config["paths"]["configs_root"]) or [None])[-1]
+    )
+
+    if arch_name == latest:
+        print(f"\nDelta targets the current latest arch: {latest}")
+        print("Choose how to apply this delta:")
+        print("  1. Update the existing latest arch in-place")
+        print(
+            "  2. Create a NEW architecture from latest and apply "
+            "the delta there (promote to latest)"
+        )
+
+        while True:
+            choice = input("Enter choice (1 or 2): ").strip()
+            if choice == "1":
+                if dry_run:
+                    print(f"[DRY RUN] Would update latest arch {latest} from delta")
+                    return True
+                return update_latest_arch_from_delta(delta_file, latest, config)
+            if choice == "2":
+                new_arch_name = input(
+                    "Enter new architecture name (e.g., gfx955): "
+                ).strip()
+                if not new_arch_name:
+                    print("New architecture name cannot be empty.")
+                    continue
+                if not prompt_yes_no(
+                    f"Promote {new_arch_name} to new latest architecture?"
+                ):
+                    print("Operation cancelled.")
+                    return False
+                if dry_run:
+                    print(
+                        "[DRY RUN] Would create "
+                        f"{new_arch_name} from {latest} and apply delta"
+                    )
+                    return True
+                return promote_new_arch_from_delta(
+                    latest, new_arch_name, delta_file, config
+                )
+            print("Invalid choice. Please enter 1 or 2.")
+    else:
+        if not prompt_yes_no(f"Apply delta to older arch ({arch_name}) in-place?"):
+            return False
+        if dry_run:
+            print(f"[DRY RUN] Would update older arch {arch_name} from delta")
+            return True
+        return update_older_arch_from_delta(delta_file, arch_name, config)
+
+
+def handle_direct_edits(
+    arch_name: str, modified_files: list[str], config: dict, dry_run: bool = False
+) -> bool:
+    print(f"\n{'=' * 80}\nDIRECT EDITS DETECTED: {arch_name}\n{'=' * 80}")
+    print("Modified files:")
+    for f in modified_files:
+        print(f"   • {f}")
+
+    latest = (
+        get_latest_arch(config["paths"]["template"])
+        or (get_all_archs(config["paths"]["configs_root"]) or [None])[-1]
+    )
+
+    if arch_name == latest:
+        print(f"\nThis is the current latest architecture ({latest}).")
+        print("Are you:")
+        print("  1. Updating the existing latest arch")
+        print("  2. Creating a new architecture (this will become the new latest)")
+
+        while True:
+            choice = input("Enter choice (1 or 2): ").strip()
+            if choice == "1":
+                if dry_run:
+                    print(
+                        f"[DRY RUN] Would update latest arch {latest} from direct edits"
+                    )
+                    return True
+                return update_latest_arch_from_edits(arch_name, config)
+            if choice == "2":
+                new_arch_name = (
+                    input(
+                        "Enter new architecture name "
+                        f"(currently detected as {arch_name}): "
+                    ).strip()
+                    or arch_name
+                )
+                if not prompt_yes_no(
+                    f"Promote {new_arch_name} to new latest architecture?"
+                ):
+                    print("Operation cancelled.")
+                    return False
+                if dry_run:
+                    print(
+                        "[DRY RUN] Would promote "
+                        f"{new_arch_name} from edits in {arch_name}"
+                    )
+                    return True
+                return promote_new_arch_from_latest_edits(
+                    arch_name, new_arch_name, config
+                )
+            print("Invalid choice. Please enter 1 or 2.")
+    else:
+        if not prompt_yes_no(
+            f"These are edits to older arch ({arch_name}). Continue (in-place)?"
+        ):
+            return False
+        if dry_run:
+            print(f"[DRY RUN] Would update older arch {arch_name} from direct edits")
+            return True
+        return update_older_arch_from_edits(arch_name, config)
+
+
+# =============================================================================
+# MAIN
+# =============================================================================
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description="Master workflow for managing architecture configurations"
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Show what would be done without making changes",
+    )
+    args = parser.parse_args()
+
+    print("=" * 80)
+    print("ARCHITECTURE CONFIG WORKFLOW")
+    print("=" * 80)
+
+    config = load_config()
+
+    if args.dry_run:
+        print("\nDRY RUN MODE - No changes will be made\n")
+
+    changes = detect_changes(config)
+    has_changes = display_change_summary(changes)
+    if not has_changes:
+        return 0
+
+    latest_arch = (
+        get_latest_arch(config["paths"]["template"])
+        or (get_all_archs(config["paths"]["configs_root"]) or [None])[-1]
+    )
+    latest_has_edits = latest_arch in (changes.get("modified_archs") or {})
+
+    # New arch directories that appeared on disk
+    for new_arch in changes.get("new_archs", []):
+        if not handle_new_arch(new_arch, config, args.dry_run):
+            return 1
+
+    # If latest was directly edited, prioritize resolving that path
+    # (user will choose in-place vs new arch)
+    if latest_has_edits:
+        if not handle_direct_edits(
+            latest_arch, changes["modified_archs"][latest_arch], config, args.dry_run
+        ):
+            return 1
+        print("\nNote: Delta files for older archs will be regenerated automatically.")
+        print("Skipping delta file processing for older architectures.\n")
+    else:
+        # Process delta files
+        for arch, delta_file in changes.get("delta_files", {}).items():
+            if not handle_delta_file(delta_file, arch, config, args.dry_run):
+                return 1
+
+    # Remaining direct edits (excluding latest if already processed)
+    for arch, files in (changes.get("modified_archs") or {}).items():
+        if arch == latest_arch and latest_has_edits:
+            continue
+        if arch in (changes.get("delta_files") or {}):
+            continue
+        if not handle_direct_edits(arch, files, config, args.dry_run):
+            return 1
+
+    if not args.dry_run:
+        cleanup_old_backups(config["paths"]["backups"])
+        print("\n" + "=" * 80)
+        print("ALL OPERATIONS COMPLETED SUCCESSFULLY!")
+        print("=" * 80)
+    else:
+        print("\n" + "=" * 80)
+        print("DRY RUN COMPLETE")
+        print("=" * 80)
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/projects/rocprofiler-compute/tools/config_management/metric_description_manager.py b/projects/rocprofiler-compute/tools/config_management/metric_description_manager.py
new file mode 100644
index 0000000000..6c197c89d5
--- /dev/null
+++ b/projects/rocprofiler-compute/tools/config_management/metric_description_manager.py
@@ -0,0 +1,515 @@
+#!/usr/bin/env python3
+##############################################################################
+# MIT License
+#
+# Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+##############################################################################
+
+"""
+Metric description manager.
+Syncs metric descriptions between config YAMLs and documentation files.
+
+Usage:
+    python metric_description_manager.py --sync-arch <arch_name> <configs_dir>
+    python metric_description_manager.py --sync-all <configs_dir>
+    python metric_description_manager.py --validate <arch_name> <configs_dir>
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+from pathlib import Path
+from typing import Union
+
+import yaml
+
+try:
+    from . import utils as cm_utils
+except Exception:
+    repo_root = Path(__file__).resolve().parents[1]
+    if str(repo_root) not in sys.path:
+        sys.path.insert(0, str(repo_root))
+    try:
+        import config_management.utils as cm_utils  # type: ignore
+    except Exception:
+        import utils as cm_utils  # type: ignore
+
+AUTOGEN_TEXT = (
+    "# AUTOGENERATED FILE. Only edit for testing purposes, not for development. "
+    "Generated by tools/config_management/metric_description_manager.py\n"
+)
+
+# Section to panel ID mapping for organizing descriptions
+SECTION_PANEL_MAP: dict[str, int] = {
+    "Wavefront launch stats": 701,
+    "Wavefront runtime stats": 702,
+    "Overall instruction mix": 1001,
+    "VALU arithmetic instruction mix": 1002,
+    "MFMA instruction mix": 1004,
+    "Compute Speed-of-Light": 1101,
+    "Pipeline statistics": 1102,
+    "Arithmetic operations": 1103,
+    "LDS Speed-of-Light": 1201,
+    "LDS Statistics": 1202,
+    "vL1D Speed-of-Light": 1601,
+    "Busy / stall metrics": 1501,
+    "Instruction counts": 1502,
+    "Spill / stack metrics": 1503,
+    "L1 Unified Translation Cache (UTCL1)": 1605,
+    "vL1D cache stall metrics": 1602,
+    "vL1D cache access metrics": 1603,
+    "Vector L1 data-return path or Texture Data (TD)": 1504,
+    "L2 Speed-of-Light": 1701,
+    "L2 cache accesses": 1703,
+    "L2-Fabric interface metrics": 1702,
+    "L2 - Fabric interface detailed metrics": 1706,
+    "L2 - Fabric Interface stalls": 1705,
+    "Scalar L1D Speed-of-Light": 1401,
+    "Scalar L1D cache accesses": 1402,
+    "Scalar L1D Cache - L2 Interface": 1403,
+    "L1I Speed-of-Light": 1301,
+    "L1I cache accesses": 1302,
+    "L1I <-> L2 interface": 1303,
+    "Workgroup manager utilizations": 601,
+    "Workgroup Manager - Resource Allocation": 602,
+    "Command processor fetcher (CPF)": 501,
+    "Command processor packet processor (CPC)": 502,
+    "System Speed-of-Light": 201,
+}
+
+PANEL_ID_TO_SECTION: dict[int, str] = {v: k for k, v in SECTION_PANEL_MAP.items()}
+
+
+def merge_docs_rst_as_default(descs: dict, docs_file: Path) -> dict:
+    """
+    For each metric that does NOT explicitly carry an 'rst' in panel YAMLs,
+    fill 'rst' from docs/data/metrics_description.yaml if present.
+    This makes docs the default RST source unless the panel overrides it.
+    """
+    docs: dict = {}
+    if docs_file.exists():
+        with open(docs_file, "r", encoding="utf-8") as f:
+            docs = yaml.safe_load(f) or {}
+
+    for section, metrics in descs.items():
+        docs_section = docs.get(section) or {}
+        for metric_name, d in metrics.items():
+            # If panel didn't explicitly provide rst, inherit from docs
+            if not d.get("rst"):
+                doc_entry = docs_section.get(metric_name) or {}
+                if doc_entry.get("rst"):
+                    d["rst"] = doc_entry["rst"]
+    return descs
+
+
+def merge_units_as_default(descs: dict, docs_file: Path, per_arch_file: Path) -> dict:
+    """
+    Fill 'unit' ONLY when missing from panel extraction:
+      1) take from existing per-arch file if present,
+      2) else from docs file,
+      3) else leave as-is (missing).
+    """
+    per_arch: dict = {}
+    if per_arch_file.exists():
+        with open(per_arch_file, "r", encoding="utf-8") as f:
+            per_arch = yaml.safe_load(f) or {}
+
+    docs: dict = {}
+    if docs_file.exists():
+        with open(docs_file, "r", encoding="utf-8") as f:
+            docs = yaml.safe_load(f) or {}
+
+    for section, metrics in descs.items():
+        psec = per_arch.get(section) or {}
+        dsec = docs.get(section) or {}
+        for metric, data in metrics.items():
+            # Only fill if panel did NOT explicitly set unit
+            if "unit" not in data or data["unit"] is None:
+                unit = None
+                if metric in psec and isinstance(psec[metric], dict):
+                    unit = psec[metric].get("unit")
+                if unit is None and metric in dsec and isinstance(dsec[metric], dict):
+                    unit = dsec[metric].get("unit")
+                if unit is not None:
+                    data["unit"] = unit
+    return descs
+
+
+def panel_rst_override_keys(descs: dict) -> set:
+    """
+    Return {(section, metric)} for metrics that explicitly
+    included 'rst' in panel YAMLs.
+    """
+    keys = set()
+    for section, metrics in descs.items():
+        for metric_name, d in metrics.items():
+            if "rst" in d and d["rst"]:
+                keys.add((section, metric_name))
+    return keys
+
+
+def panel_unit_override_keys(descs: dict) -> set[tuple[str, str]]:
+    keys: set[tuple[str, str]] = set()
+    for section, metrics in descs.items():
+        for metric, d in metrics.items():
+            if "unit" in d and d["unit"] is not None:
+                keys.add((section, metric))
+    return keys
+
+
+def validate_rst_syntax(text: str) -> tuple[bool, str]:
+    """Basic RST syntax validation."""
+    if not text:
+        return True, ""
+
+    errors: list[str] = []
+
+    single_backticks = text.count("`")
+    if single_backticks % 2 != 0:
+        errors.append("Unmatched single backticks")
+
+    double_backticks = text.count("``")
+    remaining_singles = single_backticks - (double_backticks * 2)
+    if remaining_singles % 2 != 0:
+        errors.append("Unmatched backticks after accounting for code literals")
+
+    if ":ref:`" in text:
+        ref_count = text.count(":ref:`")
+        closing_count = text[text.find(":ref:`") :].count("`")
+        if ref_count > closing_count:
+            errors.append("Unclosed :ref: directive")
+
+    if ":doc:`" in text:
+        doc_count = text.count(":doc:`")
+        closing_count = text[text.find(":doc:`") :].count("`")
+        if doc_count > closing_count:
+            errors.append("Unclosed :doc: directive")
+
+    if errors:
+        return False, "; ".join(errors)
+    return True, ""
+
+
+def extract_descriptions_from_arch(
+    arch_dir: Union[str, Path],
+) -> dict[str, dict[str, dict]]:
+    """
+    Extract metric descriptions from all config YAMLs in an arch.
+    Returns dict organized by section name.
+    """
+    arch_path = Path(arch_dir)
+    descriptions_by_section: dict[str, dict[str, dict]] = {}
+
+    for yaml_file in sorted(arch_path.glob("*.yaml")):
+        data = cm_utils.load_yaml(yaml_file)
+
+        panel_config = data.get("Panel Config")
+        if not isinstance(panel_config, dict):
+            continue
+
+        panel_descriptions: dict = panel_config.get("metrics_description", {})
+
+        metrics_with_units: dict[str, dict[str, str]] = {}
+        for ds in panel_config.get("data source", []):
+            for key, value in ds.items():
+                if isinstance(value, dict) and "metric" in value:
+                    table_id = value.get("id")
+                    section_name = PANEL_ID_TO_SECTION.get(table_id)
+                    if not section_name:
+                        continue
+                    for metric_name, metric_data in value["metric"].items():
+                        unit = metric_data.get("unit")
+                        if unit:
+                            metrics_with_units[metric_name] = {
+                                "section": section_name,
+                                "unit": unit,
+                            }
+
+        for metric_name, description in panel_descriptions.items():
+            section_name = (
+                metrics_with_units[metric_name]["section"]
+                if metric_name in metrics_with_units
+                else "General"
+            )
+
+            if isinstance(description, dict):
+                plain = description.get("plain", "")
+                rst = description.get("rst", "")
+                unit = description.get("unit", None)
+            else:
+                plain = description
+                rst = ""
+                unit = None
+
+            desc_data = {"plain": plain, "rst": rst}
+            if unit is not None:
+                desc_data["unit"] = unit
+
+            descriptions_by_section.setdefault(section_name, {})
+            descriptions_by_section[section_name][metric_name] = desc_data
+
+    return descriptions_by_section
+
+
+def update_per_arch_metrics_file(
+    arch_name: str, descriptions: dict, output_dir: Union[str, Path]
+) -> None:
+    """Write per-arch RST descriptions with units if available."""
+    output_path = Path(output_dir) / f"{arch_name}_metrics_description.yaml"
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    rst_descriptions: dict[str, dict[str, dict]] = {}
+    for section, metrics in descriptions.items():
+        rst_descriptions[section] = {}
+        for metric_name, desc_data in metrics.items():
+            entry = {"rst": desc_data["rst"]}
+            if "unit" in desc_data:
+                entry["unit"] = desc_data["unit"]
+            rst_descriptions[section][metric_name] = entry
+
+    cm_utils.save_yaml(rst_descriptions, output_path, AUTOGEN_TEXT)
+    print(f"Updated: {output_path}")
+
+
+def update_docs_metrics_file(
+    descriptions: dict,
+    docs_file: str,
+    panel_rst_overrides: set,
+    panel_unit_overrides: set,
+) -> bool:
+    docs_path = Path(docs_file)
+    existing: dict = {}
+    if docs_path.exists():
+        with open(docs_path, "r", encoding="utf-8") as f:
+            existing = yaml.safe_load(f) or {}
+
+    for section, metrics in descriptions.items():
+        existing.setdefault(section, {})
+        for metric_name, desc_data in metrics.items():
+            existing[section].setdefault(metric_name, {})
+            # Only overwrite rst if panel provided an explicit override
+            if (section, metric_name) in panel_rst_overrides and desc_data.get("rst"):
+                existing[section][metric_name]["rst"] = desc_data["rst"]
+            # Always keep unit if provided (optional)
+            if (section, metric_name) in panel_unit_overrides and "unit" in desc_data:
+                existing[section][metric_name]["unit"] = desc_data["unit"]
+
+    docs_path.parent.mkdir(parents=True, exist_ok=True)
+
+    cm_utils.save_yaml(existing, docs_path, AUTOGEN_TEXT)
+    return True
+
+
+def validate_descriptions(
+    arch_dir: Union[str, Path],
+) -> tuple[bool, list[str], list[str]]:
+    """Validate: missing descriptions and basic RST syntax."""
+    arch_path = Path(arch_dir)
+    warnings: list[str] = []
+    errors: list[str] = []
+
+    for yaml_file in sorted(arch_path.glob("*.yaml")):
+        with open(yaml_file) as f:
+            data = yaml.safe_load(f) or {}
+
+        panel_config = data.get("Panel Config")
+        if not isinstance(panel_config, dict):
+            continue
+
+        panel_descriptions: dict = panel_config.get("metrics_description", {})
+        all_metrics: set[str] = set()
+
+        for ds in panel_config.get("data source", []):
+            for _, value in ds.items():
+                if isinstance(value, dict) and "metric" in value:
+                    all_metrics.update(value["metric"].keys())
+
+        missing = sorted(all_metrics - set(panel_descriptions.keys()))
+        if missing:
+            warnings.append(
+                f"{yaml_file.name}: Missing descriptions "
+                f"for metrics: {', '.join(missing)}"
+            )
+
+        for metric_name, description in panel_descriptions.items():
+            rst_text = (
+                description.get("rst", "")
+                if isinstance(description, dict)
+                else description
+            )
+            ok, err = validate_rst_syntax(rst_text)
+            if not ok:
+                errors.append(
+                    f"{yaml_file.name}: Metric '{metric_name}' has invalid RST: {err}"
+                )
+
+    return len(errors) == 0, warnings, errors
+
+
+def sync_arch(
+    arch_name: str,
+    configs_dir: str,
+    per_arch_metrics_dir: str,
+    docs_metrics_file: str,
+    is_latest: bool,
+) -> bool:
+    """Sync descriptions for a single architecture."""
+    arch_dir = Path(configs_dir) / arch_name
+    docs_file = Path(docs_metrics_file)
+    per_arch_file = Path(per_arch_metrics_dir) / f"{arch_name}_metrics_description.yaml"
+
+    if not arch_dir.is_dir():
+        print(f"Error: {arch_dir} is not a directory")
+        return False
+
+    print(f"Syncing descriptions for {arch_name}...")
+    is_valid, warnings, errors = validate_descriptions(arch_dir)
+
+    # 1) Extract descriptions from panel YAMLs (source for 'plain', optional 'rst')
+    descriptions = extract_descriptions_from_arch(arch_dir)
+    if not descriptions:
+        print(f"No descriptions found in {arch_name}")
+        return True
+
+    # 2) Capture which metrics had explicit panel RST (BEFORE merging docs)
+    panel_rst_overrides = panel_rst_override_keys(descriptions)
+    panel_unit_overrides = panel_unit_override_keys(descriptions)
+
+    # 3) Merge docs' RST as the default (unless panel overrides)
+    descriptions = merge_docs_rst_as_default(descriptions, docs_file)
+    descriptions = merge_units_as_default(descriptions, docs_file, per_arch_file)
+
+    # 4) Write per-arch file (plain from panel; rst = panel override or docs default)
+    update_per_arch_metrics_file(arch_name, descriptions, per_arch_metrics_dir)
+
+    # 5) Only when latest: update docs, but overwrite 'rst' only for overrides
+    if is_latest:
+        if not update_docs_metrics_file(
+            descriptions,
+            docs_metrics_file,
+            panel_rst_overrides,
+            panel_unit_overrides,
+        ):
+            return False
+
+    return True
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Manage metric descriptions")
+    parser.add_argument(
+        "--sync-arch",
+        metavar="ARCH",
+        help="Sync descriptions for specific architecture",
+    )
+    parser.add_argument(
+        "--sync-all",
+        action="store_true",
+        help="Sync descriptions for all architectures",
+    )
+    parser.add_argument(
+        "--validate",
+        metavar="ARCH",
+        help="Validate descriptions for specific architecture",
+    )
+    parser.add_argument(
+        "--latest-arch", help="Specify which arch is latest (for docs update)"
+    )
+    parser.add_argument("configs_dir", help="Path to analysis_configs directory")
+    parser.add_argument(
+        "--per-arch-output",
+        default="tools/per_arch_metric_definitions",
+        help="Output directory for per-arch files",
+    )
+    parser.add_argument(
+        "--docs-file",
+        default="docs/data/metrics_description.yaml",
+        help="Path to docs metrics description file",
+    )
+
+    args = parser.parse_args()
+
+    if args.sync_arch:
+        is_latest = (args.latest_arch == args.sync_arch) if args.latest_arch else False
+        ok = sync_arch(
+            args.sync_arch,
+            args.configs_dir,
+            args.per_arch_output,
+            args.docs_file,
+            is_latest,
+        )
+        return 0 if ok else 1
+
+    if args.sync_all:
+        configs_path = Path(args.configs_dir)
+        archs = sorted([
+            d.name
+            for d in configs_path.iterdir()
+            if d.is_dir() and d.name.startswith("gfx")
+        ])
+        if not archs:
+            print("No architecture directories found")
+            return 1
+        latest_arch = args.latest_arch if args.latest_arch else archs[-1]
+        for arch in archs:
+            ok = sync_arch(
+                arch,
+                args.configs_dir,
+                args.per_arch_output,
+                args.docs_file,
+                arch == latest_arch,
+            )
+            if not ok:
+                return 1
+        return 0
+
+    if args.validate:
+        arch_dir = Path(args.configs_dir) / args.validate
+        if not arch_dir.is_dir():
+            print(f"Error: {arch_dir} is not a directory")
+            return 1
+
+        is_valid, warnings, errors = validate_descriptions(arch_dir)
+        print(f"Validation results for {args.validate}:\n{'=' * 80}")
+
+        if warnings:
+            print("\nWarnings:")
+            for w in warnings:
+                print(f"   {w}")
+
+        if errors:
+            print("\nErrors:")
+            for e in errors:
+                print(f"   {e}")
+
+        if is_valid and not warnings:
+            print("\nAll validations passed")
+
+        return 0 if is_valid else 1
+
+    parser.print_help()
+    return 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/projects/rocprofiler-compute/tools/config_management/parse_config_template.py b/projects/rocprofiler-compute/tools/config_management/parse_config_template.py
new file mode 100644
index 0000000000..15ca94699a
--- /dev/null
+++ b/projects/rocprofiler-compute/tools/config_management/parse_config_template.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python3
+"""
+Parse panel configuration based on YAML files for an architecture.
+Usage:
+    python parse_config_template.py <dir_path> [output_file.yaml] [--latest-arch ARCH]
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+from pathlib import Path
+from typing import Any, Optional
+
+try:
+    from . import utils as cm_utils
+except Exception:
+    repo_root = Path(__file__).resolve().parents[1]
+    if str(repo_root) not in sys.path:
+        sys.path.insert(0, str(repo_root))
+    try:
+        import config_management.utils as cm_utils  # type: ignore
+    except Exception:
+        import utils as cm_utils  # type: ignore
+
+AUTOGEN_TEXT = (
+    "# AUTOGENERATED FILE. Only edit for testing purposes, not for development. "
+    "Generated by tools/config_management/parse_config_template.py\n"
+)
+
+
+def parse_panel_config(yaml_file: Path) -> Optional[dict]:
+    """Parse a single YAML file and extract panel and data source info."""
+    data = cm_utils.load_yaml(yaml_file)
+    panel_config = data.get("Panel Config")
+    if not isinstance(panel_config, dict):
+        return None
+
+    filename = (
+        yaml_file.name.split("_", 1)[1] if "_" in yaml_file.name else yaml_file.name
+    )
+
+    panel_id = panel_config.get("id")
+    if panel_id and panel_id >= 100:
+        panel_id = panel_id // 100
+
+    data_sources = []
+    for ds in panel_config.get("data source", []):
+        for key, value in ds.items():
+            if isinstance(value, dict) and "id" in value and "title" in value:
+                data_sources.append({
+                    "type": key,
+                    "id": value["id"] % 100,
+                    "title": value["title"],
+                })
+
+    return {
+        "file": filename,
+        "panel_id": panel_id,
+        "panel_title": panel_config.get("title"),
+        "panel_alias": panel_config.get("alias"),
+        "data_sources": data_sources,
+    }
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Parse panel configuration from YAML files"
+    )
+    parser.add_argument("directory", help="Directory containing YAML files")
+    parser.add_argument("output", nargs="?", help="Output YAML file (optional)")
+    parser.add_argument(
+        "--latest-arch",
+        help="Specify this architecture as latest (adds metadata to output)",
+    )
+    args = parser.parse_args()
+
+    directory = Path(args.directory)
+    if not directory.is_dir():
+        print(f"Error: '{args.directory}' is not a valid directory")
+        sys.exit(1)
+
+    results = []
+    for yaml_file in sorted(directory.glob("*.yaml")):
+        parsed = parse_panel_config(yaml_file)
+        if parsed:
+            results.append(parsed)
+
+    if not results:
+        print("No valid panel configurations found.")
+        sys.exit(1)
+
+    for panel in results:
+        print(f"\n{'=' * 80}")
+        print(f"File: {panel['file']}")
+        print(f"Panel ID: {panel['panel_id']}")
+        print(f"Panel Title: {panel['panel_title']}")
+        if panel.get("panel_alias"):
+            print(f"Panel Alias: {panel['panel_alias']}")
+        print(f"\nData Sources ({len(panel['data_sources'])}):")
+        for ds in panel["data_sources"]:
+            print(f"  - {ds['type']}: {ds['id']} - {ds['title']}")
+
+    if args.output:
+        output_data: Any = results
+        if args.latest_arch:
+            output_data = {"latest_arch": args.latest_arch, "panels": results}
+        cm_utils.save_yaml(output_data, args.output, AUTOGEN_TEXT)
+        print(f"\nResults saved to: {args.output}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/projects/rocprofiler-compute/tools/config_management/tests/test_config_workflow.py b/projects/rocprofiler-compute/tools/config_management/tests/test_config_workflow.py
new file mode 100644
index 0000000000..d7caf08f59
--- /dev/null
+++ b/projects/rocprofiler-compute/tools/config_management/tests/test_config_workflow.py
@@ -0,0 +1,417 @@
+##############################################################################
+# MIT License
+#
+# Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+##############################################################################
+
+import shutil
+import subprocess
+import sys
+import tempfile
+import unittest
+from pathlib import Path
+from unittest.mock import patch
+
+import yaml
+
+REPO_ROOT = Path(__file__).resolve().parents[1]
+if str(REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(REPO_ROOT))
+
+import master_config_workflow_script as mws  # noqa
+
+
+def build_panel_dict(
+    panel_id: int, title: str, tables: tuple, descriptions: dict = None
+) -> dict:
+    """
+    tables: tuple of (table_id, table_title, metrics_dict)
+    metrics_dict example:
+      {
+        "Metric A": {"avg": "AVG(A)", "min": "MIN(A)", "max": "MAX(A)", "unit": "pct"},
+        ...
+      }
+    """
+    data_sources = []
+    for tid, ttitle, metrics in tables:
+        data_sources.append({
+            "metric_table": {
+                "id": tid,
+                "title": ttitle,
+                "header": {
+                    "metric": "Metric",
+                    "avg": "Avg",
+                    "min": "Min",
+                    "max": "Max",
+                    "unit": "Unit",
+                },
+                "metric": metrics or {},
+            }
+        })
+
+    panel = {
+        "Panel Config": {
+            "id": panel_id,
+            "title": title,
+            "data source": data_sources,
+        }
+    }
+    if descriptions:
+        panel["Panel Config"]["metrics_description"] = descriptions
+    return panel
+
+
+def write_yaml(path: Path, obj: dict) -> None:
+    path.write_text(
+        yaml.safe_dump(obj, sort_keys=False, allow_unicode=True),
+        encoding="utf-8",
+    )
+
+
+class TestUserFlows(unittest.TestCase):
+    """
+    These tests assert the interactive user flows are wired correctly:
+      - When a new delta is detected (for latest), we ask 1) in-place vs 2) create-new
+      - When a YAML is edited (for latest), we ask 1) in-place vs 2) create-new
+      - For older-arch edits, we do NOT ask to create-new; only confirm in-place
+    We patch the master script’s helpers to avoid running subprocesses.
+    """
+
+    @patch.object(mws, "validate_delta_structure", return_value=(True, ""))
+    @patch.object(mws, "update_latest_arch_from_delta", return_value=True)
+    @patch.object(mws, "promote_new_arch_from_delta", return_value=True)
+    @patch.object(mws, "get_latest_arch", return_value="gfx950")
+    @patch.object(mws, "prompt_yes_no", return_value=True)
+    def test_delta_on_latest_branching(
+        self, _yesno, _get_latest, promo_new_from_delta, update_latest_delta, _validate
+    ):
+        # choice 1: in-place
+        with patch("builtins.input", side_effect=["1"]):
+            ok = mws.handle_delta_file(
+                "/tmp/fake.yaml",
+                "gfx950",
+                {"paths": {"template": "T", "configs_root": "C"}},
+                dry_run=False,
+            )
+            assert ok
+            update_latest_delta.assert_called_once()
+            promo_new_from_delta.assert_not_called()
+
+        update_latest_delta.reset_mock()
+        promo_new_from_delta.reset_mock()
+
+        # choice 2: create new arch
+        with patch("builtins.input", side_effect=["2", "gfx955"]):
+            ok = mws.handle_delta_file(
+                "/tmp/fake.yaml",
+                "gfx950",
+                {"paths": {"template": "T", "configs_root": "C"}},
+                dry_run=False,
+            )
+            assert ok
+            promo_new_from_delta.assert_called_once_with(
+                "gfx950",
+                "gfx955",
+                "/tmp/fake.yaml",
+                {"paths": {"template": "T", "configs_root": "C"}},
+            )
+            update_latest_delta.assert_not_called()
+
+    @patch.object(mws, "update_latest_arch_from_edits", return_value=True)
+    @patch.object(mws, "promote_new_arch_from_latest_edits", return_value=True)
+    @patch.object(mws, "get_latest_arch", return_value="gfx950")
+    @patch.object(mws, "prompt_yes_no", return_value=True)
+    def test_direct_edits_on_latest_branching(
+        self, _yesno, _get_latest, promo_from_edits, update_latest_edits
+    ):
+        # choice 1: in-place update latest
+        with patch("builtins.input", side_effect=["1"]):
+            ok = mws.handle_direct_edits(
+                "gfx950",
+                ["file.yaml"],
+                {"paths": {"template": "T", "configs_root": "C"}},
+                dry_run=False,
+            )
+            self.assertTrue(ok)
+            update_latest_edits.assert_called_once_with(
+                "gfx950", {"paths": {"template": "T", "configs_root": "C"}}
+            )
+            promo_from_edits.assert_not_called()
+
+        update_latest_edits.reset_mock()
+        promo_from_edits.reset_mock()
+
+        # choice 2: promote a new arch from edits
+        with patch("builtins.input", side_effect=["2", "gfx955"]):
+            ok = mws.handle_direct_edits(
+                "gfx950",
+                ["file.yaml"],
+                {"paths": {"template": "T", "configs_root": "C"}},
+                dry_run=False,
+            )
+            self.assertTrue(ok)
+            promo_from_edits.assert_called_once_with(
+                "gfx950", "gfx955", {"paths": {"template": "T", "configs_root": "C"}}
+            )
+            update_latest_edits.assert_not_called()
+
+    @patch.object(mws, "get_latest_arch", return_value="gfx950")
+    @patch.object(mws, "prompt_yes_no", return_value=True)
+    def test_edits_on_older_arch_no_create_new_prompt(self, _yesno, _get_latest):
+        # For older arch (e.g., gfx940), we should NOT prompt for 1/2 input branch.
+        with (
+            patch("builtins.input") as mock_input,
+            patch.object(mws, "update_older_arch_from_edits", return_value=True) as upd,
+        ):
+            ok = mws.handle_direct_edits(
+                "gfx940",
+                ["file.yaml"],
+                {"paths": {"template": "T", "configs_root": "C"}},
+                dry_run=False,
+            )
+            self.assertTrue(ok)
+            upd.assert_called_once()
+            mock_input.assert_not_called()
+
+    @patch.object(mws, "validate_delta_structure", return_value=(True, ""))
+    @patch.object(mws, "get_latest_arch", return_value="gfx950")
+    @patch.object(mws, "prompt_yes_no", return_value=True)
+    def test_delta_on_older_arch_in_place_only(self, _yesno, _get_latest, _valid):
+        with (
+            patch("builtins.input") as mock_input,
+            patch.object(mws, "update_older_arch_from_delta", return_value=True) as upd,
+        ):
+            ok = mws.handle_delta_file(
+                "/tmp/old_delta.yaml",
+                "gfx940",
+                {"paths": {"template": "T", "configs_root": "C"}},
+                dry_run=False,
+            )
+            self.assertTrue(ok)
+            upd.assert_called_once()
+            mock_input.assert_not_called()
+
+
+class TestDeltaAndEditsSemantics(unittest.TestCase):
+    """
+    End-to-end tests for:
+      - generating a delta (add/del/mod of metrics + descriptions) and applying it
+      - detecting edits (add/del/mod) via the delta generator
+    These use the generate_config_deltas.py and apply_config_deltas.py scripts directly
+    with a temporary file layout.
+    """
+
+    def setUp(self):
+        self.tmpdir = Path(tempfile.mkdtemp(prefix="cfgwf_"))
+        # Create minimal directory layout
+        self.configs_root = (
+            self.tmpdir / "src" / "rocprof_compute_soc" / "analysis_configs"
+        )
+        self.configs_root.mkdir(parents=True, exist_ok=True)
+
+        self.gfx_prev = self.configs_root / "gfx950"
+        self.gfx_curr = self.configs_root / "gfx955"
+        self.gfx_prev.mkdir()
+        self.gfx_curr.mkdir()
+
+        # One shared yaml filename in both dirs (panel 1400)
+        self.file_name = "1400_scalar_l1_data_cache.yaml"
+
+        # Previous (baseline) YAML: has metric A and description
+        prev_obj = build_panel_dict(
+            1401,
+            "Scalar L1D Speed-of-Light",
+            tables=(
+                (
+                    1401,
+                    "Scalar L1D SoL Table",
+                    {
+                        "Metric A": {
+                            "avg": "AVG(A)",
+                            "min": "MIN(A)",
+                            "max": "MAX(A)",
+                            "unit": "pct",
+                        },
+                    },
+                ),
+            ),
+            descriptions={"Metric A": {"plain": "A plain", "rst": "A rst"}},
+        )
+        write_yaml(self.gfx_prev / self.file_name, prev_obj)
+
+        curr_obj = build_panel_dict(
+            1401,
+            "Scalar L1D Speed-of-Light",
+            tables=(
+                (
+                    1401,
+                    "Scalar L1D SoL Table",
+                    {
+                        "Metric A": {
+                            "avg": "AVG(A_new)",
+                            "min": "MIN(A)",
+                            "max": "MAX(A)",
+                            "unit": "pct",
+                        },  # MOD
+                        "Metric B": {
+                            "avg": "AVG(B)",
+                            "min": "MIN(B)",
+                            "max": "MAX(B)",
+                            "unit": "cycles",
+                        },  # ADD
+                    },
+                ),
+            ),
+            descriptions={
+                "Metric A": {"plain": "A plain (new)", "rst": "A rst (new)"},  # MOD
+                "Metric B": {"plain": "B plain", "rst": "B rst"},  # ADD
+            },
+        )
+        write_yaml(self.gfx_curr / self.file_name, curr_obj)
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdir, ignore_errors=True)
+
+    def test_generate_delta_and_apply_roundtrip(self):
+        """
+        1) Generate delta: curr (gfx955) vs prev (gfx950) -> stored in prev/config_delta
+        2) Apply delta onto a copy of prev -> must equal curr
+        """
+        # Run generator
+        # (call the script's main via subprocess to mimic actual behavior)
+        cmd = [
+            sys.executable,
+            str(REPO_ROOT / "generate_config_deltas.py"),
+            str(self.gfx_curr),
+            str(self.gfx_prev),
+        ]
+        res = subprocess.run(cmd, capture_output=True, text=True)
+        self.assertEqual(res.returncode, 0, msg=res.stderr)
+
+        # Find delta file
+        delta_dir = self.gfx_prev / "config_delta"
+        self.assertTrue(delta_dir.is_dir(), "config_delta directory not created")
+        deltas = sorted(delta_dir.glob("*_diff.yaml"))
+
+        delta_text = (
+            self.gfx_prev
+            / "config_delta"
+            / sorted((self.gfx_prev / "config_delta").glob("*_diff.yaml"))[-1].name
+        ).read_text(encoding="utf-8")
+        assert "AVG(A_new)" in delta_text, (
+            f"Delta is missing the expected modification:\n{delta_text}"
+        )
+
+        self.assertTrue(deltas, "No delta file created")
+        delta_file = deltas[-1]
+
+        # Apply delta to a clone of prev -> expect to match curr
+        out_clone = self.tmpdir / "out_clone"
+        out_clone.mkdir()
+        # Use apply_config_deltas.py exactly as workflow does
+        cmd2 = [
+            sys.executable,
+            str(REPO_ROOT / "apply_config_deltas.py"),
+            str(self.gfx_prev),
+            str(delta_file),
+            str(out_clone),
+        ]
+        res2 = subprocess.run(cmd2, capture_output=True, text=True)
+        self.assertEqual(res2.returncode, 0, msg=res2.stderr)
+
+        # Compare resulting YAML with curr YAML
+        produced = (out_clone / self.file_name).read_text(encoding="utf-8")
+        expected = (self.gfx_curr / self.file_name).read_text(encoding="utf-8")
+
+        self.assertEqual(yaml.safe_load(produced), yaml.safe_load(expected))
+
+    def test_delta_semantics_add_del_mod(self):
+        """
+        Read the generated delta and ensure categories capture:
+          - Modification of Metric A (avg + description)
+          - Addition of Metric B (metric + description)
+        """
+        # Generate delta
+        cmd = [
+            sys.executable,
+            str(REPO_ROOT / "generate_config_deltas.py"),
+            str(self.gfx_curr),
+            str(self.gfx_prev),
+        ]
+        res = subprocess.run(cmd, capture_output=True, text=True)
+        self.assertEqual(res.returncode, 0, msg=res.stderr)
+
+        delta_dir = self.gfx_prev / "config_delta"
+        delta_file = sorted(delta_dir.glob("*_diff.yaml"))[-1]
+        delta_text = delta_file.read_text(encoding="utf-8")
+
+        # Basic sanity: categories present
+        self.assertIn("Addition:", delta_text)
+        self.assertIn("Deletion:", delta_text)
+        self.assertIn("Modification:", delta_text)
+
+        # Additions should include Metric B and its description
+        self.assertIn("Metric B:", delta_text)
+        self.assertIn("B plain", delta_text)
+        self.assertIn("B rst", delta_text)
+
+        # Modifications should include Metric A changes
+        self.assertIn("Metric A:", delta_text)
+        self.assertIn("AVG(A_new)", delta_text)
+        self.assertIn("A plain (new)", delta_text)
+        self.assertIn("A rst (new)", delta_text)
+
+        # No deletions expected for this setup
+        # (still check Deletion section exists but may render as [])
+        # Ensure there is no "Metric C" ghost, etc.
+        self.assertNotIn("Metric C:", delta_text)
+
+    def test_edit_detection_via_delta_generator(self):
+        """
+        Using the same prev/curr pair, ensure the generator correctly
+        identifies additions and modifications (no deletions in this case).
+        (This stands in for 'edited existing config yaml'.)
+        """
+        # Generate delta from prev -> curr (same as edits applied)
+        cmd = [
+            sys.executable,
+            str(REPO_ROOT / "generate_config_deltas.py"),
+            str(self.gfx_curr),
+            str(self.gfx_prev),
+        ]
+        res = subprocess.run(cmd, capture_output=True, text=True)
+        self.assertEqual(res.returncode, 0, msg=res.stderr)
+
+        # Validate expected markers
+        delta_dir = self.gfx_prev / "config_delta"
+        delta_file = sorted(delta_dir.glob("*_diff.yaml"))[-1]
+        txt = delta_file.read_text(encoding="utf-8")
+
+        # Additions: Metric B, Descriptions for B
+        self.assertIn("Metric B:", txt)
+        self.assertIn("B plain", txt)
+        self.assertIn("B rst", txt)
+
+        # Modifications: Metric A avg and descriptions
+        self.assertIn("AVG(A_new)", txt)
+        self.assertIn("A plain (new)", txt)
+        self.assertIn("A rst (new)", txt)
diff --git a/projects/rocprofiler-compute/tools/config_management/utils.py b/projects/rocprofiler-compute/tools/config_management/utils.py
new file mode 100644
index 0000000000..0af6e5aceb
--- /dev/null
+++ b/projects/rocprofiler-compute/tools/config_management/utils.py
@@ -0,0 +1,52 @@
+##############################################################################
+# MIT License
+#
+# Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+##############################################################################
+
+from pathlib import Path
+from typing import Optional, Union
+
+import yaml
+
+
+def str_representer(dumper, data):
+    if "\n" in data:
+        return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
+    return dumper.represent_scalar("tag:yaml.org,2002:str", data)
+
+
+yaml.add_representer(str, str_representer)
+
+
+def load_yaml(filepath: Union[str, Path]) -> dict:
+    with open(filepath) as f:
+        return yaml.safe_load(f) or {}
+
+
+def save_yaml(
+    data: dict, filepath: Union[str, Path], header: Optional[str] = None
+) -> None:
+    with open(filepath, "w") as f:
+        if header:
+            f.write(header)
+        yaml.dump(data, f, sort_keys=False)
diff --git a/projects/rocprofiler-compute/tools/config_management/verify_against_config_template.py b/projects/rocprofiler-compute/tools/config_management/verify_against_config_template.py
new file mode 100644
index 0000000000..32b9044edb
--- /dev/null
+++ b/projects/rocprofiler-compute/tools/config_management/verify_against_config_template.py
@@ -0,0 +1,227 @@
+#!/usr/bin/env python3
+##############################################################################
+# MIT License
+#
+# Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+##############################################################################
+"""
+Validate panel YAML files against base template ordering.
+Checks that panel configs match expected structure, IDs, titles, and data source order.
+
+Usage:
+    python verify_against_config_template.py <analysis_configs_dir> <template_yaml>
+"""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+from typing import Optional
+
+import yaml
+
+
+def normalize_panel_id(panel_id: int) -> int:
+    """Normalize panel ID by dividing by 100."""
+    return panel_id // 100 if panel_id and panel_id >= 100 else panel_id
+
+
+def normalize_table_id(table_id: int) -> Optional[int]:
+    """Normalize table ID using modulo 100."""
+    return table_id % 100 if table_id else None
+
+
+def load_template(template_file: Path) -> dict[int, dict]:
+    """Load template and create lookup by normalized panel ID."""
+    with open(template_file) as f:
+        data = yaml.safe_load(f) or {}
+
+    panels = data.get("panels", [])
+    lookup: dict[int, dict] = {}
+    for panel in panels:
+        pid = normalize_panel_id(panel["panel_id"])
+        lookup[pid] = {
+            "panel_title": panel["panel_title"],
+            "panel_alias": panel.get("panel_alias"),
+            "data_sources": [
+                {"type": ds["type"], "id": ds["id"], "title": ds["title"]}
+                for ds in panel.get("data_sources", [])
+            ],
+        }
+    return lookup
+
+
+def extract_panel_info(yaml_file: Path) -> Optional[dict]:
+    """Extract panel config info from YAML file."""
+    with open(yaml_file) as f:
+        data = yaml.safe_load(f) or {}
+    if "Panel Config" not in data:
+        return None
+
+    panel_config = data["Panel Config"]
+    data_sources = []
+    for ds in panel_config.get("data source", []):
+        for key, value in ds.items():
+            if isinstance(value, dict) and "id" in value and "title" in value:
+                data_sources.append({
+                    "type": key,
+                    "id": normalize_table_id(value["id"]),
+                    "title": value["title"],
+                })
+
+    return {
+        "panel_id": normalize_panel_id(panel_config.get("id")),
+        "panel_title": panel_config.get("title"),
+        "data_sources": data_sources,
+    }
+
+
+def validate_panel(
+    yaml_file: Path, panel_info: dict, template: dict[int, dict], stats: dict
+) -> None:
+    """Validate a single panel YAML against template."""
+    panel_id = panel_info["panel_id"]
+    file_path = f"{yaml_file.parent.name}/{yaml_file.name}"
+
+    if panel_id not in template:
+        print(f"WARNING [{file_path}]: Panel ID {panel_id} not found in template")
+        stats["warnings"] += 1
+        return
+
+    expected = template[panel_id]
+    errors: list[str] = []
+    warnings: list[str] = []
+
+    if panel_info["panel_title"] != expected["panel_title"]:
+        errors.append(
+            f"Panel title mismatch: expected '{expected['panel_title']}', "
+            f"got '{panel_info['panel_title']}'"
+        )
+
+    if len(panel_info["data_sources"]) != len(expected["data_sources"]):
+        errors.append(
+            f"Data source count mismatch: expected {len(expected['data_sources'])}, "
+            f"got {len(panel_info['data_sources'])}"
+        )
+
+    for i, actual_ds in enumerate(panel_info["data_sources"]):
+        matching_idx = next(
+            (
+                j
+                for j, exp_ds in enumerate(expected["data_sources"])
+                if actual_ds["id"] == exp_ds["id"]
+                and actual_ds["title"] == exp_ds["title"]
+                and actual_ds["type"] == exp_ds["type"]
+            ),
+            None,
+        )
+        if matching_idx is None:
+            errors.append(
+                f"Data source {i + 1}: No matching entry in template for "
+                f"{actual_ds['type']} id={actual_ds['id']} title='{actual_ds['title']}'"
+            )
+        elif matching_idx != i:
+            warnings.append(
+                f"Data source {i + 1}: Order mismatch - appears at position {i + 1} "
+                f"but expected at position {matching_idx + 1}"
+            )
+
+    if errors:
+        print(f"ERROR [{file_path}]:")
+        for error in errors:
+            print(f"  - {error}")
+        stats["errors"] += len(errors)
+        stats["failed_files"] += 1
+    elif warnings:
+        print(f"WARNING [{file_path}]:")
+        for warning in warnings:
+            print(f"  - {warning}")
+        stats["warnings"] += len(warnings)
+        stats["passed_files"] += 1
+    else:
+        print(f"PASS [{file_path}]")
+        stats["passed_files"] += 1
+
+
+def main() -> None:
+    if len(sys.argv) != 3:
+        print(
+            "Usage: python verify_against_config_template.py "
+            "<analysis_configs_dir> <template_yaml>"
+        )
+        sys.exit(1)
+
+    configs_dir = Path(sys.argv[1])
+    template_file = Path(sys.argv[2])
+
+    if not configs_dir.is_dir():
+        print(f"Error: {configs_dir} is not a directory")
+        sys.exit(1)
+    if not template_file.is_file():
+        print(f"Error: {template_file} is not a file")
+        sys.exit(1)
+
+    print(f"Loading template from {template_file}")
+    template = load_template(template_file)
+    print(f"Template loaded: {len(template)} panels\n")
+
+    stats = {
+        "total_files": 0,
+        "passed_files": 0,
+        "failed_files": 0,
+        "errors": 0,
+        "warnings": 0,
+    }
+
+    for arch_dir in sorted(configs_dir.iterdir()):
+        if not arch_dir.is_dir():
+            continue
+        print(f"{'=' * 80}\nValidating architecture: {arch_dir.name}\n{'=' * 80}")
+        for yaml_file in sorted(arch_dir.glob("*.yaml")):
+            stats["total_files"] += 1
+            panel_info = extract_panel_info(yaml_file)
+            if panel_info:
+                validate_panel(yaml_file, panel_info, template, stats)
+            else:
+                print(f"ERROR [{arch_dir.name}/{yaml_file.name}]: Invalid panel config")
+                stats["errors"] += 1
+                stats["failed_files"] += 1
+        print()
+
+    print(f"{'=' * 80}\nVALIDATION SUMMARY\n{'=' * 80}")
+    print(f"Total files checked: {stats['total_files']}")
+    print(f"Passed: {stats['passed_files']}")
+    print(f"Failed: {stats['failed_files']}")
+    print(f"Total errors: {stats['errors']}")
+    print(f"Total warnings: {stats['warnings']}")
+
+    if stats["failed_files"] > 0:
+        print("\nValidation FAILED")
+        sys.exit(1)
+    elif stats["warnings"] > 0:
+        print("\nValidation PASSED with warnings")
+    else:
+        print("\nValidation PASSED")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx908_metrics_description.yaml b/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx908_metrics_description.yaml
new file mode 100644
index 0000000000..0fd8a4b262
--- /dev/null
+++ b/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx908_metrics_description.yaml
@@ -0,0 +1,1791 @@
+System Speed-of-Light:
+  VALU FLOPs:
+    rst: |-
+      The total floating-point operations executed per second on the :ref:`VALU
+      <desc-valu>`. This is also presented as a percent of the peak theoretical
+      FLOPs achievable on the specific accelerator. Note: this does not include
+      any floating-point operations from :ref:`MFMA <desc-mfma>` instructions.
+    unit: GFLOPs
+  VALU IOPs:
+    rst: |-
+      The total integer operations executed per second on the :ref:`VALU <desc-valu>`.
+      This is also presented as a percent of the peak theoretical IOPs achievable
+      on the specific accelerator. Note: this does not include any integer operations
+      from :ref:`MFMA <desc-mfma>` instructions.
+    unit: GOIPs
+  MFMA FLOPs (BF16):
+    rst: |-
+      The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>`
+      operations executed per second. Note: this does not include any 16-bit brain
+      floating point operations from :ref:`VALU <desc-valu>` instructions. This
+      is also presented as a percent of the peak theoretical BF16 MFMA operations
+      achievable on the specific accelerator.
+    unit: GFLOPs
+  MFMA FLOPs (F16):
+    rst: |-
+      The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 16-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. This is also presented
+      as a percent of the peak theoretical F16 MFMA operations achievable on the
+      specific accelerator.
+    unit: GFLOPs
+  MFMA FLOPs (F32):
+    rst: |-
+      The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 32-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. This is also presented
+      as a percent of the peak theoretical F32 MFMA operations achievable on the
+      specific accelerator.
+    unit: GFLOPs
+  MFMA FLOPs (F64):
+    rst: |-
+      The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 64-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. This is also presented
+      as a percent of the peak theoretical F64 MFMA operations achievable on the
+      specific accelerator.
+    unit: GFLOPs
+  MFMA IOPs (Int8):
+    rst: |-
+      The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
+      per second. Note: this does not include any 8-bit integer operations from
+      :ref:`VALU <desc-valu>` instructions. This is also presented as a percent
+      of the peak theoretical INT8 MFMA operations achievable on the specific accelerator.
+    unit: GIOPs
+  Active CUs:
+    rst: Total number of active compute units (CUs) on the accelerator during the
+      kernel execution.
+    unit: Number
+  SALU Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`SALU <desc-salu>`
+      was busy executing instructions. Computed as the ratio of the total number of
+      cycles spent by the :ref:`scheduler <desc-scheduler>` issuing SALU / :ref:`SMEM
+      <desc-smem>` instructions over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  VALU Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`VALU <desc-valu>`
+      was busy executing instructions. Does not include :ref:`VMEM <desc-vmem>` operations.
+      Computed as the ratio of the total number of cycles spent by the :ref:`scheduler
+      <desc-scheduler>` issuing VALU instructions over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  MFMA Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`MFMA <desc-mfma>`
+      unit was busy executing instructions. Computed as the ratio of the total number
+      of cycles spent by the :ref:`MFMA <desc-salu>` was busy over the :ref:`total
+      CU cycles <total-cu-cycles>`.
+    unit: Percent
+  VMEM Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`VMEM <desc-vmem>`
+      unit was busy executing instructions, including both global/generic and spill/scratch
+      operations (see the :ref:`VMEM instruction count metrics <ta-instruction-counts>`
+      for more detail). Does not include :ref:`VALU <desc-valu>` operations. Computed
+      as the ratio of the total number of cycles spent by the :ref:`scheduler <desc-scheduler>`
+      issuing VMEM instructions over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  Branch Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`branch <desc-branch>`
+      unit was busy executing instructions. Computed as the ratio of the total number
+      of cycles spent by the :ref:`scheduler <desc-scheduler>` issuing branch instructions
+      over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  VALU Active Threads:
+    rst: Indicates the average level of :ref:`divergence <desc-divergence>` within
+      a wavefront over the lifetime of the kernel. The number of work-items that were
+      active in a wavefront during execution of each :ref:`VALU <desc-valu>` instruction,
+      time-averaged over all VALU instructions run on all wavefronts in the kernel.
+    unit: Work-items
+  IPC:
+    rst: The ratio of the total number of instructions executed on the :doc:`CU <compute-unit>`
+      over the :ref:`total active CU cycles <total-active-cu-cycles>`.
+    unit: Instructions per-cycle
+  Wavefront Occupancy:
+    rst: |-
+      The time-averaged number of wavefronts resident on the accelerator over
+      the lifetime of the kernel. Note: this metric may be inaccurate for short-running
+      kernels (less than 1ms). This is also presented as a percent of the peak theoretical
+      occupancy achievable on the specific accelerator.
+    unit: Wavefronts
+  Theoretical LDS Bandwidth:
+    rst: Indicates the maximum amount of bytes that could have been loaded from, stored
+      to, or atomically updated in the LDS per unit time (see :ref:`LDS Bandwidth
+      <lds-bandwidth>` example for more detail). This is also presented as a percent
+      of the peak theoretical F64 MFMA operations achievable on the specific accelerator.
+    unit: GB/s
+  LDS Bank Conflicts/Access:
+    rst: The ratio of the number of cycles spent in the :doc:`LDS scheduler <local-data-share>`
+      due to bank conflicts (as determined by the conflict resolution hardware) to
+      the base number of cycles that would be spent in the LDS scheduler in a completely
+      uncontended case. This is also presented in normalized form (i.e., the Bank
+      Conflict Rate).
+    unit: Conflicts/Access
+  vL1D Cache Hit Rate:
+    rst: The ratio of the number of vL1D cache line requests that hit in vL1D cache
+      over the total number of cache line requests to the :ref:`vL1D cache RAM <desc-tc>`.
+    unit: Percent
+  vL1D Cache BW:
+    rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM
+      <desc-vmem>` instructions per unit time. The number of bytes is calculated as
+      the number of cache lines requested multiplied by the cache line size. This
+      value does not consider partial requests, so e.g., if only a single value is
+      requested in a cache line, the data movement will still be counted as a full
+      cache line. This is also presented as a percent of the peak theoretical bandwidth
+      achievable on the specific accelerator.
+    unit: GB/s
+  L2 Cache Hit Rate:
+    rst: The ratio of the number of L2 cache line requests that hit in the L2 cache
+      over the total number of incoming cache line requests to the L2 cache.
+    unit: Percent
+  L2 Cache BW:
+    rst: The number of bytes looked up in the L2 cache per unit time. The number of
+      bytes is calculated as the number of cache lines requested multiplied by the
+      cache line size. This value does not consider partial requests, so e.g., if
+      only a single value is requested in a cache line, the data movement will still
+      be counted as a full cache line. This is also presented as a percent of the
+      peak theoretical bandwidth achievable on the specific accelerator.
+    unit: GB/s
+  L2-Fabric Read BW:
+    rst: |-
+      The number of bytes read by the L2 over the :ref:`Infinity Fabric\u2122
+      interface <l2-fabric>` per unit time. This is also presented as a percent
+      of the peak theoretical bandwidth achievable on the specific accelerator.
+    unit: GB/s
+  L2-Fabric Write BW:
+    rst: The number of bytes sent by the L2 over the :ref:`Infinity Fabric interface
+      <l2-fabric>` by write and atomic operations per unit time. This is also presented
+      as a percent of the peak theoretical bandwidth achievable on the specific accelerator.
+    unit: GB/s
+  L2-Fabric Read Latency:
+    rst: The time-averaged number of cycles read requests spent in Infinity Fabric
+      before data was returned to the L2.
+    unit: Cycles
+  L2-Fabric Write Latency:
+    rst: The time-averaged number of cycles write requests spent in Infinity Fabric
+      before a completion acknowledgement was returned to the L2.
+    unit: Cycles
+  sL1D Cache Hit Rate:
+    rst: The percent of sL1D requests that hit on a previously loaded line the cache.
+      Calculated as the ratio of the number of sL1D requests that hit over the number
+      of all sL1D requests.
+    unit: Percent
+  sL1D Cache BW:
+    rst: The number of bytes looked up in the sL1D cache per unit time. This is also
+      presented as a percent of the peak theoretical bandwidth achievable on the specific
+      accelerator.
+    unit: GB/s
+  L1I Hit Rate:
+    rst: The percent of L1I requests that hit on a previously loaded line the cache.
+      Calculated as the ratio of the number of L1I requests that hit over the number
+      of all L1I requests.
+    unit: GB/s
+  L1I BW:
+    rst: The number of bytes looked up in the L1I cache per unit time. This is also
+      presented as a percent of the peak theoretical bandwidth achievable on the specific
+      accelerator.
+    unit: Percent
+  L1I Fetch Latency:
+    rst: The average number of cycles spent to fetch instructions to a :doc:`CU <compute-unit>`.
+    unit: Cycles
+Memory Chart:
+  Wavefront Occupancy:
+    rst: Wavefronts per active CU.
+    unit: Wavefronts
+  Wave Life:
+    rst: Average number of cycles executing a wave.
+    unit: Cycles per wave
+  SALU:
+    rst: Total Number of SALU (Scalar ALU) instructions issued per normalization unit.
+    unit: Instructions per normalization unit
+  SMEM:
+    rst: Total number of SMEM (Scalar Memory Read) instructions issued normalization
+      unit.
+    unit: Instructions per normalization unit
+  VALU:
+    rst: The number of VALU (Vector ALU) instructions issued per normalization unit.
+    unit: Instructions per normalization unit
+  MFMA:
+    rst: Total number of MFMA (Matrix-Fused-Multiply-Add) instructions issued per
+      normalization unit.
+    unit: Instructions per normalization unit
+  VMEM:
+    rst: The number of VMEM (GPU Memory) read instructions issued (including FLAT/scratch
+      memory) per normalization unit.
+    unit: Instructions per normalization unit
+  LDS:
+    rst: The total number of LDS instructions (including, but not limited to, read/write/atomics
+      and HIP's __shfl instructions) executed per normalization unit.
+    unit: Instructions per normalization unit
+  GWS:
+    rst: Total number of GDS (global data sync) instructions issued per normalization
+      unit.
+    unit: Instructions per normalization unit
+  BR:
+    rst: Total number of BRANCH instructions issued per normalization unit.
+    unit: Instructions per normalization unit
+  Active CUs:
+    rst: Total number of active compute units (CUs) on the accelerator during the
+      kernel execution.
+    unit: CUs
+  Num CUs:
+    rst: Total number of compute units (CUs) on the accelerator.
+    unit: CUs
+  VGPR:
+    rst: |-
+      The number of architected vector general-purpose registers allocated for the
+      kernel, see :ref:`VALU <desc-valu>`. Note: this may not exactly match the
+      number of VGPRs requested by the compiler due to allocation granularity.
+    unit: VGPRs
+  SGPR:
+    rst: |-
+      The number of scalar general-purpose registers allocated for the kernel, see
+      :ref:`SALU <desc-salu>`. Note: this may not exactly match the number of
+      SGPRs requested by the compiler due to allocation granularity.
+    unit: SGPRs
+  LDS Allocation:
+    rst: |-
+      The number of bytes of :doc:`LDS <local-data-share>` memory (or, shared memory)
+      allocated for this kernel. Note: This may also be larger than what was requested
+      at compile time due to both allocation granularity and dynamic per-dispatch
+      LDS allocations.
+    unit: Bytes per workgroup
+  Scratch Allocation:
+    rst: The number of bytes of :ref:`scratch memory <memory-spaces>` requested per
+      work-item for this kernel. Scratch memory is used for stack memory on the accelerator,
+      as well as for register spills and restores.
+    unit: Bytes per workgroup
+  Wavefronts:
+    rst: The total number of wavefronts, summed over all workgroups, forming this
+      kernel launch.
+    unit: Wavefronts
+  Workgroups:
+    rst: The total number of workgroups forming this kernel launch.
+    unit: Workgroups
+  LDS Req:
+    rst: The total number of LDS instructions (including, but not limited to, read/write/atomics
+      and HIP's ``__shfl`` instructions) executed per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  LDS Util:
+    rst: Indicates what percent of the kernel's duration the :ref:`LDS <desc-lds>`
+      was actively executing instructions (including, but not limited to, load, store,
+      atomic and HIP's ``__shfl`` operations). Calculated as the ratio of the total
+      number of cycles LDS was active over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  LDS Latency:
+    rst: The average number of round-trip cycles (i.e., from issue to data-return
+      / acknowledgment) required for an LDS instruction to complete.
+    unit: Cycles
+  VL1 Rd:
+    rst: The total number of incoming read requests from the :ref:`address processing
+      unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
+    unit: Requests per normalization unit
+  VL1 Wr:
+    rst: The total number of incoming write requests from the :ref:`address processing
+      unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
+    unit: Requests per normalization unit
+  VL1 Atomic:
+    rst: The total number of incoming atomic requests from the :ref:`address processing
+      unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
+    unit: Requests per normalization unit
+  VL1 Hit:
+    rst: The ratio of the number of vL1D cache line requests that hit in vL1D cache
+      over the total number of cache line requests to the :ref:`vL1D Cache RAM <desc-tc>`.
+    unit: Percent
+  VL1 Lat:
+    rst: Calculated as the average number of cycles that a vL1D cache line request
+      spent in the vL1D cache pipeline.
+    unit: Cycles
+  VL1 Coalesce:
+    rst: Indicates how well memory instructions were coalesced by the :ref:`address
+      processing unit <desc-ta>`, ranging from uncoalesced (25%) to fully coalesced
+      (100%). Calculated as the average number of :ref:`thread-requests <thread-requests>`
+      generated per instruction divided by the ideal number of thread-requests per
+      instruction.
+    unit: Percent
+  VL1 Stall:
+    rst: The ratio of the number of cycles where the vL1D is stalled waiting to issue
+      a request for data to the :doc:`L2 cache <l2-cache>` divided by the number of
+      cycles where the vL1D is active [#vl1d-activity]_.
+    unit: Percent
+  VL1_L2 Rd:
+    rst: The number of read requests for a vL1D cache line that were not satisfied
+      by the vL1D and must be retrieved from the to the :doc:`L2 Cache <l2-cache>`
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  VL1_L2 Wr:
+    rst: The number of write requests to a vL1D cache line that were sent through
+      the vL1D to the :doc:`L2 cache <l2-cache>`, per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  VL1_L2 Atomic:
+    rst: The number of atomic requests that are sent through the vL1D to the :doc:`L2
+      cache <l2-cache>`, per :ref:`normalization unit <normalization-units>`. This
+      includes requests for atomics with, and without return.
+    unit: Requests per normalization unit
+  sL1D Rd:
+    rst: The total number of requests, of any size or type, made to the sL1D per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Requests per normalization unit
+  sL1D Hit:
+    rst: The total number of sL1D requests that hit on a previously loaded cache line,
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  sL1D Lat:
+    rst: ''
+    unit: Unknown
+  sL1D_L2 Rd:
+    rst: The total number of read requests from sL1D to the :doc:`L2 <l2-cache>`,
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  sL1D_L2 Wr:
+    rst: The total number of write requests from sL1D to the :doc:`L2 <l2-cache>`,
+      per :ref:`normalization unit <normalization-units>`. Typically unused on current
+      CDNA accelerators.
+    unit: Requests per normalization unit
+  sL1D_L2 Atomic:
+    rst: The total number of atomic requests from sL1D to the :doc:`L2 <l2-cache>`,
+      per :ref:`normalization unit <normalization-units>`. Typically unused on current
+      CDNA accelerators.
+    unit: Requests per normalization unit
+  IL1 Fetch:
+    rst: The total number of requests made to the L1I per :ref:`normalization-unit
+      <normalization-units>`.
+    unit: Requests per normalization unit
+  IL1 Hit:
+    rst: The total number of L1I requests that hit on a previously loaded cache line,
+      per :ref:`normalization-unit <normalization-units>`.
+    unit: Percent
+  IL1 Lat:
+    rst: The average number of cycles spent to fetch instructions to a :doc:`CU <compute-unit>`.
+    unit: Cycles
+  IL1_L2 Rd:
+    rst: The total number of requests across the L1I - L2 interface per normalization-unit.
+    unit: Requests per normalization unit
+  L2 Rd:
+    rst: The total number of read requests to the L2 from all clients.
+    unit: Requests per normalization unit
+  L2 Wr:
+    rst: The total number of write requests to the L2 from all clients.
+    unit: Requests per normalization unit
+  L2 Atomic:
+    rst: The total number of atomic requests (with and without return) to the L2 from
+      all clients.
+    unit: Requests per normalization unit
+  L2 Hit:
+    rst: The ratio of the number of L2 cache line requests that hit in the L2 cache
+      over the total number of incoming cache line requests to the L2 cache.
+    unit: Percent
+  L2 Rd Lat:
+    rst: Calculated as the average number of cycles that the vL1D cache took to issue
+      and receive read requests from the :doc:`L2 Cache <l2-cache>`. This number also
+      includes requests for atomics with return values.
+    unit: Cycles
+  L2 Wr Lat:
+    rst: Calculated as the average number of cycles that the vL1D cache took to issue
+      and receive acknowledgement of a write request to the :doc:`L2 Cache <l2-cache>`.
+      This number also includes requests for atomics without return values.
+    unit: Cycles
+  Fabric_L2 Rd:
+    rst: Number of L2 cache - Infinity Fabric read requests (either 32-byte or 64-byte)
+      summed over TCC instances per normalization unit.
+    unit: Requests per normalization unit
+  Fabric_L2 Wr:
+    rst: Number of L2 cache - Infinity Fabric write requests (either 32-byte or 64-byte)
+      summed over TCC instances per normalization unit.
+    unit: Requests per normalization unit
+  Fabric_L2 Atomic:
+    rst: Number of L2 cache - Infinity Fabric write requests (either 32-byte or 64-byte)
+      that are actually atomic requests summed over TCC instances per normalization
+      unit.
+    unit: Requests per normalization unit
+  Fabric Rd Lat:
+    rst: The time-averaged number of cycles read requests spent in Infinity Fabric
+      before data was returned to the L2.
+    unit: Cycles
+  Fabric Wr Lat:
+    rst: The time-averaged number of cycles write requests spent in Infinity Fabric
+      before a completion acknowledgement was returned to the L2.
+    unit: Cycles
+  Fabric Atomic Lat:
+    rst: The time-averaged number of cycles atomic requests spent in Infinity Fabric
+      before a completion acknowledgement (atomic without return value) or data (atomic
+      with return value) was returned to the L2.
+    unit: Cycles
+  HBM Rd:
+    rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of
+      data from the accelerator's local HBM, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+  HBM Wr:
+    rst: The total number of L2 requests to Infinity Fabric to write 32B or 64B of
+      data from the accelerator's local HBM, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+Roofline Performance Rates:
+  VALU FLOPs (F16):
+    rst: |-
+      The total 16-bit floating-point operations executed per second on the :ref:`VALU
+      <desc-valu>`. This is presented with the value of the peak empirical F16 FLOPs achievable
+      on the specific accelerator. Note: this does not include any F16 operations
+      from :ref:`MFMA <desc-mfma>` instructions.
+    unit: GFLOPs
+  VALU FLOPs (F32):
+    rst: |-
+      The total 32-bit floating-point operations executed per second on the :ref:`VALU
+      <desc-valu>`. This is presented with the value of the peak empirical F32 FLOPs achievable
+      on the specific accelerator. Note: this does not include any F32 operations
+      from :ref:`MFMA <desc-mfma>` instructions.
+    unit: GFLOPs
+  VALU FLOPs (F64):
+    rst: |-
+      The total 64-bit floating-point operations executed per second on the :ref:`VALU
+      <desc-valu>`. This is presented with the value of the peak empirical F64 FLOPs achievable
+      on the specific accelerator. Note: this does not include any F64 operations
+      from :ref:`MFMA <desc-mfma>` instructions.
+    unit: GFLOPs
+  MFMA FLOPs (F64):
+    rst: |-
+      The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 64-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
+      measured F64 MFMA operations achievable on the specific accelerator is
+      displayed alongside for comparison.
+    unit: GFLOPs
+  MFMA FLOPs (F32):
+    rst: |-
+      The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 32-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
+      measured F32 MFMA operations achievable on the specific accelerator is
+      displayed alongside for comparison.
+    unit: GFLOPs
+  MFMA FLOPs (F16):
+    rst: |-
+      The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 16-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
+      measured F16 MFMA operations achievable on the specific accelerator is
+      displayed alongside for comparison.
+    unit: GFLOPs
+  MFMA FLOPs (BF16):
+    rst: |-
+      The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>`
+      operations executed per second. Note: this does not include any 16-bit brain
+      floating point operations from :ref:`VALU <desc-valu>` instructions. The
+      peak empirically measured BF16 MFMA operations achievable on the specific
+      accelerator is displayed alongside for comparison.
+    unit: GFLOPs
+  MFMA IOPs (Int8):
+    rst: |-
+      The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
+      per second. Note: this does not include any 8-bit integer operations from
+      :ref:`VALU <desc-valu>` instructions. The peak empirically measured INT8 MFMA
+      operations achievable on the specific accelerator is displayed alongside
+      for comparison.
+    unit: GIOPs
+  HBM Bandwidth:
+    rst: |-
+      The total number of bytes read from and written to High-Bandwidth
+        Memory (HBM) per second. The peak empirically measured bandwidth achievable
+        on the specific accelerator is displayed alongside for comparison.
+    unit: GB/s
+  L2 Cache Bandwidth:
+    rst: The number of bytes looked up in the L2 cache per unit time. The number of
+      bytes is calculated as the number of cache lines requested multiplied by the
+      cache line size. This value does not consider partial requests, so e.g., if
+      only a single value is requested in a cache line, the data movement will still
+      be counted as a full cache line. The peak empirically measured bandwidth achievable
+      on the specific accelerator is displayed alongside for comparison.
+    unit: GB/s
+  L1 Cache Bandwidth:
+    rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM
+      <desc-vmem>` instructions per unit time. The number of bytes is calculated as
+      the number of cache lines requested multiplied by the cache line size. This
+      value does not consider partial requests, so e.g., if only a single value is
+      requested in a cache line, the data movement will still be counted as a full
+      cache line. The peak empirically measured bandwidth achievable on the specific
+      accelerator is displayed alongside for comparison.
+    unit: GB/s
+  LDS Bandwidth:
+    rst: Indicates the maximum amount of bytes that could have been loaded from, stored
+      to, or atomically updated in the LDS per unit time (see :ref:`LDS Bandwidth
+      <lds-bandwidth>` example for more detail). The peak empirically measured LDS
+      bandwidth achievable on the specific accelerator is displayed alongside for
+      comparison.
+    unit: GB/s
+Roofline Plot Points:
+  AI HBM:
+    rst: |-
+      The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM).
+      It is the ratio of total floating-point operations (FLOPs) to total bytes
+      transferred between HBM and the L2 cache. This value is used as the x-coordinate
+      for the HBM roofline.
+    unit: FLOPs/Byte
+  AI L2:
+    rst: |-
+      The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio
+      of total floating-point operations (FLOPs) to total bytes transferred between
+      the L2 cache and the L1 cache. This value is used as the x-coordinate for
+      the L2 roofline.
+    unit: FLOPs/Byte
+  AI L1:
+    rst: |-
+      The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio
+      of total floating-point operations (FLOPs) to total bytes transferred between
+      the L1 cache and the processing units. This value is used as the x-coordinate
+      for the L1 roofline.
+    unit: FLOPs/Byte
+  Performance (GFLOPs):
+    rst: |-
+      The overall achieved performance, measured in GigaFLOPs
+      per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point
+      operations divided by the total execution time. This value is used as the y-coordinate
+      for the kernel's point on the Roofline plot.
+    unit: GFLOP/s
+Command processor fetcher (CPF):
+  CPF Utilization:
+    rst: Percent of total cycles where the CPF was busy actively doing any work. The
+      ratio of CPF busy cycles over total cycles counted by the CPF.
+    unit: Percent
+  CPF Stall:
+    rst: Percent of CPF busy cycles where the CPF was stalled for any reason.
+    unit: Percent
+  CPF-L2 Utilization:
+    rst: Percent of total cycles counted by the CPF-:doc:`L2 <l2-cache>` interface
+      where the CPF-L2 interface was active doing any work. The ratio of CPF-L2 busy
+      cycles over total cycles counted by the CPF-L2.
+    unit: Percent
+  CPF-L2 Stall:
+    rst: Percent of CPF-:doc:`L2 <l2-cache>` L2 busy cycles where the CPF-L2 interface
+      was stalled for any reason.
+    unit: Percent
+  CPF-UTCL1 Stall:
+    rst: Percent of CPF busy cycles where the CPF was stalled by address translation.
+    unit: Percent
+Command processor packet processor (CPC):
+  CPC Utilization:
+    rst: Percent of total cycles where the CPC was busy actively doing any work. The
+      ratio of CPC busy cycles over total cycles counted by the CPC.
+    unit: Percent
+  CPC Stall Rate:
+    rst: Percent of CPC busy cycles where the CPC was stalled for any reason.
+    unit: Percent
+  CPC Packet Decoding Utilization:
+    rst: Percent of CPC busy cycles spent decoding commands for processing.
+    unit: Percent
+  CPC-Workgroup Manager Utilization:
+    rst: Percent of CPC busy cycles spent dispatching workgroups to the :ref:`workgroup
+      manager <desc-spi>`.
+    unit: Percent
+  CPC-L2 Utilization:
+    rst: Percent of total cycles counted by the CPC-:doc:`L2 <l2-cache>` interface
+      where the CPC-L2 interface was active doing any work.
+    unit: Percent
+  CPC-UTCL1 Stall:
+    rst: Percent of CPC busy cycles where the CPC was stalled by address translation
+    unit: Percent
+  CPC-UTCL2 Utilization:
+    rst: Percent of total cycles counted by the CPC's :doc:`L2 <l2-cache>` address
+      translation interface where the CPC was busy doing address translation work.
+    unit: Percent
+Workgroup manager utilizations:
+  Accelerator Utilization:
+    rst: The percent of cycles in the kernel where the accelerator was actively doing
+      any work.
+    unit: Percent
+  Scheduler-Pipe Utilization:
+    rst: |-
+      The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
+      in the kernel where the scheduler-pipes were actively doing any work. Note: this
+      value is expected to range between 0% and 25%. See :ref:`desc-spi`.
+    unit: Percent
+  Workgroup Manager Utilization:
+    rst: The percent of cycles in the kernel where the workgroup manager was actively
+      doing any work.
+    unit: Percent
+  Shader Engine Utilization:
+    rst: The percent of :ref:`total shader engine cycles <total-se-cycles>` in the
+      kernel where any CU in a shader-engine was actively doing any work, normalized
+      over all shader-engines. Low values (e.g., << 100%) indicate that the accelerator
+      was not fully saturated by the kernel, or a potential load-imbalance issue.
+    unit: Percent
+  SIMD Utilization:
+    rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel
+      where any :ref:`SIMD <desc-valu>` on a CU was actively doing any work, summed
+      over all CUs. Low values (less than 100%) indicate that the accelerator was
+      not fully saturated by the kernel, or a potential load-imbalance issue.
+    unit: Percent
+  Dispatched Workgroups:
+    rst: The total number of workgroups forming this kernel launch.
+    unit: Workgroups
+  Dispatched Wavefronts:
+    rst: The total number of wavefronts, summed over all workgroups, forming this
+      kernel launch.
+    unit: Wavefronts
+  VGPR Writes:
+    rst: The average number of cycles spent initializing :ref:`VGPRs <desc-valu>`
+      at wave creation.
+    unit: Cycles/wave
+  SGPR Writes:
+    rst: The average number of cycles spent initializing :ref:`SGPRs <desc-salu>`
+      at wave creation.
+    unit: Cycles/wave
+Workgroup Manager - Resource Allocation:
+  Not-scheduled Rate (Workgroup Manager):
+    rst: |-
+      The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
+      in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
+      due to a bottleneck within the workgroup manager rather than a lack of a
+      CU or :ref:`SIMD <desc-valu>` with sufficient resources. Note: this value
+      is expected to range between 0-25%. See note in :ref:`workgroup manager <desc-spi>`
+      description.
+    unit: Percent
+  Not-scheduled Rate (Scheduler-Pipe):
+    rst: |-
+      The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
+      in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
+      due to a bottleneck within the scheduler-pipes rather than a lack of a CU
+      or :ref:`SIMD <desc-valu>` with sufficient resources. Note: this value is
+      expected to range between 0-25%, see note in :ref:`workgroup manager <desc-spi>`
+      description.
+    unit: Percent
+  Scheduler-Pipe Stall Rate:
+    rst: |-
+      The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
+      in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
+      due to occupancy limitations (like a lack of a CU or :ref:`SIMD <desc-valu>`
+      with sufficient resources). Note: this value is expected to range between
+      0-25%, see note in :ref:`workgroup manager <desc-spi>` description.
+    unit: Percent
+  Scratch Stall Rate:
+    rst: The percent of :ref:`total shader-engine cycles <total-se-cycles>` in the
+      kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
+      due to lack of :ref:`private (a.k.a., scratch) memory <memory-type>` slots.
+      While this can reach up to 100%, note that the actual occupancy limitations
+      on a kernel using private memory are typically quite small (for example, less
+      than 1% of the total number of waves that can be scheduled to an accelerator).
+    unit: Percent
+  Insufficient SIMD Waveslots:
+    rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel
+      where a workgroup could not be scheduled to a :ref:`SIMD <desc-valu>` due to
+      lack of available :ref:`waveslots <desc-valu>`.
+    unit: Percent
+  Insufficient SIMD VGPRs:
+    rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel
+      where a workgroup could not be scheduled to a :ref:`SIMD <desc-valu>` due to
+      lack of available :ref:`VGPRs <desc-valu>`.
+    unit: Percent
+  Insufficient SIMD SGPRs:
+    rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel
+      where a workgroup could not be scheduled to a :ref:`SIMD <desc-valu>` due to
+      lack of available :ref:`SGPRs <desc-salu>`.
+    unit: Percent
+  Insufficient CU LDS:
+    rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel where
+      a workgroup could not be scheduled to a :doc:`CU <compute-unit>` due to lack
+      of available :doc:`LDS <local-data-share>`.
+    unit: Percent
+  Insufficient CU Barriers:
+    rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel where
+      a workgroup could not be scheduled to a :doc:`CU <compute-unit>` due to lack
+      of available :ref:`barriers <desc-barrier>`.
+    unit: Percent
+  Reached CU Workgroup Limit:
+    rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel where
+      a workgroup could not be scheduled to a :doc:`CU <compute-unit>` due to limits
+      within the workgroup manager. This is expected to be always be zero on CDNA2
+      or newer accelerators (and small for previous accelerators).
+    unit: Percent
+  Reached CU Wavefront Limit:
+    rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel where
+      a wavefront could not be scheduled to a :doc:`CU <compute-unit>` due to limits
+      within the workgroup manager. This is expected to be always be zero on CDNA2
+      or newer accelerators (and small for previous accelerators).
+    unit: Percent
+Wavefront Launch Stats:
+  Grid Size:
+    rst: The total number of work-items (or, threads) launched as a part of the kernel
+      dispatch. In HIP, this is equivalent to the total grid size multiplied by the
+      total workgroup (or, block) size.
+    unit: Work-Items
+  Workgroup Size:
+    rst: The total number of work-items (or, threads) in each workgroup (or, block)
+      launched as part of the kernel dispatch. In HIP, this is equivalent to the total
+      block size.
+    unit: Work-Items
+  Total Wavefronts:
+    rst: |-
+      The total number of wavefronts launched as part of the kernel dispatch.
+      On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront
+      size is always 64 work-items. Thus, the total number of wavefronts should
+      be equivalent to the ceiling of grid size divided by 64.
+    unit: Wavefronts
+  Saved Wavefronts:
+    rst: The total number of wavefronts saved at a context-save. See `cwsr_enable
+      <https://docs.kernel.org/gpu/amdgpu/module-parameters.html?highlight=cwsr>`_.
+    unit: Wavefronts
+  Restored Wavefronts:
+    rst: The total number of wavefronts restored from a context-save. See `cwsr_enable
+      <https://docs.kernel.org/gpu/amdgpu/module-parameters.html?highlight=cwsr>`_.
+    unit: Wavefronts
+  VGPRs:
+    rst: |-
+      The number of architected vector general-purpose registers allocated for the
+      kernel, see :ref:`VALU <desc-valu>`. Note: this may not exactly match the
+      number of VGPRs requested by the compiler due to allocation granularity.
+    unit: VGPRs
+  AGPRs:
+    rst: |-
+      The number of accumulation vector general-purpose registers allocated
+      for the kernel, see :ref:`AGPRs <desc-agprs>`. Note: this may not exactly match
+      the number of AGPRs requested by the compiler due to allocation granularity.
+    unit: AGPRs
+  SGPRs:
+    rst: |-
+      The number of scalar general-purpose registers allocated for the kernel, see
+      :ref:`SALU <desc-salu>`. Note: this may not exactly match the number of
+      SGPRs requested by the compiler due to allocation granularity.
+    unit: SGPRs
+  LDS Allocation:
+    rst: |-
+      The number of bytes of :doc:`LDS <local-data-share>` memory (or, shared memory)
+      allocated for this kernel. Note: This may also be larger than what was requested
+      at compile time due to both allocation granularity and dynamic per-dispatch
+      LDS allocations.
+    unit: Bytes per workgroup
+  Scratch Allocation:
+    rst: The number of bytes of :ref:`scratch memory <memory-spaces>` requested per
+      work-item for this kernel. Scratch memory is used for stack memory on the accelerator,
+      as well as for register spills and restores.
+    unit: Bytes per work-item
+Wavefront Runtime Stats:
+  Kernel Time:
+    rst: The total duration of the executed kernel.
+    unit: Nanoseconds
+  Kernel Time (Cycles):
+    rst: The total duration of the executed kernel in cycles.
+    unit: Cycles
+  Instructions per wavefront:
+    rst: The average number of instructions (of all types) executed per wavefront.
+      This is averaged over all wavefronts in a kernel dispatch.
+    unit: Instructions per wavefront
+  Wave Cycles:
+    rst: |-
+      The number of cycles a wavefront in the kernel dispatch spent resident
+      on a compute unit per :ref:`normalization unit <normalization-units>`. This is
+      averaged over all wavefronts in a kernel dispatch. Note: this should not
+      be directly compared to the kernel cycles above.
+    unit: Cycles per normalization unit
+  Dependency Wait Cycles:
+    rst: The number of cycles a wavefront in the kernel dispatch stalled waiting on
+      memory of any kind (e.g., instruction fetch, vector or scalar memory, etc.)
+      per :ref:`normalization unit <normalization-units>`. This counter is incremented
+      at every cycle by *all* wavefronts on a CU stalled at a memory operation. As
+      such, it is most useful to get a sense of how waves were spending their time,
+      rather than identification of a precise limiter because another wave could be
+      actively executing while a wave is stalled. The sum of this metric, Issue Wait
+      Cycles and Active Cycles should be equal to the total Wave Cycles metric.
+    unit: Cycles per normalization unit
+  Issue Wait Cycles:
+    rst: The number of cycles a wavefront in the kernel dispatch was unable to issue
+      an instruction for any reason (e.g., execution pipe back-pressure, arbitration
+      loss, etc.) per :ref:`normalization unit <normalization-units>`. This counter
+      is incremented at every cycle by *all* wavefronts on a CU unable to issue an
+      instruction. As such, it is most useful to get a sense of how waves were spending
+      their time, rather than identification of a precise limiter because another
+      wave could be actively executing while a wave is issue stalled. The sum of this
+      metric, Dependency Wait Cycles and Active Cycles should be equal to the total
+      Wave Cycles metric.
+    unit: Cycles per normalization unit
+  Active Cycles:
+    rst: The average number of cycles a wavefront in the kernel dispatch was actively
+      executing instructions per :ref:`normalization unit <normalization-units>`.
+      This measurement is made on a per-wavefront basis, and may include cycles that
+      another wavefront spent actively executing (on another execution unit, for example)
+      or was stalled. As such, it is most useful to get a sense of how waves were
+      spending their time, rather than identification of a precise limiter. The sum
+      of this metric, Issue Wait Cycles and Active Wait Cycles should be equal to
+      the total Wave Cycles metric.
+    unit: Cycles per normalization unit
+  Wavefront Occupancy:
+    rst: |-
+      The time-averaged number of wavefronts resident on the accelerator over the
+      lifetime of the kernel. Note: this metric may be inaccurate for short-running
+      kernels (less than 1ms).
+    unit: Wavefronts
+Overall Instruction Mix:
+  LDS:
+    rst: The total number of LDS (also known as shared memory) operations issued.
+      These include loads, stores, atomics, and HIP's ``__shfl`` operations.
+    unit: Instructions
+  SALU:
+    rst: The total number of scalar arithmetic logic unit (SALU) operations issued.
+      Typically these are used for address calculations, literal constants, and other
+      operations that are provably uniform across a wavefront. Although scalar memory
+      (SMEM) operations are issued by the SALU, they are counted separately in this
+      section.
+    unit: Instructions
+  SMEM:
+    rst: The total number of scalar memory (SMEM) operations issued. These are typically
+      used for loading kernel arguments, base-pointers and loads from HIP's ``__constant__``
+      memory.
+    unit: Instructions
+  Branch:
+    rst: The total number of branch operations issued. These typically consist of
+      jump or branch operations and are used to implement control flow.
+    unit: Instructions
+VALU Arithmetic Instruction Mix: {}
+VMEM Instruction Mix:
+  Global/Generic Instr:
+    rst: The total number of global & generic memory instructions executed on all
+      :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Global/Generic Read:
+    rst: The total number of global & generic memory read instructions executed on
+      all :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Global/Generic Write:
+    rst: The total number of global & generic memory write instructions executed on
+      all :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Global/Generic Atomic:
+    rst: The total number of global & generic memory atomic (with and without return)
+      instructions executed on all :doc:`compute units <compute-unit>` on the accelerator,
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Spill/Stack Instr:
+    rst: The total number of spill/stack memory instructions executed on all :doc:`compute
+      units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Spill/Stack Read:
+    rst: The total number of spill/stack memory read instructions executed on all
+      :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Spill/Stack Write:
+    rst: The total number of spill/stack memory write instructions executed on all
+      :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Spill/Stack Atomic:
+    rst: The total number of spill/stack memory atomic (with and without return) instructions
+      executed on all :doc:`compute units <compute-unit>` on the accelerator, per
+      :ref:`normalization unit <normalization-units>`. Typically unused as these memory
+      operations are typically used to implement thread-local storage.
+    unit: Instructions per normalization unit
+MFMA Arithmetic Instruction Mix: {}
+Compute Speed-of-Light: {}
+Pipeline Statistics:
+  IPC:
+    rst: The ratio of the total number of instructions executed on the :doc:`CU <compute-unit>`
+      over the :ref:`total active CU cycles <total-active-cu-cycles>`.
+    unit: Instructions per cycle
+  IPC (Issued):
+    rst: The ratio of the total number of (non-:ref:`internal <ipc-internal-instructions>`)
+      instructions issued over the number of cycles where the :ref:`scheduler <desc-scheduler>`
+      was actively working on issuing instructions. Refer to the :ref:`Issued IPC
+      <issued-ipc>` example for further detail.
+    unit: Instructions per cycle
+  SALU Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`SALU <desc-salu>`
+      was busy executing instructions. Computed as the ratio of the total number of
+      cycles spent by the :ref:`scheduler <desc-scheduler>` issuing SALU / :ref:`SMEM
+      <desc-smem>` instructions over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  VALU Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`VALU <desc-valu>`
+      was busy executing instructions. Does not include :ref:`VMEM <desc-vmem>` operations.
+      Computed as the ratio of the total number of cycles spent by the :ref:`scheduler
+      <desc-scheduler>` issuing VALU instructions over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  VALU Active Threads:
+    rst: Indicates the average level of :ref:`divergence <desc-divergence>` within
+      a wavefront over the lifetime of the kernel. The number of work-items that were
+      active in a wavefront during execution of each :ref:`VALU <desc-valu>` instruction,
+      time-averaged over all VALU instructions run on all wavefronts in the kernel.
+    unit: Work-items
+Arithmetic Operations: {}
+LDS Speed-of-Light:
+  Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`LDS <desc-lds>`
+      was actively executing instructions (including, but not limited to, load, store,
+      atomic and HIP's ``__shfl`` operations). Calculated as the ratio of the total
+      number of cycles LDS was active over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  Access Rate:
+    rst: Indicates the percentage of SIMDs in the :ref:`VALU <desc-valu>` [#lds-workload]_
+      actively issuing LDS instructions, averaged over the lifetime of the kernel.
+      Calculated as the ratio of the total number of cycles spent by the :ref:`scheduler
+      <desc-scheduler>` issuing :ref:`LDS <desc-lds>` instructions over the :ref:`total
+      CU cycles <total-cu-cycles>`.
+    unit: Percent
+  Theoretical Bandwidth Utilization:
+    rst: Indicates the maximum amount of bytes that could have been loaded from, stored
+      to, or atomically updated in the LDS divided as percentage of theoretical peak.
+      Does *not* take into account the execution mask of the wavefront when the instruction
+      was executed. See the :ref:`LDS bandwidth example <lds-bandwidth>` for more
+      detail.
+    unit: Percent
+  Bank Conflict Rate:
+    rst: Indicates the percentage of active LDS cycles that were spent servicing bank
+      conflicts. Calculated as the ratio of LDS cycles spent servicing bank conflicts
+      over the number of LDS cycles that would have been required to move the same
+      amount of data in an uncontended access. [#lds-bank-conflict]_
+    unit: Percent
+LDS Statistics:
+  LDS Instructions:
+    rst: The total number of LDS instructions (including, but not limited to, read/write/atomics
+      and HIP's ``__shfl`` instructions) executed per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Theoretical Bandwidth:
+    rst: Indicates the maximum amount of bytes that could have been loaded from, stored
+      to, or atomically updated in the LDS divided by total duration. Does *not* take
+      into account the execution mask of the wavefront when the instruction was executed.
+      See the :ref:`LDS bandwidth example <lds-bandwidth>` for more detail.
+    unit: Gbps
+  LDS Latency:
+    rst: The average number of round-trip cycles (i.e., from issue to data-return
+      acknowledgment) required for an LDS instruction to complete.
+    unit: Cycles
+  Bank Conflicts/Access:
+    rst: The ratio of the number of cycles spent in the :ref:`LDS scheduler <desc-lds>`
+      due to bank conflicts (as determined by the conflict resolution hardware) to
+      the base number of cycles that would be spent in the LDS scheduler in a completely
+      uncontended case. This is the unnormalized form of the Bank Conflict Rate.
+    unit: Conflicts per Access
+  Index Accesses:
+    rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>` over
+      all operations per :ref:`normalization unit <normalization-units>`.
+    unit: Cycles per normalization unit
+  Atomic Return Cycles:
+    rst: The total number of cycles spent on LDS atomics with return per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Cycles per normalization unit
+  Bank Conflict:
+    rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>` due
+      to bank conflicts (as determined by the conflict resolution hardware) per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Cycles per normalization unit
+  Addr Conflict:
+    rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>` due
+      to address conflicts (as determined by the conflict resolution hardware) per
+      :ref:`normalization unit <normalization-units>`.
+    unit: Cycles per normalization unit
+  Unaligned Stall:
+    rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>` due
+      to stalls from non-dword aligned addresses per :ref:`normalization unit <normalization-units>`.
+    unit: Cycles per normalization unit
+  Mem Violations:
+    rst: |-
+      The total number of out-of-bounds accesses made to the LDS, per :ref:`normalization
+      unit <normalization-units>`. This is unused and expected to be zero in
+      most configurations for modern CDNA\u2122 accelerators.
+    unit: Accesses per normalization unit
+L1I Speed-of-Light:
+  Bandwidth Utilization:
+    rst: The number of bytes looked up in the L1I cache, as a percent of the peak
+      theoretical bandwidth. Calculated as the ratio of L1I requests over the :ref:`total
+      L1I cycles <total-l1i-cycles>`.
+    unit: Percent
+  Cache Hit Rate:
+    rst: The percent of L1I requests that hit [#l1i-cache]_ on a previously loaded
+      line the cache. Calculated as the ratio of the number of L1I requests that hit
+      over the number of all L1I requests.
+    unit: Percent
+  L1I-L2 Bandwidth Utilization:
+    rst: |-
+      The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth
+      achieved. Calculated as the ratio of the total number of requests from
+      the L1I to the L2 cache over the :ref:`total L1I-L2 interface cycles <total-l1i-cycles>`.
+    unit: Percent
+L1I cache accesses:
+  Req:
+    rst: The total number of requests made to the L1I per normalization-unit
+    unit: Requests per normalization unit
+  Hits:
+    rst: The total number of L1I requests that hit on a previously loaded cache line,
+      per :ref:`normalization-unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Misses - Non Duplicated:
+    rst: The total number of L1I requests that missed on a cache line that *were not*
+      already pending due to another request, per :ref:`normalization-unit <normalization-units>`.
+      See note in :ref:`desc-l1i-sol` for more detail.
+    unit: Requests per normalization unit
+  Misses - Duplicated:
+    rst: The total number of L1I requests that missed on a cache line that *were*
+      already pending due to another request, per :ref:`normalization-unit <normalization-units>`.
+      See note in :ref:`desc-l1i-sol` for more detail.
+    unit: Requests per normalization unit
+  Cache Hit Rate:
+    rst: The percent of L1I requests that hit [#l1i-cache]_ on a previously loaded
+      line the cache. Calculated as the ratio of the number of L1I requests that hit
+      over the number of all L1I requests.
+    unit: Percent
+  Instruction Fetch Latency:
+    rst: The average number of cycles spent to fetch instructions to a :doc:`CU <compute-unit>`.
+    unit: Cycles
+L1I <-> L2 interface:
+  L1I-L2 Bandwidth:
+    rst: Total number of bytes transferred across L1I - L2 interface divided by total
+      duration.
+    unit: Gbps
+Scalar L1D Speed-of-Light:
+  Bandwidth Utilization:
+    rst: The number of bytes looked up in the sL1D cache, as a percent of the peak
+      theoretical bandwidth. Calculated as the ratio of sL1D requests over the :ref:`total
+      sL1D cycles <total-sl1d-cycles>`.
+    unit: Percent
+  Cache Hit Rate:
+    rst: Indicates the percent of sL1D requests that hit on a previously loaded line
+      the cache. The ratio of the number of sL1D requests that hit [#sl1d-cache]_
+      over the number of all sL1D requests.
+    unit: Percent
+  sL1D-L2 BW Utilization:
+    rst: The percentage of the peak theoretical sL1D - L2 interface bandwidth acheived.
+      Caclulated as total number of bytes read from, written to, or atomically updated
+      across the sL1D - L2 interface.
+    unit: Percent
+Scalar L1D cache accesses:
+  Req:
+    rst: The total number of requests, of any size or type, made to the sL1D per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Hits:
+    rst: The total number of sL1D requests that hit on a previously loaded cache line,
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Misses - Non Duplicated:
+    rst: The total number of sL1D requests that missed on a cache line that *was not*
+      already pending due to another request, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`desc-sl1d-sol` for more detail.
+    unit: Requests per normalization unit
+  Misses- Duplicated:
+    rst: The total number of sL1D requests that missed on a cache line that *was*
+      already pending due to another request, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`desc-sl1d-sol` for more detail.
+    unit: Requests per normalization unit
+  Cache Hit Rate:
+    rst: Indicates the percent of sL1D requests that hit on a previously loaded line
+      the cache. The ratio of the number of sL1D requests that hit [#sl1d-cache]_
+      over the number of all sL1D requests.
+    unit: Percent
+  Read Req (Total):
+    rst: The total number of sL1D read requests of any size, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Atomic Req:
+    rst: The total number of atomic requests from sL1D to the :doc:`L2 <l2-cache>`,
+      per :ref:`normalization unit <normalization-units>`. Typically unused on current
+      CDNA accelerators.
+    unit: Requests per normalization unit
+  Read Req (1 DWord):
+    rst: The total number of sL1D read requests made for a single dword of data (4B),
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Read Req (2 DWord):
+    rst: The total number of sL1D read requests made for a two dwords of data (8B),
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Read Req (4 DWord):
+    rst: The total number of sL1D read requests made for a four dwords of data (16B),
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Read Req (8 DWord):
+    rst: The total number of sL1D read requests made for a eight dwords of data (32B),
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Read Req (16 DWord):
+    rst: The total number of sL1D read requests made for a sixteen dwords of data
+      (64B), per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+Scalar L1D Cache - L2 Interface:
+  sL1D-L2 BW:
+    rst: |-
+      The total number of bytes read from, written to, or atomically updated
+      across the sL1D\u2194:doc:`L2 <l2-cache>` interface, divided by total duration.
+      Note that sL1D writes and atomics are typically
+      unused on current CDNA accelerators, so in the majority of cases this can
+      be interpreted as an sL1D\u2192L2 read bandwidth.
+    unit: Gbps
+  Read Req:
+    rst: The total number of read requests from sL1D to the :doc:`L2 <l2-cache>`,
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Write Req:
+    rst: The total number of write requests from sL1D to the :doc:`L2 <l2-cache>`,
+      per :ref:`normalization unit <normalization-units>`. Typically unused on current
+      CDNA accelerators.
+    unit: Requests per normalization unit
+  Atomic Req:
+    rst: The total number of atomic requests from sL1D to the :doc:`L2 <l2-cache>`,
+      per :ref:`normalization unit <normalization-units>`. Typically unused on current
+      CDNA accelerators.
+    unit: Requests per normalization unit
+  Stall Cycles:
+    rst: |-
+      The total number of cycles the sL1D\u2194 :doc:`L2 <l2-cache>` interface
+      was stalled, per :ref:`normalization unit <normalization-units>`.
+    unit: Cycles per normalization unit
+Busy and stall metrics:
+  Address Processing Unit Busy:
+    rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the address processor
+      was busy
+    unit: Percent
+  Address Stall:
+    rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the address processor
+      was stalled from sending address requests further into the vL1D pipeline
+    unit: Percent
+  Data Stall:
+    rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the address processor
+      was stalled from sending write/atomic data further into the vL1D pipeline
+    unit: Percent
+  Data-Processor → Address Stall:
+    rst: Percent of :ref:`total CU cycles <total-cu-cycles>` the address processor
+      was stalled waiting to send command data to the :ref:`data processor <desc-td>`
+    unit: Percent
+Instruction counts:
+  Total Instructions:
+    rst: The total number of memory instructions executed by the address processer
+      over all compute units on the accelerator, per normalization unit.
+    unit: Instructions per normalization unit
+  Global/Generic Instructions:
+    rst: The total number of global & generic memory instructions executed on all
+      :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Global/Generic Read Instructions:
+    rst: The total number of global & generic memory read instructions executed on
+      all :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Global/Generic Write Instructions:
+    rst: The total number of global & generic memory write instructions executed on
+      all :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Global/Generic Atomic Instructions:
+    rst: The total number of global & generic memory atomic (with and without return)
+      instructions executed on all :doc:`compute units <compute-unit>` on the accelerator,
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Spill/Stack Instructions:
+    rst: The total number of spill/stack memory instructions executed on all :doc:`compute
+      units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Spill/Stack Read Instructions:
+    rst: The total number of spill/stack memory read instructions executed on all
+      :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Spill/Stack Write Instructions:
+    rst: The total number of spill/stack memory write instructions executed on all
+      :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Spill/Stack Atomic Instructions:
+    rst: The total number of spill/stack memory atomic (with and without return) instructions
+      executed on all :doc:`compute units <compute-unit>` on the accelerator, per
+      :ref:`normalization unit <normalization-units>`. Typically unused as these memory
+      operations are typically used to implement thread-local storage.
+    unit: Instructions per normalization unit
+Spill and stack metrics:
+  Spill/Stack Total Cycles:
+    rst: The number of cycles the address processing unit spent working on spill/stack
+      instructions, per :ref:`normalization unit <normalization-units>`.
+    unit: Cycles per normalization unit
+  Spill/Stack Coalesced Read:
+    rst: The number of cycles the address processing unit spent working on coalesced
+      spill/stack read instructions, per :ref:`normalization unit <normalization-units>`.
+    unit: Cycles per normalization unit
+  Spill/Stack Coalesced Write:
+    rst: The number of cycles the address processing unit spent working on coalesced
+      spill/stack write instructions, per :ref:`normalization unit <normalization-units>`.
+    unit: Cycles per normalization unit
+Vector L1 data-return path or Texture Data (TD):
+  Data-Return Busy:
+    rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the data-return unit
+      was busy processing or waiting on data to return to the :doc:`CU <compute-unit>`.
+    unit: Percent
+  Cache RAM → Data-Return Stall:
+    rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the data-return unit
+      was stalled on data to be returned from the :ref:`vL1D Cache RAM <desc-tc>`.
+    unit: Percent
+  Coalescable Instructions:
+    rst: The number of instructions submitted to the :ref:`data-return unit <desc-td>`
+      by the :ref:`address processor <desc-ta>` that were found to be coalescable,
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Read Instructions:
+    rst: The number of read instructions submitted to the :ref:`data-return unit <desc-td>`
+      by the :ref:`address processor <desc-ta>` summed over all :doc:`compute units
+      <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
+      This is expected to be the sum of global/generic and spill/stack reads in the
+      :ref:`address processor <desc-ta>`.
+    unit: Instructions per normalization unit
+  Write Instructions:
+    rst: The number of store instructions submitted to the :ref:`data-return unit
+      <desc-td>` by the :ref:`address processor <desc-ta>` summed over all :doc:`compute
+      units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
+      This is expected to be the sum of global/generic and spill/stack stores counted
+      by the :ref:`vL1D cache-front-end <ta-instruction-counts>`.
+    unit: Instructions per normalization unit
+  Atomic Instructions:
+    rst: The number of atomic instructions submitted to the :ref:`data-return unit
+      <desc-td>` by the :ref:`address processor <desc-ta>` summed over all :doc:`compute
+      units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
+      This is expected to be the sum of global/generic and spill/stack atomics in
+      the :ref:`address processor <desc-ta>`.
+    unit: Instructions per normalization unit
+vL1D Speed-of-Light:
+  Hit rate:
+    rst: The ratio of the number of vL1D cache line requests that hit [#vl1d-hit]_
+      in vL1D cache over the total number of cache line requests to the :ref:`vL1D
+      Cache RAM <desc-tc>`.
+    unit: Percent
+  Bandwidth Utilization:
+    rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM
+      <desc-vmem>` instructions, as a percent of the peak theoretical bandwidth achievable
+      on the specific accelerator. The number of bytes is calculated as the number
+      of cache lines requested multiplied by the cache line size. This value does
+      not consider partial requests, so for instance, if only a single value is requested
+      in a cache line, the data movement will still be counted as a full cache line.
+    unit: Percent
+  Utilization:
+    rst: Indicates how busy the :ref:`vL1D Cache RAM <desc-tc>` was during the kernel
+      execution. The number of cycles where the vL1D Cache RAM is actively processing
+      any request divided by the number of cycles where the vL1D is active [#vl1d-activity]_.
+    unit: Percent
+  Coalescing:
+    rst: Indicates how well memory instructions were coalesced by the :ref:`address
+      processing unit <desc-ta>`, ranging from uncoalesced (25%) to fully coalesced
+      (100%). Calculated as the average number of :ref:`thread-requests <thread-requests>`
+      generated per instruction divided by the ideal number of thread-requests per
+      instruction.
+    unit: Percent
+vL1D cache stall metrics:
+  Stalled on L2 Data:
+    rst: The ratio of the number of cycles where the vL1D is stalled waiting for requested
+      data to return from the :doc:`L2 cache <l2-cache>` divided by the number of
+      cycles where the vL1D is active [#vl1d-activity]_.
+    unit: Percent
+  Stalled on L2 Req:
+    rst: The ratio of the number of cycles where the vL1D is stalled waiting to issue
+      a request for data to the :doc:`L2 cache <l2-cache>` divided by the number of
+      cycles where the vL1D is active [#vl1d-activity]_.
+    unit: Percent
+  Tag RAM Stall (Read):
+    rst: The ratio of the number of cycles where the vL1D is stalled due to Read requests
+      with conflicting tags being looked up concurrently, divided by the number of
+      cycles where the vL1D is active [#vl1d-activity]_.
+    unit: Percent
+  Tag RAM Stall (Write):
+    rst: The ratio of the number of cycles where the vL1D is stalled due to Write
+      requests with conflicting tags being looked up concurrently, divided by the
+      number of cycles where the vL1D is active [#vl1d-activity]_.
+    unit: Percent
+  Tag RAM Stall (Atomic):
+    rst: The ratio of the number of cycles where the vL1D is stalled due to Atomic
+      requests with conflicting tags being looked up concurrently, divided by the
+      number of cycles where the vL1D is active [#vl1d-activity]_.
+    unit: Percent
+vL1D cache access metrics:
+  Total Req:
+    rst: The total number of incoming requests from the :ref:`address processing unit
+      <desc-ta>` after coalescing.
+    unit: Requests
+  Read Req:
+    rst: The total number of incoming read requests from the :ref:`address processing
+      unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
+    unit: Requests per normalization unit
+  Write Req:
+    rst: The total number of incoming write requests from the :ref:`address processing
+      unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
+    unit: Requests per normalization unit
+  Atomic Req:
+    rst: The total number of incoming atomic requests from the :ref:`address processing
+      unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
+    unit: Requests per normalization unit
+  Cache BW:
+    rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM
+      <desc-vmem>` instructions divided by total duration. The number of bytes is
+      calculated as the number of cache lines requested multiplied by the cache line
+      size. This value does not consider partial requests, so for instance, if only
+      a single value is requested in a cache line, the data movement will still be
+      counted as a full cache line.
+    unit: Gbps
+  Cache Hit Rate:
+    rst: The ratio of the number of vL1D cache line requests that hit in vL1D cache
+      over the total number of cache line requests to the :ref:`vL1D Cache RAM <desc-tc>`.
+    unit: Percent
+  Cache Accesses:
+    rst: The total number of cache line lookups in the vL1D.
+    unit: Cache lines
+  Cache Hits:
+    rst: The number of cache accesses minus the number of outgoing requests to the
+      :doc:`L2 cache <l2-cache>`, that is, the number of cache line requests serviced
+      by the :ref:`vL1D Cache RAM <desc-tc>` per :ref:`normalization unit <normalization-units>`.
+    unit: Cache lines per normalization unit
+  Invalidations:
+    rst: The number of times the vL1D was issued a write-back invalidate command during
+      the kernel's execution per :ref:`normalization unit <normalization-units>`.
+      This may be triggered by, for instance, the ``buffer_wbinvl1`` instruction.
+    unit: Invalidations per normalization unit
+  L1-L2 BW:
+    rst: The number of bytes transferred across the vL1D-L2 interface as a result
+      of :ref:`VMEM <desc-vmem>` instructions, divided by total duration. The number
+      of bytes is calculated as the number of cache lines requested multiplied by
+      the cache line size. This value does not consider partial requests, so for instance,
+      if only a single value is requested in a cache line, the data movement will
+      still be counted as a full cache line.
+    unit: Gbps
+  L1-L2 Read:
+    rst: The number of read requests for a vL1D cache line that were not satisfied
+      by the vL1D and must be retrieved from the to the :doc:`L2 Cache <l2-cache>`
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  L1-L2 Write:
+    rst: The number of write requests to a vL1D cache line that were sent through
+      the vL1D to the :doc:`L2 cache <l2-cache>`, per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  L1-L2 Atomic:
+    rst: The number of atomic requests that are sent through the vL1D to the :doc:`L2
+      cache <l2-cache>`, per :ref:`normalization unit <normalization-units>`. This
+      includes requests for atomics with, and without return.
+    unit: Requests per normalization unit
+  L1 Access Latency:
+    rst: Calculated as the average number of cycles that a vL1D cache line request
+      spent in the vL1D cache pipeline.
+    unit: Cycles
+  L1-L2 Read Latency:
+    rst: Calculated as the average number of cycles that the vL1D cache took to issue
+      and receive read requests from the :doc:`L2 Cache <l2-cache>`. This number also
+      includes requests for atomics with return values.
+    unit: Cycles
+  L1-L2 Write Latency:
+    rst: Calculated as the average number of cycles that the vL1D cache took to issue
+      and receive acknowledgement of a write request to the :doc:`L2 Cache <l2-cache>`.
+      This number also includes requests for atomics without return values.
+    unit: Cycles
+L1D - L2 Transactions:
+  NC - Read:
+    rst: Total read requests with NC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  UC - Read:
+    rst: Total read requests with UC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  CC - Read:
+    rst: Total read requests with CC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  RW - Read:
+    rst: Total read requests with RW mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  RW - Write:
+    rst: Total write requests with RW mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  NC - Write:
+    rst: Total write requests with NC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  UC - Write:
+    rst: Total write requests with UC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  CC - Write:
+    rst: Total write requests with CC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  NC - Atomic:
+    rst: Total atomic requests with NC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  UC - Atomic:
+    rst: Total atomic requests with UC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  CC - Atomic:
+    rst: Total atomic requests with CC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  RW - Atomic:
+    rst: Total atomic requests with RW mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+L1 Unified Translation Cache (UTCL1):
+  Req:
+    rst: The number of translation requests made to the UTCL1 per normalization unit.
+    unit: Requests per normalization unit
+  Hit Ratio:
+    rst: The ratio of the number of translation requests that hit in the UTCL1 divided
+      by the total number of translation requests made to the UTCL1.
+    unit: Percent
+  Hits:
+    rst: The number of translation requests that hit in the UTCL1, and could be reused,
+      per normalization unit.
+    unit: Requests per normalization unit
+  Translation Misses:
+    rst: The total number of translation requests that missed in the UTCL1 due to
+      translation not being present in the cache, per :ref:`normalization unit <normalization-units>`.
+    unit: unit
+  Permission Misses:
+    rst: |-
+      The total number of translation requests that missed in the UTCL1 due
+      to a permission error, per :ref:`normalization unit <normalization-units>`.
+      This is unused and expected to be zero in most configurations for modern
+      CDNA\u2122 accelerators.
+    unit: Requests per normalization unit
+L1D Addr Translation Stalls: {}
+L2 Speed-of-Light:
+  Utilization:
+    rst: The ratio of the :ref:`number of cycles an L2 channel was active, summed
+      over all L2 channels on the accelerator <total-active-l2-cycles>` over the :ref:`total
+      L2 cycles <total-l2-cycles>`.
+    unit: Percent
+  Peak Bandwidth:
+    rst: The number of bytes looked up in the L2 cache, as a percent of the peak theoretical
+      bandwidth achievable on the specific accelerator. The number of bytes is calculated
+      as the number of cache lines requested multiplied by the cache line size. This
+      value does not consider partial requests, so e.g., if only a single value is
+      requested in a cache line, the data movement will still be counted as a full
+      cache line.
+    unit: Percent
+  Hit Rate:
+    rst: The ratio of the number of L2 cache line requests that hit in the L2 cache
+      over the total number of incoming cache line requests to the L2 cache.
+    unit: Percent
+  L2-Fabric Read BW:
+    rst: The number of bytes read by the L2 over the :ref:`Infinity Fabric interface
+      <l2-fabric>` per unit time.
+    unit: GB/s
+  L2-Fabric Write and Atomic BW:
+    rst: The number of bytes sent by the L2 over the :ref:`Infinity Fabric interface
+      <l2-fabric>` by write and atomic operations per unit time.
+    unit: GB/s
+  HBM Bandwidth:
+    rst: Maximum theoretical bandwidth of the accelerator's local high-bandwidth memory
+      (HBM) per unit time. This value is calculated as the number of HBM channels
+      multiplied by the HBM channel width multiplied by the HBM clock frequency.
+    unit: GB/s
+L2-Fabric interface metrics:
+  Read BW:
+    rst: The total number of bytes read by the L2 cache from Infinity Fabric divided
+      by total duration.
+    unit: Gbps
+  HBM Read Traffic:
+    rst: The percent of read requests generated by the L2 cache that are routed to
+      the accelerator's local high-bandwidth memory (HBM). This breakdown does not
+      consider the *size* of the request (meaning that 32B and 64B requests are both
+      counted as a single request), so this metric only *approximates* the percent
+      of the L2-Fabric Read bandwidth directed to the local HBM.
+    unit: Percent
+  Remote Read Traffic:
+    rst: The percent of read requests generated by the L2 cache that are routed to
+      any memory location other than the accelerator's local high-bandwidth memory
+      (HBM) -- for example, the CPU's DRAM or a remote accelerator's HBM. This breakdown
+      does not consider the *size* of the request (meaning that 32B and 64B requests
+      are both counted as a single request), so this metric only *approximates* the
+      percent of the L2-Fabric Read bandwidth directed to a remote location.
+    unit: Percent
+  Uncached Read Traffic:
+    rst: The percent of read requests generated by the L2 cache that are reading from
+      an :ref:`uncached memory allocation <memory-type>`. Note, as described in the
+      :ref:`request flow <l2-request-flow>` section, a single 64B read request is
+      typically counted as two uncached read requests. So, it is possible for the
+      Uncached Read Traffic to reach up to 200% of the total number of read requests.
+      This breakdown does not consider the *size* of the request (i.e., 32B and 64B
+      requests are both counted as a single request), so this metric only *approximates*
+      the percent of the L2-Fabric read bandwidth directed to an uncached memory location.
+    unit: Percent
+  Write and Atomic BW:
+    rst: The total number of bytes written by the L2 over Infinity Fabric by write
+      and atomic operations divided by total duration. Note that on current CDNA accelerators,
+      such as the :ref:`MI2XX <mixxx-note>`, requests are only considered *atomic*
+      by Infinity Fabric if they are targeted at non-write-cacheable memory, for example,
+      :ref:`fine-grained memory <memory-type>` allocations or :ref:`uncached memory
+      <memory-type>` allocations on the MI2XX.
+    unit: Gbps
+  HBM Write and Atomic Traffic:
+    rst: The percent of write and atomic requests generated by the L2 cache that are
+      routed to the accelerator's local high-bandwidth memory (HBM). This breakdown
+      does not consider the *size* of the request (meaning that 32B and 64B requests
+      are both counted as a single request), so this metric only *approximates* the
+      percent of the L2-Fabric Write and Atomic bandwidth directed to the local HBM.
+      Note that on current CDNA accelerators, such as the :ref:`MI2XX <mixxx-note>`,
+      requests are only considered *atomic* by Infinity Fabric if they are targeted
+      at :ref:`fine-grained memory <memory-type>` allocations or :ref:`uncached memory
+      <memory-type>` allocations.
+    unit: Percent
+  Remote Write and Atomic Traffic:
+    rst: The percent of read requests generated by the L2 cache that are routed to
+      any memory location other than the accelerator's local high-bandwidth memory
+      (HBM) -- for example, the CPU's DRAM or a remote accelerator's HBM. This breakdown
+      does not consider the *size* of the request (meaning that 32B and 64B requests
+      are both counted as a single request), so this metric only *approximates* the
+      percent of the L2-Fabric Read bandwidth directed to a remote location. Note
+      that on current CDNA accelerators, such as the :ref:`MI2XX <mixxx-note>`, requests
+      are only considered *atomic* by Infinity Fabric if they are targeted at :ref:`fine-grained
+      memory <memory-type>` allocations or :ref:`uncached memory <memory-type>` allocations.
+    unit: Percent
+  Atomic Traffic:
+    rst: The percent of write requests generated by the L2 cache that are atomic requests
+      to *any* memory location. This breakdown does not consider the *size* of the
+      request (meaning that 32B and 64B requests are both counted as a single request),
+      so this metric only *approximates* the percent of the L2-Fabric Read bandwidth
+      directed to a remote location. Note that on current CDNA accelerators, such
+      as the :ref:`MI2XX <mixxx-note>`, requests are only considered *atomic* by Infinity
+      Fabric if they are targeted at :ref:`fine-grained memory <memory-type>` allocations
+      or :ref:`uncached memory <memory-type>` allocations.
+    unit: Percent
+  Uncached Write and Atomic Traffic:
+    rst: The percent of write and atomic requests generated by the L2 cache that are
+      targeting :ref:`uncached memory allocations <memory-type>`. This breakdown does
+      not consider the *size* of the request (meaning that 32B and 64B requests are
+      both counted as a single request), so this metric only *approximates* the percent
+      of the L2-Fabric read bandwidth directed to uncached memory allocations.
+    unit: Percent
+  Read Latency:
+    rst: The time-averaged number of cycles read requests spent in Infinity Fabric
+      before data was returned to the L2.
+    unit: Cycles
+  Write and Atomic Latency:
+    rst: The time-averaged number of cycles write requests spent in Infinity Fabric
+      before a completion acknowledgement was returned to the L2.
+    unit: Cycles
+  Atomic Latency:
+    rst: The time-averaged number of cycles atomic requests spent in Infinity Fabric
+      before a completion acknowledgement (atomic without return value) or data (atomic
+      with return value) was returned to the L2.
+    unit: Cycles
+L2 Cache Accesses:
+  Bandwidth:
+    rst: The number of bytes looked up in the L2 cache, divided by total duration.
+      The number of bytes is calculated as the number of cache lines requested multiplied
+      by the cache line size. This value does not consider partial requests, so for
+      example, if only a single value is requested in a cache line, the data movement
+      will still be counted as a full cache line.
+    unit: Gbps
+  Req:
+    rst: The total number of incoming requests to the L2 from all clients for all
+      request types, per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Read Req:
+    rst: The total number of read requests to the L2 from all clients.
+    unit: Requests per normalization unit
+  Write Req:
+    rst: The total number of write requests to the L2 from all clients.
+    unit: Requests per normalization unit
+  Atomic Req:
+    rst: The total number of atomic requests (with and without return) to the L2 from
+      all clients.
+    unit: Requests per normalization unit
+  Streaming Req:
+    rst: The total number of incoming requests to the L2 that are marked as *streaming*.
+      The exact meaning of this may differ depending on the targeted accelerator,
+      however on an :ref:`MI2XX <mixxx-note>` this corresponds to `non-temporal load
+      or stores <https://clang.llvm.org/docs/LanguageExtensions.html#non-temporal-load-store-builtins>`_.
+      The L2 cache attempts to evict *streaming* requests before normal requests when
+      the L2 is at capacity.
+    unit: Requests per normalization unit
+  Probe Req:
+    rst: The number of coherence probe requests made to the L2 cache from outside
+      the accelerator. On an :ref:`MI2XX <mixxx-note>`, probe requests may be generated
+      by, for example, writes to :ref:`fine-grained device <memory-type>` memory or
+      by writes to :ref:`coarse-grained <memory-type>` device memory.
+    unit: Requests per normalization unit
+  Cache Hit:
+    rst: The ratio of the number of L2 cache line requests that hit in the L2 cache
+      over the total number of incoming cache line requests to the L2 cache.
+    unit: Percent
+  Hits:
+    rst: The total number of requests to the L2 from all clients that hit in the cache.
+      As noted in the :ref:`Speed-of-Light <l2-sol>` section, this includes hit-on-miss
+      requests.
+    unit: Requests per normalization unit
+  Misses:
+    rst: The total number of requests to the L2 from all clients that miss in the
+      cache. As noted in the :ref:`Speed-of-Light <l2-sol>` section, these do not
+      include hit-on-miss requests.
+    unit: Requests per normalization unit
+  Writeback:
+    rst: The total number of L2 cache lines written back to memory for any reason.
+      Write-backs may occur due to user code (such as HIP kernel calls to ``__threadfence_system``
+      or atomic built-ins) by the :doc:`command processor <command-processor>`'s memory
+      acquire/release fences, or for other internal hardware reasons.
+    unit: Cache lines per normalization unit
+  Writeback (Internal):
+    rst: The total number of L2 cache lines written back to memory for internal hardware
+      reasons, per :ref:`normalization unit <normalization-units>`.
+    unit: Cache lines per normalization unit
+  Writeback (vL1D Req):
+    rst: The total number of L2 cache lines written back to memory due to requests
+      initiated by the :doc:`vL1D cache <vector-l1-cache>`, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Cache lines per normalization unit
+  Evict (Internal):
+    rst: The total number of L2 cache lines evicted from the cache due to capacity
+      limits, per :ref:`normalization unit <normalization-units>`.
+    unit: Cache lines per normalization unit
+  Evict (vL1D Req):
+    rst: The total number of L2 cache lines evicted from the cache due to invalidation
+      requests initiated by the :doc:`vL1D cache <vector-l1-cache>`, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Cache lines per normalization unit
+  NC Req:
+    rst: The total number of requests to the L2 to Not-hardware-Coherent (NC) memory
+      allocations, per :ref:`normalization unit <normalization-units>`. See the :ref:`memory-type`
+      for more information.
+    unit: Requests per normalization unit
+  UC Req:
+    rst: The total number of requests to the L2 that go to Uncached (UC) memory allocations.
+      See the :ref:`memory-type` for more information.
+    unit: Requests per normalization unit
+  CC Req:
+    rst: The total number of requests to the L2 that go to Coherently Cacheable (CC)
+      memory allocations. See the :ref:`memory-type` for more information.
+    unit: Requests per normalization unit
+  RW Req:
+    rst: The total number of requests to the L2 that go to Read-Write coherent memory
+      (RW) allocations. See the :ref:`memory-type` for more information.
+    unit: Requests per normalization unit
+L2 Cache Stalls: {}
+L2 - Fabric Interface stalls:
+  Write - Credit Starvation:
+    rst: The number of cycles the L2-Fabric interface was stalled on write or atomic
+      requests to any memory location because too many write/atomic requests were
+      currently in flight, as a percent of the :ref:`total active L2 cycles <total-active-l2-cycles>`.
+    unit: Percent
+L2 - Fabric interface detailed metrics:
+  Read (32B):
+    rst: The total number of L2 requests to Infinity Fabric to read 32B of data from
+      any memory location, per :ref:`normalization unit <normalization-units>`. See
+      :ref:`l2-request-flow` for more detail. Typically unused on CDNA accelerators.
+    unit: Requests per normalization unit
+  Read (64B):
+    rst: The total number of L2 requests to Infinity Fabric to read 64B of data from
+      any memory location, per :ref:`normalization unit <normalization-units>`. See
+      :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+  Read (Uncached):
+    rst: The total number of L2 requests to Infinity Fabric to read :ref:`uncached
+      data <memory-type>` from any memory location, per :ref:`normalization unit <normalization-units>`.
+      64B requests for uncached data are counted as two 32B uncached data requests.
+      See :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+  HBM Read:
+    rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of
+      data from the accelerator's local HBM, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+  Remote Read:
+    rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of
+      data from any source other than the accelerator's local HBM, per :ref:`normalization
+      unit <normalization-units>`. See :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+  Write and Atomic (32B):
+    rst: The total number of L2 requests to Infinity Fabric to write or atomically
+      update 32B of data to any memory location, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+  Write and Atomic (Uncached):
+    rst: The total number of L2 requests to Infinity Fabric to write or atomically
+      update 32B or 64B of :ref:`uncached data <memory-type>`, per :ref:`normalization
+      unit <normalization-units>`. See :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+  Write and Atomic (64B):
+    rst: The total number of L2 requests to Infinity Fabric to write or atomically
+      update 64B of data in any memory location, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+  HBM Write and Atomic:
+    rst: The total number of L2 requests to Infinity Fabric to write or atomically
+      update 32B or 64B of data in the accelerator's local HBM, per :ref:`normalization
+      unit <normalization-units>`. See :ref:`l2-request-flow` for more detail. plain
+    unit: Requests per normalization unit
+  Remote Write and Atomic:
+    rst: The total number of L2 requests to Infinity Fabric to write or atomically
+      update 32B or 64B of data in any memory location other than the accelerator's
+      local HBM, per :ref:`normalization unit <normalization-units>`. See :ref:`l2-request-flow`
+      for more detail.
+    unit: Requests per normalization unit
+  Atomic:
+    rst: The total number of L2 requests to Infinity Fabric to atomically update 32B
+      or 64B of data in any memory location, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`l2-request-flow` for more detail. Note that on current CDNA accelerators,
+      such as the :ref:`MI2XX <mixxx-note>`, requests are only considered *atomic*
+      by Infinity Fabric if they are targeted at non-write-cacheable memory, such
+      as :ref:`fine-grained memory <memory-type>` allocations or :ref:`uncached memory
+      <memory-type>` allocations on the MI2XX.
+    unit: Requests per normalization unit
+Aggregate Stats (All channels):
+  L2 Cache Hit Rate:
+    rst: The total number of requests to the L2 from all clients that hit in the cache.
+      As noted in the :ref:`Speed-of-Light <l2-sol>` section, this includes hit-on-miss
+      requests.
+    unit: Percent
+L2 Cache Hit Rate (pct):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
+L2 Requests (per normUnit):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
+L2-Fabric Requests (per normUnit):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
+L2-Fabric Read Latency (Cycles):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
+L2-Fabric Write and Atomic Latency (Cycles):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
+L2-Fabric Atomic Latency (Cycles):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
+L2-Fabric Read Stall (Cycles per normUnit):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
+L2-Fabric Write and Atomic Stall (Cycles per normUnit):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
+L2-Fabric (128B read requests per normUnit):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
diff --git a/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx90a_metrics_description.yaml b/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx90a_metrics_description.yaml
new file mode 100644
index 0000000000..39e2a52664
--- /dev/null
+++ b/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx90a_metrics_description.yaml
@@ -0,0 +1,2035 @@
+System Speed-of-Light:
+  VALU FLOPs:
+    rst: |-
+      The total floating-point operations executed per second on the :ref:`VALU
+      <desc-valu>`. This is also presented as a percent of the peak theoretical
+      FLOPs achievable on the specific accelerator. Note: this does not include
+      any floating-point operations from :ref:`MFMA <desc-mfma>` instructions.
+    unit: GFLOPs
+  VALU IOPs:
+    rst: |-
+      The total integer operations executed per second on the :ref:`VALU <desc-valu>`.
+      This is also presented as a percent of the peak theoretical IOPs achievable
+      on the specific accelerator. Note: this does not include any integer operations
+      from :ref:`MFMA <desc-mfma>` instructions.
+    unit: GOIPs
+  MFMA FLOPs (BF16):
+    rst: |-
+      The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>`
+      operations executed per second. Note: this does not include any 16-bit brain
+      floating point operations from :ref:`VALU <desc-valu>` instructions. This
+      is also presented as a percent of the peak theoretical BF16 MFMA operations
+      achievable on the specific accelerator.
+    unit: GFLOPs
+  MFMA FLOPs (F16):
+    rst: |-
+      The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 16-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. This is also presented
+      as a percent of the peak theoretical F16 MFMA operations achievable on the
+      specific accelerator.
+    unit: GFLOPs
+  MFMA FLOPs (F32):
+    rst: |-
+      The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 32-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. This is also presented
+      as a percent of the peak theoretical F32 MFMA operations achievable on the
+      specific accelerator.
+    unit: GFLOPs
+  MFMA FLOPs (F64):
+    rst: |-
+      The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 64-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. This is also presented
+      as a percent of the peak theoretical F64 MFMA operations achievable on the
+      specific accelerator.
+    unit: GFLOPs
+  MFMA IOPs (Int8):
+    rst: |-
+      The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
+      per second. Note: this does not include any 8-bit integer operations from
+      :ref:`VALU <desc-valu>` instructions. This is also presented as a percent
+      of the peak theoretical INT8 MFMA operations achievable on the specific accelerator.
+    unit: GIOPs
+  Active CUs:
+    rst: Total number of active compute units (CUs) on the accelerator during the
+      kernel execution.
+    unit: Number
+  SALU Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`SALU <desc-salu>`
+      was busy executing instructions. Computed as the ratio of the total number of
+      cycles spent by the :ref:`scheduler <desc-scheduler>` issuing SALU / :ref:`SMEM
+      <desc-smem>` instructions over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  VALU Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`VALU <desc-valu>`
+      was busy executing instructions. Does not include :ref:`VMEM <desc-vmem>` operations.
+      Computed as the ratio of the total number of cycles spent by the :ref:`scheduler
+      <desc-scheduler>` issuing VALU instructions over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  MFMA Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`MFMA <desc-mfma>`
+      unit was busy executing instructions. Computed as the ratio of the total number
+      of cycles spent by the :ref:`MFMA <desc-salu>` was busy over the :ref:`total
+      CU cycles <total-cu-cycles>`.
+    unit: Percent
+  VMEM Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`VMEM <desc-vmem>`
+      unit was busy executing instructions, including both global/generic and spill/scratch
+      operations (see the :ref:`VMEM instruction count metrics <ta-instruction-counts>`
+      for more detail). Does not include :ref:`VALU <desc-valu>` operations. Computed
+      as the ratio of the total number of cycles spent by the :ref:`scheduler <desc-scheduler>`
+      issuing VMEM instructions over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  Branch Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`branch <desc-branch>`
+      unit was busy executing instructions. Computed as the ratio of the total number
+      of cycles spent by the :ref:`scheduler <desc-scheduler>` issuing branch instructions
+      over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  VALU Active Threads:
+    rst: Indicates the average level of :ref:`divergence <desc-divergence>` within
+      a wavefront over the lifetime of the kernel. The number of work-items that were
+      active in a wavefront during execution of each :ref:`VALU <desc-valu>` instruction,
+      time-averaged over all VALU instructions run on all wavefronts in the kernel.
+    unit: Work-items
+  IPC:
+    rst: The ratio of the total number of instructions executed on the :doc:`CU <compute-unit>`
+      over the :ref:`total active CU cycles <total-active-cu-cycles>`.
+    unit: Instructions per-cycle
+  Wavefront Occupancy:
+    rst: |-
+      The time-averaged number of wavefronts resident on the accelerator over
+      the lifetime of the kernel. Note: this metric may be inaccurate for short-running
+      kernels (less than 1ms). This is also presented as a percent of the peak theoretical
+      occupancy achievable on the specific accelerator.
+    unit: Wavefronts
+  Theoretical LDS Bandwidth:
+    rst: Indicates the maximum amount of bytes that could have been loaded from, stored
+      to, or atomically updated in the LDS per unit time (see :ref:`LDS Bandwidth
+      <lds-bandwidth>` example for more detail). This is also presented as a percent
+      of the peak theoretical F64 MFMA operations achievable on the specific accelerator.
+    unit: GB/s
+  LDS Bank Conflicts/Access:
+    rst: The ratio of the number of cycles spent in the :doc:`LDS scheduler <local-data-share>`
+      due to bank conflicts (as determined by the conflict resolution hardware) to
+      the base number of cycles that would be spent in the LDS scheduler in a completely
+      uncontended case. This is also presented in normalized form (i.e., the Bank
+      Conflict Rate).
+    unit: Conflicts/Access
+  vL1D Cache Hit Rate:
+    rst: The ratio of the number of vL1D cache line requests that hit in vL1D cache
+      over the total number of cache line requests to the :ref:`vL1D cache RAM <desc-tc>`.
+    unit: Percent
+  vL1D Cache BW:
+    rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM
+      <desc-vmem>` instructions per unit time. The number of bytes is calculated as
+      the number of cache lines requested multiplied by the cache line size. This
+      value does not consider partial requests, so e.g., if only a single value is
+      requested in a cache line, the data movement will still be counted as a full
+      cache line. This is also presented as a percent of the peak theoretical bandwidth
+      achievable on the specific accelerator.
+    unit: GB/s
+  L2 Cache Hit Rate:
+    rst: The ratio of the number of L2 cache line requests that hit in the L2 cache
+      over the total number of incoming cache line requests to the L2 cache.
+    unit: Percent
+  L2 Cache BW:
+    rst: The number of bytes looked up in the L2 cache per unit time. The number of
+      bytes is calculated as the number of cache lines requested multiplied by the
+      cache line size. This value does not consider partial requests, so e.g., if
+      only a single value is requested in a cache line, the data movement will still
+      be counted as a full cache line. This is also presented as a percent of the
+      peak theoretical bandwidth achievable on the specific accelerator.
+    unit: GB/s
+  L2-Fabric Read BW:
+    rst: |-
+      The number of bytes read by the L2 over the :ref:`Infinity Fabric\u2122
+      interface <l2-fabric>` per unit time. This is also presented as a percent
+      of the peak theoretical bandwidth achievable on the specific accelerator.
+    unit: GB/s
+  L2-Fabric Write BW:
+    rst: The number of bytes sent by the L2 over the :ref:`Infinity Fabric interface
+      <l2-fabric>` by write and atomic operations per unit time. This is also presented
+      as a percent of the peak theoretical bandwidth achievable on the specific accelerator.
+    unit: GB/s
+  L2-Fabric Read Latency:
+    rst: The time-averaged number of cycles read requests spent in Infinity Fabric
+      before data was returned to the L2.
+    unit: Cycles
+  L2-Fabric Write Latency:
+    rst: The time-averaged number of cycles write requests spent in Infinity Fabric
+      before a completion acknowledgement was returned to the L2.
+    unit: Cycles
+  sL1D Cache Hit Rate:
+    rst: The percent of sL1D requests that hit on a previously loaded line the cache.
+      Calculated as the ratio of the number of sL1D requests that hit over the number
+      of all sL1D requests.
+    unit: Percent
+  sL1D Cache BW:
+    rst: The number of bytes looked up in the sL1D cache per unit time. This is also
+      presented as a percent of the peak theoretical bandwidth achievable on the specific
+      accelerator.
+    unit: GB/s
+  L1I Hit Rate:
+    rst: The percent of L1I requests that hit on a previously loaded line the cache.
+      Calculated as the ratio of the number of L1I requests that hit over the number
+      of all L1I requests.
+    unit: GB/s
+  L1I BW:
+    rst: The number of bytes looked up in the L1I cache per unit time. This is also
+      presented as a percent of the peak theoretical bandwidth achievable on the specific
+      accelerator.
+    unit: Percent
+  L1I Fetch Latency:
+    rst: The average number of cycles spent to fetch instructions to a :doc:`CU <compute-unit>`.
+    unit: Cycles
+Memory Chart:
+  Wavefront Occupancy:
+    rst: Wavefronts per active CU.
+    unit: Wavefronts
+  Wave Life:
+    rst: Average number of cycles executing a wave.
+    unit: Cycles per wave
+  SALU:
+    rst: Total Number of SALU (Scalar ALU) instructions issued per normalization unit.
+    unit: Instructions per normalization unit
+  SMEM:
+    rst: Total number of SMEM (Scalar Memory Read) instructions issued normalization
+      unit.
+    unit: Instructions per normalization unit
+  VALU:
+    rst: The number of VALU (Vector ALU) instructions issued per normalization unit.
+    unit: Instructions per normalization unit
+  MFMA:
+    rst: Total number of MFMA (Matrix-Fused-Multiply-Add) instructions issued per
+      normalization unit.
+    unit: Instructions per normalization unit
+  VMEM:
+    rst: The number of VMEM (GPU Memory) read instructions issued (including FLAT/scratch
+      memory) per normalization unit.
+    unit: Instructions per normalization unit
+  LDS:
+    rst: The total number of LDS instructions (including, but not limited to, read/write/atomics
+      and HIP's __shfl instructions) executed per normalization unit.
+    unit: Instructions per normalization unit
+  GWS:
+    rst: Total number of GDS (global data sync) instructions issued per normalization
+      unit.
+    unit: Instructions per normalization unit
+  BR:
+    rst: Total number of BRANCH instructions issued per normalization unit.
+    unit: Instructions per normalization unit
+  Active CUs:
+    rst: Total number of active compute units (CUs) on the accelerator during the
+      kernel execution.
+    unit: CUs
+  Num CUs:
+    rst: Total number of compute units (CUs) on the accelerator.
+    unit: CUs
+  VGPR:
+    rst: |-
+      The number of architected vector general-purpose registers allocated for the
+      kernel, see :ref:`VALU <desc-valu>`. Note: this may not exactly match the
+      number of VGPRs requested by the compiler due to allocation granularity.
+    unit: VGPRs
+  SGPR:
+    rst: |-
+      The number of scalar general-purpose registers allocated for the kernel, see
+      :ref:`SALU <desc-salu>`. Note: this may not exactly match the number of
+      SGPRs requested by the compiler due to allocation granularity.
+    unit: SGPRs
+  LDS Allocation:
+    rst: |-
+      The number of bytes of :doc:`LDS <local-data-share>` memory (or, shared memory)
+      allocated for this kernel. Note: This may also be larger than what was requested
+      at compile time due to both allocation granularity and dynamic per-dispatch
+      LDS allocations.
+    unit: Bytes per workgroup
+  Scratch Allocation:
+    rst: The number of bytes of :ref:`scratch memory <memory-spaces>` requested per
+      work-item for this kernel. Scratch memory is used for stack memory on the accelerator,
+      as well as for register spills and restores.
+    unit: Bytes per workgroup
+  Wavefronts:
+    rst: The total number of wavefronts, summed over all workgroups, forming this
+      kernel launch.
+    unit: Wavefronts
+  Workgroups:
+    rst: The total number of workgroups forming this kernel launch.
+    unit: Workgroups
+  LDS Req:
+    rst: The total number of LDS instructions (including, but not limited to, read/write/atomics
+      and HIP's ``__shfl`` instructions) executed per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  LDS Util:
+    rst: Indicates what percent of the kernel's duration the :ref:`LDS <desc-lds>`
+      was actively executing instructions (including, but not limited to, load, store,
+      atomic and HIP's ``__shfl`` operations). Calculated as the ratio of the total
+      number of cycles LDS was active over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  LDS Latency:
+    rst: The average number of round-trip cycles (i.e., from issue to data-return
+      / acknowledgment) required for an LDS instruction to complete.
+    unit: Cycles
+  VL1 Rd:
+    rst: The total number of incoming read requests from the :ref:`address processing
+      unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
+    unit: Requests per normalization unit
+  VL1 Wr:
+    rst: The total number of incoming write requests from the :ref:`address processing
+      unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
+    unit: Requests per normalization unit
+  VL1 Atomic:
+    rst: The total number of incoming atomic requests from the :ref:`address processing
+      unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
+    unit: Requests per normalization unit
+  VL1 Hit:
+    rst: The ratio of the number of vL1D cache line requests that hit in vL1D cache
+      over the total number of cache line requests to the :ref:`vL1D Cache RAM <desc-tc>`.
+    unit: Percent
+  VL1 Lat:
+    rst: Calculated as the average number of cycles that a vL1D cache line request
+      spent in the vL1D cache pipeline.
+    unit: Cycles
+  VL1 Coalesce:
+    rst: Indicates how well memory instructions were coalesced by the :ref:`address
+      processing unit <desc-ta>`, ranging from uncoalesced (25%) to fully coalesced
+      (100%). Calculated as the average number of :ref:`thread-requests <thread-requests>`
+      generated per instruction divided by the ideal number of thread-requests per
+      instruction.
+    unit: Percent
+  VL1 Stall:
+    rst: The ratio of the number of cycles where the vL1D is stalled waiting to issue
+      a request for data to the :doc:`L2 cache <l2-cache>` divided by the number of
+      cycles where the vL1D is active [#vl1d-activity]_.
+    unit: Percent
+  VL1_L2 Rd:
+    rst: The number of read requests for a vL1D cache line that were not satisfied
+      by the vL1D and must be retrieved from the to the :doc:`L2 Cache <l2-cache>`
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  VL1_L2 Wr:
+    rst: The number of write requests to a vL1D cache line that were sent through
+      the vL1D to the :doc:`L2 cache <l2-cache>`, per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  VL1_L2 Atomic:
+    rst: The number of atomic requests that are sent through the vL1D to the :doc:`L2
+      cache <l2-cache>`, per :ref:`normalization unit <normalization-units>`. This
+      includes requests for atomics with, and without return.
+    unit: Requests per normalization unit
+  sL1D Rd:
+    rst: The total number of requests, of any size or type, made to the sL1D per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Requests per normalization unit
+  sL1D Hit:
+    rst: The total number of sL1D requests that hit on a previously loaded cache line,
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  sL1D Lat:
+    rst: ''
+    unit: Unknown
+  sL1D_L2 Rd:
+    rst: The total number of read requests from sL1D to the :doc:`L2 <l2-cache>`,
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  sL1D_L2 Wr:
+    rst: The total number of write requests from sL1D to the :doc:`L2 <l2-cache>`,
+      per :ref:`normalization unit <normalization-units>`. Typically unused on current
+      CDNA accelerators.
+    unit: Requests per normalization unit
+  sL1D_L2 Atomic:
+    rst: The total number of atomic requests from sL1D to the :doc:`L2 <l2-cache>`,
+      per :ref:`normalization unit <normalization-units>`. Typically unused on current
+      CDNA accelerators.
+    unit: Requests per normalization unit
+  IL1 Fetch:
+    rst: The total number of requests made to the L1I per :ref:`normalization-unit
+      <normalization-units>`.
+    unit: Requests per normalization unit
+  IL1 Hit:
+    rst: The total number of L1I requests that hit on a previously loaded cache line,
+      per :ref:`normalization-unit <normalization-units>`.
+    unit: Percent
+  IL1 Lat:
+    rst: The average number of cycles spent to fetch instructions to a :doc:`CU <compute-unit>`.
+    unit: Cycles
+  IL1_L2 Rd:
+    rst: The total number of requests across the L1I - L2 interface per normalization-unit.
+    unit: Requests per normalization unit
+  L2 Rd:
+    rst: The total number of read requests to the L2 from all clients.
+    unit: Requests per normalization unit
+  L2 Wr:
+    rst: The total number of write requests to the L2 from all clients.
+    unit: Requests per normalization unit
+  L2 Atomic:
+    rst: The total number of atomic requests (with and without return) to the L2 from
+      all clients.
+    unit: Requests per normalization unit
+  L2 Hit:
+    rst: The ratio of the number of L2 cache line requests that hit in the L2 cache
+      over the total number of incoming cache line requests to the L2 cache.
+    unit: Percent
+  L2 Rd Lat:
+    rst: Calculated as the average number of cycles that the vL1D cache took to issue
+      and receive read requests from the :doc:`L2 Cache <l2-cache>`. This number also
+      includes requests for atomics with return values.
+    unit: Cycles
+  L2 Wr Lat:
+    rst: Calculated as the average number of cycles that the vL1D cache took to issue
+      and receive acknowledgement of a write request to the :doc:`L2 Cache <l2-cache>`.
+      This number also includes requests for atomics without return values.
+    unit: Cycles
+  Fabric_L2 Rd:
+    rst: Number of L2 cache - Infinity Fabric read requests (either 32-byte or 64-byte)
+      summed over TCC instances per normalization unit.
+    unit: Requests per normalization unit
+  Fabric_L2 Wr:
+    rst: Number of L2 cache - Infinity Fabric write requests (either 32-byte or 64-byte)
+      summed over TCC instances per normalization unit.
+    unit: Requests per normalization unit
+  Fabric_L2 Atomic:
+    rst: Number of L2 cache - Infinity Fabric write requests (either 32-byte or 64-byte)
+      that are actually atomic requests summed over TCC instances per normalization
+      unit.
+    unit: Requests per normalization unit
+  Fabric Rd Lat:
+    rst: The time-averaged number of cycles read requests spent in Infinity Fabric
+      before data was returned to the L2.
+    unit: Cycles
+  Fabric Wr Lat:
+    rst: The time-averaged number of cycles write requests spent in Infinity Fabric
+      before a completion acknowledgement was returned to the L2.
+    unit: Cycles
+  Fabric Atomic Lat:
+    rst: The time-averaged number of cycles atomic requests spent in Infinity Fabric
+      before a completion acknowledgement (atomic without return value) or data (atomic
+      with return value) was returned to the L2.
+    unit: Cycles
+  HBM Rd:
+    rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of
+      data from the accelerator's local HBM, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+  HBM Wr:
+    rst: The total number of L2 requests to Infinity Fabric to write 32B or 64B of
+      data from the accelerator's local HBM, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+Roofline Performance Rates:
+  VALU FLOPs (F16):
+    rst: |-
+      The total 16-bit floating-point operations executed per second on the :ref:`VALU
+      <desc-valu>`. This is presented with the value of the peak empirical F16 FLOPs achievable
+      on the specific accelerator. Note: this does not include any F16 operations
+      from :ref:`MFMA <desc-mfma>` instructions.
+    unit: GFLOPs
+  VALU FLOPs (F32):
+    rst: |-
+      The total 32-bit floating-point operations executed per second on the :ref:`VALU
+      <desc-valu>`. This is presented with the value of the peak empirical F32 FLOPs achievable
+      on the specific accelerator. Note: this does not include any F32 operations
+      from :ref:`MFMA <desc-mfma>` instructions.
+    unit: GFLOPs
+  VALU FLOPs (F64):
+    rst: |-
+      The total 64-bit floating-point operations executed per second on the :ref:`VALU
+      <desc-valu>`. This is presented with the value of the peak empirical F64 FLOPs achievable
+      on the specific accelerator. Note: this does not include any F64 operations
+      from :ref:`MFMA <desc-mfma>` instructions.
+    unit: GFLOPs
+  MFMA FLOPs (F64):
+    rst: |-
+      The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 64-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
+      measured F64 MFMA operations achievable on the specific accelerator is
+      displayed alongside for comparison.
+    unit: GFLOPs
+  MFMA FLOPs (F32):
+    rst: |-
+      The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 32-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
+      measured F32 MFMA operations achievable on the specific accelerator is
+      displayed alongside for comparison.
+    unit: GFLOPs
+  MFMA FLOPs (F16):
+    rst: |-
+      The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 16-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
+      measured F16 MFMA operations achievable on the specific accelerator is
+      displayed alongside for comparison.
+    unit: GFLOPs
+  MFMA FLOPs (BF16):
+    rst: |-
+      The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>`
+      operations executed per second. Note: this does not include any 16-bit brain
+      floating point operations from :ref:`VALU <desc-valu>` instructions. The
+      peak empirically measured BF16 MFMA operations achievable on the specific
+      accelerator is displayed alongside for comparison.
+    unit: GFLOPs
+  MFMA IOPs (Int8):
+    rst: |-
+      The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
+      per second. Note: this does not include any 8-bit integer operations from
+      :ref:`VALU <desc-valu>` instructions. The peak empirically measured INT8 MFMA
+      operations achievable on the specific accelerator is displayed alongside
+      for comparison.
+    unit: GIOPs
+  HBM Bandwidth:
+    rst: |-
+      The total number of bytes read from and written to High-Bandwidth
+        Memory (HBM) per second. The peak empirically measured bandwidth achievable
+        on the specific accelerator is displayed alongside for comparison.
+    unit: GB/s
+  L2 Cache Bandwidth:
+    rst: The number of bytes looked up in the L2 cache per unit time. The number of
+      bytes is calculated as the number of cache lines requested multiplied by the
+      cache line size. This value does not consider partial requests, so e.g., if
+      only a single value is requested in a cache line, the data movement will still
+      be counted as a full cache line. The peak empirically measured bandwidth achievable
+      on the specific accelerator is displayed alongside for comparison.
+    unit: GB/s
+  L1 Cache Bandwidth:
+    rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM
+      <desc-vmem>` instructions per unit time. The number of bytes is calculated as
+      the number of cache lines requested multiplied by the cache line size. This
+      value does not consider partial requests, so e.g., if only a single value is
+      requested in a cache line, the data movement will still be counted as a full
+      cache line. The peak empirically measured bandwidth achievable on the specific
+      accelerator is displayed alongside for comparison.
+    unit: GB/s
+  LDS Bandwidth:
+    rst: Indicates the maximum amount of bytes that could have been loaded from, stored
+      to, or atomically updated in the LDS per unit time (see :ref:`LDS Bandwidth
+      <lds-bandwidth>` example for more detail). The peak empirically measured LDS
+      bandwidth achievable on the specific accelerator is displayed alongside for
+      comparison.
+    unit: GB/s
+Roofline Plot Points:
+  AI HBM:
+    rst: |-
+      The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM).
+      It is the ratio of total floating-point operations (FLOPs) to total bytes
+      transferred between HBM and the L2 cache. This value is used as the x-coordinate
+      for the HBM roofline.
+    unit: FLOPs/Byte
+  AI L2:
+    rst: |-
+      The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio
+      of total floating-point operations (FLOPs) to total bytes transferred between
+      the L2 cache and the L1 cache. This value is used as the x-coordinate for
+      the L2 roofline.
+    unit: FLOPs/Byte
+  AI L1:
+    rst: |-
+      The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio
+      of total floating-point operations (FLOPs) to total bytes transferred between
+      the L1 cache and the processing units. This value is used as the x-coordinate
+      for the L1 roofline.
+    unit: FLOPs/Byte
+  Performance (GFLOPs):
+    rst: |-
+      The overall achieved performance, measured in GigaFLOPs
+      per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point
+      operations divided by the total execution time. This value is used as the y-coordinate
+      for the kernel's point on the Roofline plot.
+    unit: GFLOP/s
+Command processor fetcher (CPF):
+  CPF Utilization:
+    rst: Percent of total cycles where the CPF was busy actively doing any work. The
+      ratio of CPF busy cycles over total cycles counted by the CPF.
+    unit: Percent
+  CPF Stall:
+    rst: Percent of CPF busy cycles where the CPF was stalled for any reason.
+    unit: Percent
+  CPF-L2 Utilization:
+    rst: Percent of total cycles counted by the CPF-:doc:`L2 <l2-cache>` interface
+      where the CPF-L2 interface was active doing any work. The ratio of CPF-L2 busy
+      cycles over total cycles counted by the CPF-L2.
+    unit: Percent
+  CPF-L2 Stall:
+    rst: Percent of CPF-:doc:`L2 <l2-cache>` L2 busy cycles where the CPF-L2 interface
+      was stalled for any reason.
+    unit: Percent
+  CPF-UTCL1 Stall:
+    rst: Percent of CPF busy cycles where the CPF was stalled by address translation.
+    unit: Percent
+Command processor packet processor (CPC):
+  CPC Utilization:
+    rst: Percent of total cycles where the CPC was busy actively doing any work. The
+      ratio of CPC busy cycles over total cycles counted by the CPC.
+    unit: Percent
+  CPC Stall Rate:
+    rst: Percent of CPC busy cycles where the CPC was stalled for any reason.
+    unit: Percent
+  CPC Packet Decoding Utilization:
+    rst: Percent of CPC busy cycles spent decoding commands for processing.
+    unit: Percent
+  CPC-Workgroup Manager Utilization:
+    rst: Percent of CPC busy cycles spent dispatching workgroups to the :ref:`workgroup
+      manager <desc-spi>`.
+    unit: Percent
+  CPC-L2 Utilization:
+    rst: Percent of total cycles counted by the CPC-:doc:`L2 <l2-cache>` interface
+      where the CPC-L2 interface was active doing any work.
+    unit: Percent
+  CPC-UTCL1 Stall:
+    rst: Percent of CPC busy cycles where the CPC was stalled by address translation
+    unit: Percent
+  CPC-UTCL2 Utilization:
+    rst: Percent of total cycles counted by the CPC's :doc:`L2 <l2-cache>` address
+      translation interface where the CPC was busy doing address translation work.
+    unit: Percent
+Workgroup manager utilizations:
+  Accelerator Utilization:
+    rst: The percent of cycles in the kernel where the accelerator was actively doing
+      any work.
+    unit: Percent
+  Scheduler-Pipe Utilization:
+    rst: |-
+      The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
+      in the kernel where the scheduler-pipes were actively doing any work. Note: this
+      value is expected to range between 0% and 25%. See :ref:`desc-spi`.
+    unit: Percent
+  Workgroup Manager Utilization:
+    rst: The percent of cycles in the kernel where the workgroup manager was actively
+      doing any work.
+    unit: Percent
+  Shader Engine Utilization:
+    rst: The percent of :ref:`total shader engine cycles <total-se-cycles>` in the
+      kernel where any CU in a shader-engine was actively doing any work, normalized
+      over all shader-engines. Low values (e.g., << 100%) indicate that the accelerator
+      was not fully saturated by the kernel, or a potential load-imbalance issue.
+    unit: Percent
+  SIMD Utilization:
+    rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel
+      where any :ref:`SIMD <desc-valu>` on a CU was actively doing any work, summed
+      over all CUs. Low values (less than 100%) indicate that the accelerator was
+      not fully saturated by the kernel, or a potential load-imbalance issue.
+    unit: Percent
+  Dispatched Workgroups:
+    rst: The total number of workgroups forming this kernel launch.
+    unit: Workgroups
+  Dispatched Wavefronts:
+    rst: The total number of wavefronts, summed over all workgroups, forming this
+      kernel launch.
+    unit: Wavefronts
+  VGPR Writes:
+    rst: The average number of cycles spent initializing :ref:`VGPRs <desc-valu>`
+      at wave creation.
+    unit: Cycles/wave
+  SGPR Writes:
+    rst: The average number of cycles spent initializing :ref:`SGPRs <desc-salu>`
+      at wave creation.
+    unit: Cycles/wave
+Workgroup Manager - Resource Allocation:
+  Not-scheduled Rate (Workgroup Manager):
+    rst: |-
+      The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
+      in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
+      due to a bottleneck within the workgroup manager rather than a lack of a
+      CU or :ref:`SIMD <desc-valu>` with sufficient resources. Note: this value
+      is expected to range between 0-25%. See note in :ref:`workgroup manager <desc-spi>`
+      description.
+    unit: Percent
+  Not-scheduled Rate (Scheduler-Pipe):
+    rst: |-
+      The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
+      in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
+      due to a bottleneck within the scheduler-pipes rather than a lack of a CU
+      or :ref:`SIMD <desc-valu>` with sufficient resources. Note: this value is
+      expected to range between 0-25%, see note in :ref:`workgroup manager <desc-spi>`
+      description.
+    unit: Percent
+  Scheduler-Pipe Stall Rate:
+    rst: |-
+      The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
+      in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
+      due to occupancy limitations (like a lack of a CU or :ref:`SIMD <desc-valu>`
+      with sufficient resources). Note: this value is expected to range between
+      0-25%, see note in :ref:`workgroup manager <desc-spi>` description.
+    unit: Percent
+  Scratch Stall Rate:
+    rst: The percent of :ref:`total shader-engine cycles <total-se-cycles>` in the
+      kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
+      due to lack of :ref:`private (a.k.a., scratch) memory <memory-type>` slots.
+      While this can reach up to 100%, note that the actual occupancy limitations
+      on a kernel using private memory are typically quite small (for example, less
+      than 1% of the total number of waves that can be scheduled to an accelerator).
+    unit: Percent
+  Insufficient SIMD Waveslots:
+    rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel
+      where a workgroup could not be scheduled to a :ref:`SIMD <desc-valu>` due to
+      lack of available :ref:`waveslots <desc-valu>`.
+    unit: Percent
+  Insufficient SIMD VGPRs:
+    rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel
+      where a workgroup could not be scheduled to a :ref:`SIMD <desc-valu>` due to
+      lack of available :ref:`VGPRs <desc-valu>`.
+    unit: Percent
+  Insufficient SIMD SGPRs:
+    rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel
+      where a workgroup could not be scheduled to a :ref:`SIMD <desc-valu>` due to
+      lack of available :ref:`SGPRs <desc-salu>`.
+    unit: Percent
+  Insufficient CU LDS:
+    rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel where
+      a workgroup could not be scheduled to a :doc:`CU <compute-unit>` due to lack
+      of available :doc:`LDS <local-data-share>`.
+    unit: Percent
+  Insufficient CU Barriers:
+    rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel where
+      a workgroup could not be scheduled to a :doc:`CU <compute-unit>` due to lack
+      of available :ref:`barriers <desc-barrier>`.
+    unit: Percent
+  Reached CU Workgroup Limit:
+    rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel where
+      a workgroup could not be scheduled to a :doc:`CU <compute-unit>` due to limits
+      within the workgroup manager. This is expected to be always be zero on CDNA2
+      or newer accelerators (and small for previous accelerators).
+    unit: Percent
+  Reached CU Wavefront Limit:
+    rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel where
+      a wavefront could not be scheduled to a :doc:`CU <compute-unit>` due to limits
+      within the workgroup manager. This is expected to be always be zero on CDNA2
+      or newer accelerators (and small for previous accelerators).
+    unit: Percent
+Wavefront Launch Stats:
+  Grid Size:
+    rst: The total number of work-items (or, threads) launched as a part of the kernel
+      dispatch. In HIP, this is equivalent to the total grid size multiplied by the
+      total workgroup (or, block) size.
+    unit: Work-Items
+  Workgroup Size:
+    rst: The total number of work-items (or, threads) in each workgroup (or, block)
+      launched as part of the kernel dispatch. In HIP, this is equivalent to the total
+      block size.
+    unit: Work-Items
+  Total Wavefronts:
+    rst: |-
+      The total number of wavefronts launched as part of the kernel dispatch.
+      On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront
+      size is always 64 work-items. Thus, the total number of wavefronts should
+      be equivalent to the ceiling of grid size divided by 64.
+    unit: Wavefronts
+  Saved Wavefronts:
+    rst: The total number of wavefronts saved at a context-save. See `cwsr_enable
+      <https://docs.kernel.org/gpu/amdgpu/module-parameters.html?highlight=cwsr>`_.
+    unit: Wavefronts
+  Restored Wavefronts:
+    rst: The total number of wavefronts restored from a context-save. See `cwsr_enable
+      <https://docs.kernel.org/gpu/amdgpu/module-parameters.html?highlight=cwsr>`_.
+    unit: Wavefronts
+  VGPRs:
+    rst: |-
+      The number of architected vector general-purpose registers allocated for the
+      kernel, see :ref:`VALU <desc-valu>`. Note: this may not exactly match the
+      number of VGPRs requested by the compiler due to allocation granularity.
+    unit: VGPRs
+  AGPRs:
+    rst: |-
+      The number of accumulation vector general-purpose registers allocated
+      for the kernel, see :ref:`AGPRs <desc-agprs>`. Note: this may not exactly match
+      the number of AGPRs requested by the compiler due to allocation granularity.
+    unit: AGPRs
+  SGPRs:
+    rst: |-
+      The number of scalar general-purpose registers allocated for the kernel, see
+      :ref:`SALU <desc-salu>`. Note: this may not exactly match the number of
+      SGPRs requested by the compiler due to allocation granularity.
+    unit: SGPRs
+  LDS Allocation:
+    rst: |-
+      The number of bytes of :doc:`LDS <local-data-share>` memory (or, shared memory)
+      allocated for this kernel. Note: This may also be larger than what was requested
+      at compile time due to both allocation granularity and dynamic per-dispatch
+      LDS allocations.
+    unit: Bytes per workgroup
+  Scratch Allocation:
+    rst: The number of bytes of :ref:`scratch memory <memory-spaces>` requested per
+      work-item for this kernel. Scratch memory is used for stack memory on the accelerator,
+      as well as for register spills and restores.
+    unit: Bytes per work-item
+Wavefront Runtime Stats:
+  Kernel Time:
+    rst: The total duration of the executed kernel.
+    unit: Nanoseconds
+  Kernel Time (Cycles):
+    rst: The total duration of the executed kernel in cycles.
+    unit: Cycles
+  Instructions per wavefront:
+    rst: The average number of instructions (of all types) executed per wavefront.
+      This is averaged over all wavefronts in a kernel dispatch.
+    unit: Instructions per wavefront
+  Wave Cycles:
+    rst: |-
+      The number of cycles a wavefront in the kernel dispatch spent resident
+      on a compute unit per :ref:`normalization unit <normalization-units>`. This is
+      averaged over all wavefronts in a kernel dispatch. Note: this should not
+      be directly compared to the kernel cycles above.
+    unit: Cycles per normalization unit
+  Dependency Wait Cycles:
+    rst: The number of cycles a wavefront in the kernel dispatch stalled waiting on
+      memory of any kind (e.g., instruction fetch, vector or scalar memory, etc.)
+      per :ref:`normalization unit <normalization-units>`. This counter is incremented
+      at every cycle by *all* wavefronts on a CU stalled at a memory operation. As
+      such, it is most useful to get a sense of how waves were spending their time,
+      rather than identification of a precise limiter because another wave could be
+      actively executing while a wave is stalled. The sum of this metric, Issue Wait
+      Cycles and Active Cycles should be equal to the total Wave Cycles metric.
+    unit: Cycles per normalization unit
+  Issue Wait Cycles:
+    rst: The number of cycles a wavefront in the kernel dispatch was unable to issue
+      an instruction for any reason (e.g., execution pipe back-pressure, arbitration
+      loss, etc.) per :ref:`normalization unit <normalization-units>`. This counter
+      is incremented at every cycle by *all* wavefronts on a CU unable to issue an
+      instruction. As such, it is most useful to get a sense of how waves were spending
+      their time, rather than identification of a precise limiter because another
+      wave could be actively executing while a wave is issue stalled. The sum of this
+      metric, Dependency Wait Cycles and Active Cycles should be equal to the total
+      Wave Cycles metric.
+    unit: Cycles per normalization unit
+  Active Cycles:
+    rst: The average number of cycles a wavefront in the kernel dispatch was actively
+      executing instructions per :ref:`normalization unit <normalization-units>`.
+      This measurement is made on a per-wavefront basis, and may include cycles that
+      another wavefront spent actively executing (on another execution unit, for example)
+      or was stalled. As such, it is most useful to get a sense of how waves were
+      spending their time, rather than identification of a precise limiter. The sum
+      of this metric, Issue Wait Cycles and Active Wait Cycles should be equal to
+      the total Wave Cycles metric.
+    unit: Cycles per normalization unit
+  Wavefront Occupancy:
+    rst: |-
+      The time-averaged number of wavefronts resident on the accelerator over the
+      lifetime of the kernel. Note: this metric may be inaccurate for short-running
+      kernels (less than 1ms).
+    unit: Wavefronts
+Overall Instruction Mix:
+  VALU:
+    rst: The total number of vector arithmetic logic unit (VALU) operations issued.
+      These are the workhorses of the :doc:`compute unit <compute-unit>`, and are
+      used to execute a wide range of instruction types including floating point operations,
+      non-uniform address calculations, transcendental operations, integer operations,
+      shifts, conditional evaluation, etc.
+    unit: Instructions
+  VMEM:
+    rst: The total number of vector memory operations issued. These include most loads,
+      stores and atomic operations and all accesses to :ref:`generic, global, private
+      and texture <memory-spaces>` memory.
+    unit: Instructions
+  LDS:
+    rst: The total number of LDS (also known as shared memory) operations issued.
+      These include loads, stores, atomics, and HIP's ``__shfl`` operations.
+    unit: Instructions
+  MFMA:
+    rst: The total number of matrix fused multiply-add instructions issued.
+    unit: Instructions
+  SALU:
+    rst: The total number of scalar arithmetic logic unit (SALU) operations issued.
+      Typically these are used for address calculations, literal constants, and other
+      operations that are provably uniform across a wavefront. Although scalar memory
+      (SMEM) operations are issued by the SALU, they are counted separately in this
+      section.
+    unit: Instructions
+  SMEM:
+    rst: The total number of scalar memory (SMEM) operations issued. These are typically
+      used for loading kernel arguments, base-pointers and loads from HIP's ``__constant__``
+      memory.
+    unit: Instructions
+  Branch:
+    rst: The total number of branch operations issued. These typically consist of
+      jump or branch operations and are used to implement control flow.
+    unit: Instructions
+VALU Arithmetic Instruction Mix:
+  INT32:
+    rst: The total number of instructions operating on 32-bit integer operands issued
+      to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  INT64:
+    rst: The total number of instructions operating on 64-bit integer operands issued
+      to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F16-ADD:
+    rst: The total number of addition instructions operating on 16-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F16-MUL:
+    rst: The total number of multiplication instructions operating on 16-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F16-FMA:
+    rst: The total number of fused multiply-add instructions operating on 16-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F16-Trans:
+    rst: The total number of transcendental instructions (e.g., `sqrt`) operating
+      on 16-bit floating-point operands issued to the VALU per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F32-ADD:
+    rst: The total number of addition instructions operating on 32-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F32-MUL:
+    rst: The total number of multiplication instructions operating on 32-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F32-FMA:
+    rst: The total number of fused multiply-add instructions operating on 32-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F32-Trans:
+    rst: The total number of transcendental instructions (such as ``sqrt``) operating
+      on 32-bit floating-point operands issued to the VALU per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F64-ADD:
+    rst: The total number of addition instructions operating on 64-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F64-MUL:
+    rst: The total number of multiplication instructions operating on 64-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F64-FMA:
+    rst: The total number of fused multiply-add instructions operating on 64-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F64-Trans:
+    rst: The total number of transcendental instructions (such as `sqrt`) operating
+      on 64-bit floating-point operands issued to the VALU per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Conversion:
+    rst: |-
+      The total number of type conversion instructions (such as converting data
+      to or from F32\u2194F64) issued to the VALU per :ref:`normalization unit
+      <normalization-units>`.
+    unit: Instructions per normalization unit
+VMEM Instruction Mix:
+  Global/Generic Instr:
+    rst: The total number of global & generic memory instructions executed on all
+      :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Global/Generic Read:
+    rst: The total number of global & generic memory read instructions executed on
+      all :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Global/Generic Write:
+    rst: The total number of global & generic memory write instructions executed on
+      all :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Global/Generic Atomic:
+    rst: The total number of global & generic memory atomic (with and without return)
+      instructions executed on all :doc:`compute units <compute-unit>` on the accelerator,
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Spill/Stack Instr:
+    rst: The total number of spill/stack memory instructions executed on all :doc:`compute
+      units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Spill/Stack Read:
+    rst: The total number of spill/stack memory read instructions executed on all
+      :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Spill/Stack Write:
+    rst: The total number of spill/stack memory write instructions executed on all
+      :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Spill/Stack Atomic:
+    rst: The total number of spill/stack memory atomic (with and without return) instructions
+      executed on all :doc:`compute units <compute-unit>` on the accelerator, per
+      :ref:`normalization unit <normalization-units>`. Typically unused as these memory
+      operations are typically used to implement thread-local storage.
+    unit: Instructions per normalization unit
+MFMA Arithmetic Instruction Mix:
+  MFMA-I8:
+    rst: The total number of 8-bit integer :ref:`MFMA <desc-mfma>` instructions issued
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  MFMA-F16:
+    rst: The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` instructions
+      issued per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  MFMA-BF16:
+    rst: The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>` instructions
+      issued per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  MFMA-F32:
+    rst: The total number of 32-bit floating-point :ref:`MFMA <desc-mfma>` instructions
+      issued per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  MFMA-F64:
+    rst: The total number of 64-bit floating-point :ref:`MFMA <desc-mfma>` instructions
+      issued per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+Compute Speed-of-Light:
+  VALU FLOPs:
+    rst: |-
+      The total floating-point operations executed per second on the :ref:`VALU
+      <desc-valu>`. This is also presented as a percent of the peak theoretical
+      FLOPs achievable on the specific accelerator. Note: this does not include
+      any floating-point operations from :ref:`MFMA <desc-mfma>` instructions.
+    unit: GFLOPs
+  VALU IOPs:
+    rst: |-
+      The total integer operations executed per second on the :ref:`VALU <desc-valu>`.
+      This is also presented as a percent of the peak theoretical IOPs achievable
+      on the specific accelerator. Note: this does not include any integer operations
+      from :ref:`MFMA <desc-mfma>` instructions.
+    unit: GIOPs
+  MFMA FLOPs (BF16):
+    rst: |-
+      The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 16-bit brain floating
+      point operations from :ref:`VALU <desc-valu>` instructions. This is also
+      presented as a percent of the peak theoretical BF16 MFMA operations achievable
+      on the specific accelerator.
+    unit: GFLOPs
+  MFMA FLOPs (F16):
+    rst: |-
+      The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 16-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. This is also presented
+      as a percent of the peak theoretical F16 MFMA operations achievable on the
+      specific accelerator.
+    unit: GFLOPs
+  MFMA FLOPs (F32):
+    rst: |-
+      The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 32-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. This is also presented
+      as a percent of the peak theoretical F32 MFMA operations achievable on the
+      specific accelerator.
+    unit: GFLOPs
+  MFMA FLOPs (F64):
+    rst: |-
+      The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 64-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. This is also presented
+      as a percent of the peak theoretical F64 MFMA operations achievable on the
+      specific accelerator. The total number of 64-bit floating point :ref:`MFMA
+      <desc-mfma>` operations executed per second. Note: this does not include
+      any 64-bit floating point operations from :ref:`VALU <desc-valu>` instructions.
+      This is also presented as a percent of the peak theoretical F64 MFMA operations
+      achievable on the specific accelerator.
+    unit: GFLOPs
+  MFMA IOPs (INT8):
+    rst: |-
+      The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
+      per second. Note: this does not include any 8-bit integer operations from
+      :ref:`VALU <desc-valu>` instructions. This is also presented as a percent
+      of the peak theoretical INT8 MFMA operations achievable on the specific accelerator.
+    unit: GFLOPs
+Pipeline Statistics:
+  IPC:
+    rst: The ratio of the total number of instructions executed on the :doc:`CU <compute-unit>`
+      over the :ref:`total active CU cycles <total-active-cu-cycles>`.
+    unit: Instructions per cycle
+  IPC (Issued):
+    rst: The ratio of the total number of (non-:ref:`internal <ipc-internal-instructions>`)
+      instructions issued over the number of cycles where the :ref:`scheduler <desc-scheduler>`
+      was actively working on issuing instructions. Refer to the :ref:`Issued IPC
+      <issued-ipc>` example for further detail.
+    unit: Instructions per cycle
+  SALU Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`SALU <desc-salu>`
+      was busy executing instructions. Computed as the ratio of the total number of
+      cycles spent by the :ref:`scheduler <desc-scheduler>` issuing SALU / :ref:`SMEM
+      <desc-smem>` instructions over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  VALU Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`VALU <desc-valu>`
+      was busy executing instructions. Does not include :ref:`VMEM <desc-vmem>` operations.
+      Computed as the ratio of the total number of cycles spent by the :ref:`scheduler
+      <desc-scheduler>` issuing VALU instructions over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  VMEM Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`VMEM <desc-vmem>`
+      unit was busy executing instructions, including both global/generic and spill/scratch
+      operations (see the :ref:`VMEM instruction count metrics <ta-instruction-counts>`
+      for more detail). Does not include :ref:`VALU <desc-valu>` operations. Computed
+      as the ratio of the total number of cycles spent by the :ref:`scheduler <desc-scheduler>`
+      issuing VMEM instructions over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  Branch Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`branch <desc-branch>`
+      unit was busy executing instructions. Computed as the ratio of the total number
+      of cycles spent by the :ref:`scheduler <desc-scheduler>` issuing branch instructions
+      over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  VALU Active Threads:
+    rst: Indicates the average level of :ref:`divergence <desc-divergence>` within
+      a wavefront over the lifetime of the kernel. The number of work-items that were
+      active in a wavefront during execution of each :ref:`VALU <desc-valu>` instruction,
+      time-averaged over all VALU instructions run on all wavefronts in the kernel.
+    unit: Work-items
+  MFMA Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`MFMA <desc-mfma>`
+      unit was busy executing instructions. Computed as the ratio of the total number
+      of cycles spent by the :ref:`MFMA <desc-salu>` was busy over the :ref:`total
+      CU cycles <total-cu-cycles>`.
+    unit: Percent
+  MFMA Instruction Cycles:
+    rst: The average duration of :ref:`MFMA <desc-mfma>` instructions in this kernel
+      in cycles. Computed as the ratio of the total number of cycles the MFMA unit
+      was busy over the total number of MFMA instructions. Compare to, for example,
+      the `AMD Matrix Instruction Calculator <https://github.com/RadeonOpenCompute/amd_matrix_instruction_calculator>`_.
+    unit: Cycles per instruction
+  VMEM Latency:
+    rst: The average number of round-trip cycles (that is, from issue to data return
+      / acknowledgment) required for a VMEM instruction to complete.
+    unit: Cycles
+  SMEM Latency:
+    rst: The average number of round-trip cycles (that is, from issue to data return
+      / acknowledgment) required for a SMEM instruction to complete.
+    unit: Cycles
+Arithmetic Operations:
+  FLOPs (Total):
+    rst: The total number of floating-point operations executed on either the :ref:`VALU
+      <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization unit
+      <normalization-units>`.
+    unit: FLOP per normalization unit
+  IOPs (Total):
+    rst: The total number of integer operations executed on either the :ref:`VALU
+      <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization unit
+      <normalization-units>`.
+    unit: IOP per normalization unit
+  F16 OPs:
+    rst: The total number of 16-bit floating-point operations executed on either the
+      :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: FLOP per normalization unit
+  BF16 OPs:
+    rst: |-
+      The total number of 16-bit brain floating-point operations executed on
+      either the :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization
+      unit <normalization-units>`. Note: on current CDNA accelerators, the VALU
+      has no native BF16 instructions.
+    unit: FLOP per normalization unit
+  F32 OPs:
+    rst: The total number of 32-bit floating-point operations executed on either the
+      :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: FLOP per normalization unit
+  F64 OPs:
+    rst: The total number of 64-bit floating-point operations executed on either the
+      :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: FLOP per normalization unit
+  INT8 OPs:
+    rst: |-
+      The total number of 8-bit integer operations executed on either the :ref:`VALU
+      <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization unit
+      <normalization-units>`. Note: on current CDNA accelerators, the VALU has
+      no native INT8 instructions.
+    unit: IOP per normalization unit
+LDS Speed-of-Light:
+  Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`LDS <desc-lds>`
+      was actively executing instructions (including, but not limited to, load, store,
+      atomic and HIP's ``__shfl`` operations). Calculated as the ratio of the total
+      number of cycles LDS was active over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  Access Rate:
+    rst: Indicates the percentage of SIMDs in the :ref:`VALU <desc-valu>` [#lds-workload]_
+      actively issuing LDS instructions, averaged over the lifetime of the kernel.
+      Calculated as the ratio of the total number of cycles spent by the :ref:`scheduler
+      <desc-scheduler>` issuing :ref:`LDS <desc-lds>` instructions over the :ref:`total
+      CU cycles <total-cu-cycles>`.
+    unit: Percent
+  Theoretical Bandwidth Utilization:
+    rst: Indicates the maximum amount of bytes that could have been loaded from, stored
+      to, or atomically updated in the LDS divided as percentage of theoretical peak.
+      Does *not* take into account the execution mask of the wavefront when the instruction
+      was executed. See the :ref:`LDS bandwidth example <lds-bandwidth>` for more
+      detail.
+    unit: Percent
+  Bank Conflict Rate:
+    rst: Indicates the percentage of active LDS cycles that were spent servicing bank
+      conflicts. Calculated as the ratio of LDS cycles spent servicing bank conflicts
+      over the number of LDS cycles that would have been required to move the same
+      amount of data in an uncontended access. [#lds-bank-conflict]_
+    unit: Percent
+LDS Statistics:
+  LDS Instructions:
+    rst: The total number of LDS instructions (including, but not limited to, read/write/atomics
+      and HIP's ``__shfl`` instructions) executed per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Theoretical Bandwidth:
+    rst: Indicates the maximum amount of bytes that could have been loaded from, stored
+      to, or atomically updated in the LDS divided by total duration. Does *not* take
+      into account the execution mask of the wavefront when the instruction was executed.
+      See the :ref:`LDS bandwidth example <lds-bandwidth>` for more detail.
+    unit: Gbps
+  LDS Latency:
+    rst: The average number of round-trip cycles (i.e., from issue to data-return
+      acknowledgment) required for an LDS instruction to complete.
+    unit: Cycles
+  Bank Conflicts/Access:
+    rst: The ratio of the number of cycles spent in the :ref:`LDS scheduler <desc-lds>`
+      due to bank conflicts (as determined by the conflict resolution hardware) to
+      the base number of cycles that would be spent in the LDS scheduler in a completely
+      uncontended case. This is the unnormalized form of the Bank Conflict Rate.
+    unit: Conflicts per Access
+  Index Accesses:
+    rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>` over
+      all operations per :ref:`normalization unit <normalization-units>`.
+    unit: Cycles per normalization unit
+  Atomic Return Cycles:
+    rst: The total number of cycles spent on LDS atomics with return per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Cycles per normalization unit
+  Bank Conflict:
+    rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>` due
+      to bank conflicts (as determined by the conflict resolution hardware) per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Cycles per normalization unit
+  Addr Conflict:
+    rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>` due
+      to address conflicts (as determined by the conflict resolution hardware) per
+      :ref:`normalization unit <normalization-units>`.
+    unit: Cycles per normalization unit
+  Unaligned Stall:
+    rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>` due
+      to stalls from non-dword aligned addresses per :ref:`normalization unit <normalization-units>`.
+    unit: Cycles per normalization unit
+  Mem Violations:
+    rst: |-
+      The total number of out-of-bounds accesses made to the LDS, per :ref:`normalization
+      unit <normalization-units>`. This is unused and expected to be zero in
+      most configurations for modern CDNA\u2122 accelerators.
+    unit: Accesses per normalization unit
+L1I Speed-of-Light:
+  Bandwidth Utilization:
+    rst: The number of bytes looked up in the L1I cache, as a percent of the peak
+      theoretical bandwidth. Calculated as the ratio of L1I requests over the :ref:`total
+      L1I cycles <total-l1i-cycles>`.
+    unit: Percent
+  Cache Hit Rate:
+    rst: The percent of L1I requests that hit [#l1i-cache]_ on a previously loaded
+      line the cache. Calculated as the ratio of the number of L1I requests that hit
+      over the number of all L1I requests.
+    unit: Percent
+  L1I-L2 Bandwidth Utilization:
+    rst: |-
+      The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth
+      achieved. Calculated as the ratio of the total number of requests from
+      the L1I to the L2 cache over the :ref:`total L1I-L2 interface cycles <total-l1i-cycles>`.
+    unit: Percent
+L1I cache accesses:
+  Req:
+    rst: The total number of requests made to the L1I per normalization-unit
+    unit: Requests per normalization unit
+  Hits:
+    rst: The total number of L1I requests that hit on a previously loaded cache line,
+      per :ref:`normalization-unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Misses - Non Duplicated:
+    rst: The total number of L1I requests that missed on a cache line that *were not*
+      already pending due to another request, per :ref:`normalization-unit <normalization-units>`.
+      See note in :ref:`desc-l1i-sol` for more detail.
+    unit: Requests per normalization unit
+  Misses - Duplicated:
+    rst: The total number of L1I requests that missed on a cache line that *were*
+      already pending due to another request, per :ref:`normalization-unit <normalization-units>`.
+      See note in :ref:`desc-l1i-sol` for more detail.
+    unit: Requests per normalization unit
+  Cache Hit Rate:
+    rst: The percent of L1I requests that hit [#l1i-cache]_ on a previously loaded
+      line the cache. Calculated as the ratio of the number of L1I requests that hit
+      over the number of all L1I requests.
+    unit: Percent
+  Instruction Fetch Latency:
+    rst: The average number of cycles spent to fetch instructions to a :doc:`CU <compute-unit>`.
+    unit: Cycles
+L1I <-> L2 interface:
+  L1I-L2 Bandwidth:
+    rst: Total number of bytes transferred across L1I - L2 interface divided by total
+      duration.
+    unit: Gbps
+Scalar L1D Speed-of-Light:
+  Bandwidth Utilization:
+    rst: The number of bytes looked up in the sL1D cache, as a percent of the peak
+      theoretical bandwidth. Calculated as the ratio of sL1D requests over the :ref:`total
+      sL1D cycles <total-sl1d-cycles>`.
+    unit: Percent
+  Cache Hit Rate:
+    rst: Indicates the percent of sL1D requests that hit on a previously loaded line
+      the cache. The ratio of the number of sL1D requests that hit [#sl1d-cache]_
+      over the number of all sL1D requests.
+    unit: Percent
+  sL1D-L2 BW Utilization:
+    rst: The percentage of the peak theoretical sL1D - L2 interface bandwidth acheived.
+      Caclulated as total number of bytes read from, written to, or atomically updated
+      across the sL1D - L2 interface.
+    unit: Percent
+Scalar L1D cache accesses:
+  Req:
+    rst: The total number of requests, of any size or type, made to the sL1D per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Hits:
+    rst: The total number of sL1D requests that hit on a previously loaded cache line,
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Misses - Non Duplicated:
+    rst: The total number of sL1D requests that missed on a cache line that *was not*
+      already pending due to another request, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`desc-sl1d-sol` for more detail.
+    unit: Requests per normalization unit
+  Misses- Duplicated:
+    rst: The total number of sL1D requests that missed on a cache line that *was*
+      already pending due to another request, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`desc-sl1d-sol` for more detail.
+    unit: Requests per normalization unit
+  Cache Hit Rate:
+    rst: Indicates the percent of sL1D requests that hit on a previously loaded line
+      the cache. The ratio of the number of sL1D requests that hit [#sl1d-cache]_
+      over the number of all sL1D requests.
+    unit: Percent
+  Read Req (Total):
+    rst: The total number of sL1D read requests of any size, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Atomic Req:
+    rst: The total number of atomic requests from sL1D to the :doc:`L2 <l2-cache>`,
+      per :ref:`normalization unit <normalization-units>`. Typically unused on current
+      CDNA accelerators.
+    unit: Requests per normalization unit
+  Read Req (1 DWord):
+    rst: The total number of sL1D read requests made for a single dword of data (4B),
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Read Req (2 DWord):
+    rst: The total number of sL1D read requests made for a two dwords of data (8B),
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Read Req (4 DWord):
+    rst: The total number of sL1D read requests made for a four dwords of data (16B),
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Read Req (8 DWord):
+    rst: The total number of sL1D read requests made for a eight dwords of data (32B),
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Read Req (16 DWord):
+    rst: The total number of sL1D read requests made for a sixteen dwords of data
+      (64B), per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+Scalar L1D Cache - L2 Interface:
+  sL1D-L2 BW:
+    rst: |-
+      The total number of bytes read from, written to, or atomically updated
+      across the sL1D\u2194:doc:`L2 <l2-cache>` interface, divided by total duration.
+      Note that sL1D writes and atomics are typically
+      unused on current CDNA accelerators, so in the majority of cases this can
+      be interpreted as an sL1D\u2192L2 read bandwidth.
+    unit: Gbps
+  Read Req:
+    rst: The total number of read requests from sL1D to the :doc:`L2 <l2-cache>`,
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Write Req:
+    rst: The total number of write requests from sL1D to the :doc:`L2 <l2-cache>`,
+      per :ref:`normalization unit <normalization-units>`. Typically unused on current
+      CDNA accelerators.
+    unit: Requests per normalization unit
+  Atomic Req:
+    rst: The total number of atomic requests from sL1D to the :doc:`L2 <l2-cache>`,
+      per :ref:`normalization unit <normalization-units>`. Typically unused on current
+      CDNA accelerators.
+    unit: Requests per normalization unit
+  Stall Cycles:
+    rst: |-
+      The total number of cycles the sL1D\u2194 :doc:`L2 <l2-cache>` interface
+      was stalled, per :ref:`normalization unit <normalization-units>`.
+    unit: Cycles per normalization unit
+Busy and stall metrics:
+  Address Processing Unit Busy:
+    rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the address processor
+      was busy
+    unit: Percent
+  Address Stall:
+    rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the address processor
+      was stalled from sending address requests further into the vL1D pipeline
+    unit: Percent
+  Data Stall:
+    rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the address processor
+      was stalled from sending write/atomic data further into the vL1D pipeline
+    unit: Percent
+  Data-Processor → Address Stall:
+    rst: Percent of :ref:`total CU cycles <total-cu-cycles>` the address processor
+      was stalled waiting to send command data to the :ref:`data processor <desc-td>`
+    unit: Percent
+  Sequencer → TA Address Stall:
+    rst: ''
+    unit: Unknown
+  Sequencer → TA Command Stall:
+    rst: ''
+    unit: Unknown
+  Sequencer → TA Data Stall:
+    rst: ''
+    unit: Unknown
+Instruction counts:
+  Total Instructions:
+    rst: The total number of memory instructions executed by the address processer
+      over all compute units on the accelerator, per normalization unit.
+    unit: Instructions per normalization unit
+  Global/Generic Instructions:
+    rst: The total number of global & generic memory instructions executed on all
+      :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Global/Generic Read Instructions:
+    rst: The total number of global & generic memory read instructions executed on
+      all :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Global/Generic Write Instructions:
+    rst: The total number of global & generic memory write instructions executed on
+      all :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Global/Generic Atomic Instructions:
+    rst: The total number of global & generic memory atomic (with and without return)
+      instructions executed on all :doc:`compute units <compute-unit>` on the accelerator,
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Spill/Stack Instructions:
+    rst: The total number of spill/stack memory instructions executed on all :doc:`compute
+      units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Spill/Stack Read Instructions:
+    rst: The total number of spill/stack memory read instructions executed on all
+      :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Spill/Stack Write Instructions:
+    rst: The total number of spill/stack memory write instructions executed on all
+      :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Spill/Stack Atomic Instructions:
+    rst: The total number of spill/stack memory atomic (with and without return) instructions
+      executed on all :doc:`compute units <compute-unit>` on the accelerator, per
+      :ref:`normalization unit <normalization-units>`. Typically unused as these memory
+      operations are typically used to implement thread-local storage.
+    unit: Instructions per normalization unit
+Spill and stack metrics:
+  Spill/Stack Total Cycles:
+    rst: The number of cycles the address processing unit spent working on spill/stack
+      instructions, per :ref:`normalization unit <normalization-units>`.
+    unit: Cycles per normalization unit
+  Spill/Stack Coalesced Read:
+    rst: The number of cycles the address processing unit spent working on coalesced
+      spill/stack read instructions, per :ref:`normalization unit <normalization-units>`.
+    unit: Cycles per normalization unit
+  Spill/Stack Coalesced Write:
+    rst: The number of cycles the address processing unit spent working on coalesced
+      spill/stack write instructions, per :ref:`normalization unit <normalization-units>`.
+    unit: Cycles per normalization unit
+Vector L1 data-return path or Texture Data (TD):
+  Data-Return Busy:
+    rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the data-return unit
+      was busy processing or waiting on data to return to the :doc:`CU <compute-unit>`.
+    unit: Percent
+  Cache RAM → Data-Return Stall:
+    rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the data-return unit
+      was stalled on data to be returned from the :ref:`vL1D Cache RAM <desc-tc>`.
+    unit: Percent
+  Workgroup manager → Data-Return Stall:
+    rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the data-return unit
+      was stalled by the :ref:`workgroup manager <desc-spi>` due to initialization
+      of registers as a part of launching new workgroups.
+    unit: Percent
+  Coalescable Instructions:
+    rst: The number of instructions submitted to the :ref:`data-return unit <desc-td>`
+      by the :ref:`address processor <desc-ta>` that were found to be coalescable,
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Read Instructions:
+    rst: The number of read instructions submitted to the :ref:`data-return unit <desc-td>`
+      by the :ref:`address processor <desc-ta>` summed over all :doc:`compute units
+      <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
+      This is expected to be the sum of global/generic and spill/stack reads in the
+      :ref:`address processor <desc-ta>`.
+    unit: Instructions per normalization unit
+  Write Instructions:
+    rst: The number of store instructions submitted to the :ref:`data-return unit
+      <desc-td>` by the :ref:`address processor <desc-ta>` summed over all :doc:`compute
+      units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
+      This is expected to be the sum of global/generic and spill/stack stores counted
+      by the :ref:`vL1D cache-front-end <ta-instruction-counts>`.
+    unit: Instructions per normalization unit
+  Atomic Instructions:
+    rst: The number of atomic instructions submitted to the :ref:`data-return unit
+      <desc-td>` by the :ref:`address processor <desc-ta>` summed over all :doc:`compute
+      units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
+      This is expected to be the sum of global/generic and spill/stack atomics in
+      the :ref:`address processor <desc-ta>`.
+    unit: Instructions per normalization unit
+vL1D Speed-of-Light:
+  Hit rate:
+    rst: The ratio of the number of vL1D cache line requests that hit [#vl1d-hit]_
+      in vL1D cache over the total number of cache line requests to the :ref:`vL1D
+      Cache RAM <desc-tc>`.
+    unit: Percent
+  Bandwidth Utilization:
+    rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM
+      <desc-vmem>` instructions, as a percent of the peak theoretical bandwidth achievable
+      on the specific accelerator. The number of bytes is calculated as the number
+      of cache lines requested multiplied by the cache line size. This value does
+      not consider partial requests, so for instance, if only a single value is requested
+      in a cache line, the data movement will still be counted as a full cache line.
+    unit: Percent
+  Utilization:
+    rst: Indicates how busy the :ref:`vL1D Cache RAM <desc-tc>` was during the kernel
+      execution. The number of cycles where the vL1D Cache RAM is actively processing
+      any request divided by the number of cycles where the vL1D is active [#vl1d-activity]_.
+    unit: Percent
+  Coalescing:
+    rst: Indicates how well memory instructions were coalesced by the :ref:`address
+      processing unit <desc-ta>`, ranging from uncoalesced (25%) to fully coalesced
+      (100%). Calculated as the average number of :ref:`thread-requests <thread-requests>`
+      generated per instruction divided by the ideal number of thread-requests per
+      instruction.
+    unit: Percent
+vL1D cache stall metrics:
+  Stalled on L2 Data:
+    rst: The ratio of the number of cycles where the vL1D is stalled waiting for requested
+      data to return from the :doc:`L2 cache <l2-cache>` divided by the number of
+      cycles where the vL1D is active [#vl1d-activity]_.
+    unit: Percent
+  Stalled on L2 Req:
+    rst: The ratio of the number of cycles where the vL1D is stalled waiting to issue
+      a request for data to the :doc:`L2 cache <l2-cache>` divided by the number of
+      cycles where the vL1D is active [#vl1d-activity]_.
+    unit: Percent
+  Tag RAM Stall (Read):
+    rst: The ratio of the number of cycles where the vL1D is stalled due to Read requests
+      with conflicting tags being looked up concurrently, divided by the number of
+      cycles where the vL1D is active [#vl1d-activity]_.
+    unit: Percent
+  Tag RAM Stall (Write):
+    rst: The ratio of the number of cycles where the vL1D is stalled due to Write
+      requests with conflicting tags being looked up concurrently, divided by the
+      number of cycles where the vL1D is active [#vl1d-activity]_.
+    unit: Percent
+  Tag RAM Stall (Atomic):
+    rst: The ratio of the number of cycles where the vL1D is stalled due to Atomic
+      requests with conflicting tags being looked up concurrently, divided by the
+      number of cycles where the vL1D is active [#vl1d-activity]_.
+    unit: Percent
+vL1D cache access metrics:
+  Total Req:
+    rst: The total number of incoming requests from the :ref:`address processing unit
+      <desc-ta>` after coalescing.
+    unit: Requests
+  Read Req:
+    rst: The total number of incoming read requests from the :ref:`address processing
+      unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
+    unit: Requests per normalization unit
+  Write Req:
+    rst: The total number of incoming write requests from the :ref:`address processing
+      unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
+    unit: Requests per normalization unit
+  Atomic Req:
+    rst: The total number of incoming atomic requests from the :ref:`address processing
+      unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
+    unit: Requests per normalization unit
+  Cache BW:
+    rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM
+      <desc-vmem>` instructions divided by total duration. The number of bytes is
+      calculated as the number of cache lines requested multiplied by the cache line
+      size. This value does not consider partial requests, so for instance, if only
+      a single value is requested in a cache line, the data movement will still be
+      counted as a full cache line.
+    unit: Gbps
+  Cache Hit Rate:
+    rst: The ratio of the number of vL1D cache line requests that hit in vL1D cache
+      over the total number of cache line requests to the :ref:`vL1D Cache RAM <desc-tc>`.
+    unit: Percent
+  Cache Accesses:
+    rst: The total number of cache line lookups in the vL1D.
+    unit: Cache lines
+  Cache Hits:
+    rst: The number of cache accesses minus the number of outgoing requests to the
+      :doc:`L2 cache <l2-cache>`, that is, the number of cache line requests serviced
+      by the :ref:`vL1D Cache RAM <desc-tc>` per :ref:`normalization unit <normalization-units>`.
+    unit: Cache lines per normalization unit
+  Invalidations:
+    rst: The number of times the vL1D was issued a write-back invalidate command during
+      the kernel's execution per :ref:`normalization unit <normalization-units>`.
+      This may be triggered by, for instance, the ``buffer_wbinvl1`` instruction.
+    unit: Invalidations per normalization unit
+  L1-L2 BW:
+    rst: The number of bytes transferred across the vL1D-L2 interface as a result
+      of :ref:`VMEM <desc-vmem>` instructions, divided by total duration. The number
+      of bytes is calculated as the number of cache lines requested multiplied by
+      the cache line size. This value does not consider partial requests, so for instance,
+      if only a single value is requested in a cache line, the data movement will
+      still be counted as a full cache line.
+    unit: Gbps
+  L1-L2 Read:
+    rst: The number of read requests for a vL1D cache line that were not satisfied
+      by the vL1D and must be retrieved from the to the :doc:`L2 Cache <l2-cache>`
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  L1-L2 Write:
+    rst: The number of write requests to a vL1D cache line that were sent through
+      the vL1D to the :doc:`L2 cache <l2-cache>`, per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  L1-L2 Atomic:
+    rst: The number of atomic requests that are sent through the vL1D to the :doc:`L2
+      cache <l2-cache>`, per :ref:`normalization unit <normalization-units>`. This
+      includes requests for atomics with, and without return.
+    unit: Requests per normalization unit
+  L1 Access Latency:
+    rst: Calculated as the average number of cycles that a vL1D cache line request
+      spent in the vL1D cache pipeline.
+    unit: Cycles
+  L1-L2 Read Latency:
+    rst: Calculated as the average number of cycles that the vL1D cache took to issue
+      and receive read requests from the :doc:`L2 Cache <l2-cache>`. This number also
+      includes requests for atomics with return values.
+    unit: Cycles
+  L1-L2 Write Latency:
+    rst: Calculated as the average number of cycles that the vL1D cache took to issue
+      and receive acknowledgement of a write request to the :doc:`L2 Cache <l2-cache>`.
+      This number also includes requests for atomics without return values.
+    unit: Cycles
+L1D - L2 Transactions:
+  NC - Read:
+    rst: Total read requests with NC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  UC - Read:
+    rst: Total read requests with UC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  CC - Read:
+    rst: Total read requests with CC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  RW - Read:
+    rst: Total read requests with RW mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  RW - Write:
+    rst: Total write requests with RW mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  NC - Write:
+    rst: Total write requests with NC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  UC - Write:
+    rst: Total write requests with UC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  CC - Write:
+    rst: Total write requests with CC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  NC - Atomic:
+    rst: Total atomic requests with NC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  UC - Atomic:
+    rst: Total atomic requests with UC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  CC - Atomic:
+    rst: Total atomic requests with CC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  RW - Atomic:
+    rst: Total atomic requests with RW mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+L1 Unified Translation Cache (UTCL1):
+  Req:
+    rst: The number of translation requests made to the UTCL1 per normalization unit.
+    unit: Requests per normalization unit
+  Hit Ratio:
+    rst: The ratio of the number of translation requests that hit in the UTCL1 divided
+      by the total number of translation requests made to the UTCL1.
+    unit: Percent
+  Hits:
+    rst: The number of translation requests that hit in the UTCL1, and could be reused,
+      per normalization unit.
+    unit: Requests per normalization unit
+  Translation Misses:
+    rst: The total number of translation requests that missed in the UTCL1 due to
+      translation not being present in the cache, per :ref:`normalization unit <normalization-units>`.
+    unit: unit
+  Permission Misses:
+    rst: |-
+      The total number of translation requests that missed in the UTCL1 due
+      to a permission error, per :ref:`normalization unit <normalization-units>`.
+      This is unused and expected to be zero in most configurations for modern
+      CDNA\u2122 accelerators.
+    unit: Requests per normalization unit
+L1D Addr Translation Stalls: {}
+L2 Speed-of-Light:
+  Utilization:
+    rst: The ratio of the :ref:`number of cycles an L2 channel was active, summed
+      over all L2 channels on the accelerator <total-active-l2-cycles>` over the :ref:`total
+      L2 cycles <total-l2-cycles>`.
+    unit: Percent
+  Peak Bandwidth:
+    rst: The number of bytes looked up in the L2 cache, as a percent of the peak theoretical
+      bandwidth achievable on the specific accelerator. The number of bytes is calculated
+      as the number of cache lines requested multiplied by the cache line size. This
+      value does not consider partial requests, so e.g., if only a single value is
+      requested in a cache line, the data movement will still be counted as a full
+      cache line.
+    unit: Percent
+  Hit Rate:
+    rst: The ratio of the number of L2 cache line requests that hit in the L2 cache
+      over the total number of incoming cache line requests to the L2 cache.
+    unit: Percent
+  L2-Fabric Read BW:
+    rst: The number of bytes read by the L2 over the :ref:`Infinity Fabric interface
+      <l2-fabric>` per unit time.
+    unit: GB/s
+  L2-Fabric Write and Atomic BW:
+    rst: The number of bytes sent by the L2 over the :ref:`Infinity Fabric interface
+      <l2-fabric>` by write and atomic operations per unit time.
+    unit: GB/s
+  HBM Bandwidth:
+    rst: Maximum theoretical bandwidth of the accelerator's local high-bandwidth memory
+      (HBM) per unit time. This value is calculated as the number of HBM channels
+      multiplied by the HBM channel width multiplied by the HBM clock frequency.
+    unit: GB/s
+L2-Fabric interface metrics:
+  Read BW:
+    rst: The total number of bytes read by the L2 cache from Infinity Fabric divided
+      by total duration.
+    unit: Gbps
+  HBM Read Traffic:
+    rst: The percent of read requests generated by the L2 cache that are routed to
+      the accelerator's local high-bandwidth memory (HBM). This breakdown does not
+      consider the *size* of the request (meaning that 32B and 64B requests are both
+      counted as a single request), so this metric only *approximates* the percent
+      of the L2-Fabric Read bandwidth directed to the local HBM.
+    unit: Percent
+  Remote Read Traffic:
+    rst: The percent of read requests generated by the L2 cache that are routed to
+      any memory location other than the accelerator's local high-bandwidth memory
+      (HBM) -- for example, the CPU's DRAM or a remote accelerator's HBM. This breakdown
+      does not consider the *size* of the request (meaning that 32B and 64B requests
+      are both counted as a single request), so this metric only *approximates* the
+      percent of the L2-Fabric Read bandwidth directed to a remote location.
+    unit: Percent
+  Uncached Read Traffic:
+    rst: The percent of read requests generated by the L2 cache that are reading from
+      an :ref:`uncached memory allocation <memory-type>`. Note, as described in the
+      :ref:`request flow <l2-request-flow>` section, a single 64B read request is
+      typically counted as two uncached read requests. So, it is possible for the
+      Uncached Read Traffic to reach up to 200% of the total number of read requests.
+      This breakdown does not consider the *size* of the request (i.e., 32B and 64B
+      requests are both counted as a single request), so this metric only *approximates*
+      the percent of the L2-Fabric read bandwidth directed to an uncached memory location.
+    unit: Percent
+  Write and Atomic BW:
+    rst: The total number of bytes written by the L2 over Infinity Fabric by write
+      and atomic operations divided by total duration. Note that on current CDNA accelerators,
+      such as the :ref:`MI2XX <mixxx-note>`, requests are only considered *atomic*
+      by Infinity Fabric if they are targeted at non-write-cacheable memory, for example,
+      :ref:`fine-grained memory <memory-type>` allocations or :ref:`uncached memory
+      <memory-type>` allocations on the MI2XX.
+    unit: Gbps
+  HBM Write and Atomic Traffic:
+    rst: The percent of write and atomic requests generated by the L2 cache that are
+      routed to the accelerator's local high-bandwidth memory (HBM). This breakdown
+      does not consider the *size* of the request (meaning that 32B and 64B requests
+      are both counted as a single request), so this metric only *approximates* the
+      percent of the L2-Fabric Write and Atomic bandwidth directed to the local HBM.
+      Note that on current CDNA accelerators, such as the :ref:`MI2XX <mixxx-note>`,
+      requests are only considered *atomic* by Infinity Fabric if they are targeted
+      at :ref:`fine-grained memory <memory-type>` allocations or :ref:`uncached memory
+      <memory-type>` allocations.
+    unit: Percent
+  Remote Write and Atomic Traffic:
+    rst: The percent of read requests generated by the L2 cache that are routed to
+      any memory location other than the accelerator's local high-bandwidth memory
+      (HBM) -- for example, the CPU's DRAM or a remote accelerator's HBM. This breakdown
+      does not consider the *size* of the request (meaning that 32B and 64B requests
+      are both counted as a single request), so this metric only *approximates* the
+      percent of the L2-Fabric Read bandwidth directed to a remote location. Note
+      that on current CDNA accelerators, such as the :ref:`MI2XX <mixxx-note>`, requests
+      are only considered *atomic* by Infinity Fabric if they are targeted at :ref:`fine-grained
+      memory <memory-type>` allocations or :ref:`uncached memory <memory-type>` allocations.
+    unit: Percent
+  Atomic Traffic:
+    rst: The percent of write requests generated by the L2 cache that are atomic requests
+      to *any* memory location. This breakdown does not consider the *size* of the
+      request (meaning that 32B and 64B requests are both counted as a single request),
+      so this metric only *approximates* the percent of the L2-Fabric Read bandwidth
+      directed to a remote location. Note that on current CDNA accelerators, such
+      as the :ref:`MI2XX <mixxx-note>`, requests are only considered *atomic* by Infinity
+      Fabric if they are targeted at :ref:`fine-grained memory <memory-type>` allocations
+      or :ref:`uncached memory <memory-type>` allocations.
+    unit: Percent
+  Uncached Write and Atomic Traffic:
+    rst: The percent of write and atomic requests generated by the L2 cache that are
+      targeting :ref:`uncached memory allocations <memory-type>`. This breakdown does
+      not consider the *size* of the request (meaning that 32B and 64B requests are
+      both counted as a single request), so this metric only *approximates* the percent
+      of the L2-Fabric read bandwidth directed to uncached memory allocations.
+    unit: Percent
+  Read Latency:
+    rst: The time-averaged number of cycles read requests spent in Infinity Fabric
+      before data was returned to the L2.
+    unit: Cycles
+  Write and Atomic Latency:
+    rst: The time-averaged number of cycles write requests spent in Infinity Fabric
+      before a completion acknowledgement was returned to the L2.
+    unit: Cycles
+  Atomic Latency:
+    rst: The time-averaged number of cycles atomic requests spent in Infinity Fabric
+      before a completion acknowledgement (atomic without return value) or data (atomic
+      with return value) was returned to the L2.
+    unit: Cycles
+L2 Cache Accesses:
+  Bandwidth:
+    rst: The number of bytes looked up in the L2 cache, divided by total duration.
+      The number of bytes is calculated as the number of cache lines requested multiplied
+      by the cache line size. This value does not consider partial requests, so for
+      example, if only a single value is requested in a cache line, the data movement
+      will still be counted as a full cache line.
+    unit: Gbps
+  Req:
+    rst: The total number of incoming requests to the L2 from all clients for all
+      request types, per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Read Req:
+    rst: The total number of read requests to the L2 from all clients.
+    unit: Requests per normalization unit
+  Write Req:
+    rst: The total number of write requests to the L2 from all clients.
+    unit: Requests per normalization unit
+  Atomic Req:
+    rst: The total number of atomic requests (with and without return) to the L2 from
+      all clients.
+    unit: Requests per normalization unit
+  Streaming Req:
+    rst: The total number of incoming requests to the L2 that are marked as *streaming*.
+      The exact meaning of this may differ depending on the targeted accelerator,
+      however on an :ref:`MI2XX <mixxx-note>` this corresponds to `non-temporal load
+      or stores <https://clang.llvm.org/docs/LanguageExtensions.html#non-temporal-load-store-builtins>`_.
+      The L2 cache attempts to evict *streaming* requests before normal requests when
+      the L2 is at capacity.
+    unit: Requests per normalization unit
+  Probe Req:
+    rst: The number of coherence probe requests made to the L2 cache from outside
+      the accelerator. On an :ref:`MI2XX <mixxx-note>`, probe requests may be generated
+      by, for example, writes to :ref:`fine-grained device <memory-type>` memory or
+      by writes to :ref:`coarse-grained <memory-type>` device memory.
+    unit: Requests per normalization unit
+  Cache Hit:
+    rst: The ratio of the number of L2 cache line requests that hit in the L2 cache
+      over the total number of incoming cache line requests to the L2 cache.
+    unit: Percent
+  Hits:
+    rst: The total number of requests to the L2 from all clients that hit in the cache.
+      As noted in the :ref:`Speed-of-Light <l2-sol>` section, this includes hit-on-miss
+      requests.
+    unit: Requests per normalization unit
+  Misses:
+    rst: The total number of requests to the L2 from all clients that miss in the
+      cache. As noted in the :ref:`Speed-of-Light <l2-sol>` section, these do not
+      include hit-on-miss requests.
+    unit: Requests per normalization unit
+  Writeback:
+    rst: The total number of L2 cache lines written back to memory for any reason.
+      Write-backs may occur due to user code (such as HIP kernel calls to ``__threadfence_system``
+      or atomic built-ins) by the :doc:`command processor <command-processor>`'s memory
+      acquire/release fences, or for other internal hardware reasons.
+    unit: Cache lines per normalization unit
+  Writeback (Internal):
+    rst: The total number of L2 cache lines written back to memory for internal hardware
+      reasons, per :ref:`normalization unit <normalization-units>`.
+    unit: Cache lines per normalization unit
+  Writeback (vL1D Req):
+    rst: The total number of L2 cache lines written back to memory due to requests
+      initiated by the :doc:`vL1D cache <vector-l1-cache>`, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Cache lines per normalization unit
+  Evict (Internal):
+    rst: The total number of L2 cache lines evicted from the cache due to capacity
+      limits, per :ref:`normalization unit <normalization-units>`.
+    unit: Cache lines per normalization unit
+  Evict (vL1D Req):
+    rst: The total number of L2 cache lines evicted from the cache due to invalidation
+      requests initiated by the :doc:`vL1D cache <vector-l1-cache>`, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Cache lines per normalization unit
+  NC Req:
+    rst: The total number of requests to the L2 to Not-hardware-Coherent (NC) memory
+      allocations, per :ref:`normalization unit <normalization-units>`. See the :ref:`memory-type`
+      for more information.
+    unit: Requests per normalization unit
+  UC Req:
+    rst: The total number of requests to the L2 that go to Uncached (UC) memory allocations.
+      See the :ref:`memory-type` for more information.
+    unit: Requests per normalization unit
+  CC Req:
+    rst: The total number of requests to the L2 that go to Coherently Cacheable (CC)
+      memory allocations. See the :ref:`memory-type` for more information.
+    unit: Requests per normalization unit
+  RW Req:
+    rst: The total number of requests to the L2 that go to Read-Write coherent memory
+      (RW) allocations. See the :ref:`memory-type` for more information.
+    unit: Requests per normalization unit
+L2 Cache Stalls: {}
+L2 - Fabric Interface stalls:
+  Write - Credit Starvation:
+    rst: The number of cycles the L2-Fabric interface was stalled on write or atomic
+      requests to any memory location because too many write/atomic requests were
+      currently in flight, as a percent of the :ref:`total active L2 cycles <total-active-l2-cycles>`.
+    unit: Percent
+L2 - Fabric interface detailed metrics:
+  Read (32B):
+    rst: The total number of L2 requests to Infinity Fabric to read 32B of data from
+      any memory location, per :ref:`normalization unit <normalization-units>`. See
+      :ref:`l2-request-flow` for more detail. Typically unused on CDNA accelerators.
+    unit: Requests per normalization unit
+  Read (64B):
+    rst: The total number of L2 requests to Infinity Fabric to read 64B of data from
+      any memory location, per :ref:`normalization unit <normalization-units>`. See
+      :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+  Read (Uncached):
+    rst: The total number of L2 requests to Infinity Fabric to read :ref:`uncached
+      data <memory-type>` from any memory location, per :ref:`normalization unit <normalization-units>`.
+      64B requests for uncached data are counted as two 32B uncached data requests.
+      See :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+  HBM Read:
+    rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of
+      data from the accelerator's local HBM, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+  Remote Read:
+    rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of
+      data from any source other than the accelerator's local HBM, per :ref:`normalization
+      unit <normalization-units>`. See :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+  Write and Atomic (32B):
+    rst: The total number of L2 requests to Infinity Fabric to write or atomically
+      update 32B of data to any memory location, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+  Write and Atomic (Uncached):
+    rst: The total number of L2 requests to Infinity Fabric to write or atomically
+      update 32B or 64B of :ref:`uncached data <memory-type>`, per :ref:`normalization
+      unit <normalization-units>`. See :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+  Write and Atomic (64B):
+    rst: The total number of L2 requests to Infinity Fabric to write or atomically
+      update 64B of data in any memory location, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+  HBM Write and Atomic:
+    rst: The total number of L2 requests to Infinity Fabric to write or atomically
+      update 32B or 64B of data in the accelerator's local HBM, per :ref:`normalization
+      unit <normalization-units>`. See :ref:`l2-request-flow` for more detail. plain
+    unit: Requests per normalization unit
+  Remote Write and Atomic:
+    rst: The total number of L2 requests to Infinity Fabric to write or atomically
+      update 32B or 64B of data in any memory location other than the accelerator's
+      local HBM, per :ref:`normalization unit <normalization-units>`. See :ref:`l2-request-flow`
+      for more detail.
+    unit: Requests per normalization unit
+  Atomic:
+    rst: The total number of L2 requests to Infinity Fabric to atomically update 32B
+      or 64B of data in any memory location, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`l2-request-flow` for more detail. Note that on current CDNA accelerators,
+      such as the :ref:`MI2XX <mixxx-note>`, requests are only considered *atomic*
+      by Infinity Fabric if they are targeted at non-write-cacheable memory, such
+      as :ref:`fine-grained memory <memory-type>` allocations or :ref:`uncached memory
+      <memory-type>` allocations on the MI2XX.
+    unit: Requests per normalization unit
+Aggregate Stats (All channels):
+  L2 Cache Hit Rate:
+    rst: The total number of requests to the L2 from all clients that hit in the cache.
+      As noted in the :ref:`Speed-of-Light <l2-sol>` section, this includes hit-on-miss
+      requests.
+    unit: Percent
+L2 Cache Hit Rate (pct):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
+L2 Requests (per normUnit):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
+L2-Fabric Requests (per normUnit):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
+L2-Fabric Read Latency (Cycles):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
+L2-Fabric Write and Atomic Latency (Cycles):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
+L2-Fabric Atomic Latency (Cycles):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
+L2-Fabric Read Stall (Cycles per normUnit):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
+L2-Fabric Write and Atomic Stall (Cycles per normUnit):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
+L2-Fabric (128B read requests per normUnit):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
diff --git a/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx940_metrics_description.yaml b/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx940_metrics_description.yaml
new file mode 100644
index 0000000000..5a978ccee5
--- /dev/null
+++ b/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx940_metrics_description.yaml
@@ -0,0 +1,2040 @@
+System Speed-of-Light:
+  VALU FLOPs:
+    rst: |-
+      The total floating-point operations executed per second on the :ref:`VALU
+      <desc-valu>`. This is also presented as a percent of the peak theoretical
+      FLOPs achievable on the specific accelerator. Note: this does not include
+      any floating-point operations from :ref:`MFMA <desc-mfma>` instructions.
+    unit: GFLOPs
+  VALU IOPs:
+    rst: |-
+      The total integer operations executed per second on the :ref:`VALU <desc-valu>`.
+      This is also presented as a percent of the peak theoretical IOPs achievable
+      on the specific accelerator. Note: this does not include any integer operations
+      from :ref:`MFMA <desc-mfma>` instructions.
+    unit: GOIPs
+  MFMA FLOPs (F8):
+    rst: |-
+      The total number of 8-bit brain floating point :ref:`MFMA <desc-mfma>`
+      operations executed per second. Note: this does not include any 16-bit brain
+      floating point operations from :ref:`VALU <desc-valu>` instructions. This
+      is also presented as a percent of the peak theoretical F8 MFMA operations
+      achievable on the specific accelerator. It is supported on AMD Instinct MI300
+      series and later only.
+    unit: GFLOPs
+  MFMA FLOPs (BF16):
+    rst: |-
+      The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>`
+      operations executed per second. Note: this does not include any 16-bit brain
+      floating point operations from :ref:`VALU <desc-valu>` instructions. This
+      is also presented as a percent of the peak theoretical BF16 MFMA operations
+      achievable on the specific accelerator.
+    unit: GFLOPs
+  MFMA FLOPs (F16):
+    rst: |-
+      The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 16-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. This is also presented
+      as a percent of the peak theoretical F16 MFMA operations achievable on the
+      specific accelerator.
+    unit: GFLOPs
+  MFMA FLOPs (F32):
+    rst: |-
+      The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 32-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. This is also presented
+      as a percent of the peak theoretical F32 MFMA operations achievable on the
+      specific accelerator.
+    unit: GFLOPs
+  MFMA FLOPs (F64):
+    rst: |-
+      The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 64-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. This is also presented
+      as a percent of the peak theoretical F64 MFMA operations achievable on the
+      specific accelerator.
+    unit: GFLOPs
+  MFMA IOPs (Int8):
+    rst: |-
+      The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
+      per second. Note: this does not include any 8-bit integer operations from
+      :ref:`VALU <desc-valu>` instructions. This is also presented as a percent
+      of the peak theoretical INT8 MFMA operations achievable on the specific accelerator.
+    unit: GIOPs
+  Active CUs:
+    rst: Total number of active compute units (CUs) on the accelerator during the
+      kernel execution.
+    unit: Number
+  SALU Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`SALU <desc-salu>`
+      was busy executing instructions. Computed as the ratio of the total number of
+      cycles spent by the :ref:`scheduler <desc-scheduler>` issuing SALU / :ref:`SMEM
+      <desc-smem>` instructions over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  VALU Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`VALU <desc-valu>`
+      was busy executing instructions. Does not include :ref:`VMEM <desc-vmem>` operations.
+      Computed as the ratio of the total number of cycles spent by the :ref:`scheduler
+      <desc-scheduler>` issuing VALU instructions over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  MFMA Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`MFMA <desc-mfma>`
+      unit was busy executing instructions. Computed as the ratio of the total number
+      of cycles spent by the :ref:`MFMA <desc-salu>` was busy over the :ref:`total
+      CU cycles <total-cu-cycles>`.
+    unit: Percent
+  VMEM Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`VMEM <desc-vmem>`
+      unit was busy executing instructions, including both global/generic and spill/scratch
+      operations (see the :ref:`VMEM instruction count metrics <ta-instruction-counts>`
+      for more detail). Does not include :ref:`VALU <desc-valu>` operations. Computed
+      as the ratio of the total number of cycles spent by the :ref:`scheduler <desc-scheduler>`
+      issuing VMEM instructions over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  Branch Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`branch <desc-branch>`
+      unit was busy executing instructions. Computed as the ratio of the total number
+      of cycles spent by the :ref:`scheduler <desc-scheduler>` issuing branch instructions
+      over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  VALU Active Threads:
+    rst: Indicates the average level of :ref:`divergence <desc-divergence>` within
+      a wavefront over the lifetime of the kernel. The number of work-items that were
+      active in a wavefront during execution of each :ref:`VALU <desc-valu>` instruction,
+      time-averaged over all VALU instructions run on all wavefronts in the kernel.
+    unit: Work-items
+  IPC:
+    rst: The ratio of the total number of instructions executed on the :doc:`CU <compute-unit>`
+      over the :ref:`total active CU cycles <total-active-cu-cycles>`.
+    unit: Instructions per-cycle
+  Wavefront Occupancy:
+    rst: |-
+      The time-averaged number of wavefronts resident on the accelerator over
+      the lifetime of the kernel. Note: this metric may be inaccurate for short-running
+      kernels (less than 1ms). This is also presented as a percent of the peak theoretical
+      occupancy achievable on the specific accelerator.
+    unit: Wavefronts
+  Theoretical LDS Bandwidth:
+    rst: Indicates the maximum amount of bytes that could have been loaded from, stored
+      to, or atomically updated in the LDS per unit time (see :ref:`LDS Bandwidth
+      <lds-bandwidth>` example for more detail). This is also presented as a percent
+      of the peak theoretical F64 MFMA operations achievable on the specific accelerator.
+    unit: GB/s
+  LDS Bank Conflicts/Access:
+    rst: The ratio of the number of cycles spent in the :doc:`LDS scheduler <local-data-share>`
+      due to bank conflicts (as determined by the conflict resolution hardware) to
+      the base number of cycles that would be spent in the LDS scheduler in a completely
+      uncontended case. This is also presented in normalized form (i.e., the Bank
+      Conflict Rate).
+    unit: Conflicts/Access
+  vL1D Cache Hit Rate:
+    rst: The ratio of the number of vL1D cache line requests that hit in vL1D cache
+      over the total number of cache line requests to the :ref:`vL1D cache RAM <desc-tc>`.
+    unit: Percent
+  vL1D Cache BW:
+    rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM
+      <desc-vmem>` instructions per unit time. The number of bytes is calculated as
+      the number of cache lines requested multiplied by the cache line size. This
+      value does not consider partial requests, so e.g., if only a single value is
+      requested in a cache line, the data movement will still be counted as a full
+      cache line. This is also presented as a percent of the peak theoretical bandwidth
+      achievable on the specific accelerator.
+    unit: GB/s
+  L2 Cache Hit Rate:
+    rst: The ratio of the number of L2 cache line requests that hit in the L2 cache
+      over the total number of incoming cache line requests to the L2 cache.
+    unit: Percent
+  L2 Cache BW:
+    rst: The number of bytes looked up in the L2 cache per unit time. The number of
+      bytes is calculated as the number of cache lines requested multiplied by the
+      cache line size. This value does not consider partial requests, so e.g., if
+      only a single value is requested in a cache line, the data movement will still
+      be counted as a full cache line. This is also presented as a percent of the
+      peak theoretical bandwidth achievable on the specific accelerator.
+    unit: GB/s
+  L2-Fabric Read BW:
+    rst: |-
+      The number of bytes read by the L2 over the :ref:`Infinity Fabric\u2122
+      interface <l2-fabric>` per unit time. This is also presented as a percent
+      of the peak theoretical bandwidth achievable on the specific accelerator.
+    unit: GB/s
+  L2-Fabric Write BW:
+    rst: The number of bytes sent by the L2 over the :ref:`Infinity Fabric interface
+      <l2-fabric>` by write and atomic operations per unit time. This is also presented
+      as a percent of the peak theoretical bandwidth achievable on the specific accelerator.
+    unit: GB/s
+  L2-Fabric Read Latency:
+    rst: The time-averaged number of cycles read requests spent in Infinity Fabric
+      before data was returned to the L2.
+    unit: Cycles
+  L2-Fabric Write Latency:
+    rst: The time-averaged number of cycles write requests spent in Infinity Fabric
+      before a completion acknowledgement was returned to the L2.
+    unit: Cycles
+  sL1D Cache Hit Rate:
+    rst: The percent of sL1D requests that hit on a previously loaded line the cache.
+      Calculated as the ratio of the number of sL1D requests that hit over the number
+      of all sL1D requests.
+    unit: Percent
+  sL1D Cache BW:
+    rst: The number of bytes looked up in the sL1D cache per unit time. This is also
+      presented as a percent of the peak theoretical bandwidth achievable on the specific
+      accelerator.
+    unit: GB/s
+  L1I Hit Rate:
+    rst: The percent of L1I requests that hit on a previously loaded line the cache.
+      Calculated as the ratio of the number of L1I requests that hit over the number
+      of all L1I requests.
+    unit: GB/s
+  L1I BW:
+    rst: The number of bytes looked up in the L1I cache per unit time. This is also
+      presented as a percent of the peak theoretical bandwidth achievable on the specific
+      accelerator.
+    unit: Percent
+  L1I Fetch Latency:
+    rst: The average number of cycles spent to fetch instructions to a :doc:`CU <compute-unit>`.
+    unit: Cycles
+Memory Chart:
+  Wavefront Occupancy:
+    rst: Wavefronts per active CU.
+    unit: Wavefronts
+  Wave Life:
+    rst: Average number of cycles executing a wave.
+    unit: Cycles per wave
+  SALU:
+    rst: Total Number of SALU (Scalar ALU) instructions issued per normalization unit.
+    unit: Instructions per normalization unit
+  SMEM:
+    rst: Total number of SMEM (Scalar Memory Read) instructions issued normalization
+      unit.
+    unit: Instructions per normalization unit
+  VALU:
+    rst: The number of VALU (Vector ALU) instructions issued per normalization unit.
+    unit: Instructions per normalization unit
+  MFMA:
+    rst: Total number of MFMA (Matrix-Fused-Multiply-Add) instructions issued per
+      normalization unit.
+    unit: Instructions per normalization unit
+  VMEM:
+    rst: The number of VMEM (GPU Memory) read instructions issued (including FLAT/scratch
+      memory) per normalization unit.
+    unit: Instructions per normalization unit
+  LDS:
+    rst: The total number of LDS instructions (including, but not limited to, read/write/atomics
+      and HIP's __shfl instructions) executed per normalization unit.
+    unit: Instructions per normalization unit
+  GWS:
+    rst: Total number of GDS (global data sync) instructions issued per normalization
+      unit.
+    unit: Instructions per normalization unit
+  BR:
+    rst: Total number of BRANCH instructions issued per normalization unit.
+    unit: Instructions per normalization unit
+  Active CUs:
+    rst: Total number of active compute units (CUs) on the accelerator during the
+      kernel execution.
+    unit: CUs
+  Num CUs:
+    rst: Total number of compute units (CUs) on the accelerator.
+    unit: CUs
+  VGPR:
+    rst: |-
+      The number of architected vector general-purpose registers allocated for the
+      kernel, see :ref:`VALU <desc-valu>`. Note: this may not exactly match the
+      number of VGPRs requested by the compiler due to allocation granularity.
+    unit: VGPRs
+  SGPR:
+    rst: |-
+      The number of scalar general-purpose registers allocated for the kernel, see
+      :ref:`SALU <desc-salu>`. Note: this may not exactly match the number of
+      SGPRs requested by the compiler due to allocation granularity.
+    unit: SGPRs
+  LDS Allocation:
+    rst: |-
+      The number of bytes of :doc:`LDS <local-data-share>` memory (or, shared memory)
+      allocated for this kernel. Note: This may also be larger than what was requested
+      at compile time due to both allocation granularity and dynamic per-dispatch
+      LDS allocations.
+    unit: Bytes per workgroup
+  Scratch Allocation:
+    rst: The number of bytes of :ref:`scratch memory <memory-spaces>` requested per
+      work-item for this kernel. Scratch memory is used for stack memory on the accelerator,
+      as well as for register spills and restores.
+    unit: Bytes per workgroup
+  Wavefronts:
+    rst: The total number of wavefronts, summed over all workgroups, forming this
+      kernel launch.
+    unit: Wavefronts
+  Workgroups:
+    rst: The total number of workgroups forming this kernel launch.
+    unit: Workgroups
+  LDS Req:
+    rst: The total number of LDS instructions (including, but not limited to, read/write/atomics
+      and HIP's ``__shfl`` instructions) executed per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  LDS Util:
+    rst: Indicates what percent of the kernel's duration the :ref:`LDS <desc-lds>`
+      was actively executing instructions (including, but not limited to, load, store,
+      atomic and HIP's ``__shfl`` operations). Calculated as the ratio of the total
+      number of cycles LDS was active over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  LDS Latency:
+    rst: The average number of round-trip cycles (i.e., from issue to data-return
+      / acknowledgment) required for an LDS instruction to complete.
+    unit: Cycles
+  VL1 Rd:
+    rst: The total number of incoming read requests from the :ref:`address processing
+      unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
+    unit: Requests per normalization unit
+  VL1 Wr:
+    rst: The total number of incoming write requests from the :ref:`address processing
+      unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
+    unit: Requests per normalization unit
+  VL1 Atomic:
+    rst: The total number of incoming atomic requests from the :ref:`address processing
+      unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
+    unit: Requests per normalization unit
+  VL1 Hit:
+    rst: The ratio of the number of vL1D cache line requests that hit in vL1D cache
+      over the total number of cache line requests to the :ref:`vL1D Cache RAM <desc-tc>`.
+    unit: Percent
+  VL1 Lat:
+    rst: Calculated as the average number of cycles that a vL1D cache line request
+      spent in the vL1D cache pipeline.
+    unit: Cycles
+  VL1 Coalesce:
+    rst: Indicates how well memory instructions were coalesced by the :ref:`address
+      processing unit <desc-ta>`, ranging from uncoalesced (25%) to fully coalesced
+      (100%). Calculated as the average number of :ref:`thread-requests <thread-requests>`
+      generated per instruction divided by the ideal number of thread-requests per
+      instruction.
+    unit: Percent
+  VL1 Stall:
+    rst: The ratio of the number of cycles where the vL1D is stalled waiting to issue
+      a request for data to the :doc:`L2 cache <l2-cache>` divided by the number of
+      cycles where the vL1D is active [#vl1d-activity]_.
+    unit: Percent
+  VL1_L2 Rd:
+    rst: The number of read requests for a vL1D cache line that were not satisfied
+      by the vL1D and must be retrieved from the to the :doc:`L2 Cache <l2-cache>`
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  VL1_L2 Wr:
+    rst: The number of write requests to a vL1D cache line that were sent through
+      the vL1D to the :doc:`L2 cache <l2-cache>`, per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  VL1_L2 Atomic:
+    rst: The number of atomic requests that are sent through the vL1D to the :doc:`L2
+      cache <l2-cache>`, per :ref:`normalization unit <normalization-units>`. This
+      includes requests for atomics with, and without return.
+    unit: Requests per normalization unit
+  sL1D Rd:
+    rst: The total number of requests, of any size or type, made to the sL1D per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Requests per normalization unit
+  sL1D Hit:
+    rst: The total number of sL1D requests that hit on a previously loaded cache line,
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  sL1D Lat:
+    rst: ''
+    unit: Unknown
+  sL1D_L2 Rd:
+    rst: The total number of read requests from sL1D to the :doc:`L2 <l2-cache>`,
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  sL1D_L2 Wr:
+    rst: The total number of write requests from sL1D to the :doc:`L2 <l2-cache>`,
+      per :ref:`normalization unit <normalization-units>`. Typically unused on current
+      CDNA accelerators.
+    unit: Requests per normalization unit
+  sL1D_L2 Atomic:
+    rst: The total number of atomic requests from sL1D to the :doc:`L2 <l2-cache>`,
+      per :ref:`normalization unit <normalization-units>`. Typically unused on current
+      CDNA accelerators.
+    unit: Requests per normalization unit
+  IL1 Fetch:
+    rst: The total number of requests made to the L1I per :ref:`normalization-unit
+      <normalization-units>`.
+    unit: Requests per normalization unit
+  IL1 Hit:
+    rst: The total number of L1I requests that hit on a previously loaded cache line,
+      per :ref:`normalization-unit <normalization-units>`.
+    unit: Percent
+  IL1 Lat:
+    rst: The average number of cycles spent to fetch instructions to a :doc:`CU <compute-unit>`.
+    unit: Cycles
+  IL1_L2 Rd:
+    rst: The total number of requests across the L1I - L2 interface per normalization-unit.
+    unit: Requests per normalization unit
+  L2 Rd:
+    rst: The total number of read requests to the L2 from all clients.
+    unit: Requests per normalization unit
+  L2 Wr:
+    rst: The total number of write requests to the L2 from all clients.
+    unit: Requests per normalization unit
+  L2 Atomic:
+    rst: The total number of atomic requests (with and without return) to the L2 from
+      all clients.
+    unit: Requests per normalization unit
+  L2 Hit:
+    rst: The ratio of the number of L2 cache line requests that hit in the L2 cache
+      over the total number of incoming cache line requests to the L2 cache.
+    unit: Percent
+  Fabric_L2 Rd:
+    rst: Number of L2 cache - Infinity Fabric read requests (either 32-byte or 64-byte)
+      summed over TCC instances per normalization unit.
+    unit: Requests per normalization unit
+  Fabric_L2 Wr:
+    rst: Number of L2 cache - Infinity Fabric write requests (either 32-byte or 64-byte)
+      summed over TCC instances per normalization unit.
+    unit: Requests per normalization unit
+  Fabric_L2 Atomic:
+    rst: Number of L2 cache - Infinity Fabric write requests (either 32-byte or 64-byte)
+      that are actually atomic requests summed over TCC instances per normalization
+      unit.
+    unit: Requests per normalization unit
+  Fabric Rd Lat:
+    rst: The time-averaged number of cycles read requests spent in Infinity Fabric
+      before data was returned to the L2.
+    unit: Cycles
+  Fabric Wr Lat:
+    rst: The time-averaged number of cycles write requests spent in Infinity Fabric
+      before a completion acknowledgement was returned to the L2.
+    unit: Cycles
+  Fabric Atomic Lat:
+    rst: The time-averaged number of cycles atomic requests spent in Infinity Fabric
+      before a completion acknowledgement (atomic without return value) or data (atomic
+      with return value) was returned to the L2.
+    unit: Cycles
+  HBM Rd:
+    rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of
+      data from the accelerator's local HBM, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+  HBM Wr:
+    rst: The total number of L2 requests to Infinity Fabric to write 32B or 64B of
+      data from the accelerator's local HBM, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+Roofline Performance Rates:
+  VALU FLOPs (F16):
+    rst: |-
+      The total 16-bit floating-point operations executed per second on the :ref:`VALU
+      <desc-valu>`. This is presented with the value of the peak empirical F16 FLOPs achievable
+      on the specific accelerator. Note: this does not include any F16 operations
+      from :ref:`MFMA <desc-mfma>` instructions.
+    unit: GFLOPs
+  VALU FLOPs (F32):
+    rst: |-
+      The total 32-bit floating-point operations executed per second on the :ref:`VALU
+      <desc-valu>`. This is presented with the value of the peak empirical F32 FLOPs achievable
+      on the specific accelerator. Note: this does not include any F32 operations
+      from :ref:`MFMA <desc-mfma>` instructions.
+    unit: GFLOPs
+  VALU FLOPs (F64):
+    rst: |-
+      The total 64-bit floating-point operations executed per second on the :ref:`VALU
+      <desc-valu>`. This is presented with the value of the peak empirical F64 FLOPs achievable
+      on the specific accelerator. Note: this does not include any F64 operations
+      from :ref:`MFMA <desc-mfma>` instructions.
+    unit: GFLOPs
+  MFMA FLOPs (F64):
+    rst: |-
+      The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 64-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
+      measured F64 MFMA operations achievable on the specific accelerator is
+      displayed alongside for comparison.
+    unit: GFLOPs
+  MFMA FLOPs (F32):
+    rst: |-
+      The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 32-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
+      measured F32 MFMA operations achievable on the specific accelerator is
+      displayed alongside for comparison.
+    unit: GFLOPs
+  MFMA FLOPs (F16):
+    rst: |-
+      The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 16-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
+      measured F16 MFMA operations achievable on the specific accelerator is
+      displayed alongside for comparison.
+    unit: GFLOPs
+  MFMA FLOPs (BF16):
+    rst: |-
+      The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>`
+      operations executed per second. Note: this does not include any 16-bit brain
+      floating point operations from :ref:`VALU <desc-valu>` instructions. The
+      peak empirically measured BF16 MFMA operations achievable on the specific
+      accelerator is displayed alongside for comparison.
+    unit: GFLOPs
+  MFMA FLOPs (F8):
+    rst: |-
+      The total number of 8-bit brain floating point :ref:`MFMA <desc-mfma>`
+      operations executed per second. Note: this does not include any 16-bit brain
+      floating point operations from :ref:`VALU <desc-valu>` instructions. The
+      peak empirically measured F8 MFMA operations achievable on the specific
+      accelerator is displayed alongside for comparison. It is supported on AMD
+      Instinct MI300 series and later only.
+    unit: GFLOPs
+  MFMA IOPs (Int8):
+    rst: |-
+      The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
+      per second. Note: this does not include any 8-bit integer operations from
+      :ref:`VALU <desc-valu>` instructions. The peak empirically measured INT8 MFMA
+      operations achievable on the specific accelerator is displayed alongside
+      for comparison.
+    unit: GIOPs
+  HBM Bandwidth:
+    rst: |-
+      The total number of bytes read from and written to High-Bandwidth
+        Memory (HBM) per second. The peak empirically measured bandwidth achievable
+        on the specific accelerator is displayed alongside for comparison.
+    unit: GB/s
+  L2 Cache Bandwidth:
+    rst: The number of bytes looked up in the L2 cache per unit time. The number of
+      bytes is calculated as the number of cache lines requested multiplied by the
+      cache line size. This value does not consider partial requests, so e.g., if
+      only a single value is requested in a cache line, the data movement will still
+      be counted as a full cache line. The peak empirically measured bandwidth achievable
+      on the specific accelerator is displayed alongside for comparison.
+    unit: GB/s
+  L1 Cache Bandwidth:
+    rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM
+      <desc-vmem>` instructions per unit time. The number of bytes is calculated as
+      the number of cache lines requested multiplied by the cache line size. This
+      value does not consider partial requests, so e.g., if only a single value is
+      requested in a cache line, the data movement will still be counted as a full
+      cache line. The peak empirically measured bandwidth achievable on the specific
+      accelerator is displayed alongside for comparison.
+    unit: GB/s
+  LDS Bandwidth:
+    rst: Indicates the maximum amount of bytes that could have been loaded from, stored
+      to, or atomically updated in the LDS per unit time (see :ref:`LDS Bandwidth
+      <lds-bandwidth>` example for more detail). The peak empirically measured LDS
+      bandwidth achievable on the specific accelerator is displayed alongside for
+      comparison.
+    unit: GB/s
+Roofline Plot Points:
+  AI HBM:
+    rst: |-
+      The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM).
+      It is the ratio of total floating-point operations (FLOPs) to total bytes
+      transferred between HBM and the L2 cache. This value is used as the x-coordinate
+      for the HBM roofline.
+    unit: FLOPs/Byte
+  AI L2:
+    rst: |-
+      The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio
+      of total floating-point operations (FLOPs) to total bytes transferred between
+      the L2 cache and the L1 cache. This value is used as the x-coordinate for
+      the L2 roofline.
+    unit: FLOPs/Byte
+  AI L1:
+    rst: |-
+      The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio
+      of total floating-point operations (FLOPs) to total bytes transferred between
+      the L1 cache and the processing units. This value is used as the x-coordinate
+      for the L1 roofline.
+    unit: FLOPs/Byte
+  Performance (GFLOPs):
+    rst: |-
+      The overall achieved performance, measured in GigaFLOPs
+      per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point
+      operations divided by the total execution time. This value is used as the y-coordinate
+      for the kernel's point on the Roofline plot.
+    unit: GFLOP/s
+Command processor fetcher (CPF):
+  CPF Utilization:
+    rst: Percent of total cycles where the CPF was busy actively doing any work. The
+      ratio of CPF busy cycles over total cycles counted by the CPF.
+    unit: Percent
+  CPF Stall:
+    rst: Percent of CPF busy cycles where the CPF was stalled for any reason.
+    unit: Percent
+  CPF-L2 Utilization:
+    rst: Percent of total cycles counted by the CPF-:doc:`L2 <l2-cache>` interface
+      where the CPF-L2 interface was active doing any work. The ratio of CPF-L2 busy
+      cycles over total cycles counted by the CPF-L2.
+    unit: Percent
+  CPF-L2 Stall:
+    rst: Percent of CPF-:doc:`L2 <l2-cache>` L2 busy cycles where the CPF-L2 interface
+      was stalled for any reason.
+    unit: Percent
+  CPF-UTCL1 Stall:
+    rst: Percent of CPF busy cycles where the CPF was stalled by address translation.
+    unit: Percent
+Command processor packet processor (CPC):
+  CPC Utilization:
+    rst: Percent of total cycles where the CPC was busy actively doing any work. The
+      ratio of CPC busy cycles over total cycles counted by the CPC.
+    unit: Percent
+  CPC Stall Rate:
+    rst: Percent of CPC busy cycles where the CPC was stalled for any reason.
+    unit: Percent
+  CPC Packet Decoding Utilization:
+    rst: Percent of CPC busy cycles spent decoding commands for processing.
+    unit: Percent
+  CPC-Workgroup Manager Utilization:
+    rst: Percent of CPC busy cycles spent dispatching workgroups to the :ref:`workgroup
+      manager <desc-spi>`.
+    unit: Percent
+  CPC-L2 Utilization:
+    rst: Percent of total cycles counted by the CPC-:doc:`L2 <l2-cache>` interface
+      where the CPC-L2 interface was active doing any work.
+    unit: Percent
+  CPC-UTCL1 Stall:
+    rst: Percent of CPC busy cycles where the CPC was stalled by address translation
+    unit: Percent
+  CPC-UTCL2 Utilization:
+    rst: Percent of total cycles counted by the CPC's :doc:`L2 <l2-cache>` address
+      translation interface where the CPC was busy doing address translation work.
+    unit: Percent
+Workgroup manager utilizations:
+  Accelerator Utilization:
+    rst: The percent of cycles in the kernel where the accelerator was actively doing
+      any work.
+    unit: Percent
+  Scheduler-Pipe Utilization:
+    rst: |-
+      The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
+      in the kernel where the scheduler-pipes were actively doing any work. Note: this
+      value is expected to range between 0% and 25%. See :ref:`desc-spi`.
+    unit: Percent
+  Workgroup Manager Utilization:
+    rst: The percent of cycles in the kernel where the workgroup manager was actively
+      doing any work.
+    unit: Percent
+  Shader Engine Utilization:
+    rst: The percent of :ref:`total shader engine cycles <total-se-cycles>` in the
+      kernel where any CU in a shader-engine was actively doing any work, normalized
+      over all shader-engines. Low values (e.g., << 100%) indicate that the accelerator
+      was not fully saturated by the kernel, or a potential load-imbalance issue.
+    unit: Percent
+  SIMD Utilization:
+    rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel
+      where any :ref:`SIMD <desc-valu>` on a CU was actively doing any work, summed
+      over all CUs. Low values (less than 100%) indicate that the accelerator was
+      not fully saturated by the kernel, or a potential load-imbalance issue.
+    unit: Percent
+  Dispatched Workgroups:
+    rst: The total number of workgroups forming this kernel launch.
+    unit: Workgroups
+  Dispatched Wavefronts:
+    rst: The total number of wavefronts, summed over all workgroups, forming this
+      kernel launch.
+    unit: Wavefronts
+  VGPR Writes:
+    rst: The average number of cycles spent initializing :ref:`VGPRs <desc-valu>`
+      at wave creation.
+    unit: Cycles/wave
+  SGPR Writes:
+    rst: The average number of cycles spent initializing :ref:`SGPRs <desc-salu>`
+      at wave creation.
+    unit: Cycles/wave
+Workgroup Manager - Resource Allocation:
+  Not-scheduled Rate (Workgroup Manager):
+    rst: |-
+      The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
+      in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
+      due to a bottleneck within the workgroup manager rather than a lack of a
+      CU or :ref:`SIMD <desc-valu>` with sufficient resources. Note: this value
+      is expected to range between 0-25%. See note in :ref:`workgroup manager <desc-spi>`
+      description.
+    unit: Percent
+  Not-scheduled Rate (Scheduler-Pipe):
+    rst: |-
+      The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
+      in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
+      due to a bottleneck within the scheduler-pipes rather than a lack of a CU
+      or :ref:`SIMD <desc-valu>` with sufficient resources. Note: this value is
+      expected to range between 0-25%, see note in :ref:`workgroup manager <desc-spi>`
+      description.
+    unit: Percent
+  Scheduler-Pipe Stall Rate:
+    rst: |-
+      The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
+      in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
+      due to occupancy limitations (like a lack of a CU or :ref:`SIMD <desc-valu>`
+      with sufficient resources). Note: this value is expected to range between
+      0-25%, see note in :ref:`workgroup manager <desc-spi>` description.
+    unit: Percent
+  Scratch Stall Rate:
+    rst: The percent of :ref:`total shader-engine cycles <total-se-cycles>` in the
+      kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
+      due to lack of :ref:`private (a.k.a., scratch) memory <memory-type>` slots.
+      While this can reach up to 100%, note that the actual occupancy limitations
+      on a kernel using private memory are typically quite small (for example, less
+      than 1% of the total number of waves that can be scheduled to an accelerator).
+    unit: Percent
+  Insufficient SIMD Waveslots:
+    rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel
+      where a workgroup could not be scheduled to a :ref:`SIMD <desc-valu>` due to
+      lack of available :ref:`waveslots <desc-valu>`.
+    unit: Percent
+  Insufficient SIMD VGPRs:
+    rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel
+      where a workgroup could not be scheduled to a :ref:`SIMD <desc-valu>` due to
+      lack of available :ref:`VGPRs <desc-valu>`.
+    unit: Percent
+  Insufficient SIMD SGPRs:
+    rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel
+      where a workgroup could not be scheduled to a :ref:`SIMD <desc-valu>` due to
+      lack of available :ref:`SGPRs <desc-salu>`.
+    unit: Percent
+  Insufficient CU LDS:
+    rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel where
+      a workgroup could not be scheduled to a :doc:`CU <compute-unit>` due to lack
+      of available :doc:`LDS <local-data-share>`.
+    unit: Percent
+  Insufficient CU Barriers:
+    rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel where
+      a workgroup could not be scheduled to a :doc:`CU <compute-unit>` due to lack
+      of available :ref:`barriers <desc-barrier>`.
+    unit: Percent
+  Reached CU Workgroup Limit:
+    rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel where
+      a workgroup could not be scheduled to a :doc:`CU <compute-unit>` due to limits
+      within the workgroup manager. This is expected to be always be zero on CDNA2
+      or newer accelerators (and small for previous accelerators).
+    unit: Percent
+  Reached CU Wavefront Limit:
+    rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel where
+      a wavefront could not be scheduled to a :doc:`CU <compute-unit>` due to limits
+      within the workgroup manager. This is expected to be always be zero on CDNA2
+      or newer accelerators (and small for previous accelerators).
+    unit: Percent
+Wavefront Launch Stats:
+  Grid Size:
+    rst: The total number of work-items (or, threads) launched as a part of the kernel
+      dispatch. In HIP, this is equivalent to the total grid size multiplied by the
+      total workgroup (or, block) size.
+    unit: Work-Items
+  Workgroup Size:
+    rst: The total number of work-items (or, threads) in each workgroup (or, block)
+      launched as part of the kernel dispatch. In HIP, this is equivalent to the total
+      block size.
+    unit: Work-Items
+  Total Wavefronts:
+    rst: |-
+      The total number of wavefronts launched as part of the kernel dispatch.
+      On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront
+      size is always 64 work-items. Thus, the total number of wavefronts should
+      be equivalent to the ceiling of grid size divided by 64.
+    unit: Wavefronts
+  Saved Wavefronts:
+    rst: The total number of wavefronts saved at a context-save. See `cwsr_enable
+      <https://docs.kernel.org/gpu/amdgpu/module-parameters.html?highlight=cwsr>`_.
+    unit: Wavefronts
+  Restored Wavefronts:
+    rst: The total number of wavefronts restored from a context-save. See `cwsr_enable
+      <https://docs.kernel.org/gpu/amdgpu/module-parameters.html?highlight=cwsr>`_.
+    unit: Wavefronts
+  VGPRs:
+    rst: |-
+      The number of architected vector general-purpose registers allocated for the
+      kernel, see :ref:`VALU <desc-valu>`. Note: this may not exactly match the
+      number of VGPRs requested by the compiler due to allocation granularity.
+    unit: VGPRs
+  AGPRs:
+    rst: |-
+      The number of accumulation vector general-purpose registers allocated
+      for the kernel, see :ref:`AGPRs <desc-agprs>`. Note: this may not exactly match
+      the number of AGPRs requested by the compiler due to allocation granularity.
+    unit: AGPRs
+  SGPRs:
+    rst: |-
+      The number of scalar general-purpose registers allocated for the kernel, see
+      :ref:`SALU <desc-salu>`. Note: this may not exactly match the number of
+      SGPRs requested by the compiler due to allocation granularity.
+    unit: SGPRs
+  LDS Allocation:
+    rst: |-
+      The number of bytes of :doc:`LDS <local-data-share>` memory (or, shared memory)
+      allocated for this kernel. Note: This may also be larger than what was requested
+      at compile time due to both allocation granularity and dynamic per-dispatch
+      LDS allocations.
+    unit: Bytes per workgroup
+  Scratch Allocation:
+    rst: The number of bytes of :ref:`scratch memory <memory-spaces>` requested per
+      work-item for this kernel. Scratch memory is used for stack memory on the accelerator,
+      as well as for register spills and restores.
+    unit: Bytes per work-item
+Wavefront Runtime Stats:
+  Kernel Time:
+    rst: The total duration of the executed kernel.
+    unit: Nanoseconds
+  Kernel Time (Cycles):
+    rst: The total duration of the executed kernel in cycles.
+    unit: Cycles
+  Instructions per wavefront:
+    rst: The average number of instructions (of all types) executed per wavefront.
+      This is averaged over all wavefronts in a kernel dispatch.
+    unit: Instructions per wavefront
+  Wave Cycles:
+    rst: |-
+      The number of cycles a wavefront in the kernel dispatch spent resident
+      on a compute unit per :ref:`normalization unit <normalization-units>`. This is
+      averaged over all wavefronts in a kernel dispatch. Note: this should not
+      be directly compared to the kernel cycles above.
+    unit: Cycles per normalization unit
+  Dependency Wait Cycles:
+    rst: The number of cycles a wavefront in the kernel dispatch stalled waiting on
+      memory of any kind (e.g., instruction fetch, vector or scalar memory, etc.)
+      per :ref:`normalization unit <normalization-units>`. This counter is incremented
+      at every cycle by *all* wavefronts on a CU stalled at a memory operation. As
+      such, it is most useful to get a sense of how waves were spending their time,
+      rather than identification of a precise limiter because another wave could be
+      actively executing while a wave is stalled. The sum of this metric, Issue Wait
+      Cycles and Active Cycles should be equal to the total Wave Cycles metric.
+    unit: Cycles per normalization unit
+  Issue Wait Cycles:
+    rst: The number of cycles a wavefront in the kernel dispatch was unable to issue
+      an instruction for any reason (e.g., execution pipe back-pressure, arbitration
+      loss, etc.) per :ref:`normalization unit <normalization-units>`. This counter
+      is incremented at every cycle by *all* wavefronts on a CU unable to issue an
+      instruction. As such, it is most useful to get a sense of how waves were spending
+      their time, rather than identification of a precise limiter because another
+      wave could be actively executing while a wave is issue stalled. The sum of this
+      metric, Dependency Wait Cycles and Active Cycles should be equal to the total
+      Wave Cycles metric.
+    unit: Cycles per normalization unit
+  Active Cycles:
+    rst: The average number of cycles a wavefront in the kernel dispatch was actively
+      executing instructions per :ref:`normalization unit <normalization-units>`.
+      This measurement is made on a per-wavefront basis, and may include cycles that
+      another wavefront spent actively executing (on another execution unit, for example)
+      or was stalled. As such, it is most useful to get a sense of how waves were
+      spending their time, rather than identification of a precise limiter. The sum
+      of this metric, Issue Wait Cycles and Active Wait Cycles should be equal to
+      the total Wave Cycles metric.
+    unit: Cycles per normalization unit
+  Wavefront Occupancy:
+    rst: |-
+      The time-averaged number of wavefronts resident on the accelerator over the
+      lifetime of the kernel. Note: this metric may be inaccurate for short-running
+      kernels (less than 1ms).
+    unit: Wavefronts
+Overall Instruction Mix:
+  VALU:
+    rst: The total number of vector arithmetic logic unit (VALU) operations issued.
+      These are the workhorses of the :doc:`compute unit <compute-unit>`, and are
+      used to execute a wide range of instruction types including floating point operations,
+      non-uniform address calculations, transcendental operations, integer operations,
+      shifts, conditional evaluation, etc.
+    unit: Instructions
+  VMEM:
+    rst: The total number of vector memory operations issued. These include most loads,
+      stores and atomic operations and all accesses to :ref:`generic, global, private
+      and texture <memory-spaces>` memory.
+    unit: Instructions
+  LDS:
+    rst: The total number of LDS (also known as shared memory) operations issued.
+      These include loads, stores, atomics, and HIP's ``__shfl`` operations.
+    unit: Instructions
+  MFMA:
+    rst: The total number of matrix fused multiply-add instructions issued.
+    unit: Instructions
+  SALU:
+    rst: The total number of scalar arithmetic logic unit (SALU) operations issued.
+      Typically these are used for address calculations, literal constants, and other
+      operations that are provably uniform across a wavefront. Although scalar memory
+      (SMEM) operations are issued by the SALU, they are counted separately in this
+      section.
+    unit: Instructions
+  SMEM:
+    rst: The total number of scalar memory (SMEM) operations issued. These are typically
+      used for loading kernel arguments, base-pointers and loads from HIP's ``__constant__``
+      memory.
+    unit: Instructions
+  Branch:
+    rst: The total number of branch operations issued. These typically consist of
+      jump or branch operations and are used to implement control flow.
+    unit: Instructions
+VALU Arithmetic Instruction Mix:
+  INT32:
+    rst: The total number of instructions operating on 32-bit integer operands issued
+      to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  INT64:
+    rst: The total number of instructions operating on 64-bit integer operands issued
+      to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F16-ADD:
+    rst: The total number of addition instructions operating on 16-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F16-MUL:
+    rst: The total number of multiplication instructions operating on 16-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F16-FMA:
+    rst: The total number of fused multiply-add instructions operating on 16-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F16-Trans:
+    rst: The total number of transcendental instructions (e.g., `sqrt`) operating
+      on 16-bit floating-point operands issued to the VALU per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F32-ADD:
+    rst: The total number of addition instructions operating on 32-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F32-MUL:
+    rst: The total number of multiplication instructions operating on 32-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F32-FMA:
+    rst: The total number of fused multiply-add instructions operating on 32-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F32-Trans:
+    rst: The total number of transcendental instructions (such as ``sqrt``) operating
+      on 32-bit floating-point operands issued to the VALU per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F64-ADD:
+    rst: The total number of addition instructions operating on 64-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F64-MUL:
+    rst: The total number of multiplication instructions operating on 64-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F64-FMA:
+    rst: The total number of fused multiply-add instructions operating on 64-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F64-Trans:
+    rst: The total number of transcendental instructions (such as `sqrt`) operating
+      on 64-bit floating-point operands issued to the VALU per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Conversion:
+    rst: |-
+      The total number of type conversion instructions (such as converting data
+      to or from F32\u2194F64) issued to the VALU per :ref:`normalization unit
+      <normalization-units>`.
+    unit: Instructions per normalization unit
+VMEM Instruction Mix:
+  Global/Generic Instr:
+    rst: The total number of global & generic memory instructions executed on all
+      :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Global/Generic Read:
+    rst: The total number of global & generic memory read instructions executed on
+      all :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Global/Generic Write:
+    rst: The total number of global & generic memory write instructions executed on
+      all :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Global/Generic Atomic:
+    rst: The total number of global & generic memory atomic (with and without return)
+      instructions executed on all :doc:`compute units <compute-unit>` on the accelerator,
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Spill/Stack Instr:
+    rst: The total number of spill/stack memory instructions executed on all :doc:`compute
+      units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Spill/Stack Read:
+    rst: The total number of spill/stack memory read instructions executed on all
+      :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Spill/Stack Write:
+    rst: The total number of spill/stack memory write instructions executed on all
+      :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Spill/Stack Atomic:
+    rst: The total number of spill/stack memory atomic (with and without return) instructions
+      executed on all :doc:`compute units <compute-unit>` on the accelerator, per
+      :ref:`normalization unit <normalization-units>`. Typically unused as these memory
+      operations are typically used to implement thread-local storage.
+    unit: Instructions per normalization unit
+MFMA Arithmetic Instruction Mix:
+  MFMA-I8:
+    rst: The total number of 8-bit integer :ref:`MFMA <desc-mfma>` instructions issued
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  MFMA-F8:
+    rst: The total number of 8-bit floating point :ref:`MFMA <desc-mfma>` instructions
+      issued per :ref:`normalization unit <normalization-units>`. This is supported
+      in AMD Instinct MI300 series and later only.
+    unit: Instructions per normalization unit
+  MFMA-F16:
+    rst: The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` instructions
+      issued per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  MFMA-BF16:
+    rst: The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>` instructions
+      issued per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  MFMA-F32:
+    rst: The total number of 32-bit floating-point :ref:`MFMA <desc-mfma>` instructions
+      issued per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  MFMA-F64:
+    rst: The total number of 64-bit floating-point :ref:`MFMA <desc-mfma>` instructions
+      issued per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+Compute Speed-of-Light:
+  VALU FLOPs:
+    rst: |-
+      The total floating-point operations executed per second on the :ref:`VALU
+      <desc-valu>`. This is also presented as a percent of the peak theoretical
+      FLOPs achievable on the specific accelerator. Note: this does not include
+      any floating-point operations from :ref:`MFMA <desc-mfma>` instructions.
+    unit: GFLOPs
+  VALU IOPs:
+    rst: |-
+      The total integer operations executed per second on the :ref:`VALU <desc-valu>`.
+      This is also presented as a percent of the peak theoretical IOPs achievable
+      on the specific accelerator. Note: this does not include any integer operations
+      from :ref:`MFMA <desc-mfma>` instructions.
+    unit: GIOPs
+  MFMA FLOPs (F8):
+    rst: ''
+    unit: Unknown
+  MFMA FLOPs (BF16):
+    rst: |-
+      The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 16-bit brain floating
+      point operations from :ref:`VALU <desc-valu>` instructions. This is also
+      presented as a percent of the peak theoretical BF16 MFMA operations achievable
+      on the specific accelerator.
+    unit: GFLOPs
+  MFMA FLOPs (F16):
+    rst: |-
+      The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 16-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. This is also presented
+      as a percent of the peak theoretical F16 MFMA operations achievable on the
+      specific accelerator.
+    unit: GFLOPs
+  MFMA FLOPs (F32):
+    rst: |-
+      The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 32-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. This is also presented
+      as a percent of the peak theoretical F32 MFMA operations achievable on the
+      specific accelerator.
+    unit: GFLOPs
+  MFMA FLOPs (F64):
+    rst: |-
+      The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 64-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. This is also presented
+      as a percent of the peak theoretical F64 MFMA operations achievable on the
+      specific accelerator. The total number of 64-bit floating point :ref:`MFMA
+      <desc-mfma>` operations executed per second. Note: this does not include
+      any 64-bit floating point operations from :ref:`VALU <desc-valu>` instructions.
+      This is also presented as a percent of the peak theoretical F64 MFMA operations
+      achievable on the specific accelerator.
+    unit: GFLOPs
+  MFMA IOPs (INT8):
+    rst: |-
+      The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
+      per second. Note: this does not include any 8-bit integer operations from
+      :ref:`VALU <desc-valu>` instructions. This is also presented as a percent
+      of the peak theoretical INT8 MFMA operations achievable on the specific accelerator.
+    unit: GFLOPs
+Pipeline Statistics:
+  IPC:
+    rst: The ratio of the total number of instructions executed on the :doc:`CU <compute-unit>`
+      over the :ref:`total active CU cycles <total-active-cu-cycles>`.
+    unit: Instructions per cycle
+  IPC (Issued):
+    rst: The ratio of the total number of (non-:ref:`internal <ipc-internal-instructions>`)
+      instructions issued over the number of cycles where the :ref:`scheduler <desc-scheduler>`
+      was actively working on issuing instructions. Refer to the :ref:`Issued IPC
+      <issued-ipc>` example for further detail.
+    unit: Instructions per cycle
+  SALU Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`SALU <desc-salu>`
+      was busy executing instructions. Computed as the ratio of the total number of
+      cycles spent by the :ref:`scheduler <desc-scheduler>` issuing SALU / :ref:`SMEM
+      <desc-smem>` instructions over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  VALU Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`VALU <desc-valu>`
+      was busy executing instructions. Does not include :ref:`VMEM <desc-vmem>` operations.
+      Computed as the ratio of the total number of cycles spent by the :ref:`scheduler
+      <desc-scheduler>` issuing VALU instructions over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  VMEM Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`VMEM <desc-vmem>`
+      unit was busy executing instructions, including both global/generic and spill/scratch
+      operations (see the :ref:`VMEM instruction count metrics <ta-instruction-counts>`
+      for more detail). Does not include :ref:`VALU <desc-valu>` operations. Computed
+      as the ratio of the total number of cycles spent by the :ref:`scheduler <desc-scheduler>`
+      issuing VMEM instructions over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  Branch Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`branch <desc-branch>`
+      unit was busy executing instructions. Computed as the ratio of the total number
+      of cycles spent by the :ref:`scheduler <desc-scheduler>` issuing branch instructions
+      over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  VALU Active Threads:
+    rst: Indicates the average level of :ref:`divergence <desc-divergence>` within
+      a wavefront over the lifetime of the kernel. The number of work-items that were
+      active in a wavefront during execution of each :ref:`VALU <desc-valu>` instruction,
+      time-averaged over all VALU instructions run on all wavefronts in the kernel.
+    unit: Work-items
+  MFMA Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`MFMA <desc-mfma>`
+      unit was busy executing instructions. Computed as the ratio of the total number
+      of cycles spent by the :ref:`MFMA <desc-salu>` was busy over the :ref:`total
+      CU cycles <total-cu-cycles>`.
+    unit: Percent
+  MFMA Instruction Cycles:
+    rst: The average duration of :ref:`MFMA <desc-mfma>` instructions in this kernel
+      in cycles. Computed as the ratio of the total number of cycles the MFMA unit
+      was busy over the total number of MFMA instructions. Compare to, for example,
+      the `AMD Matrix Instruction Calculator <https://github.com/RadeonOpenCompute/amd_matrix_instruction_calculator>`_.
+    unit: Cycles per instruction
+  VMEM Latency:
+    rst: The average number of round-trip cycles (that is, from issue to data return
+      / acknowledgment) required for a VMEM instruction to complete.
+    unit: Cycles
+  SMEM Latency:
+    rst: The average number of round-trip cycles (that is, from issue to data return
+      / acknowledgment) required for a SMEM instruction to complete.
+    unit: Cycles
+Arithmetic Operations:
+  FLOPs (Total):
+    rst: The total number of floating-point operations executed on either the :ref:`VALU
+      <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization unit
+      <normalization-units>`.
+    unit: FLOP per normalization unit
+  IOPs (Total):
+    rst: The total number of integer operations executed on either the :ref:`VALU
+      <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization unit
+      <normalization-units>`.
+    unit: IOP per normalization unit
+  F8 OPs:
+    rst: ''
+    unit: Unknown
+  F16 OPs:
+    rst: The total number of 16-bit floating-point operations executed on either the
+      :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: FLOP per normalization unit
+  BF16 OPs:
+    rst: |-
+      The total number of 16-bit brain floating-point operations executed on
+      either the :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization
+      unit <normalization-units>`. Note: on current CDNA accelerators, the VALU
+      has no native BF16 instructions.
+    unit: FLOP per normalization unit
+  F32 OPs:
+    rst: The total number of 32-bit floating-point operations executed on either the
+      :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: FLOP per normalization unit
+  F64 OPs:
+    rst: The total number of 64-bit floating-point operations executed on either the
+      :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: FLOP per normalization unit
+  INT8 OPs:
+    rst: |-
+      The total number of 8-bit integer operations executed on either the :ref:`VALU
+      <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization unit
+      <normalization-units>`. Note: on current CDNA accelerators, the VALU has
+      no native INT8 instructions.
+    unit: IOP per normalization unit
+LDS Speed-of-Light:
+  Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`LDS <desc-lds>`
+      was actively executing instructions (including, but not limited to, load, store,
+      atomic and HIP's ``__shfl`` operations). Calculated as the ratio of the total
+      number of cycles LDS was active over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  Access Rate:
+    rst: Indicates the percentage of SIMDs in the :ref:`VALU <desc-valu>` [#lds-workload]_
+      actively issuing LDS instructions, averaged over the lifetime of the kernel.
+      Calculated as the ratio of the total number of cycles spent by the :ref:`scheduler
+      <desc-scheduler>` issuing :ref:`LDS <desc-lds>` instructions over the :ref:`total
+      CU cycles <total-cu-cycles>`.
+    unit: Percent
+  Theoretical Bandwidth Utilization:
+    rst: Indicates the maximum amount of bytes that could have been loaded from, stored
+      to, or atomically updated in the LDS divided as percentage of theoretical peak.
+      Does *not* take into account the execution mask of the wavefront when the instruction
+      was executed. See the :ref:`LDS bandwidth example <lds-bandwidth>` for more
+      detail.
+    unit: Percent
+  Bank Conflict Rate:
+    rst: Indicates the percentage of active LDS cycles that were spent servicing bank
+      conflicts. Calculated as the ratio of LDS cycles spent servicing bank conflicts
+      over the number of LDS cycles that would have been required to move the same
+      amount of data in an uncontended access. [#lds-bank-conflict]_
+    unit: Percent
+LDS Statistics:
+  LDS Instructions:
+    rst: The total number of LDS instructions (including, but not limited to, read/write/atomics
+      and HIP's ``__shfl`` instructions) executed per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Theoretical Bandwidth:
+    rst: Indicates the maximum amount of bytes that could have been loaded from, stored
+      to, or atomically updated in the LDS divided by total duration. Does *not* take
+      into account the execution mask of the wavefront when the instruction was executed.
+      See the :ref:`LDS bandwidth example <lds-bandwidth>` for more detail.
+    unit: Gbps
+  LDS Latency:
+    rst: The average number of round-trip cycles (i.e., from issue to data-return
+      acknowledgment) required for an LDS instruction to complete.
+    unit: Cycles
+  Bank Conflicts/Access:
+    rst: The ratio of the number of cycles spent in the :ref:`LDS scheduler <desc-lds>`
+      due to bank conflicts (as determined by the conflict resolution hardware) to
+      the base number of cycles that would be spent in the LDS scheduler in a completely
+      uncontended case. This is the unnormalized form of the Bank Conflict Rate.
+    unit: Conflicts per Access
+  Index Accesses:
+    rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>` over
+      all operations per :ref:`normalization unit <normalization-units>`.
+    unit: Cycles per normalization unit
+  Atomic Return Cycles:
+    rst: The total number of cycles spent on LDS atomics with return per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Cycles per normalization unit
+  Bank Conflict:
+    rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>` due
+      to bank conflicts (as determined by the conflict resolution hardware) per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Cycles per normalization unit
+  Addr Conflict:
+    rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>` due
+      to address conflicts (as determined by the conflict resolution hardware) per
+      :ref:`normalization unit <normalization-units>`.
+    unit: Cycles per normalization unit
+  Unaligned Stall:
+    rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>` due
+      to stalls from non-dword aligned addresses per :ref:`normalization unit <normalization-units>`.
+    unit: Cycles per normalization unit
+  Mem Violations:
+    rst: |-
+      The total number of out-of-bounds accesses made to the LDS, per :ref:`normalization
+      unit <normalization-units>`. This is unused and expected to be zero in
+      most configurations for modern CDNA\u2122 accelerators.
+    unit: Accesses per normalization unit
+L1I Speed-of-Light:
+  Bandwidth Utilization:
+    rst: The number of bytes looked up in the L1I cache, as a percent of the peak
+      theoretical bandwidth. Calculated as the ratio of L1I requests over the :ref:`total
+      L1I cycles <total-l1i-cycles>`.
+    unit: Percent
+  Cache Hit Rate:
+    rst: The percent of L1I requests that hit [#l1i-cache]_ on a previously loaded
+      line the cache. Calculated as the ratio of the number of L1I requests that hit
+      over the number of all L1I requests.
+    unit: Percent
+  L1I-L2 Bandwidth Utilization:
+    rst: |-
+      The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth
+      achieved. Calculated as the ratio of the total number of requests from
+      the L1I to the L2 cache over the :ref:`total L1I-L2 interface cycles <total-l1i-cycles>`.
+    unit: Percent
+L1I cache accesses:
+  Req:
+    rst: The total number of requests made to the L1I per normalization-unit
+    unit: Requests per normalization unit
+  Hits:
+    rst: The total number of L1I requests that hit on a previously loaded cache line,
+      per :ref:`normalization-unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Misses - Non Duplicated:
+    rst: The total number of L1I requests that missed on a cache line that *were not*
+      already pending due to another request, per :ref:`normalization-unit <normalization-units>`.
+      See note in :ref:`desc-l1i-sol` for more detail.
+    unit: Requests per normalization unit
+  Misses - Duplicated:
+    rst: The total number of L1I requests that missed on a cache line that *were*
+      already pending due to another request, per :ref:`normalization-unit <normalization-units>`.
+      See note in :ref:`desc-l1i-sol` for more detail.
+    unit: Requests per normalization unit
+  Cache Hit Rate:
+    rst: The percent of L1I requests that hit [#l1i-cache]_ on a previously loaded
+      line the cache. Calculated as the ratio of the number of L1I requests that hit
+      over the number of all L1I requests.
+    unit: Percent
+  Instruction Fetch Latency:
+    rst: The average number of cycles spent to fetch instructions to a :doc:`CU <compute-unit>`.
+    unit: Cycles
+L1I <-> L2 interface:
+  L1I-L2 Bandwidth:
+    rst: Total number of bytes transferred across L1I - L2 interface divided by total
+      duration.
+    unit: Gbps
+Scalar L1D Speed-of-Light:
+  Bandwidth Utilization:
+    rst: The number of bytes looked up in the sL1D cache, as a percent of the peak
+      theoretical bandwidth. Calculated as the ratio of sL1D requests over the :ref:`total
+      sL1D cycles <total-sl1d-cycles>`.
+    unit: Percent
+  Cache Hit Rate:
+    rst: Indicates the percent of sL1D requests that hit on a previously loaded line
+      the cache. The ratio of the number of sL1D requests that hit [#sl1d-cache]_
+      over the number of all sL1D requests.
+    unit: Percent
+  sL1D-L2 BW Utilization:
+    rst: The percentage of the peak theoretical sL1D - L2 interface bandwidth acheived.
+      Caclulated as total number of bytes read from, written to, or atomically updated
+      across the sL1D - L2 interface.
+    unit: Percent
+Scalar L1D cache accesses:
+  Req:
+    rst: The total number of requests, of any size or type, made to the sL1D per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Hits:
+    rst: The total number of sL1D requests that hit on a previously loaded cache line,
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Misses - Non Duplicated:
+    rst: The total number of sL1D requests that missed on a cache line that *was not*
+      already pending due to another request, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`desc-sl1d-sol` for more detail.
+    unit: Requests per normalization unit
+  Misses- Duplicated:
+    rst: The total number of sL1D requests that missed on a cache line that *was*
+      already pending due to another request, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`desc-sl1d-sol` for more detail.
+    unit: Requests per normalization unit
+  Cache Hit Rate:
+    rst: Indicates the percent of sL1D requests that hit on a previously loaded line
+      the cache. The ratio of the number of sL1D requests that hit [#sl1d-cache]_
+      over the number of all sL1D requests.
+    unit: Percent
+  Read Req (Total):
+    rst: The total number of sL1D read requests of any size, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Atomic Req:
+    rst: The total number of atomic requests from sL1D to the :doc:`L2 <l2-cache>`,
+      per :ref:`normalization unit <normalization-units>`. Typically unused on current
+      CDNA accelerators.
+    unit: Requests per normalization unit
+  Read Req (1 DWord):
+    rst: The total number of sL1D read requests made for a single dword of data (4B),
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Read Req (2 DWord):
+    rst: The total number of sL1D read requests made for a two dwords of data (8B),
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Read Req (4 DWord):
+    rst: The total number of sL1D read requests made for a four dwords of data (16B),
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Read Req (8 DWord):
+    rst: The total number of sL1D read requests made for a eight dwords of data (32B),
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Read Req (16 DWord):
+    rst: The total number of sL1D read requests made for a sixteen dwords of data
+      (64B), per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+Scalar L1D Cache - L2 Interface:
+  sL1D-L2 BW:
+    rst: |-
+      The total number of bytes read from, written to, or atomically updated
+      across the sL1D\u2194:doc:`L2 <l2-cache>` interface, divided by total duration.
+      Note that sL1D writes and atomics are typically
+      unused on current CDNA accelerators, so in the majority of cases this can
+      be interpreted as an sL1D\u2192L2 read bandwidth.
+    unit: Gbps
+  Read Req:
+    rst: The total number of read requests from sL1D to the :doc:`L2 <l2-cache>`,
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Write Req:
+    rst: The total number of write requests from sL1D to the :doc:`L2 <l2-cache>`,
+      per :ref:`normalization unit <normalization-units>`. Typically unused on current
+      CDNA accelerators.
+    unit: Requests per normalization unit
+  Atomic Req:
+    rst: The total number of atomic requests from sL1D to the :doc:`L2 <l2-cache>`,
+      per :ref:`normalization unit <normalization-units>`. Typically unused on current
+      CDNA accelerators.
+    unit: Requests per normalization unit
+  Stall Cycles:
+    rst: |-
+      The total number of cycles the sL1D\u2194 :doc:`L2 <l2-cache>` interface
+      was stalled, per :ref:`normalization unit <normalization-units>`.
+    unit: Cycles per normalization unit
+Busy and stall metrics:
+  Address Processing Unit Busy:
+    rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the address processor
+      was busy
+    unit: Percent
+  Address Stall:
+    rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the address processor
+      was stalled from sending address requests further into the vL1D pipeline
+    unit: Percent
+  Data Stall:
+    rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the address processor
+      was stalled from sending write/atomic data further into the vL1D pipeline
+    unit: Percent
+  Data-Processor → Address Stall:
+    rst: Percent of :ref:`total CU cycles <total-cu-cycles>` the address processor
+      was stalled waiting to send command data to the :ref:`data processor <desc-td>`
+    unit: Percent
+  Sequencer → TA Address Stall:
+    rst: ''
+    unit: Unknown
+  Sequencer → TA Command Stall:
+    rst: ''
+    unit: Unknown
+  Sequencer → TA Data Stall:
+    rst: ''
+    unit: Unknown
+Instruction counts:
+  Total Instructions:
+    rst: The total number of memory instructions executed by the address processer
+      over all compute units on the accelerator, per normalization unit.
+    unit: Instructions per normalization unit
+  Global/Generic Instructions:
+    rst: The total number of global & generic memory instructions executed on all
+      :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Global/Generic Read Instructions:
+    rst: The total number of global & generic memory read instructions executed on
+      all :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Global/Generic Write Instructions:
+    rst: The total number of global & generic memory write instructions executed on
+      all :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Global/Generic Atomic Instructions:
+    rst: The total number of global & generic memory atomic (with and without return)
+      instructions executed on all :doc:`compute units <compute-unit>` on the accelerator,
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Spill/Stack Instructions:
+    rst: The total number of spill/stack memory instructions executed on all :doc:`compute
+      units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Spill/Stack Read Instructions:
+    rst: The total number of spill/stack memory read instructions executed on all
+      :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Spill/Stack Write Instructions:
+    rst: The total number of spill/stack memory write instructions executed on all
+      :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Spill/Stack Atomic Instructions:
+    rst: The total number of spill/stack memory atomic (with and without return) instructions
+      executed on all :doc:`compute units <compute-unit>` on the accelerator, per
+      :ref:`normalization unit <normalization-units>`. Typically unused as these memory
+      operations are typically used to implement thread-local storage.
+    unit: Instructions per normalization unit
+Spill and stack metrics:
+  Spill/Stack Total Cycles:
+    rst: The number of cycles the address processing unit spent working on spill/stack
+      instructions, per :ref:`normalization unit <normalization-units>`.
+    unit: Cycles per normalization unit
+  Spill/Stack Coalesced Read:
+    rst: The number of cycles the address processing unit spent working on coalesced
+      spill/stack read instructions, per :ref:`normalization unit <normalization-units>`.
+    unit: Cycles per normalization unit
+  Spill/Stack Coalesced Write:
+    rst: The number of cycles the address processing unit spent working on coalesced
+      spill/stack write instructions, per :ref:`normalization unit <normalization-units>`.
+    unit: Cycles per normalization unit
+Vector L1 data-return path or Texture Data (TD):
+  Data-Return Busy:
+    rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the data-return unit
+      was busy processing or waiting on data to return to the :doc:`CU <compute-unit>`.
+    unit: Percent
+  Cache RAM → Data-Return Stall:
+    rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the data-return unit
+      was stalled on data to be returned from the :ref:`vL1D Cache RAM <desc-tc>`.
+    unit: Percent
+  Workgroup manager → Data-Return Stall:
+    rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the data-return unit
+      was stalled by the :ref:`workgroup manager <desc-spi>` due to initialization
+      of registers as a part of launching new workgroups.
+    unit: Percent
+  Coalescable Instructions:
+    rst: The number of instructions submitted to the :ref:`data-return unit <desc-td>`
+      by the :ref:`address processor <desc-ta>` that were found to be coalescable,
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Read Instructions:
+    rst: The number of read instructions submitted to the :ref:`data-return unit <desc-td>`
+      by the :ref:`address processor <desc-ta>` summed over all :doc:`compute units
+      <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
+      This is expected to be the sum of global/generic and spill/stack reads in the
+      :ref:`address processor <desc-ta>`.
+    unit: Instructions per normalization unit
+  Write Instructions:
+    rst: The number of store instructions submitted to the :ref:`data-return unit
+      <desc-td>` by the :ref:`address processor <desc-ta>` summed over all :doc:`compute
+      units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
+      This is expected to be the sum of global/generic and spill/stack stores counted
+      by the :ref:`vL1D cache-front-end <ta-instruction-counts>`.
+    unit: Instructions per normalization unit
+  Atomic Instructions:
+    rst: The number of atomic instructions submitted to the :ref:`data-return unit
+      <desc-td>` by the :ref:`address processor <desc-ta>` summed over all :doc:`compute
+      units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
+      This is expected to be the sum of global/generic and spill/stack atomics in
+      the :ref:`address processor <desc-ta>`.
+    unit: Instructions per normalization unit
+vL1D Speed-of-Light:
+  Hit rate:
+    rst: The ratio of the number of vL1D cache line requests that hit [#vl1d-hit]_
+      in vL1D cache over the total number of cache line requests to the :ref:`vL1D
+      Cache RAM <desc-tc>`.
+    unit: Percent
+  Bandwidth Utilization:
+    rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM
+      <desc-vmem>` instructions, as a percent of the peak theoretical bandwidth achievable
+      on the specific accelerator. The number of bytes is calculated as the number
+      of cache lines requested multiplied by the cache line size. This value does
+      not consider partial requests, so for instance, if only a single value is requested
+      in a cache line, the data movement will still be counted as a full cache line.
+    unit: Percent
+  Utilization:
+    rst: Indicates how busy the :ref:`vL1D Cache RAM <desc-tc>` was during the kernel
+      execution. The number of cycles where the vL1D Cache RAM is actively processing
+      any request divided by the number of cycles where the vL1D is active [#vl1d-activity]_.
+    unit: Percent
+  Coalescing:
+    rst: Indicates how well memory instructions were coalesced by the :ref:`address
+      processing unit <desc-ta>`, ranging from uncoalesced (25%) to fully coalesced
+      (100%). Calculated as the average number of :ref:`thread-requests <thread-requests>`
+      generated per instruction divided by the ideal number of thread-requests per
+      instruction.
+    unit: Percent
+vL1D cache stall metrics:
+  Stalled on L2 Data:
+    rst: The ratio of the number of cycles where the vL1D is stalled waiting for requested
+      data to return from the :doc:`L2 cache <l2-cache>` divided by the number of
+      cycles where the vL1D is active [#vl1d-activity]_.
+    unit: Percent
+  Stalled on L2 Req:
+    rst: The ratio of the number of cycles where the vL1D is stalled waiting to issue
+      a request for data to the :doc:`L2 cache <l2-cache>` divided by the number of
+      cycles where the vL1D is active [#vl1d-activity]_.
+    unit: Percent
+  Tag RAM Stall (Read):
+    rst: The ratio of the number of cycles where the vL1D is stalled due to Read requests
+      with conflicting tags being looked up concurrently, divided by the number of
+      cycles where the vL1D is active [#vl1d-activity]_.
+    unit: Percent
+  Tag RAM Stall (Write):
+    rst: The ratio of the number of cycles where the vL1D is stalled due to Write
+      requests with conflicting tags being looked up concurrently, divided by the
+      number of cycles where the vL1D is active [#vl1d-activity]_.
+    unit: Percent
+  Tag RAM Stall (Atomic):
+    rst: The ratio of the number of cycles where the vL1D is stalled due to Atomic
+      requests with conflicting tags being looked up concurrently, divided by the
+      number of cycles where the vL1D is active [#vl1d-activity]_.
+    unit: Percent
+vL1D cache access metrics:
+  Total Req:
+    rst: The total number of incoming requests from the :ref:`address processing unit
+      <desc-ta>` after coalescing.
+    unit: Requests
+  Read Req:
+    rst: The total number of incoming read requests from the :ref:`address processing
+      unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
+    unit: Requests per normalization unit
+  Write Req:
+    rst: The total number of incoming write requests from the :ref:`address processing
+      unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
+    unit: Requests per normalization unit
+  Atomic Req:
+    rst: The total number of incoming atomic requests from the :ref:`address processing
+      unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
+    unit: Requests per normalization unit
+  Cache BW:
+    rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM
+      <desc-vmem>` instructions divided by total duration. The number of bytes is
+      calculated as the number of cache lines requested multiplied by the cache line
+      size. This value does not consider partial requests, so for instance, if only
+      a single value is requested in a cache line, the data movement will still be
+      counted as a full cache line.
+    unit: Gbps
+  Cache Hit Rate:
+    rst: The ratio of the number of vL1D cache line requests that hit in vL1D cache
+      over the total number of cache line requests to the :ref:`vL1D Cache RAM <desc-tc>`.
+    unit: Percent
+  Cache Accesses:
+    rst: The total number of cache line lookups in the vL1D.
+    unit: Cache lines
+  Cache Hits:
+    rst: The number of cache accesses minus the number of outgoing requests to the
+      :doc:`L2 cache <l2-cache>`, that is, the number of cache line requests serviced
+      by the :ref:`vL1D Cache RAM <desc-tc>` per :ref:`normalization unit <normalization-units>`.
+    unit: Cache lines per normalization unit
+  Invalidations:
+    rst: The number of times the vL1D was issued a write-back invalidate command during
+      the kernel's execution per :ref:`normalization unit <normalization-units>`.
+      This may be triggered by, for instance, the ``buffer_wbinvl1`` instruction.
+    unit: Invalidations per normalization unit
+  L1-L2 BW:
+    rst: The number of bytes transferred across the vL1D-L2 interface as a result
+      of :ref:`VMEM <desc-vmem>` instructions, divided by total duration. The number
+      of bytes is calculated as the number of cache lines requested multiplied by
+      the cache line size. This value does not consider partial requests, so for instance,
+      if only a single value is requested in a cache line, the data movement will
+      still be counted as a full cache line.
+    unit: Gbps
+  L1-L2 Read:
+    rst: The number of read requests for a vL1D cache line that were not satisfied
+      by the vL1D and must be retrieved from the to the :doc:`L2 Cache <l2-cache>`
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  L1-L2 Write:
+    rst: The number of write requests to a vL1D cache line that were sent through
+      the vL1D to the :doc:`L2 cache <l2-cache>`, per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  L1-L2 Atomic:
+    rst: The number of atomic requests that are sent through the vL1D to the :doc:`L2
+      cache <l2-cache>`, per :ref:`normalization unit <normalization-units>`. This
+      includes requests for atomics with, and without return.
+    unit: Requests per normalization unit
+L1D - L2 Transactions:
+  NC - Read:
+    rst: Total read requests with NC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  UC - Read:
+    rst: Total read requests with UC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  CC - Read:
+    rst: Total read requests with CC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  RW - Read:
+    rst: Total read requests with RW mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  RW - Write:
+    rst: Total write requests with RW mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  NC - Write:
+    rst: Total write requests with NC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  UC - Write:
+    rst: Total write requests with UC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  CC - Write:
+    rst: Total write requests with CC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  NC - Atomic:
+    rst: Total atomic requests with NC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  UC - Atomic:
+    rst: Total atomic requests with UC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  CC - Atomic:
+    rst: Total atomic requests with CC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  RW - Atomic:
+    rst: Total atomic requests with RW mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+L1 Unified Translation Cache (UTCL1):
+  Req:
+    rst: The number of translation requests made to the UTCL1 per normalization unit.
+    unit: Requests per normalization unit
+  Hit Ratio:
+    rst: The ratio of the number of translation requests that hit in the UTCL1 divided
+      by the total number of translation requests made to the UTCL1.
+    unit: Percent
+  Hits:
+    rst: The number of translation requests that hit in the UTCL1, and could be reused,
+      per normalization unit.
+    unit: Requests per normalization unit
+  Translation Misses:
+    rst: The total number of translation requests that missed in the UTCL1 due to
+      translation not being present in the cache, per :ref:`normalization unit <normalization-units>`.
+    unit: unit
+  Permission Misses:
+    rst: |-
+      The total number of translation requests that missed in the UTCL1 due
+      to a permission error, per :ref:`normalization unit <normalization-units>`.
+      This is unused and expected to be zero in most configurations for modern
+      CDNA\u2122 accelerators.
+    unit: Requests per normalization unit
+L1D Addr Translation Stalls: {}
+L2 Speed-of-Light:
+  Utilization:
+    rst: The ratio of the :ref:`number of cycles an L2 channel was active, summed
+      over all L2 channels on the accelerator <total-active-l2-cycles>` over the :ref:`total
+      L2 cycles <total-l2-cycles>`.
+    unit: Percent
+  Peak Bandwidth:
+    rst: The number of bytes looked up in the L2 cache, as a percent of the peak theoretical
+      bandwidth achievable on the specific accelerator. The number of bytes is calculated
+      as the number of cache lines requested multiplied by the cache line size. This
+      value does not consider partial requests, so e.g., if only a single value is
+      requested in a cache line, the data movement will still be counted as a full
+      cache line.
+    unit: Percent
+  Hit Rate:
+    rst: The ratio of the number of L2 cache line requests that hit in the L2 cache
+      over the total number of incoming cache line requests to the L2 cache.
+    unit: Percent
+  L2-Fabric Read BW:
+    rst: The number of bytes read by the L2 over the :ref:`Infinity Fabric interface
+      <l2-fabric>` per unit time.
+    unit: GB/s
+  L2-Fabric Write and Atomic BW:
+    rst: The number of bytes sent by the L2 over the :ref:`Infinity Fabric interface
+      <l2-fabric>` by write and atomic operations per unit time.
+    unit: GB/s
+  HBM Bandwidth:
+    rst: Maximum theoretical bandwidth of the accelerator's local high-bandwidth memory
+      (HBM) per unit time. This value is calculated as the number of HBM channels
+      multiplied by the HBM channel width multiplied by the HBM clock frequency.
+    unit: GB/s
+L2-Fabric interface metrics:
+  Read BW:
+    rst: The total number of bytes read by the L2 cache from Infinity Fabric divided
+      by total duration.
+    unit: Gbps
+  HBM Read Traffic:
+    rst: The percent of read requests generated by the L2 cache that are routed to
+      the accelerator's local high-bandwidth memory (HBM). This breakdown does not
+      consider the *size* of the request (meaning that 32B and 64B requests are both
+      counted as a single request), so this metric only *approximates* the percent
+      of the L2-Fabric Read bandwidth directed to the local HBM.
+    unit: Percent
+  Remote Read Traffic:
+    rst: The percent of read requests generated by the L2 cache that are routed to
+      any memory location other than the accelerator's local high-bandwidth memory
+      (HBM) -- for example, the CPU's DRAM or a remote accelerator's HBM. This breakdown
+      does not consider the *size* of the request (meaning that 32B and 64B requests
+      are both counted as a single request), so this metric only *approximates* the
+      percent of the L2-Fabric Read bandwidth directed to a remote location.
+    unit: Percent
+  Uncached Read Traffic:
+    rst: The percent of read requests generated by the L2 cache that are reading from
+      an :ref:`uncached memory allocation <memory-type>`. Note, as described in the
+      :ref:`request flow <l2-request-flow>` section, a single 64B read request is
+      typically counted as two uncached read requests. So, it is possible for the
+      Uncached Read Traffic to reach up to 200% of the total number of read requests.
+      This breakdown does not consider the *size* of the request (i.e., 32B and 64B
+      requests are both counted as a single request), so this metric only *approximates*
+      the percent of the L2-Fabric read bandwidth directed to an uncached memory location.
+    unit: Percent
+  Write and Atomic BW:
+    rst: The total number of bytes written by the L2 over Infinity Fabric by write
+      and atomic operations divided by total duration. Note that on current CDNA accelerators,
+      such as the :ref:`MI2XX <mixxx-note>`, requests are only considered *atomic*
+      by Infinity Fabric if they are targeted at non-write-cacheable memory, for example,
+      :ref:`fine-grained memory <memory-type>` allocations or :ref:`uncached memory
+      <memory-type>` allocations on the MI2XX.
+    unit: Gbps
+  HBM Write and Atomic Traffic:
+    rst: The percent of write and atomic requests generated by the L2 cache that are
+      routed to the accelerator's local high-bandwidth memory (HBM). This breakdown
+      does not consider the *size* of the request (meaning that 32B and 64B requests
+      are both counted as a single request), so this metric only *approximates* the
+      percent of the L2-Fabric Write and Atomic bandwidth directed to the local HBM.
+      Note that on current CDNA accelerators, such as the :ref:`MI2XX <mixxx-note>`,
+      requests are only considered *atomic* by Infinity Fabric if they are targeted
+      at :ref:`fine-grained memory <memory-type>` allocations or :ref:`uncached memory
+      <memory-type>` allocations.
+    unit: Percent
+  Remote Write and Atomic Traffic:
+    rst: The percent of read requests generated by the L2 cache that are routed to
+      any memory location other than the accelerator's local high-bandwidth memory
+      (HBM) -- for example, the CPU's DRAM or a remote accelerator's HBM. This breakdown
+      does not consider the *size* of the request (meaning that 32B and 64B requests
+      are both counted as a single request), so this metric only *approximates* the
+      percent of the L2-Fabric Read bandwidth directed to a remote location. Note
+      that on current CDNA accelerators, such as the :ref:`MI2XX <mixxx-note>`, requests
+      are only considered *atomic* by Infinity Fabric if they are targeted at :ref:`fine-grained
+      memory <memory-type>` allocations or :ref:`uncached memory <memory-type>` allocations.
+    unit: Percent
+  Atomic Traffic:
+    rst: The percent of write requests generated by the L2 cache that are atomic requests
+      to *any* memory location. This breakdown does not consider the *size* of the
+      request (meaning that 32B and 64B requests are both counted as a single request),
+      so this metric only *approximates* the percent of the L2-Fabric Read bandwidth
+      directed to a remote location. Note that on current CDNA accelerators, such
+      as the :ref:`MI2XX <mixxx-note>`, requests are only considered *atomic* by Infinity
+      Fabric if they are targeted at :ref:`fine-grained memory <memory-type>` allocations
+      or :ref:`uncached memory <memory-type>` allocations.
+    unit: Percent
+  Uncached Write and Atomic Traffic:
+    rst: The percent of write and atomic requests generated by the L2 cache that are
+      targeting :ref:`uncached memory allocations <memory-type>`. This breakdown does
+      not consider the *size* of the request (meaning that 32B and 64B requests are
+      both counted as a single request), so this metric only *approximates* the percent
+      of the L2-Fabric read bandwidth directed to uncached memory allocations.
+    unit: Percent
+  Read Latency:
+    rst: The time-averaged number of cycles read requests spent in Infinity Fabric
+      before data was returned to the L2.
+    unit: Cycles
+  Write and Atomic Latency:
+    rst: The time-averaged number of cycles write requests spent in Infinity Fabric
+      before a completion acknowledgement was returned to the L2.
+    unit: Cycles
+  Atomic Latency:
+    rst: The time-averaged number of cycles atomic requests spent in Infinity Fabric
+      before a completion acknowledgement (atomic without return value) or data (atomic
+      with return value) was returned to the L2.
+    unit: Cycles
+L2 Cache Accesses:
+  Bandwidth:
+    rst: The number of bytes looked up in the L2 cache, divided by total duration.
+      The number of bytes is calculated as the number of cache lines requested multiplied
+      by the cache line size. This value does not consider partial requests, so for
+      example, if only a single value is requested in a cache line, the data movement
+      will still be counted as a full cache line.
+    unit: Gbps
+  Req:
+    rst: The total number of incoming requests to the L2 from all clients for all
+      request types, per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Read Req:
+    rst: The total number of read requests to the L2 from all clients.
+    unit: Requests per normalization unit
+  Write Req:
+    rst: The total number of write requests to the L2 from all clients.
+    unit: Requests per normalization unit
+  Atomic Req:
+    rst: The total number of atomic requests (with and without return) to the L2 from
+      all clients.
+    unit: Requests per normalization unit
+  Streaming Req:
+    rst: The total number of incoming requests to the L2 that are marked as *streaming*.
+      The exact meaning of this may differ depending on the targeted accelerator,
+      however on an :ref:`MI2XX <mixxx-note>` this corresponds to `non-temporal load
+      or stores <https://clang.llvm.org/docs/LanguageExtensions.html#non-temporal-load-store-builtins>`_.
+      The L2 cache attempts to evict *streaming* requests before normal requests when
+      the L2 is at capacity.
+    unit: Requests per normalization unit
+  Probe Req:
+    rst: The number of coherence probe requests made to the L2 cache from outside
+      the accelerator. On an :ref:`MI2XX <mixxx-note>`, probe requests may be generated
+      by, for example, writes to :ref:`fine-grained device <memory-type>` memory or
+      by writes to :ref:`coarse-grained <memory-type>` device memory.
+    unit: Requests per normalization unit
+  Cache Hit:
+    rst: The ratio of the number of L2 cache line requests that hit in the L2 cache
+      over the total number of incoming cache line requests to the L2 cache.
+    unit: Percent
+  Hits:
+    rst: The total number of requests to the L2 from all clients that hit in the cache.
+      As noted in the :ref:`Speed-of-Light <l2-sol>` section, this includes hit-on-miss
+      requests.
+    unit: Requests per normalization unit
+  Misses:
+    rst: The total number of requests to the L2 from all clients that miss in the
+      cache. As noted in the :ref:`Speed-of-Light <l2-sol>` section, these do not
+      include hit-on-miss requests.
+    unit: Requests per normalization unit
+  Writeback:
+    rst: The total number of L2 cache lines written back to memory for any reason.
+      Write-backs may occur due to user code (such as HIP kernel calls to ``__threadfence_system``
+      or atomic built-ins) by the :doc:`command processor <command-processor>`'s memory
+      acquire/release fences, or for other internal hardware reasons.
+    unit: Cache lines per normalization unit
+  Writeback (Internal):
+    rst: The total number of L2 cache lines written back to memory for internal hardware
+      reasons, per :ref:`normalization unit <normalization-units>`.
+    unit: Cache lines per normalization unit
+  Writeback (vL1D Req):
+    rst: The total number of L2 cache lines written back to memory due to requests
+      initiated by the :doc:`vL1D cache <vector-l1-cache>`, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Cache lines per normalization unit
+  Evict (Internal):
+    rst: The total number of L2 cache lines evicted from the cache due to capacity
+      limits, per :ref:`normalization unit <normalization-units>`.
+    unit: Cache lines per normalization unit
+  Evict (vL1D Req):
+    rst: The total number of L2 cache lines evicted from the cache due to invalidation
+      requests initiated by the :doc:`vL1D cache <vector-l1-cache>`, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Cache lines per normalization unit
+  NC Req:
+    rst: The total number of requests to the L2 to Not-hardware-Coherent (NC) memory
+      allocations, per :ref:`normalization unit <normalization-units>`. See the :ref:`memory-type`
+      for more information.
+    unit: Requests per normalization unit
+  UC Req:
+    rst: The total number of requests to the L2 that go to Uncached (UC) memory allocations.
+      See the :ref:`memory-type` for more information.
+    unit: Requests per normalization unit
+  CC Req:
+    rst: The total number of requests to the L2 that go to Coherently Cacheable (CC)
+      memory allocations. See the :ref:`memory-type` for more information.
+    unit: Requests per normalization unit
+  RW Req:
+    rst: The total number of requests to the L2 that go to Read-Write coherent memory
+      (RW) allocations. See the :ref:`memory-type` for more information.
+    unit: Requests per normalization unit
+L2 Cache Stalls: {}
+L2 - Fabric Interface stalls:
+  Write - Credit Starvation:
+    rst: The number of cycles the L2-Fabric interface was stalled on write or atomic
+      requests to any memory location because too many write/atomic requests were
+      currently in flight, as a percent of the :ref:`total active L2 cycles <total-active-l2-cycles>`.
+    unit: Percent
+L2 - Fabric interface detailed metrics:
+  Read (32B):
+    rst: The total number of L2 requests to Infinity Fabric to read 32B of data from
+      any memory location, per :ref:`normalization unit <normalization-units>`. See
+      :ref:`l2-request-flow` for more detail. Typically unused on CDNA accelerators.
+    unit: Requests per normalization unit
+  Read (64B):
+    rst: The total number of L2 requests to Infinity Fabric to read 64B of data from
+      any memory location, per :ref:`normalization unit <normalization-units>`. See
+      :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+  Read (Uncached):
+    rst: The total number of L2 requests to Infinity Fabric to read :ref:`uncached
+      data <memory-type>` from any memory location, per :ref:`normalization unit <normalization-units>`.
+      64B requests for uncached data are counted as two 32B uncached data requests.
+      See :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+  HBM Read:
+    rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of
+      data from the accelerator's local HBM, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+  Remote Read:
+    rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of
+      data from any source other than the accelerator's local HBM, per :ref:`normalization
+      unit <normalization-units>`. See :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+  Write and Atomic (32B):
+    rst: The total number of L2 requests to Infinity Fabric to write or atomically
+      update 32B of data to any memory location, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+  Write and Atomic (Uncached):
+    rst: The total number of L2 requests to Infinity Fabric to write or atomically
+      update 32B or 64B of :ref:`uncached data <memory-type>`, per :ref:`normalization
+      unit <normalization-units>`. See :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+  Write and Atomic (64B):
+    rst: The total number of L2 requests to Infinity Fabric to write or atomically
+      update 64B of data in any memory location, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+  HBM Write and Atomic:
+    rst: The total number of L2 requests to Infinity Fabric to write or atomically
+      update 32B or 64B of data in the accelerator's local HBM, per :ref:`normalization
+      unit <normalization-units>`. See :ref:`l2-request-flow` for more detail. plain
+    unit: Requests per normalization unit
+  Remote Write and Atomic:
+    rst: The total number of L2 requests to Infinity Fabric to write or atomically
+      update 32B or 64B of data in any memory location other than the accelerator's
+      local HBM, per :ref:`normalization unit <normalization-units>`. See :ref:`l2-request-flow`
+      for more detail.
+    unit: Requests per normalization unit
+  Atomic:
+    rst: The total number of L2 requests to Infinity Fabric to atomically update 32B
+      or 64B of data in any memory location, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`l2-request-flow` for more detail. Note that on current CDNA accelerators,
+      such as the :ref:`MI2XX <mixxx-note>`, requests are only considered *atomic*
+      by Infinity Fabric if they are targeted at non-write-cacheable memory, such
+      as :ref:`fine-grained memory <memory-type>` allocations or :ref:`uncached memory
+      <memory-type>` allocations on the MI2XX.
+    unit: Requests per normalization unit
+Aggregate Stats (All channels):
+  L2 Cache Hit Rate:
+    rst: The total number of requests to the L2 from all clients that hit in the cache.
+      As noted in the :ref:`Speed-of-Light <l2-sol>` section, this includes hit-on-miss
+      requests.
+    unit: Percent
+L2 Cache Hit Rate (pct):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
+L2 Requests (per normUnit):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
+L2-Fabric Requests (per normUnit):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
+L2-Fabric Read Latency (Cycles):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
+L2-Fabric Write and Atomic Latency (Cycles):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
+L2-Fabric Atomic Latency (Cycles):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
+L2-Fabric Read Stall (Cycles per normUnit):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
+L2-Fabric Write and Atomic Stall (Cycles per normUnit):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
+L2-Fabric (128B read requests per normUnit):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
diff --git a/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx941_metrics_description.yaml b/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx941_metrics_description.yaml
new file mode 100644
index 0000000000..5a978ccee5
--- /dev/null
+++ b/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx941_metrics_description.yaml
@@ -0,0 +1,2040 @@
+System Speed-of-Light:
+  VALU FLOPs:
+    rst: |-
+      The total floating-point operations executed per second on the :ref:`VALU
+      <desc-valu>`. This is also presented as a percent of the peak theoretical
+      FLOPs achievable on the specific accelerator. Note: this does not include
+      any floating-point operations from :ref:`MFMA <desc-mfma>` instructions.
+    unit: GFLOPs
+  VALU IOPs:
+    rst: |-
+      The total integer operations executed per second on the :ref:`VALU <desc-valu>`.
+      This is also presented as a percent of the peak theoretical IOPs achievable
+      on the specific accelerator. Note: this does not include any integer operations
+      from :ref:`MFMA <desc-mfma>` instructions.
+    unit: GOIPs
+  MFMA FLOPs (F8):
+    rst: |-
+      The total number of 8-bit brain floating point :ref:`MFMA <desc-mfma>`
+      operations executed per second. Note: this does not include any 16-bit brain
+      floating point operations from :ref:`VALU <desc-valu>` instructions. This
+      is also presented as a percent of the peak theoretical F8 MFMA operations
+      achievable on the specific accelerator. It is supported on AMD Instinct MI300
+      series and later only.
+    unit: GFLOPs
+  MFMA FLOPs (BF16):
+    rst: |-
+      The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>`
+      operations executed per second. Note: this does not include any 16-bit brain
+      floating point operations from :ref:`VALU <desc-valu>` instructions. This
+      is also presented as a percent of the peak theoretical BF16 MFMA operations
+      achievable on the specific accelerator.
+    unit: GFLOPs
+  MFMA FLOPs (F16):
+    rst: |-
+      The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 16-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. This is also presented
+      as a percent of the peak theoretical F16 MFMA operations achievable on the
+      specific accelerator.
+    unit: GFLOPs
+  MFMA FLOPs (F32):
+    rst: |-
+      The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 32-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. This is also presented
+      as a percent of the peak theoretical F32 MFMA operations achievable on the
+      specific accelerator.
+    unit: GFLOPs
+  MFMA FLOPs (F64):
+    rst: |-
+      The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 64-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. This is also presented
+      as a percent of the peak theoretical F64 MFMA operations achievable on the
+      specific accelerator.
+    unit: GFLOPs
+  MFMA IOPs (Int8):
+    rst: |-
+      The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
+      per second. Note: this does not include any 8-bit integer operations from
+      :ref:`VALU <desc-valu>` instructions. This is also presented as a percent
+      of the peak theoretical INT8 MFMA operations achievable on the specific accelerator.
+    unit: GIOPs
+  Active CUs:
+    rst: Total number of active compute units (CUs) on the accelerator during the
+      kernel execution.
+    unit: Number
+  SALU Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`SALU <desc-salu>`
+      was busy executing instructions. Computed as the ratio of the total number of
+      cycles spent by the :ref:`scheduler <desc-scheduler>` issuing SALU / :ref:`SMEM
+      <desc-smem>` instructions over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  VALU Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`VALU <desc-valu>`
+      was busy executing instructions. Does not include :ref:`VMEM <desc-vmem>` operations.
+      Computed as the ratio of the total number of cycles spent by the :ref:`scheduler
+      <desc-scheduler>` issuing VALU instructions over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  MFMA Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`MFMA <desc-mfma>`
+      unit was busy executing instructions. Computed as the ratio of the total number
+      of cycles spent by the :ref:`MFMA <desc-salu>` was busy over the :ref:`total
+      CU cycles <total-cu-cycles>`.
+    unit: Percent
+  VMEM Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`VMEM <desc-vmem>`
+      unit was busy executing instructions, including both global/generic and spill/scratch
+      operations (see the :ref:`VMEM instruction count metrics <ta-instruction-counts>`
+      for more detail). Does not include :ref:`VALU <desc-valu>` operations. Computed
+      as the ratio of the total number of cycles spent by the :ref:`scheduler <desc-scheduler>`
+      issuing VMEM instructions over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  Branch Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`branch <desc-branch>`
+      unit was busy executing instructions. Computed as the ratio of the total number
+      of cycles spent by the :ref:`scheduler <desc-scheduler>` issuing branch instructions
+      over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  VALU Active Threads:
+    rst: Indicates the average level of :ref:`divergence <desc-divergence>` within
+      a wavefront over the lifetime of the kernel. The number of work-items that were
+      active in a wavefront during execution of each :ref:`VALU <desc-valu>` instruction,
+      time-averaged over all VALU instructions run on all wavefronts in the kernel.
+    unit: Work-items
+  IPC:
+    rst: The ratio of the total number of instructions executed on the :doc:`CU <compute-unit>`
+      over the :ref:`total active CU cycles <total-active-cu-cycles>`.
+    unit: Instructions per-cycle
+  Wavefront Occupancy:
+    rst: |-
+      The time-averaged number of wavefronts resident on the accelerator over
+      the lifetime of the kernel. Note: this metric may be inaccurate for short-running
+      kernels (less than 1ms). This is also presented as a percent of the peak theoretical
+      occupancy achievable on the specific accelerator.
+    unit: Wavefronts
+  Theoretical LDS Bandwidth:
+    rst: Indicates the maximum amount of bytes that could have been loaded from, stored
+      to, or atomically updated in the LDS per unit time (see :ref:`LDS Bandwidth
+      <lds-bandwidth>` example for more detail). This is also presented as a percent
+      of the peak theoretical F64 MFMA operations achievable on the specific accelerator.
+    unit: GB/s
+  LDS Bank Conflicts/Access:
+    rst: The ratio of the number of cycles spent in the :doc:`LDS scheduler <local-data-share>`
+      due to bank conflicts (as determined by the conflict resolution hardware) to
+      the base number of cycles that would be spent in the LDS scheduler in a completely
+      uncontended case. This is also presented in normalized form (i.e., the Bank
+      Conflict Rate).
+    unit: Conflicts/Access
+  vL1D Cache Hit Rate:
+    rst: The ratio of the number of vL1D cache line requests that hit in vL1D cache
+      over the total number of cache line requests to the :ref:`vL1D cache RAM <desc-tc>`.
+    unit: Percent
+  vL1D Cache BW:
+    rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM
+      <desc-vmem>` instructions per unit time. The number of bytes is calculated as
+      the number of cache lines requested multiplied by the cache line size. This
+      value does not consider partial requests, so e.g., if only a single value is
+      requested in a cache line, the data movement will still be counted as a full
+      cache line. This is also presented as a percent of the peak theoretical bandwidth
+      achievable on the specific accelerator.
+    unit: GB/s
+  L2 Cache Hit Rate:
+    rst: The ratio of the number of L2 cache line requests that hit in the L2 cache
+      over the total number of incoming cache line requests to the L2 cache.
+    unit: Percent
+  L2 Cache BW:
+    rst: The number of bytes looked up in the L2 cache per unit time. The number of
+      bytes is calculated as the number of cache lines requested multiplied by the
+      cache line size. This value does not consider partial requests, so e.g., if
+      only a single value is requested in a cache line, the data movement will still
+      be counted as a full cache line. This is also presented as a percent of the
+      peak theoretical bandwidth achievable on the specific accelerator.
+    unit: GB/s
+  L2-Fabric Read BW:
+    rst: |-
+      The number of bytes read by the L2 over the :ref:`Infinity Fabric\u2122
+      interface <l2-fabric>` per unit time. This is also presented as a percent
+      of the peak theoretical bandwidth achievable on the specific accelerator.
+    unit: GB/s
+  L2-Fabric Write BW:
+    rst: The number of bytes sent by the L2 over the :ref:`Infinity Fabric interface
+      <l2-fabric>` by write and atomic operations per unit time. This is also presented
+      as a percent of the peak theoretical bandwidth achievable on the specific accelerator.
+    unit: GB/s
+  L2-Fabric Read Latency:
+    rst: The time-averaged number of cycles read requests spent in Infinity Fabric
+      before data was returned to the L2.
+    unit: Cycles
+  L2-Fabric Write Latency:
+    rst: The time-averaged number of cycles write requests spent in Infinity Fabric
+      before a completion acknowledgement was returned to the L2.
+    unit: Cycles
+  sL1D Cache Hit Rate:
+    rst: The percent of sL1D requests that hit on a previously loaded line the cache.
+      Calculated as the ratio of the number of sL1D requests that hit over the number
+      of all sL1D requests.
+    unit: Percent
+  sL1D Cache BW:
+    rst: The number of bytes looked up in the sL1D cache per unit time. This is also
+      presented as a percent of the peak theoretical bandwidth achievable on the specific
+      accelerator.
+    unit: GB/s
+  L1I Hit Rate:
+    rst: The percent of L1I requests that hit on a previously loaded line the cache.
+      Calculated as the ratio of the number of L1I requests that hit over the number
+      of all L1I requests.
+    unit: GB/s
+  L1I BW:
+    rst: The number of bytes looked up in the L1I cache per unit time. This is also
+      presented as a percent of the peak theoretical bandwidth achievable on the specific
+      accelerator.
+    unit: Percent
+  L1I Fetch Latency:
+    rst: The average number of cycles spent to fetch instructions to a :doc:`CU <compute-unit>`.
+    unit: Cycles
+Memory Chart:
+  Wavefront Occupancy:
+    rst: Wavefronts per active CU.
+    unit: Wavefronts
+  Wave Life:
+    rst: Average number of cycles executing a wave.
+    unit: Cycles per wave
+  SALU:
+    rst: Total Number of SALU (Scalar ALU) instructions issued per normalization unit.
+    unit: Instructions per normalization unit
+  SMEM:
+    rst: Total number of SMEM (Scalar Memory Read) instructions issued normalization
+      unit.
+    unit: Instructions per normalization unit
+  VALU:
+    rst: The number of VALU (Vector ALU) instructions issued per normalization unit.
+    unit: Instructions per normalization unit
+  MFMA:
+    rst: Total number of MFMA (Matrix-Fused-Multiply-Add) instructions issued per
+      normalization unit.
+    unit: Instructions per normalization unit
+  VMEM:
+    rst: The number of VMEM (GPU Memory) read instructions issued (including FLAT/scratch
+      memory) per normalization unit.
+    unit: Instructions per normalization unit
+  LDS:
+    rst: The total number of LDS instructions (including, but not limited to, read/write/atomics
+      and HIP's __shfl instructions) executed per normalization unit.
+    unit: Instructions per normalization unit
+  GWS:
+    rst: Total number of GDS (global data sync) instructions issued per normalization
+      unit.
+    unit: Instructions per normalization unit
+  BR:
+    rst: Total number of BRANCH instructions issued per normalization unit.
+    unit: Instructions per normalization unit
+  Active CUs:
+    rst: Total number of active compute units (CUs) on the accelerator during the
+      kernel execution.
+    unit: CUs
+  Num CUs:
+    rst: Total number of compute units (CUs) on the accelerator.
+    unit: CUs
+  VGPR:
+    rst: |-
+      The number of architected vector general-purpose registers allocated for the
+      kernel, see :ref:`VALU <desc-valu>`. Note: this may not exactly match the
+      number of VGPRs requested by the compiler due to allocation granularity.
+    unit: VGPRs
+  SGPR:
+    rst: |-
+      The number of scalar general-purpose registers allocated for the kernel, see
+      :ref:`SALU <desc-salu>`. Note: this may not exactly match the number of
+      SGPRs requested by the compiler due to allocation granularity.
+    unit: SGPRs
+  LDS Allocation:
+    rst: |-
+      The number of bytes of :doc:`LDS <local-data-share>` memory (or, shared memory)
+      allocated for this kernel. Note: This may also be larger than what was requested
+      at compile time due to both allocation granularity and dynamic per-dispatch
+      LDS allocations.
+    unit: Bytes per workgroup
+  Scratch Allocation:
+    rst: The number of bytes of :ref:`scratch memory <memory-spaces>` requested per
+      work-item for this kernel. Scratch memory is used for stack memory on the accelerator,
+      as well as for register spills and restores.
+    unit: Bytes per workgroup
+  Wavefronts:
+    rst: The total number of wavefronts, summed over all workgroups, forming this
+      kernel launch.
+    unit: Wavefronts
+  Workgroups:
+    rst: The total number of workgroups forming this kernel launch.
+    unit: Workgroups
+  LDS Req:
+    rst: The total number of LDS instructions (including, but not limited to, read/write/atomics
+      and HIP's ``__shfl`` instructions) executed per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  LDS Util:
+    rst: Indicates what percent of the kernel's duration the :ref:`LDS <desc-lds>`
+      was actively executing instructions (including, but not limited to, load, store,
+      atomic and HIP's ``__shfl`` operations). Calculated as the ratio of the total
+      number of cycles LDS was active over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  LDS Latency:
+    rst: The average number of round-trip cycles (i.e., from issue to data-return
+      / acknowledgment) required for an LDS instruction to complete.
+    unit: Cycles
+  VL1 Rd:
+    rst: The total number of incoming read requests from the :ref:`address processing
+      unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
+    unit: Requests per normalization unit
+  VL1 Wr:
+    rst: The total number of incoming write requests from the :ref:`address processing
+      unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
+    unit: Requests per normalization unit
+  VL1 Atomic:
+    rst: The total number of incoming atomic requests from the :ref:`address processing
+      unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
+    unit: Requests per normalization unit
+  VL1 Hit:
+    rst: The ratio of the number of vL1D cache line requests that hit in vL1D cache
+      over the total number of cache line requests to the :ref:`vL1D Cache RAM <desc-tc>`.
+    unit: Percent
+  VL1 Lat:
+    rst: Calculated as the average number of cycles that a vL1D cache line request
+      spent in the vL1D cache pipeline.
+    unit: Cycles
+  VL1 Coalesce:
+    rst: Indicates how well memory instructions were coalesced by the :ref:`address
+      processing unit <desc-ta>`, ranging from uncoalesced (25%) to fully coalesced
+      (100%). Calculated as the average number of :ref:`thread-requests <thread-requests>`
+      generated per instruction divided by the ideal number of thread-requests per
+      instruction.
+    unit: Percent
+  VL1 Stall:
+    rst: The ratio of the number of cycles where the vL1D is stalled waiting to issue
+      a request for data to the :doc:`L2 cache <l2-cache>` divided by the number of
+      cycles where the vL1D is active [#vl1d-activity]_.
+    unit: Percent
+  VL1_L2 Rd:
+    rst: The number of read requests for a vL1D cache line that were not satisfied
+      by the vL1D and must be retrieved from the to the :doc:`L2 Cache <l2-cache>`
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  VL1_L2 Wr:
+    rst: The number of write requests to a vL1D cache line that were sent through
+      the vL1D to the :doc:`L2 cache <l2-cache>`, per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  VL1_L2 Atomic:
+    rst: The number of atomic requests that are sent through the vL1D to the :doc:`L2
+      cache <l2-cache>`, per :ref:`normalization unit <normalization-units>`. This
+      includes requests for atomics with, and without return.
+    unit: Requests per normalization unit
+  sL1D Rd:
+    rst: The total number of requests, of any size or type, made to the sL1D per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Requests per normalization unit
+  sL1D Hit:
+    rst: The total number of sL1D requests that hit on a previously loaded cache line,
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  sL1D Lat:
+    rst: ''
+    unit: Unknown
+  sL1D_L2 Rd:
+    rst: The total number of read requests from sL1D to the :doc:`L2 <l2-cache>`,
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  sL1D_L2 Wr:
+    rst: The total number of write requests from sL1D to the :doc:`L2 <l2-cache>`,
+      per :ref:`normalization unit <normalization-units>`. Typically unused on current
+      CDNA accelerators.
+    unit: Requests per normalization unit
+  sL1D_L2 Atomic:
+    rst: The total number of atomic requests from sL1D to the :doc:`L2 <l2-cache>`,
+      per :ref:`normalization unit <normalization-units>`. Typically unused on current
+      CDNA accelerators.
+    unit: Requests per normalization unit
+  IL1 Fetch:
+    rst: The total number of requests made to the L1I per :ref:`normalization-unit
+      <normalization-units>`.
+    unit: Requests per normalization unit
+  IL1 Hit:
+    rst: The total number of L1I requests that hit on a previously loaded cache line,
+      per :ref:`normalization-unit <normalization-units>`.
+    unit: Percent
+  IL1 Lat:
+    rst: The average number of cycles spent to fetch instructions to a :doc:`CU <compute-unit>`.
+    unit: Cycles
+  IL1_L2 Rd:
+    rst: The total number of requests across the L1I - L2 interface per normalization-unit.
+    unit: Requests per normalization unit
+  L2 Rd:
+    rst: The total number of read requests to the L2 from all clients.
+    unit: Requests per normalization unit
+  L2 Wr:
+    rst: The total number of write requests to the L2 from all clients.
+    unit: Requests per normalization unit
+  L2 Atomic:
+    rst: The total number of atomic requests (with and without return) to the L2 from
+      all clients.
+    unit: Requests per normalization unit
+  L2 Hit:
+    rst: The ratio of the number of L2 cache line requests that hit in the L2 cache
+      over the total number of incoming cache line requests to the L2 cache.
+    unit: Percent
+  Fabric_L2 Rd:
+    rst: Number of L2 cache - Infinity Fabric read requests (either 32-byte or 64-byte)
+      summed over TCC instances per normalization unit.
+    unit: Requests per normalization unit
+  Fabric_L2 Wr:
+    rst: Number of L2 cache - Infinity Fabric write requests (either 32-byte or 64-byte)
+      summed over TCC instances per normalization unit.
+    unit: Requests per normalization unit
+  Fabric_L2 Atomic:
+    rst: Number of L2 cache - Infinity Fabric write requests (either 32-byte or 64-byte)
+      that are actually atomic requests summed over TCC instances per normalization
+      unit.
+    unit: Requests per normalization unit
+  Fabric Rd Lat:
+    rst: The time-averaged number of cycles read requests spent in Infinity Fabric
+      before data was returned to the L2.
+    unit: Cycles
+  Fabric Wr Lat:
+    rst: The time-averaged number of cycles write requests spent in Infinity Fabric
+      before a completion acknowledgement was returned to the L2.
+    unit: Cycles
+  Fabric Atomic Lat:
+    rst: The time-averaged number of cycles atomic requests spent in Infinity Fabric
+      before a completion acknowledgement (atomic without return value) or data (atomic
+      with return value) was returned to the L2.
+    unit: Cycles
+  HBM Rd:
+    rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of
+      data from the accelerator's local HBM, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+  HBM Wr:
+    rst: The total number of L2 requests to Infinity Fabric to write 32B or 64B of
+      data from the accelerator's local HBM, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+Roofline Performance Rates:
+  VALU FLOPs (F16):
+    rst: |-
+      The total 16-bit floating-point operations executed per second on the :ref:`VALU
+      <desc-valu>`. This is presented with the value of the peak empirical F16 FLOPs achievable
+      on the specific accelerator. Note: this does not include any F16 operations
+      from :ref:`MFMA <desc-mfma>` instructions.
+    unit: GFLOPs
+  VALU FLOPs (F32):
+    rst: |-
+      The total 32-bit floating-point operations executed per second on the :ref:`VALU
+      <desc-valu>`. This is presented with the value of the peak empirical F32 FLOPs achievable
+      on the specific accelerator. Note: this does not include any F32 operations
+      from :ref:`MFMA <desc-mfma>` instructions.
+    unit: GFLOPs
+  VALU FLOPs (F64):
+    rst: |-
+      The total 64-bit floating-point operations executed per second on the :ref:`VALU
+      <desc-valu>`. This is presented with the value of the peak empirical F64 FLOPs achievable
+      on the specific accelerator. Note: this does not include any F64 operations
+      from :ref:`MFMA <desc-mfma>` instructions.
+    unit: GFLOPs
+  MFMA FLOPs (F64):
+    rst: |-
+      The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 64-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
+      measured F64 MFMA operations achievable on the specific accelerator is
+      displayed alongside for comparison.
+    unit: GFLOPs
+  MFMA FLOPs (F32):
+    rst: |-
+      The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 32-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
+      measured F32 MFMA operations achievable on the specific accelerator is
+      displayed alongside for comparison.
+    unit: GFLOPs
+  MFMA FLOPs (F16):
+    rst: |-
+      The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 16-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
+      measured F16 MFMA operations achievable on the specific accelerator is
+      displayed alongside for comparison.
+    unit: GFLOPs
+  MFMA FLOPs (BF16):
+    rst: |-
+      The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>`
+      operations executed per second. Note: this does not include any 16-bit brain
+      floating point operations from :ref:`VALU <desc-valu>` instructions. The
+      peak empirically measured BF16 MFMA operations achievable on the specific
+      accelerator is displayed alongside for comparison.
+    unit: GFLOPs
+  MFMA FLOPs (F8):
+    rst: |-
+      The total number of 8-bit brain floating point :ref:`MFMA <desc-mfma>`
+      operations executed per second. Note: this does not include any 16-bit brain
+      floating point operations from :ref:`VALU <desc-valu>` instructions. The
+      peak empirically measured F8 MFMA operations achievable on the specific
+      accelerator is displayed alongside for comparison. It is supported on AMD
+      Instinct MI300 series and later only.
+    unit: GFLOPs
+  MFMA IOPs (Int8):
+    rst: |-
+      The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
+      per second. Note: this does not include any 8-bit integer operations from
+      :ref:`VALU <desc-valu>` instructions. The peak empirically measured INT8 MFMA
+      operations achievable on the specific accelerator is displayed alongside
+      for comparison.
+    unit: GIOPs
+  HBM Bandwidth:
+    rst: |-
+      The total number of bytes read from and written to High-Bandwidth
+        Memory (HBM) per second. The peak empirically measured bandwidth achievable
+        on the specific accelerator is displayed alongside for comparison.
+    unit: GB/s
+  L2 Cache Bandwidth:
+    rst: The number of bytes looked up in the L2 cache per unit time. The number of
+      bytes is calculated as the number of cache lines requested multiplied by the
+      cache line size. This value does not consider partial requests, so e.g., if
+      only a single value is requested in a cache line, the data movement will still
+      be counted as a full cache line. The peak empirically measured bandwidth achievable
+      on the specific accelerator is displayed alongside for comparison.
+    unit: GB/s
+  L1 Cache Bandwidth:
+    rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM
+      <desc-vmem>` instructions per unit time. The number of bytes is calculated as
+      the number of cache lines requested multiplied by the cache line size. This
+      value does not consider partial requests, so e.g., if only a single value is
+      requested in a cache line, the data movement will still be counted as a full
+      cache line. The peak empirically measured bandwidth achievable on the specific
+      accelerator is displayed alongside for comparison.
+    unit: GB/s
+  LDS Bandwidth:
+    rst: Indicates the maximum amount of bytes that could have been loaded from, stored
+      to, or atomically updated in the LDS per unit time (see :ref:`LDS Bandwidth
+      <lds-bandwidth>` example for more detail). The peak empirically measured LDS
+      bandwidth achievable on the specific accelerator is displayed alongside for
+      comparison.
+    unit: GB/s
+Roofline Plot Points:
+  AI HBM:
+    rst: |-
+      The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM).
+      It is the ratio of total floating-point operations (FLOPs) to total bytes
+      transferred between HBM and the L2 cache. This value is used as the x-coordinate
+      for the HBM roofline.
+    unit: FLOPs/Byte
+  AI L2:
+    rst: |-
+      The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio
+      of total floating-point operations (FLOPs) to total bytes transferred between
+      the L2 cache and the L1 cache. This value is used as the x-coordinate for
+      the L2 roofline.
+    unit: FLOPs/Byte
+  AI L1:
+    rst: |-
+      The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio
+      of total floating-point operations (FLOPs) to total bytes transferred between
+      the L1 cache and the processing units. This value is used as the x-coordinate
+      for the L1 roofline.
+    unit: FLOPs/Byte
+  Performance (GFLOPs):
+    rst: |-
+      The overall achieved performance, measured in GigaFLOPs
+      per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point
+      operations divided by the total execution time. This value is used as the y-coordinate
+      for the kernel's point on the Roofline plot.
+    unit: GFLOP/s
+Command processor fetcher (CPF):
+  CPF Utilization:
+    rst: Percent of total cycles where the CPF was busy actively doing any work. The
+      ratio of CPF busy cycles over total cycles counted by the CPF.
+    unit: Percent
+  CPF Stall:
+    rst: Percent of CPF busy cycles where the CPF was stalled for any reason.
+    unit: Percent
+  CPF-L2 Utilization:
+    rst: Percent of total cycles counted by the CPF-:doc:`L2 <l2-cache>` interface
+      where the CPF-L2 interface was active doing any work. The ratio of CPF-L2 busy
+      cycles over total cycles counted by the CPF-L2.
+    unit: Percent
+  CPF-L2 Stall:
+    rst: Percent of CPF-:doc:`L2 <l2-cache>` L2 busy cycles where the CPF-L2 interface
+      was stalled for any reason.
+    unit: Percent
+  CPF-UTCL1 Stall:
+    rst: Percent of CPF busy cycles where the CPF was stalled by address translation.
+    unit: Percent
+Command processor packet processor (CPC):
+  CPC Utilization:
+    rst: Percent of total cycles where the CPC was busy actively doing any work. The
+      ratio of CPC busy cycles over total cycles counted by the CPC.
+    unit: Percent
+  CPC Stall Rate:
+    rst: Percent of CPC busy cycles where the CPC was stalled for any reason.
+    unit: Percent
+  CPC Packet Decoding Utilization:
+    rst: Percent of CPC busy cycles spent decoding commands for processing.
+    unit: Percent
+  CPC-Workgroup Manager Utilization:
+    rst: Percent of CPC busy cycles spent dispatching workgroups to the :ref:`workgroup
+      manager <desc-spi>`.
+    unit: Percent
+  CPC-L2 Utilization:
+    rst: Percent of total cycles counted by the CPC-:doc:`L2 <l2-cache>` interface
+      where the CPC-L2 interface was active doing any work.
+    unit: Percent
+  CPC-UTCL1 Stall:
+    rst: Percent of CPC busy cycles where the CPC was stalled by address translation
+    unit: Percent
+  CPC-UTCL2 Utilization:
+    rst: Percent of total cycles counted by the CPC's :doc:`L2 <l2-cache>` address
+      translation interface where the CPC was busy doing address translation work.
+    unit: Percent
+Workgroup manager utilizations:
+  Accelerator Utilization:
+    rst: The percent of cycles in the kernel where the accelerator was actively doing
+      any work.
+    unit: Percent
+  Scheduler-Pipe Utilization:
+    rst: |-
+      The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
+      in the kernel where the scheduler-pipes were actively doing any work. Note: this
+      value is expected to range between 0% and 25%. See :ref:`desc-spi`.
+    unit: Percent
+  Workgroup Manager Utilization:
+    rst: The percent of cycles in the kernel where the workgroup manager was actively
+      doing any work.
+    unit: Percent
+  Shader Engine Utilization:
+    rst: The percent of :ref:`total shader engine cycles <total-se-cycles>` in the
+      kernel where any CU in a shader-engine was actively doing any work, normalized
+      over all shader-engines. Low values (e.g., << 100%) indicate that the accelerator
+      was not fully saturated by the kernel, or a potential load-imbalance issue.
+    unit: Percent
+  SIMD Utilization:
+    rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel
+      where any :ref:`SIMD <desc-valu>` on a CU was actively doing any work, summed
+      over all CUs. Low values (less than 100%) indicate that the accelerator was
+      not fully saturated by the kernel, or a potential load-imbalance issue.
+    unit: Percent
+  Dispatched Workgroups:
+    rst: The total number of workgroups forming this kernel launch.
+    unit: Workgroups
+  Dispatched Wavefronts:
+    rst: The total number of wavefronts, summed over all workgroups, forming this
+      kernel launch.
+    unit: Wavefronts
+  VGPR Writes:
+    rst: The average number of cycles spent initializing :ref:`VGPRs <desc-valu>`
+      at wave creation.
+    unit: Cycles/wave
+  SGPR Writes:
+    rst: The average number of cycles spent initializing :ref:`SGPRs <desc-salu>`
+      at wave creation.
+    unit: Cycles/wave
+Workgroup Manager - Resource Allocation:
+  Not-scheduled Rate (Workgroup Manager):
+    rst: |-
+      The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
+      in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
+      due to a bottleneck within the workgroup manager rather than a lack of a
+      CU or :ref:`SIMD <desc-valu>` with sufficient resources. Note: this value
+      is expected to range between 0-25%. See note in :ref:`workgroup manager <desc-spi>`
+      description.
+    unit: Percent
+  Not-scheduled Rate (Scheduler-Pipe):
+    rst: |-
+      The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
+      in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
+      due to a bottleneck within the scheduler-pipes rather than a lack of a CU
+      or :ref:`SIMD <desc-valu>` with sufficient resources. Note: this value is
+      expected to range between 0-25%, see note in :ref:`workgroup manager <desc-spi>`
+      description.
+    unit: Percent
+  Scheduler-Pipe Stall Rate:
+    rst: |-
+      The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
+      in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
+      due to occupancy limitations (like a lack of a CU or :ref:`SIMD <desc-valu>`
+      with sufficient resources). Note: this value is expected to range between
+      0-25%, see note in :ref:`workgroup manager <desc-spi>` description.
+    unit: Percent
+  Scratch Stall Rate:
+    rst: The percent of :ref:`total shader-engine cycles <total-se-cycles>` in the
+      kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
+      due to lack of :ref:`private (a.k.a., scratch) memory <memory-type>` slots.
+      While this can reach up to 100%, note that the actual occupancy limitations
+      on a kernel using private memory are typically quite small (for example, less
+      than 1% of the total number of waves that can be scheduled to an accelerator).
+    unit: Percent
+  Insufficient SIMD Waveslots:
+    rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel
+      where a workgroup could not be scheduled to a :ref:`SIMD <desc-valu>` due to
+      lack of available :ref:`waveslots <desc-valu>`.
+    unit: Percent
+  Insufficient SIMD VGPRs:
+    rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel
+      where a workgroup could not be scheduled to a :ref:`SIMD <desc-valu>` due to
+      lack of available :ref:`VGPRs <desc-valu>`.
+    unit: Percent
+  Insufficient SIMD SGPRs:
+    rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel
+      where a workgroup could not be scheduled to a :ref:`SIMD <desc-valu>` due to
+      lack of available :ref:`SGPRs <desc-salu>`.
+    unit: Percent
+  Insufficient CU LDS:
+    rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel where
+      a workgroup could not be scheduled to a :doc:`CU <compute-unit>` due to lack
+      of available :doc:`LDS <local-data-share>`.
+    unit: Percent
+  Insufficient CU Barriers:
+    rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel where
+      a workgroup could not be scheduled to a :doc:`CU <compute-unit>` due to lack
+      of available :ref:`barriers <desc-barrier>`.
+    unit: Percent
+  Reached CU Workgroup Limit:
+    rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel where
+      a workgroup could not be scheduled to a :doc:`CU <compute-unit>` due to limits
+      within the workgroup manager. This is expected to be always be zero on CDNA2
+      or newer accelerators (and small for previous accelerators).
+    unit: Percent
+  Reached CU Wavefront Limit:
+    rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel where
+      a wavefront could not be scheduled to a :doc:`CU <compute-unit>` due to limits
+      within the workgroup manager. This is expected to be always be zero on CDNA2
+      or newer accelerators (and small for previous accelerators).
+    unit: Percent
+Wavefront Launch Stats:
+  Grid Size:
+    rst: The total number of work-items (or, threads) launched as a part of the kernel
+      dispatch. In HIP, this is equivalent to the total grid size multiplied by the
+      total workgroup (or, block) size.
+    unit: Work-Items
+  Workgroup Size:
+    rst: The total number of work-items (or, threads) in each workgroup (or, block)
+      launched as part of the kernel dispatch. In HIP, this is equivalent to the total
+      block size.
+    unit: Work-Items
+  Total Wavefronts:
+    rst: |-
+      The total number of wavefronts launched as part of the kernel dispatch.
+      On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront
+      size is always 64 work-items. Thus, the total number of wavefronts should
+      be equivalent to the ceiling of grid size divided by 64.
+    unit: Wavefronts
+  Saved Wavefronts:
+    rst: The total number of wavefronts saved at a context-save. See `cwsr_enable
+      <https://docs.kernel.org/gpu/amdgpu/module-parameters.html?highlight=cwsr>`_.
+    unit: Wavefronts
+  Restored Wavefronts:
+    rst: The total number of wavefronts restored from a context-save. See `cwsr_enable
+      <https://docs.kernel.org/gpu/amdgpu/module-parameters.html?highlight=cwsr>`_.
+    unit: Wavefronts
+  VGPRs:
+    rst: |-
+      The number of architected vector general-purpose registers allocated for the
+      kernel, see :ref:`VALU <desc-valu>`. Note: this may not exactly match the
+      number of VGPRs requested by the compiler due to allocation granularity.
+    unit: VGPRs
+  AGPRs:
+    rst: |-
+      The number of accumulation vector general-purpose registers allocated
+      for the kernel, see :ref:`AGPRs <desc-agprs>`. Note: this may not exactly match
+      the number of AGPRs requested by the compiler due to allocation granularity.
+    unit: AGPRs
+  SGPRs:
+    rst: |-
+      The number of scalar general-purpose registers allocated for the kernel, see
+      :ref:`SALU <desc-salu>`. Note: this may not exactly match the number of
+      SGPRs requested by the compiler due to allocation granularity.
+    unit: SGPRs
+  LDS Allocation:
+    rst: |-
+      The number of bytes of :doc:`LDS <local-data-share>` memory (or, shared memory)
+      allocated for this kernel. Note: This may also be larger than what was requested
+      at compile time due to both allocation granularity and dynamic per-dispatch
+      LDS allocations.
+    unit: Bytes per workgroup
+  Scratch Allocation:
+    rst: The number of bytes of :ref:`scratch memory <memory-spaces>` requested per
+      work-item for this kernel. Scratch memory is used for stack memory on the accelerator,
+      as well as for register spills and restores.
+    unit: Bytes per work-item
+Wavefront Runtime Stats:
+  Kernel Time:
+    rst: The total duration of the executed kernel.
+    unit: Nanoseconds
+  Kernel Time (Cycles):
+    rst: The total duration of the executed kernel in cycles.
+    unit: Cycles
+  Instructions per wavefront:
+    rst: The average number of instructions (of all types) executed per wavefront.
+      This is averaged over all wavefronts in a kernel dispatch.
+    unit: Instructions per wavefront
+  Wave Cycles:
+    rst: |-
+      The number of cycles a wavefront in the kernel dispatch spent resident
+      on a compute unit per :ref:`normalization unit <normalization-units>`. This is
+      averaged over all wavefronts in a kernel dispatch. Note: this should not
+      be directly compared to the kernel cycles above.
+    unit: Cycles per normalization unit
+  Dependency Wait Cycles:
+    rst: The number of cycles a wavefront in the kernel dispatch stalled waiting on
+      memory of any kind (e.g., instruction fetch, vector or scalar memory, etc.)
+      per :ref:`normalization unit <normalization-units>`. This counter is incremented
+      at every cycle by *all* wavefronts on a CU stalled at a memory operation. As
+      such, it is most useful to get a sense of how waves were spending their time,
+      rather than identification of a precise limiter because another wave could be
+      actively executing while a wave is stalled. The sum of this metric, Issue Wait
+      Cycles and Active Cycles should be equal to the total Wave Cycles metric.
+    unit: Cycles per normalization unit
+  Issue Wait Cycles:
+    rst: The number of cycles a wavefront in the kernel dispatch was unable to issue
+      an instruction for any reason (e.g., execution pipe back-pressure, arbitration
+      loss, etc.) per :ref:`normalization unit <normalization-units>`. This counter
+      is incremented at every cycle by *all* wavefronts on a CU unable to issue an
+      instruction. As such, it is most useful to get a sense of how waves were spending
+      their time, rather than identification of a precise limiter because another
+      wave could be actively executing while a wave is issue stalled. The sum of this
+      metric, Dependency Wait Cycles and Active Cycles should be equal to the total
+      Wave Cycles metric.
+    unit: Cycles per normalization unit
+  Active Cycles:
+    rst: The average number of cycles a wavefront in the kernel dispatch was actively
+      executing instructions per :ref:`normalization unit <normalization-units>`.
+      This measurement is made on a per-wavefront basis, and may include cycles that
+      another wavefront spent actively executing (on another execution unit, for example)
+      or was stalled. As such, it is most useful to get a sense of how waves were
+      spending their time, rather than identification of a precise limiter. The sum
+      of this metric, Issue Wait Cycles and Active Wait Cycles should be equal to
+      the total Wave Cycles metric.
+    unit: Cycles per normalization unit
+  Wavefront Occupancy:
+    rst: |-
+      The time-averaged number of wavefronts resident on the accelerator over the
+      lifetime of the kernel. Note: this metric may be inaccurate for short-running
+      kernels (less than 1ms).
+    unit: Wavefronts
+Overall Instruction Mix:
+  VALU:
+    rst: The total number of vector arithmetic logic unit (VALU) operations issued.
+      These are the workhorses of the :doc:`compute unit <compute-unit>`, and are
+      used to execute a wide range of instruction types including floating point operations,
+      non-uniform address calculations, transcendental operations, integer operations,
+      shifts, conditional evaluation, etc.
+    unit: Instructions
+  VMEM:
+    rst: The total number of vector memory operations issued. These include most loads,
+      stores and atomic operations and all accesses to :ref:`generic, global, private
+      and texture <memory-spaces>` memory.
+    unit: Instructions
+  LDS:
+    rst: The total number of LDS (also known as shared memory) operations issued.
+      These include loads, stores, atomics, and HIP's ``__shfl`` operations.
+    unit: Instructions
+  MFMA:
+    rst: The total number of matrix fused multiply-add instructions issued.
+    unit: Instructions
+  SALU:
+    rst: The total number of scalar arithmetic logic unit (SALU) operations issued.
+      Typically these are used for address calculations, literal constants, and other
+      operations that are provably uniform across a wavefront. Although scalar memory
+      (SMEM) operations are issued by the SALU, they are counted separately in this
+      section.
+    unit: Instructions
+  SMEM:
+    rst: The total number of scalar memory (SMEM) operations issued. These are typically
+      used for loading kernel arguments, base-pointers and loads from HIP's ``__constant__``
+      memory.
+    unit: Instructions
+  Branch:
+    rst: The total number of branch operations issued. These typically consist of
+      jump or branch operations and are used to implement control flow.
+    unit: Instructions
+VALU Arithmetic Instruction Mix:
+  INT32:
+    rst: The total number of instructions operating on 32-bit integer operands issued
+      to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  INT64:
+    rst: The total number of instructions operating on 64-bit integer operands issued
+      to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F16-ADD:
+    rst: The total number of addition instructions operating on 16-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F16-MUL:
+    rst: The total number of multiplication instructions operating on 16-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F16-FMA:
+    rst: The total number of fused multiply-add instructions operating on 16-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F16-Trans:
+    rst: The total number of transcendental instructions (e.g., `sqrt`) operating
+      on 16-bit floating-point operands issued to the VALU per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F32-ADD:
+    rst: The total number of addition instructions operating on 32-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F32-MUL:
+    rst: The total number of multiplication instructions operating on 32-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F32-FMA:
+    rst: The total number of fused multiply-add instructions operating on 32-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F32-Trans:
+    rst: The total number of transcendental instructions (such as ``sqrt``) operating
+      on 32-bit floating-point operands issued to the VALU per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F64-ADD:
+    rst: The total number of addition instructions operating on 64-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F64-MUL:
+    rst: The total number of multiplication instructions operating on 64-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F64-FMA:
+    rst: The total number of fused multiply-add instructions operating on 64-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F64-Trans:
+    rst: The total number of transcendental instructions (such as `sqrt`) operating
+      on 64-bit floating-point operands issued to the VALU per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Conversion:
+    rst: |-
+      The total number of type conversion instructions (such as converting data
+      to or from F32\u2194F64) issued to the VALU per :ref:`normalization unit
+      <normalization-units>`.
+    unit: Instructions per normalization unit
+VMEM Instruction Mix:
+  Global/Generic Instr:
+    rst: The total number of global & generic memory instructions executed on all
+      :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Global/Generic Read:
+    rst: The total number of global & generic memory read instructions executed on
+      all :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Global/Generic Write:
+    rst: The total number of global & generic memory write instructions executed on
+      all :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Global/Generic Atomic:
+    rst: The total number of global & generic memory atomic (with and without return)
+      instructions executed on all :doc:`compute units <compute-unit>` on the accelerator,
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Spill/Stack Instr:
+    rst: The total number of spill/stack memory instructions executed on all :doc:`compute
+      units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Spill/Stack Read:
+    rst: The total number of spill/stack memory read instructions executed on all
+      :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Spill/Stack Write:
+    rst: The total number of spill/stack memory write instructions executed on all
+      :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Spill/Stack Atomic:
+    rst: The total number of spill/stack memory atomic (with and without return) instructions
+      executed on all :doc:`compute units <compute-unit>` on the accelerator, per
+      :ref:`normalization unit <normalization-units>`. Typically unused as these memory
+      operations are typically used to implement thread-local storage.
+    unit: Instructions per normalization unit
+MFMA Arithmetic Instruction Mix:
+  MFMA-I8:
+    rst: The total number of 8-bit integer :ref:`MFMA <desc-mfma>` instructions issued
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  MFMA-F8:
+    rst: The total number of 8-bit floating point :ref:`MFMA <desc-mfma>` instructions
+      issued per :ref:`normalization unit <normalization-units>`. This is supported
+      in AMD Instinct MI300 series and later only.
+    unit: Instructions per normalization unit
+  MFMA-F16:
+    rst: The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` instructions
+      issued per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  MFMA-BF16:
+    rst: The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>` instructions
+      issued per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  MFMA-F32:
+    rst: The total number of 32-bit floating-point :ref:`MFMA <desc-mfma>` instructions
+      issued per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  MFMA-F64:
+    rst: The total number of 64-bit floating-point :ref:`MFMA <desc-mfma>` instructions
+      issued per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+Compute Speed-of-Light:
+  VALU FLOPs:
+    rst: |-
+      The total floating-point operations executed per second on the :ref:`VALU
+      <desc-valu>`. This is also presented as a percent of the peak theoretical
+      FLOPs achievable on the specific accelerator. Note: this does not include
+      any floating-point operations from :ref:`MFMA <desc-mfma>` instructions.
+    unit: GFLOPs
+  VALU IOPs:
+    rst: |-
+      The total integer operations executed per second on the :ref:`VALU <desc-valu>`.
+      This is also presented as a percent of the peak theoretical IOPs achievable
+      on the specific accelerator. Note: this does not include any integer operations
+      from :ref:`MFMA <desc-mfma>` instructions.
+    unit: GIOPs
+  MFMA FLOPs (F8):
+    rst: ''
+    unit: Unknown
+  MFMA FLOPs (BF16):
+    rst: |-
+      The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 16-bit brain floating
+      point operations from :ref:`VALU <desc-valu>` instructions. This is also
+      presented as a percent of the peak theoretical BF16 MFMA operations achievable
+      on the specific accelerator.
+    unit: GFLOPs
+  MFMA FLOPs (F16):
+    rst: |-
+      The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 16-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. This is also presented
+      as a percent of the peak theoretical F16 MFMA operations achievable on the
+      specific accelerator.
+    unit: GFLOPs
+  MFMA FLOPs (F32):
+    rst: |-
+      The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 32-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. This is also presented
+      as a percent of the peak theoretical F32 MFMA operations achievable on the
+      specific accelerator.
+    unit: GFLOPs
+  MFMA FLOPs (F64):
+    rst: |-
+      The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 64-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. This is also presented
+      as a percent of the peak theoretical F64 MFMA operations achievable on the
+      specific accelerator. The total number of 64-bit floating point :ref:`MFMA
+      <desc-mfma>` operations executed per second. Note: this does not include
+      any 64-bit floating point operations from :ref:`VALU <desc-valu>` instructions.
+      This is also presented as a percent of the peak theoretical F64 MFMA operations
+      achievable on the specific accelerator.
+    unit: GFLOPs
+  MFMA IOPs (INT8):
+    rst: |-
+      The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
+      per second. Note: this does not include any 8-bit integer operations from
+      :ref:`VALU <desc-valu>` instructions. This is also presented as a percent
+      of the peak theoretical INT8 MFMA operations achievable on the specific accelerator.
+    unit: GFLOPs
+Pipeline Statistics:
+  IPC:
+    rst: The ratio of the total number of instructions executed on the :doc:`CU <compute-unit>`
+      over the :ref:`total active CU cycles <total-active-cu-cycles>`.
+    unit: Instructions per cycle
+  IPC (Issued):
+    rst: The ratio of the total number of (non-:ref:`internal <ipc-internal-instructions>`)
+      instructions issued over the number of cycles where the :ref:`scheduler <desc-scheduler>`
+      was actively working on issuing instructions. Refer to the :ref:`Issued IPC
+      <issued-ipc>` example for further detail.
+    unit: Instructions per cycle
+  SALU Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`SALU <desc-salu>`
+      was busy executing instructions. Computed as the ratio of the total number of
+      cycles spent by the :ref:`scheduler <desc-scheduler>` issuing SALU / :ref:`SMEM
+      <desc-smem>` instructions over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  VALU Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`VALU <desc-valu>`
+      was busy executing instructions. Does not include :ref:`VMEM <desc-vmem>` operations.
+      Computed as the ratio of the total number of cycles spent by the :ref:`scheduler
+      <desc-scheduler>` issuing VALU instructions over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  VMEM Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`VMEM <desc-vmem>`
+      unit was busy executing instructions, including both global/generic and spill/scratch
+      operations (see the :ref:`VMEM instruction count metrics <ta-instruction-counts>`
+      for more detail). Does not include :ref:`VALU <desc-valu>` operations. Computed
+      as the ratio of the total number of cycles spent by the :ref:`scheduler <desc-scheduler>`
+      issuing VMEM instructions over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  Branch Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`branch <desc-branch>`
+      unit was busy executing instructions. Computed as the ratio of the total number
+      of cycles spent by the :ref:`scheduler <desc-scheduler>` issuing branch instructions
+      over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  VALU Active Threads:
+    rst: Indicates the average level of :ref:`divergence <desc-divergence>` within
+      a wavefront over the lifetime of the kernel. The number of work-items that were
+      active in a wavefront during execution of each :ref:`VALU <desc-valu>` instruction,
+      time-averaged over all VALU instructions run on all wavefronts in the kernel.
+    unit: Work-items
+  MFMA Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`MFMA <desc-mfma>`
+      unit was busy executing instructions. Computed as the ratio of the total number
+      of cycles spent by the :ref:`MFMA <desc-salu>` was busy over the :ref:`total
+      CU cycles <total-cu-cycles>`.
+    unit: Percent
+  MFMA Instruction Cycles:
+    rst: The average duration of :ref:`MFMA <desc-mfma>` instructions in this kernel
+      in cycles. Computed as the ratio of the total number of cycles the MFMA unit
+      was busy over the total number of MFMA instructions. Compare to, for example,
+      the `AMD Matrix Instruction Calculator <https://github.com/RadeonOpenCompute/amd_matrix_instruction_calculator>`_.
+    unit: Cycles per instruction
+  VMEM Latency:
+    rst: The average number of round-trip cycles (that is, from issue to data return
+      / acknowledgment) required for a VMEM instruction to complete.
+    unit: Cycles
+  SMEM Latency:
+    rst: The average number of round-trip cycles (that is, from issue to data return
+      / acknowledgment) required for a SMEM instruction to complete.
+    unit: Cycles
+Arithmetic Operations:
+  FLOPs (Total):
+    rst: The total number of floating-point operations executed on either the :ref:`VALU
+      <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization unit
+      <normalization-units>`.
+    unit: FLOP per normalization unit
+  IOPs (Total):
+    rst: The total number of integer operations executed on either the :ref:`VALU
+      <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization unit
+      <normalization-units>`.
+    unit: IOP per normalization unit
+  F8 OPs:
+    rst: ''
+    unit: Unknown
+  F16 OPs:
+    rst: The total number of 16-bit floating-point operations executed on either the
+      :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: FLOP per normalization unit
+  BF16 OPs:
+    rst: |-
+      The total number of 16-bit brain floating-point operations executed on
+      either the :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization
+      unit <normalization-units>`. Note: on current CDNA accelerators, the VALU
+      has no native BF16 instructions.
+    unit: FLOP per normalization unit
+  F32 OPs:
+    rst: The total number of 32-bit floating-point operations executed on either the
+      :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: FLOP per normalization unit
+  F64 OPs:
+    rst: The total number of 64-bit floating-point operations executed on either the
+      :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: FLOP per normalization unit
+  INT8 OPs:
+    rst: |-
+      The total number of 8-bit integer operations executed on either the :ref:`VALU
+      <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization unit
+      <normalization-units>`. Note: on current CDNA accelerators, the VALU has
+      no native INT8 instructions.
+    unit: IOP per normalization unit
+LDS Speed-of-Light:
+  Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`LDS <desc-lds>`
+      was actively executing instructions (including, but not limited to, load, store,
+      atomic and HIP's ``__shfl`` operations). Calculated as the ratio of the total
+      number of cycles LDS was active over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  Access Rate:
+    rst: Indicates the percentage of SIMDs in the :ref:`VALU <desc-valu>` [#lds-workload]_
+      actively issuing LDS instructions, averaged over the lifetime of the kernel.
+      Calculated as the ratio of the total number of cycles spent by the :ref:`scheduler
+      <desc-scheduler>` issuing :ref:`LDS <desc-lds>` instructions over the :ref:`total
+      CU cycles <total-cu-cycles>`.
+    unit: Percent
+  Theoretical Bandwidth Utilization:
+    rst: Indicates the maximum amount of bytes that could have been loaded from, stored
+      to, or atomically updated in the LDS divided as percentage of theoretical peak.
+      Does *not* take into account the execution mask of the wavefront when the instruction
+      was executed. See the :ref:`LDS bandwidth example <lds-bandwidth>` for more
+      detail.
+    unit: Percent
+  Bank Conflict Rate:
+    rst: Indicates the percentage of active LDS cycles that were spent servicing bank
+      conflicts. Calculated as the ratio of LDS cycles spent servicing bank conflicts
+      over the number of LDS cycles that would have been required to move the same
+      amount of data in an uncontended access. [#lds-bank-conflict]_
+    unit: Percent
+LDS Statistics:
+  LDS Instructions:
+    rst: The total number of LDS instructions (including, but not limited to, read/write/atomics
+      and HIP's ``__shfl`` instructions) executed per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Theoretical Bandwidth:
+    rst: Indicates the maximum amount of bytes that could have been loaded from, stored
+      to, or atomically updated in the LDS divided by total duration. Does *not* take
+      into account the execution mask of the wavefront when the instruction was executed.
+      See the :ref:`LDS bandwidth example <lds-bandwidth>` for more detail.
+    unit: Gbps
+  LDS Latency:
+    rst: The average number of round-trip cycles (i.e., from issue to data-return
+      acknowledgment) required for an LDS instruction to complete.
+    unit: Cycles
+  Bank Conflicts/Access:
+    rst: The ratio of the number of cycles spent in the :ref:`LDS scheduler <desc-lds>`
+      due to bank conflicts (as determined by the conflict resolution hardware) to
+      the base number of cycles that would be spent in the LDS scheduler in a completely
+      uncontended case. This is the unnormalized form of the Bank Conflict Rate.
+    unit: Conflicts per Access
+  Index Accesses:
+    rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>` over
+      all operations per :ref:`normalization unit <normalization-units>`.
+    unit: Cycles per normalization unit
+  Atomic Return Cycles:
+    rst: The total number of cycles spent on LDS atomics with return per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Cycles per normalization unit
+  Bank Conflict:
+    rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>` due
+      to bank conflicts (as determined by the conflict resolution hardware) per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Cycles per normalization unit
+  Addr Conflict:
+    rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>` due
+      to address conflicts (as determined by the conflict resolution hardware) per
+      :ref:`normalization unit <normalization-units>`.
+    unit: Cycles per normalization unit
+  Unaligned Stall:
+    rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>` due
+      to stalls from non-dword aligned addresses per :ref:`normalization unit <normalization-units>`.
+    unit: Cycles per normalization unit
+  Mem Violations:
+    rst: |-
+      The total number of out-of-bounds accesses made to the LDS, per :ref:`normalization
+      unit <normalization-units>`. This is unused and expected to be zero in
+      most configurations for modern CDNA\u2122 accelerators.
+    unit: Accesses per normalization unit
+L1I Speed-of-Light:
+  Bandwidth Utilization:
+    rst: The number of bytes looked up in the L1I cache, as a percent of the peak
+      theoretical bandwidth. Calculated as the ratio of L1I requests over the :ref:`total
+      L1I cycles <total-l1i-cycles>`.
+    unit: Percent
+  Cache Hit Rate:
+    rst: The percent of L1I requests that hit [#l1i-cache]_ on a previously loaded
+      line the cache. Calculated as the ratio of the number of L1I requests that hit
+      over the number of all L1I requests.
+    unit: Percent
+  L1I-L2 Bandwidth Utilization:
+    rst: |-
+      The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth
+      achieved. Calculated as the ratio of the total number of requests from
+      the L1I to the L2 cache over the :ref:`total L1I-L2 interface cycles <total-l1i-cycles>`.
+    unit: Percent
+L1I cache accesses:
+  Req:
+    rst: The total number of requests made to the L1I per normalization-unit
+    unit: Requests per normalization unit
+  Hits:
+    rst: The total number of L1I requests that hit on a previously loaded cache line,
+      per :ref:`normalization-unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Misses - Non Duplicated:
+    rst: The total number of L1I requests that missed on a cache line that *were not*
+      already pending due to another request, per :ref:`normalization-unit <normalization-units>`.
+      See note in :ref:`desc-l1i-sol` for more detail.
+    unit: Requests per normalization unit
+  Misses - Duplicated:
+    rst: The total number of L1I requests that missed on a cache line that *were*
+      already pending due to another request, per :ref:`normalization-unit <normalization-units>`.
+      See note in :ref:`desc-l1i-sol` for more detail.
+    unit: Requests per normalization unit
+  Cache Hit Rate:
+    rst: The percent of L1I requests that hit [#l1i-cache]_ on a previously loaded
+      line the cache. Calculated as the ratio of the number of L1I requests that hit
+      over the number of all L1I requests.
+    unit: Percent
+  Instruction Fetch Latency:
+    rst: The average number of cycles spent to fetch instructions to a :doc:`CU <compute-unit>`.
+    unit: Cycles
+L1I <-> L2 interface:
+  L1I-L2 Bandwidth:
+    rst: Total number of bytes transferred across L1I - L2 interface divided by total
+      duration.
+    unit: Gbps
+Scalar L1D Speed-of-Light:
+  Bandwidth Utilization:
+    rst: The number of bytes looked up in the sL1D cache, as a percent of the peak
+      theoretical bandwidth. Calculated as the ratio of sL1D requests over the :ref:`total
+      sL1D cycles <total-sl1d-cycles>`.
+    unit: Percent
+  Cache Hit Rate:
+    rst: Indicates the percent of sL1D requests that hit on a previously loaded line
+      the cache. The ratio of the number of sL1D requests that hit [#sl1d-cache]_
+      over the number of all sL1D requests.
+    unit: Percent
+  sL1D-L2 BW Utilization:
+    rst: The percentage of the peak theoretical sL1D - L2 interface bandwidth acheived.
+      Caclulated as total number of bytes read from, written to, or atomically updated
+      across the sL1D - L2 interface.
+    unit: Percent
+Scalar L1D cache accesses:
+  Req:
+    rst: The total number of requests, of any size or type, made to the sL1D per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Hits:
+    rst: The total number of sL1D requests that hit on a previously loaded cache line,
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Misses - Non Duplicated:
+    rst: The total number of sL1D requests that missed on a cache line that *was not*
+      already pending due to another request, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`desc-sl1d-sol` for more detail.
+    unit: Requests per normalization unit
+  Misses- Duplicated:
+    rst: The total number of sL1D requests that missed on a cache line that *was*
+      already pending due to another request, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`desc-sl1d-sol` for more detail.
+    unit: Requests per normalization unit
+  Cache Hit Rate:
+    rst: Indicates the percent of sL1D requests that hit on a previously loaded line
+      the cache. The ratio of the number of sL1D requests that hit [#sl1d-cache]_
+      over the number of all sL1D requests.
+    unit: Percent
+  Read Req (Total):
+    rst: The total number of sL1D read requests of any size, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Atomic Req:
+    rst: The total number of atomic requests from sL1D to the :doc:`L2 <l2-cache>`,
+      per :ref:`normalization unit <normalization-units>`. Typically unused on current
+      CDNA accelerators.
+    unit: Requests per normalization unit
+  Read Req (1 DWord):
+    rst: The total number of sL1D read requests made for a single dword of data (4B),
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Read Req (2 DWord):
+    rst: The total number of sL1D read requests made for a two dwords of data (8B),
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Read Req (4 DWord):
+    rst: The total number of sL1D read requests made for a four dwords of data (16B),
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Read Req (8 DWord):
+    rst: The total number of sL1D read requests made for a eight dwords of data (32B),
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Read Req (16 DWord):
+    rst: The total number of sL1D read requests made for a sixteen dwords of data
+      (64B), per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+Scalar L1D Cache - L2 Interface:
+  sL1D-L2 BW:
+    rst: |-
+      The total number of bytes read from, written to, or atomically updated
+      across the sL1D\u2194:doc:`L2 <l2-cache>` interface, divided by total duration.
+      Note that sL1D writes and atomics are typically
+      unused on current CDNA accelerators, so in the majority of cases this can
+      be interpreted as an sL1D\u2192L2 read bandwidth.
+    unit: Gbps
+  Read Req:
+    rst: The total number of read requests from sL1D to the :doc:`L2 <l2-cache>`,
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Write Req:
+    rst: The total number of write requests from sL1D to the :doc:`L2 <l2-cache>`,
+      per :ref:`normalization unit <normalization-units>`. Typically unused on current
+      CDNA accelerators.
+    unit: Requests per normalization unit
+  Atomic Req:
+    rst: The total number of atomic requests from sL1D to the :doc:`L2 <l2-cache>`,
+      per :ref:`normalization unit <normalization-units>`. Typically unused on current
+      CDNA accelerators.
+    unit: Requests per normalization unit
+  Stall Cycles:
+    rst: |-
+      The total number of cycles the sL1D\u2194 :doc:`L2 <l2-cache>` interface
+      was stalled, per :ref:`normalization unit <normalization-units>`.
+    unit: Cycles per normalization unit
+Busy and stall metrics:
+  Address Processing Unit Busy:
+    rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the address processor
+      was busy
+    unit: Percent
+  Address Stall:
+    rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the address processor
+      was stalled from sending address requests further into the vL1D pipeline
+    unit: Percent
+  Data Stall:
+    rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the address processor
+      was stalled from sending write/atomic data further into the vL1D pipeline
+    unit: Percent
+  Data-Processor → Address Stall:
+    rst: Percent of :ref:`total CU cycles <total-cu-cycles>` the address processor
+      was stalled waiting to send command data to the :ref:`data processor <desc-td>`
+    unit: Percent
+  Sequencer → TA Address Stall:
+    rst: ''
+    unit: Unknown
+  Sequencer → TA Command Stall:
+    rst: ''
+    unit: Unknown
+  Sequencer → TA Data Stall:
+    rst: ''
+    unit: Unknown
+Instruction counts:
+  Total Instructions:
+    rst: The total number of memory instructions executed by the address processer
+      over all compute units on the accelerator, per normalization unit.
+    unit: Instructions per normalization unit
+  Global/Generic Instructions:
+    rst: The total number of global & generic memory instructions executed on all
+      :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Global/Generic Read Instructions:
+    rst: The total number of global & generic memory read instructions executed on
+      all :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Global/Generic Write Instructions:
+    rst: The total number of global & generic memory write instructions executed on
+      all :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Global/Generic Atomic Instructions:
+    rst: The total number of global & generic memory atomic (with and without return)
+      instructions executed on all :doc:`compute units <compute-unit>` on the accelerator,
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Spill/Stack Instructions:
+    rst: The total number of spill/stack memory instructions executed on all :doc:`compute
+      units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Spill/Stack Read Instructions:
+    rst: The total number of spill/stack memory read instructions executed on all
+      :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Spill/Stack Write Instructions:
+    rst: The total number of spill/stack memory write instructions executed on all
+      :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Spill/Stack Atomic Instructions:
+    rst: The total number of spill/stack memory atomic (with and without return) instructions
+      executed on all :doc:`compute units <compute-unit>` on the accelerator, per
+      :ref:`normalization unit <normalization-units>`. Typically unused as these memory
+      operations are typically used to implement thread-local storage.
+    unit: Instructions per normalization unit
+Spill and stack metrics:
+  Spill/Stack Total Cycles:
+    rst: The number of cycles the address processing unit spent working on spill/stack
+      instructions, per :ref:`normalization unit <normalization-units>`.
+    unit: Cycles per normalization unit
+  Spill/Stack Coalesced Read:
+    rst: The number of cycles the address processing unit spent working on coalesced
+      spill/stack read instructions, per :ref:`normalization unit <normalization-units>`.
+    unit: Cycles per normalization unit
+  Spill/Stack Coalesced Write:
+    rst: The number of cycles the address processing unit spent working on coalesced
+      spill/stack write instructions, per :ref:`normalization unit <normalization-units>`.
+    unit: Cycles per normalization unit
+Vector L1 data-return path or Texture Data (TD):
+  Data-Return Busy:
+    rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the data-return unit
+      was busy processing or waiting on data to return to the :doc:`CU <compute-unit>`.
+    unit: Percent
+  Cache RAM → Data-Return Stall:
+    rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the data-return unit
+      was stalled on data to be returned from the :ref:`vL1D Cache RAM <desc-tc>`.
+    unit: Percent
+  Workgroup manager → Data-Return Stall:
+    rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the data-return unit
+      was stalled by the :ref:`workgroup manager <desc-spi>` due to initialization
+      of registers as a part of launching new workgroups.
+    unit: Percent
+  Coalescable Instructions:
+    rst: The number of instructions submitted to the :ref:`data-return unit <desc-td>`
+      by the :ref:`address processor <desc-ta>` that were found to be coalescable,
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Read Instructions:
+    rst: The number of read instructions submitted to the :ref:`data-return unit <desc-td>`
+      by the :ref:`address processor <desc-ta>` summed over all :doc:`compute units
+      <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
+      This is expected to be the sum of global/generic and spill/stack reads in the
+      :ref:`address processor <desc-ta>`.
+    unit: Instructions per normalization unit
+  Write Instructions:
+    rst: The number of store instructions submitted to the :ref:`data-return unit
+      <desc-td>` by the :ref:`address processor <desc-ta>` summed over all :doc:`compute
+      units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
+      This is expected to be the sum of global/generic and spill/stack stores counted
+      by the :ref:`vL1D cache-front-end <ta-instruction-counts>`.
+    unit: Instructions per normalization unit
+  Atomic Instructions:
+    rst: The number of atomic instructions submitted to the :ref:`data-return unit
+      <desc-td>` by the :ref:`address processor <desc-ta>` summed over all :doc:`compute
+      units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
+      This is expected to be the sum of global/generic and spill/stack atomics in
+      the :ref:`address processor <desc-ta>`.
+    unit: Instructions per normalization unit
+vL1D Speed-of-Light:
+  Hit rate:
+    rst: The ratio of the number of vL1D cache line requests that hit [#vl1d-hit]_
+      in vL1D cache over the total number of cache line requests to the :ref:`vL1D
+      Cache RAM <desc-tc>`.
+    unit: Percent
+  Bandwidth Utilization:
+    rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM
+      <desc-vmem>` instructions, as a percent of the peak theoretical bandwidth achievable
+      on the specific accelerator. The number of bytes is calculated as the number
+      of cache lines requested multiplied by the cache line size. This value does
+      not consider partial requests, so for instance, if only a single value is requested
+      in a cache line, the data movement will still be counted as a full cache line.
+    unit: Percent
+  Utilization:
+    rst: Indicates how busy the :ref:`vL1D Cache RAM <desc-tc>` was during the kernel
+      execution. The number of cycles where the vL1D Cache RAM is actively processing
+      any request divided by the number of cycles where the vL1D is active [#vl1d-activity]_.
+    unit: Percent
+  Coalescing:
+    rst: Indicates how well memory instructions were coalesced by the :ref:`address
+      processing unit <desc-ta>`, ranging from uncoalesced (25%) to fully coalesced
+      (100%). Calculated as the average number of :ref:`thread-requests <thread-requests>`
+      generated per instruction divided by the ideal number of thread-requests per
+      instruction.
+    unit: Percent
+vL1D cache stall metrics:
+  Stalled on L2 Data:
+    rst: The ratio of the number of cycles where the vL1D is stalled waiting for requested
+      data to return from the :doc:`L2 cache <l2-cache>` divided by the number of
+      cycles where the vL1D is active [#vl1d-activity]_.
+    unit: Percent
+  Stalled on L2 Req:
+    rst: The ratio of the number of cycles where the vL1D is stalled waiting to issue
+      a request for data to the :doc:`L2 cache <l2-cache>` divided by the number of
+      cycles where the vL1D is active [#vl1d-activity]_.
+    unit: Percent
+  Tag RAM Stall (Read):
+    rst: The ratio of the number of cycles where the vL1D is stalled due to Read requests
+      with conflicting tags being looked up concurrently, divided by the number of
+      cycles where the vL1D is active [#vl1d-activity]_.
+    unit: Percent
+  Tag RAM Stall (Write):
+    rst: The ratio of the number of cycles where the vL1D is stalled due to Write
+      requests with conflicting tags being looked up concurrently, divided by the
+      number of cycles where the vL1D is active [#vl1d-activity]_.
+    unit: Percent
+  Tag RAM Stall (Atomic):
+    rst: The ratio of the number of cycles where the vL1D is stalled due to Atomic
+      requests with conflicting tags being looked up concurrently, divided by the
+      number of cycles where the vL1D is active [#vl1d-activity]_.
+    unit: Percent
+vL1D cache access metrics:
+  Total Req:
+    rst: The total number of incoming requests from the :ref:`address processing unit
+      <desc-ta>` after coalescing.
+    unit: Requests
+  Read Req:
+    rst: The total number of incoming read requests from the :ref:`address processing
+      unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
+    unit: Requests per normalization unit
+  Write Req:
+    rst: The total number of incoming write requests from the :ref:`address processing
+      unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
+    unit: Requests per normalization unit
+  Atomic Req:
+    rst: The total number of incoming atomic requests from the :ref:`address processing
+      unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
+    unit: Requests per normalization unit
+  Cache BW:
+    rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM
+      <desc-vmem>` instructions divided by total duration. The number of bytes is
+      calculated as the number of cache lines requested multiplied by the cache line
+      size. This value does not consider partial requests, so for instance, if only
+      a single value is requested in a cache line, the data movement will still be
+      counted as a full cache line.
+    unit: Gbps
+  Cache Hit Rate:
+    rst: The ratio of the number of vL1D cache line requests that hit in vL1D cache
+      over the total number of cache line requests to the :ref:`vL1D Cache RAM <desc-tc>`.
+    unit: Percent
+  Cache Accesses:
+    rst: The total number of cache line lookups in the vL1D.
+    unit: Cache lines
+  Cache Hits:
+    rst: The number of cache accesses minus the number of outgoing requests to the
+      :doc:`L2 cache <l2-cache>`, that is, the number of cache line requests serviced
+      by the :ref:`vL1D Cache RAM <desc-tc>` per :ref:`normalization unit <normalization-units>`.
+    unit: Cache lines per normalization unit
+  Invalidations:
+    rst: The number of times the vL1D was issued a write-back invalidate command during
+      the kernel's execution per :ref:`normalization unit <normalization-units>`.
+      This may be triggered by, for instance, the ``buffer_wbinvl1`` instruction.
+    unit: Invalidations per normalization unit
+  L1-L2 BW:
+    rst: The number of bytes transferred across the vL1D-L2 interface as a result
+      of :ref:`VMEM <desc-vmem>` instructions, divided by total duration. The number
+      of bytes is calculated as the number of cache lines requested multiplied by
+      the cache line size. This value does not consider partial requests, so for instance,
+      if only a single value is requested in a cache line, the data movement will
+      still be counted as a full cache line.
+    unit: Gbps
+  L1-L2 Read:
+    rst: The number of read requests for a vL1D cache line that were not satisfied
+      by the vL1D and must be retrieved from the to the :doc:`L2 Cache <l2-cache>`
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  L1-L2 Write:
+    rst: The number of write requests to a vL1D cache line that were sent through
+      the vL1D to the :doc:`L2 cache <l2-cache>`, per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  L1-L2 Atomic:
+    rst: The number of atomic requests that are sent through the vL1D to the :doc:`L2
+      cache <l2-cache>`, per :ref:`normalization unit <normalization-units>`. This
+      includes requests for atomics with, and without return.
+    unit: Requests per normalization unit
+L1D - L2 Transactions:
+  NC - Read:
+    rst: Total read requests with NC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  UC - Read:
+    rst: Total read requests with UC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  CC - Read:
+    rst: Total read requests with CC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  RW - Read:
+    rst: Total read requests with RW mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  RW - Write:
+    rst: Total write requests with RW mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  NC - Write:
+    rst: Total write requests with NC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  UC - Write:
+    rst: Total write requests with UC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  CC - Write:
+    rst: Total write requests with CC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  NC - Atomic:
+    rst: Total atomic requests with NC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  UC - Atomic:
+    rst: Total atomic requests with UC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  CC - Atomic:
+    rst: Total atomic requests with CC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  RW - Atomic:
+    rst: Total atomic requests with RW mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+L1 Unified Translation Cache (UTCL1):
+  Req:
+    rst: The number of translation requests made to the UTCL1 per normalization unit.
+    unit: Requests per normalization unit
+  Hit Ratio:
+    rst: The ratio of the number of translation requests that hit in the UTCL1 divided
+      by the total number of translation requests made to the UTCL1.
+    unit: Percent
+  Hits:
+    rst: The number of translation requests that hit in the UTCL1, and could be reused,
+      per normalization unit.
+    unit: Requests per normalization unit
+  Translation Misses:
+    rst: The total number of translation requests that missed in the UTCL1 due to
+      translation not being present in the cache, per :ref:`normalization unit <normalization-units>`.
+    unit: unit
+  Permission Misses:
+    rst: |-
+      The total number of translation requests that missed in the UTCL1 due
+      to a permission error, per :ref:`normalization unit <normalization-units>`.
+      This is unused and expected to be zero in most configurations for modern
+      CDNA\u2122 accelerators.
+    unit: Requests per normalization unit
+L1D Addr Translation Stalls: {}
+L2 Speed-of-Light:
+  Utilization:
+    rst: The ratio of the :ref:`number of cycles an L2 channel was active, summed
+      over all L2 channels on the accelerator <total-active-l2-cycles>` over the :ref:`total
+      L2 cycles <total-l2-cycles>`.
+    unit: Percent
+  Peak Bandwidth:
+    rst: The number of bytes looked up in the L2 cache, as a percent of the peak theoretical
+      bandwidth achievable on the specific accelerator. The number of bytes is calculated
+      as the number of cache lines requested multiplied by the cache line size. This
+      value does not consider partial requests, so e.g., if only a single value is
+      requested in a cache line, the data movement will still be counted as a full
+      cache line.
+    unit: Percent
+  Hit Rate:
+    rst: The ratio of the number of L2 cache line requests that hit in the L2 cache
+      over the total number of incoming cache line requests to the L2 cache.
+    unit: Percent
+  L2-Fabric Read BW:
+    rst: The number of bytes read by the L2 over the :ref:`Infinity Fabric interface
+      <l2-fabric>` per unit time.
+    unit: GB/s
+  L2-Fabric Write and Atomic BW:
+    rst: The number of bytes sent by the L2 over the :ref:`Infinity Fabric interface
+      <l2-fabric>` by write and atomic operations per unit time.
+    unit: GB/s
+  HBM Bandwidth:
+    rst: Maximum theoretical bandwidth of the accelerator's local high-bandwidth memory
+      (HBM) per unit time. This value is calculated as the number of HBM channels
+      multiplied by the HBM channel width multiplied by the HBM clock frequency.
+    unit: GB/s
+L2-Fabric interface metrics:
+  Read BW:
+    rst: The total number of bytes read by the L2 cache from Infinity Fabric divided
+      by total duration.
+    unit: Gbps
+  HBM Read Traffic:
+    rst: The percent of read requests generated by the L2 cache that are routed to
+      the accelerator's local high-bandwidth memory (HBM). This breakdown does not
+      consider the *size* of the request (meaning that 32B and 64B requests are both
+      counted as a single request), so this metric only *approximates* the percent
+      of the L2-Fabric Read bandwidth directed to the local HBM.
+    unit: Percent
+  Remote Read Traffic:
+    rst: The percent of read requests generated by the L2 cache that are routed to
+      any memory location other than the accelerator's local high-bandwidth memory
+      (HBM) -- for example, the CPU's DRAM or a remote accelerator's HBM. This breakdown
+      does not consider the *size* of the request (meaning that 32B and 64B requests
+      are both counted as a single request), so this metric only *approximates* the
+      percent of the L2-Fabric Read bandwidth directed to a remote location.
+    unit: Percent
+  Uncached Read Traffic:
+    rst: The percent of read requests generated by the L2 cache that are reading from
+      an :ref:`uncached memory allocation <memory-type>`. Note, as described in the
+      :ref:`request flow <l2-request-flow>` section, a single 64B read request is
+      typically counted as two uncached read requests. So, it is possible for the
+      Uncached Read Traffic to reach up to 200% of the total number of read requests.
+      This breakdown does not consider the *size* of the request (i.e., 32B and 64B
+      requests are both counted as a single request), so this metric only *approximates*
+      the percent of the L2-Fabric read bandwidth directed to an uncached memory location.
+    unit: Percent
+  Write and Atomic BW:
+    rst: The total number of bytes written by the L2 over Infinity Fabric by write
+      and atomic operations divided by total duration. Note that on current CDNA accelerators,
+      such as the :ref:`MI2XX <mixxx-note>`, requests are only considered *atomic*
+      by Infinity Fabric if they are targeted at non-write-cacheable memory, for example,
+      :ref:`fine-grained memory <memory-type>` allocations or :ref:`uncached memory
+      <memory-type>` allocations on the MI2XX.
+    unit: Gbps
+  HBM Write and Atomic Traffic:
+    rst: The percent of write and atomic requests generated by the L2 cache that are
+      routed to the accelerator's local high-bandwidth memory (HBM). This breakdown
+      does not consider the *size* of the request (meaning that 32B and 64B requests
+      are both counted as a single request), so this metric only *approximates* the
+      percent of the L2-Fabric Write and Atomic bandwidth directed to the local HBM.
+      Note that on current CDNA accelerators, such as the :ref:`MI2XX <mixxx-note>`,
+      requests are only considered *atomic* by Infinity Fabric if they are targeted
+      at :ref:`fine-grained memory <memory-type>` allocations or :ref:`uncached memory
+      <memory-type>` allocations.
+    unit: Percent
+  Remote Write and Atomic Traffic:
+    rst: The percent of read requests generated by the L2 cache that are routed to
+      any memory location other than the accelerator's local high-bandwidth memory
+      (HBM) -- for example, the CPU's DRAM or a remote accelerator's HBM. This breakdown
+      does not consider the *size* of the request (meaning that 32B and 64B requests
+      are both counted as a single request), so this metric only *approximates* the
+      percent of the L2-Fabric Read bandwidth directed to a remote location. Note
+      that on current CDNA accelerators, such as the :ref:`MI2XX <mixxx-note>`, requests
+      are only considered *atomic* by Infinity Fabric if they are targeted at :ref:`fine-grained
+      memory <memory-type>` allocations or :ref:`uncached memory <memory-type>` allocations.
+    unit: Percent
+  Atomic Traffic:
+    rst: The percent of write requests generated by the L2 cache that are atomic requests
+      to *any* memory location. This breakdown does not consider the *size* of the
+      request (meaning that 32B and 64B requests are both counted as a single request),
+      so this metric only *approximates* the percent of the L2-Fabric Read bandwidth
+      directed to a remote location. Note that on current CDNA accelerators, such
+      as the :ref:`MI2XX <mixxx-note>`, requests are only considered *atomic* by Infinity
+      Fabric if they are targeted at :ref:`fine-grained memory <memory-type>` allocations
+      or :ref:`uncached memory <memory-type>` allocations.
+    unit: Percent
+  Uncached Write and Atomic Traffic:
+    rst: The percent of write and atomic requests generated by the L2 cache that are
+      targeting :ref:`uncached memory allocations <memory-type>`. This breakdown does
+      not consider the *size* of the request (meaning that 32B and 64B requests are
+      both counted as a single request), so this metric only *approximates* the percent
+      of the L2-Fabric read bandwidth directed to uncached memory allocations.
+    unit: Percent
+  Read Latency:
+    rst: The time-averaged number of cycles read requests spent in Infinity Fabric
+      before data was returned to the L2.
+    unit: Cycles
+  Write and Atomic Latency:
+    rst: The time-averaged number of cycles write requests spent in Infinity Fabric
+      before a completion acknowledgement was returned to the L2.
+    unit: Cycles
+  Atomic Latency:
+    rst: The time-averaged number of cycles atomic requests spent in Infinity Fabric
+      before a completion acknowledgement (atomic without return value) or data (atomic
+      with return value) was returned to the L2.
+    unit: Cycles
+L2 Cache Accesses:
+  Bandwidth:
+    rst: The number of bytes looked up in the L2 cache, divided by total duration.
+      The number of bytes is calculated as the number of cache lines requested multiplied
+      by the cache line size. This value does not consider partial requests, so for
+      example, if only a single value is requested in a cache line, the data movement
+      will still be counted as a full cache line.
+    unit: Gbps
+  Req:
+    rst: The total number of incoming requests to the L2 from all clients for all
+      request types, per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Read Req:
+    rst: The total number of read requests to the L2 from all clients.
+    unit: Requests per normalization unit
+  Write Req:
+    rst: The total number of write requests to the L2 from all clients.
+    unit: Requests per normalization unit
+  Atomic Req:
+    rst: The total number of atomic requests (with and without return) to the L2 from
+      all clients.
+    unit: Requests per normalization unit
+  Streaming Req:
+    rst: The total number of incoming requests to the L2 that are marked as *streaming*.
+      The exact meaning of this may differ depending on the targeted accelerator,
+      however on an :ref:`MI2XX <mixxx-note>` this corresponds to `non-temporal load
+      or stores <https://clang.llvm.org/docs/LanguageExtensions.html#non-temporal-load-store-builtins>`_.
+      The L2 cache attempts to evict *streaming* requests before normal requests when
+      the L2 is at capacity.
+    unit: Requests per normalization unit
+  Probe Req:
+    rst: The number of coherence probe requests made to the L2 cache from outside
+      the accelerator. On an :ref:`MI2XX <mixxx-note>`, probe requests may be generated
+      by, for example, writes to :ref:`fine-grained device <memory-type>` memory or
+      by writes to :ref:`coarse-grained <memory-type>` device memory.
+    unit: Requests per normalization unit
+  Cache Hit:
+    rst: The ratio of the number of L2 cache line requests that hit in the L2 cache
+      over the total number of incoming cache line requests to the L2 cache.
+    unit: Percent
+  Hits:
+    rst: The total number of requests to the L2 from all clients that hit in the cache.
+      As noted in the :ref:`Speed-of-Light <l2-sol>` section, this includes hit-on-miss
+      requests.
+    unit: Requests per normalization unit
+  Misses:
+    rst: The total number of requests to the L2 from all clients that miss in the
+      cache. As noted in the :ref:`Speed-of-Light <l2-sol>` section, these do not
+      include hit-on-miss requests.
+    unit: Requests per normalization unit
+  Writeback:
+    rst: The total number of L2 cache lines written back to memory for any reason.
+      Write-backs may occur due to user code (such as HIP kernel calls to ``__threadfence_system``
+      or atomic built-ins) by the :doc:`command processor <command-processor>`'s memory
+      acquire/release fences, or for other internal hardware reasons.
+    unit: Cache lines per normalization unit
+  Writeback (Internal):
+    rst: The total number of L2 cache lines written back to memory for internal hardware
+      reasons, per :ref:`normalization unit <normalization-units>`.
+    unit: Cache lines per normalization unit
+  Writeback (vL1D Req):
+    rst: The total number of L2 cache lines written back to memory due to requests
+      initiated by the :doc:`vL1D cache <vector-l1-cache>`, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Cache lines per normalization unit
+  Evict (Internal):
+    rst: The total number of L2 cache lines evicted from the cache due to capacity
+      limits, per :ref:`normalization unit <normalization-units>`.
+    unit: Cache lines per normalization unit
+  Evict (vL1D Req):
+    rst: The total number of L2 cache lines evicted from the cache due to invalidation
+      requests initiated by the :doc:`vL1D cache <vector-l1-cache>`, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Cache lines per normalization unit
+  NC Req:
+    rst: The total number of requests to the L2 to Not-hardware-Coherent (NC) memory
+      allocations, per :ref:`normalization unit <normalization-units>`. See the :ref:`memory-type`
+      for more information.
+    unit: Requests per normalization unit
+  UC Req:
+    rst: The total number of requests to the L2 that go to Uncached (UC) memory allocations.
+      See the :ref:`memory-type` for more information.
+    unit: Requests per normalization unit
+  CC Req:
+    rst: The total number of requests to the L2 that go to Coherently Cacheable (CC)
+      memory allocations. See the :ref:`memory-type` for more information.
+    unit: Requests per normalization unit
+  RW Req:
+    rst: The total number of requests to the L2 that go to Read-Write coherent memory
+      (RW) allocations. See the :ref:`memory-type` for more information.
+    unit: Requests per normalization unit
+L2 Cache Stalls: {}
+L2 - Fabric Interface stalls:
+  Write - Credit Starvation:
+    rst: The number of cycles the L2-Fabric interface was stalled on write or atomic
+      requests to any memory location because too many write/atomic requests were
+      currently in flight, as a percent of the :ref:`total active L2 cycles <total-active-l2-cycles>`.
+    unit: Percent
+L2 - Fabric interface detailed metrics:
+  Read (32B):
+    rst: The total number of L2 requests to Infinity Fabric to read 32B of data from
+      any memory location, per :ref:`normalization unit <normalization-units>`. See
+      :ref:`l2-request-flow` for more detail. Typically unused on CDNA accelerators.
+    unit: Requests per normalization unit
+  Read (64B):
+    rst: The total number of L2 requests to Infinity Fabric to read 64B of data from
+      any memory location, per :ref:`normalization unit <normalization-units>`. See
+      :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+  Read (Uncached):
+    rst: The total number of L2 requests to Infinity Fabric to read :ref:`uncached
+      data <memory-type>` from any memory location, per :ref:`normalization unit <normalization-units>`.
+      64B requests for uncached data are counted as two 32B uncached data requests.
+      See :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+  HBM Read:
+    rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of
+      data from the accelerator's local HBM, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+  Remote Read:
+    rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of
+      data from any source other than the accelerator's local HBM, per :ref:`normalization
+      unit <normalization-units>`. See :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+  Write and Atomic (32B):
+    rst: The total number of L2 requests to Infinity Fabric to write or atomically
+      update 32B of data to any memory location, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+  Write and Atomic (Uncached):
+    rst: The total number of L2 requests to Infinity Fabric to write or atomically
+      update 32B or 64B of :ref:`uncached data <memory-type>`, per :ref:`normalization
+      unit <normalization-units>`. See :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+  Write and Atomic (64B):
+    rst: The total number of L2 requests to Infinity Fabric to write or atomically
+      update 64B of data in any memory location, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+  HBM Write and Atomic:
+    rst: The total number of L2 requests to Infinity Fabric to write or atomically
+      update 32B or 64B of data in the accelerator's local HBM, per :ref:`normalization
+      unit <normalization-units>`. See :ref:`l2-request-flow` for more detail. plain
+    unit: Requests per normalization unit
+  Remote Write and Atomic:
+    rst: The total number of L2 requests to Infinity Fabric to write or atomically
+      update 32B or 64B of data in any memory location other than the accelerator's
+      local HBM, per :ref:`normalization unit <normalization-units>`. See :ref:`l2-request-flow`
+      for more detail.
+    unit: Requests per normalization unit
+  Atomic:
+    rst: The total number of L2 requests to Infinity Fabric to atomically update 32B
+      or 64B of data in any memory location, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`l2-request-flow` for more detail. Note that on current CDNA accelerators,
+      such as the :ref:`MI2XX <mixxx-note>`, requests are only considered *atomic*
+      by Infinity Fabric if they are targeted at non-write-cacheable memory, such
+      as :ref:`fine-grained memory <memory-type>` allocations or :ref:`uncached memory
+      <memory-type>` allocations on the MI2XX.
+    unit: Requests per normalization unit
+Aggregate Stats (All channels):
+  L2 Cache Hit Rate:
+    rst: The total number of requests to the L2 from all clients that hit in the cache.
+      As noted in the :ref:`Speed-of-Light <l2-sol>` section, this includes hit-on-miss
+      requests.
+    unit: Percent
+L2 Cache Hit Rate (pct):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
+L2 Requests (per normUnit):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
+L2-Fabric Requests (per normUnit):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
+L2-Fabric Read Latency (Cycles):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
+L2-Fabric Write and Atomic Latency (Cycles):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
+L2-Fabric Atomic Latency (Cycles):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
+L2-Fabric Read Stall (Cycles per normUnit):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
+L2-Fabric Write and Atomic Stall (Cycles per normUnit):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
+L2-Fabric (128B read requests per normUnit):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
diff --git a/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx942_metrics_description.yaml b/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx942_metrics_description.yaml
new file mode 100644
index 0000000000..4d27ec667a
--- /dev/null
+++ b/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx942_metrics_description.yaml
@@ -0,0 +1,2043 @@
+System Speed-of-Light:
+  VALU FLOPs:
+    rst: |-
+      The total floating-point operations executed per second on the :ref:`VALU
+      <desc-valu>`. This is also presented as a percent of the peak theoretical
+      FLOPs achievable on the specific accelerator. Note: this does not include
+      any floating-point operations from :ref:`MFMA <desc-mfma>` instructions.
+    unit: GFLOPs
+  VALU IOPs:
+    rst: |-
+      The total integer operations executed per second on the :ref:`VALU <desc-valu>`.
+      This is also presented as a percent of the peak theoretical IOPs achievable
+      on the specific accelerator. Note: this does not include any integer operations
+      from :ref:`MFMA <desc-mfma>` instructions.
+    unit: GOIPs
+  MFMA FLOPs (F8):
+    rst: |-
+      The total number of 8-bit brain floating point :ref:`MFMA <desc-mfma>`
+      operations executed per second. Note: this does not include any 16-bit brain
+      floating point operations from :ref:`VALU <desc-valu>` instructions. This
+      is also presented as a percent of the peak theoretical F8 MFMA operations
+      achievable on the specific accelerator. It is supported on AMD Instinct MI300
+      series and later only.
+    unit: GFLOPs
+  MFMA FLOPs (BF16):
+    rst: |-
+      The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>`
+      operations executed per second. Note: this does not include any 16-bit brain
+      floating point operations from :ref:`VALU <desc-valu>` instructions. This
+      is also presented as a percent of the peak theoretical BF16 MFMA operations
+      achievable on the specific accelerator.
+    unit: GFLOPs
+  MFMA FLOPs (F16):
+    rst: |-
+      The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 16-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. This is also presented
+      as a percent of the peak theoretical F16 MFMA operations achievable on the
+      specific accelerator.
+    unit: GFLOPs
+  MFMA FLOPs (F32):
+    rst: |-
+      The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 32-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. This is also presented
+      as a percent of the peak theoretical F32 MFMA operations achievable on the
+      specific accelerator.
+    unit: GFLOPs
+  MFMA FLOPs (F64):
+    rst: |-
+      The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 64-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. This is also presented
+      as a percent of the peak theoretical F64 MFMA operations achievable on the
+      specific accelerator.
+    unit: GFLOPs
+  MFMA IOPs (Int8):
+    rst: |-
+      The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
+      per second. Note: this does not include any 8-bit integer operations from
+      :ref:`VALU <desc-valu>` instructions. This is also presented as a percent
+      of the peak theoretical INT8 MFMA operations achievable on the specific accelerator.
+    unit: GIOPs
+  Active CUs:
+    rst: Total number of active compute units (CUs) on the accelerator during the
+      kernel execution.
+    unit: Number
+  SALU Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`SALU <desc-salu>`
+      was busy executing instructions. Computed as the ratio of the total number of
+      cycles spent by the :ref:`scheduler <desc-scheduler>` issuing SALU / :ref:`SMEM
+      <desc-smem>` instructions over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  VALU Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`VALU <desc-valu>`
+      was busy executing instructions. Does not include :ref:`VMEM <desc-vmem>` operations.
+      Computed as the ratio of the total number of cycles spent by the :ref:`scheduler
+      <desc-scheduler>` issuing VALU instructions over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  MFMA Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`MFMA <desc-mfma>`
+      unit was busy executing instructions. Computed as the ratio of the total number
+      of cycles spent by the :ref:`MFMA <desc-salu>` was busy over the :ref:`total
+      CU cycles <total-cu-cycles>`.
+    unit: Percent
+  VMEM Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`VMEM <desc-vmem>`
+      unit was busy executing instructions, including both global/generic and spill/scratch
+      operations (see the :ref:`VMEM instruction count metrics <ta-instruction-counts>`
+      for more detail). Does not include :ref:`VALU <desc-valu>` operations. Computed
+      as the ratio of the total number of cycles spent by the :ref:`scheduler <desc-scheduler>`
+      issuing VMEM instructions over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  Branch Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`branch <desc-branch>`
+      unit was busy executing instructions. Computed as the ratio of the total number
+      of cycles spent by the :ref:`scheduler <desc-scheduler>` issuing branch instructions
+      over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  VALU Active Threads:
+    rst: Indicates the average level of :ref:`divergence <desc-divergence>` within
+      a wavefront over the lifetime of the kernel. The number of work-items that were
+      active in a wavefront during execution of each :ref:`VALU <desc-valu>` instruction,
+      time-averaged over all VALU instructions run on all wavefronts in the kernel.
+    unit: Work-items
+  IPC:
+    rst: The ratio of the total number of instructions executed on the :doc:`CU <compute-unit>`
+      over the :ref:`total active CU cycles <total-active-cu-cycles>`.
+    unit: Instructions per-cycle
+  Wavefront Occupancy:
+    rst: |-
+      The time-averaged number of wavefronts resident on the accelerator over
+      the lifetime of the kernel. Note: this metric may be inaccurate for short-running
+      kernels (less than 1ms). This is also presented as a percent of the peak theoretical
+      occupancy achievable on the specific accelerator.
+    unit: Wavefronts
+  Theoretical LDS Bandwidth:
+    rst: Indicates the maximum amount of bytes that could have been loaded from, stored
+      to, or atomically updated in the LDS per unit time (see :ref:`LDS Bandwidth
+      <lds-bandwidth>` example for more detail). This is also presented as a percent
+      of the peak theoretical F64 MFMA operations achievable on the specific accelerator.
+    unit: GB/s
+  LDS Bank Conflicts/Access:
+    rst: The ratio of the number of cycles spent in the :doc:`LDS scheduler <local-data-share>`
+      due to bank conflicts (as determined by the conflict resolution hardware) to
+      the base number of cycles that would be spent in the LDS scheduler in a completely
+      uncontended case. This is also presented in normalized form (i.e., the Bank
+      Conflict Rate).
+    unit: Conflicts/Access
+  vL1D Cache Hit Rate:
+    rst: The ratio of the number of vL1D cache line requests that hit in vL1D cache
+      over the total number of cache line requests to the :ref:`vL1D cache RAM <desc-tc>`.
+    unit: Percent
+  vL1D Cache BW:
+    rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM
+      <desc-vmem>` instructions per unit time. The number of bytes is calculated as
+      the number of cache lines requested multiplied by the cache line size. This
+      value does not consider partial requests, so e.g., if only a single value is
+      requested in a cache line, the data movement will still be counted as a full
+      cache line. This is also presented as a percent of the peak theoretical bandwidth
+      achievable on the specific accelerator.
+    unit: GB/s
+  L2 Cache Hit Rate:
+    rst: The ratio of the number of L2 cache line requests that hit in the L2 cache
+      over the total number of incoming cache line requests to the L2 cache.
+    unit: Percent
+  L2 Cache BW:
+    rst: The number of bytes looked up in the L2 cache per unit time. The number of
+      bytes is calculated as the number of cache lines requested multiplied by the
+      cache line size. This value does not consider partial requests, so e.g., if
+      only a single value is requested in a cache line, the data movement will still
+      be counted as a full cache line. This is also presented as a percent of the
+      peak theoretical bandwidth achievable on the specific accelerator.
+    unit: GB/s
+  L2-Fabric Read BW:
+    rst: |-
+      The number of bytes read by the L2 over the :ref:`Infinity Fabric\u2122
+      interface <l2-fabric>` per unit time. This is also presented as a percent
+      of the peak theoretical bandwidth achievable on the specific accelerator.
+    unit: GB/s
+  L2-Fabric Write BW:
+    rst: The number of bytes sent by the L2 over the :ref:`Infinity Fabric interface
+      <l2-fabric>` by write and atomic operations per unit time. This is also presented
+      as a percent of the peak theoretical bandwidth achievable on the specific accelerator.
+    unit: GB/s
+  L2-Fabric Read Latency:
+    rst: The time-averaged number of cycles read requests spent in Infinity Fabric
+      before data was returned to the L2.
+    unit: Cycles
+  L2-Fabric Write Latency:
+    rst: The time-averaged number of cycles write requests spent in Infinity Fabric
+      before a completion acknowledgement was returned to the L2.
+    unit: Cycles
+  sL1D Cache Hit Rate:
+    rst: The percent of sL1D requests that hit on a previously loaded line the cache.
+      Calculated as the ratio of the number of sL1D requests that hit over the number
+      of all sL1D requests.
+    unit: Percent
+  sL1D Cache BW:
+    rst: The number of bytes looked up in the sL1D cache per unit time. This is also
+      presented as a percent of the peak theoretical bandwidth achievable on the specific
+      accelerator.
+    unit: GB/s
+  L1I Hit Rate:
+    rst: The percent of L1I requests that hit on a previously loaded line the cache.
+      Calculated as the ratio of the number of L1I requests that hit over the number
+      of all L1I requests.
+    unit: GB/s
+  L1I BW:
+    rst: The number of bytes looked up in the L1I cache per unit time. This is also
+      presented as a percent of the peak theoretical bandwidth achievable on the specific
+      accelerator.
+    unit: Percent
+  L1I Fetch Latency:
+    rst: The average number of cycles spent to fetch instructions to a :doc:`CU <compute-unit>`.
+    unit: Cycles
+Memory Chart:
+  Wavefront Occupancy:
+    rst: Wavefronts per active CU.
+    unit: Wavefronts
+  Wave Life:
+    rst: Average number of cycles executing a wave.
+    unit: Cycles per wave
+  SALU:
+    rst: Total Number of SALU (Scalar ALU) instructions issued per normalization unit.
+    unit: Instructions per normalization unit
+  SMEM:
+    rst: Total number of SMEM (Scalar Memory Read) instructions issued normalization
+      unit.
+    unit: Instructions per normalization unit
+  VALU:
+    rst: The number of VALU (Vector ALU) instructions issued per normalization unit.
+    unit: Instructions per normalization unit
+  MFMA:
+    rst: Total number of MFMA (Matrix-Fused-Multiply-Add) instructions issued per
+      normalization unit.
+    unit: Instructions per normalization unit
+  VMEM:
+    rst: The number of VMEM (GPU Memory) read instructions issued (including FLAT/scratch
+      memory) per normalization unit.
+    unit: Instructions per normalization unit
+  LDS:
+    rst: The total number of LDS instructions (including, but not limited to, read/write/atomics
+      and HIP's __shfl instructions) executed per normalization unit.
+    unit: Instructions per normalization unit
+  GWS:
+    rst: Total number of GDS (global data sync) instructions issued per normalization
+      unit.
+    unit: Instructions per normalization unit
+  BR:
+    rst: Total number of BRANCH instructions issued per normalization unit.
+    unit: Instructions per normalization unit
+  Active CUs:
+    rst: Total number of active compute units (CUs) on the accelerator during the
+      kernel execution.
+    unit: CUs
+  Num CUs:
+    rst: Total number of compute units (CUs) on the accelerator.
+    unit: CUs
+  VGPR:
+    rst: |-
+      The number of architected vector general-purpose registers allocated for the
+      kernel, see :ref:`VALU <desc-valu>`. Note: this may not exactly match the
+      number of VGPRs requested by the compiler due to allocation granularity.
+    unit: VGPRs
+  SGPR:
+    rst: |-
+      The number of scalar general-purpose registers allocated for the kernel, see
+      :ref:`SALU <desc-salu>`. Note: this may not exactly match the number of
+      SGPRs requested by the compiler due to allocation granularity.
+    unit: SGPRs
+  LDS Allocation:
+    rst: |-
+      The number of bytes of :doc:`LDS <local-data-share>` memory (or, shared memory)
+      allocated for this kernel. Note: This may also be larger than what was requested
+      at compile time due to both allocation granularity and dynamic per-dispatch
+      LDS allocations.
+    unit: Bytes per workgroup
+  Scratch Allocation:
+    rst: The number of bytes of :ref:`scratch memory <memory-spaces>` requested per
+      work-item for this kernel. Scratch memory is used for stack memory on the accelerator,
+      as well as for register spills and restores.
+    unit: Bytes per workgroup
+  Wavefronts:
+    rst: The total number of wavefronts, summed over all workgroups, forming this
+      kernel launch.
+    unit: Wavefronts
+  Workgroups:
+    rst: The total number of workgroups forming this kernel launch.
+    unit: Workgroups
+  LDS Req:
+    rst: The total number of LDS instructions (including, but not limited to, read/write/atomics
+      and HIP's ``__shfl`` instructions) executed per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  LDS Util:
+    rst: Indicates what percent of the kernel's duration the :ref:`LDS <desc-lds>`
+      was actively executing instructions (including, but not limited to, load, store,
+      atomic and HIP's ``__shfl`` operations). Calculated as the ratio of the total
+      number of cycles LDS was active over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  LDS Latency:
+    rst: The average number of round-trip cycles (i.e., from issue to data-return
+      / acknowledgment) required for an LDS instruction to complete.
+    unit: Cycles
+  VL1 Rd:
+    rst: The total number of incoming read requests from the :ref:`address processing
+      unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
+    unit: Requests per normalization unit
+  VL1 Wr:
+    rst: The total number of incoming write requests from the :ref:`address processing
+      unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
+    unit: Requests per normalization unit
+  VL1 Atomic:
+    rst: The total number of incoming atomic requests from the :ref:`address processing
+      unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
+    unit: Requests per normalization unit
+  VL1 Hit:
+    rst: The ratio of the number of vL1D cache line requests that hit in vL1D cache
+      over the total number of cache line requests to the :ref:`vL1D Cache RAM <desc-tc>`.
+    unit: Percent
+  VL1 Lat:
+    rst: Calculated as the average number of cycles that a vL1D cache line request
+      spent in the vL1D cache pipeline.
+    unit: Cycles
+  VL1 Coalesce:
+    rst: Indicates how well memory instructions were coalesced by the :ref:`address
+      processing unit <desc-ta>`, ranging from uncoalesced (25%) to fully coalesced
+      (100%). Calculated as the average number of :ref:`thread-requests <thread-requests>`
+      generated per instruction divided by the ideal number of thread-requests per
+      instruction.
+    unit: Percent
+  VL1 Stall:
+    rst: The ratio of the number of cycles where the vL1D is stalled waiting to issue
+      a request for data to the :doc:`L2 cache <l2-cache>` divided by the number of
+      cycles where the vL1D is active [#vl1d-activity]_.
+    unit: Percent
+  VL1_L2 Rd:
+    rst: The number of read requests for a vL1D cache line that were not satisfied
+      by the vL1D and must be retrieved from the to the :doc:`L2 Cache <l2-cache>`
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  VL1_L2 Wr:
+    rst: The number of write requests to a vL1D cache line that were sent through
+      the vL1D to the :doc:`L2 cache <l2-cache>`, per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  VL1_L2 Atomic:
+    rst: The number of atomic requests that are sent through the vL1D to the :doc:`L2
+      cache <l2-cache>`, per :ref:`normalization unit <normalization-units>`. This
+      includes requests for atomics with, and without return.
+    unit: Requests per normalization unit
+  sL1D Rd:
+    rst: The total number of requests, of any size or type, made to the sL1D per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Requests per normalization unit
+  sL1D Hit:
+    rst: The total number of sL1D requests that hit on a previously loaded cache line,
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  sL1D Lat:
+    rst: ''
+    unit: Unknown
+  sL1D_L2 Rd:
+    rst: The total number of read requests from sL1D to the :doc:`L2 <l2-cache>`,
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  sL1D_L2 Wr:
+    rst: The total number of write requests from sL1D to the :doc:`L2 <l2-cache>`,
+      per :ref:`normalization unit <normalization-units>`. Typically unused on current
+      CDNA accelerators.
+    unit: Requests per normalization unit
+  sL1D_L2 Atomic:
+    rst: The total number of atomic requests from sL1D to the :doc:`L2 <l2-cache>`,
+      per :ref:`normalization unit <normalization-units>`. Typically unused on current
+      CDNA accelerators.
+    unit: Requests per normalization unit
+  IL1 Fetch:
+    rst: The total number of requests made to the L1I per :ref:`normalization-unit
+      <normalization-units>`.
+    unit: Requests per normalization unit
+  IL1 Hit:
+    rst: The total number of L1I requests that hit on a previously loaded cache line,
+      per :ref:`normalization-unit <normalization-units>`.
+    unit: Percent
+  IL1 Lat:
+    rst: The average number of cycles spent to fetch instructions to a :doc:`CU <compute-unit>`.
+    unit: Cycles
+  IL1_L2 Rd:
+    rst: The total number of requests across the L1I - L2 interface per normalization-unit.
+    unit: Requests per normalization unit
+  L2 Rd:
+    rst: The total number of read requests to the L2 from all clients.
+    unit: Requests per normalization unit
+  L2 Wr:
+    rst: The total number of write requests to the L2 from all clients.
+    unit: Requests per normalization unit
+  L2 Atomic:
+    rst: The total number of atomic requests (with and without return) to the L2 from
+      all clients.
+    unit: Requests per normalization unit
+  L2 Hit:
+    rst: The ratio of the number of L2 cache line requests that hit in the L2 cache
+      over the total number of incoming cache line requests to the L2 cache.
+    unit: Percent
+  Fabric_L2 Rd:
+    rst: Number of L2 cache - Infinity Fabric read requests (either 32-byte or 64-byte)
+      summed over TCC instances per normalization unit.
+    unit: Requests per normalization unit
+  Fabric_L2 Wr:
+    rst: Number of L2 cache - Infinity Fabric write requests (either 32-byte or 64-byte)
+      summed over TCC instances per normalization unit.
+    unit: Requests per normalization unit
+  Fabric_L2 Atomic:
+    rst: Number of L2 cache - Infinity Fabric write requests (either 32-byte or 64-byte)
+      that are actually atomic requests summed over TCC instances per normalization
+      unit.
+    unit: Requests per normalization unit
+  Fabric Rd Lat:
+    rst: The time-averaged number of cycles read requests spent in Infinity Fabric
+      before data was returned to the L2.
+    unit: Cycles
+  Fabric Wr Lat:
+    rst: The time-averaged number of cycles write requests spent in Infinity Fabric
+      before a completion acknowledgement was returned to the L2.
+    unit: Cycles
+  Fabric Atomic Lat:
+    rst: The time-averaged number of cycles atomic requests spent in Infinity Fabric
+      before a completion acknowledgement (atomic without return value) or data (atomic
+      with return value) was returned to the L2.
+    unit: Cycles
+  HBM Rd:
+    rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of
+      data from the accelerator's local HBM, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+  HBM Wr:
+    rst: The total number of L2 requests to Infinity Fabric to write 32B or 64B of
+      data from the accelerator's local HBM, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+Roofline Performance Rates:
+  VALU FLOPs (F16):
+    rst: |-
+      The total 16-bit floating-point operations executed per second on the :ref:`VALU
+      <desc-valu>`. This is presented with the value of the peak empirical F16 FLOPs achievable
+      on the specific accelerator. Note: this does not include any F16 operations
+      from :ref:`MFMA <desc-mfma>` instructions.
+    unit: GFLOPs
+  VALU FLOPs (F32):
+    rst: |-
+      The total 32-bit floating-point operations executed per second on the :ref:`VALU
+      <desc-valu>`. This is presented with the value of the peak empirical F32 FLOPs achievable
+      on the specific accelerator. Note: this does not include any F32 operations
+      from :ref:`MFMA <desc-mfma>` instructions.
+    unit: GFLOPs
+  VALU FLOPs (F64):
+    rst: |-
+      The total 64-bit floating-point operations executed per second on the :ref:`VALU
+      <desc-valu>`. This is presented with the value of the peak empirical F64 FLOPs achievable
+      on the specific accelerator. Note: this does not include any F64 operations
+      from :ref:`MFMA <desc-mfma>` instructions.
+    unit: GFLOPs
+  MFMA FLOPs (F64):
+    rst: |-
+      The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 64-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
+      measured F64 MFMA operations achievable on the specific accelerator is
+      displayed alongside for comparison.
+    unit: GFLOPs
+  MFMA FLOPs (F32):
+    rst: |-
+      The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 32-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
+      measured F32 MFMA operations achievable on the specific accelerator is
+      displayed alongside for comparison.
+    unit: GFLOPs
+  MFMA FLOPs (F16):
+    rst: |-
+      The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 16-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
+      measured F16 MFMA operations achievable on the specific accelerator is
+      displayed alongside for comparison.
+    unit: GFLOPs
+  MFMA FLOPs (BF16):
+    rst: |-
+      The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>`
+      operations executed per second. Note: this does not include any 16-bit brain
+      floating point operations from :ref:`VALU <desc-valu>` instructions. The
+      peak empirically measured BF16 MFMA operations achievable on the specific
+      accelerator is displayed alongside for comparison.
+    unit: GFLOPs
+  MFMA FLOPs (F8):
+    rst: |-
+      The total number of 8-bit brain floating point :ref:`MFMA <desc-mfma>`
+      operations executed per second. Note: this does not include any 16-bit brain
+      floating point operations from :ref:`VALU <desc-valu>` instructions. The
+      peak empirically measured F8 MFMA operations achievable on the specific
+      accelerator is displayed alongside for comparison. It is supported on AMD
+      Instinct MI300 series and later only.
+    unit: GFLOPs
+  MFMA IOPs (Int8):
+    rst: |-
+      The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
+      per second. Note: this does not include any 8-bit integer operations from
+      :ref:`VALU <desc-valu>` instructions. The peak empirically measured INT8 MFMA
+      operations achievable on the specific accelerator is displayed alongside
+      for comparison.
+    unit: GIOPs
+  HBM Bandwidth:
+    rst: |-
+      The total number of bytes read from and written to High-Bandwidth
+        Memory (HBM) per second. The peak empirically measured bandwidth achievable
+        on the specific accelerator is displayed alongside for comparison.
+    unit: GB/s
+  L2 Cache Bandwidth:
+    rst: The number of bytes looked up in the L2 cache per unit time. The number of
+      bytes is calculated as the number of cache lines requested multiplied by the
+      cache line size. This value does not consider partial requests, so e.g., if
+      only a single value is requested in a cache line, the data movement will still
+      be counted as a full cache line. The peak empirically measured bandwidth achievable
+      on the specific accelerator is displayed alongside for comparison.
+    unit: GB/s
+  L1 Cache Bandwidth:
+    rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM
+      <desc-vmem>` instructions per unit time. The number of bytes is calculated as
+      the number of cache lines requested multiplied by the cache line size. This
+      value does not consider partial requests, so e.g., if only a single value is
+      requested in a cache line, the data movement will still be counted as a full
+      cache line. The peak empirically measured bandwidth achievable on the specific
+      accelerator is displayed alongside for comparison.
+    unit: GB/s
+  LDS Bandwidth:
+    rst: Indicates the maximum amount of bytes that could have been loaded from, stored
+      to, or atomically updated in the LDS per unit time (see :ref:`LDS Bandwidth
+      <lds-bandwidth>` example for more detail). The peak empirically measured LDS
+      bandwidth achievable on the specific accelerator is displayed alongside for
+      comparison.
+    unit: GB/s
+Roofline Plot Points:
+  AI HBM:
+    rst: |-
+      The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM).
+      It is the ratio of total floating-point operations (FLOPs) to total bytes
+      transferred between HBM and the L2 cache. This value is used as the x-coordinate
+      for the HBM roofline.
+    unit: FLOPs/Byte
+  AI L2:
+    rst: |-
+      The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio
+      of total floating-point operations (FLOPs) to total bytes transferred between
+      the L2 cache and the L1 cache. This value is used as the x-coordinate for
+      the L2 roofline.
+    unit: FLOPs/Byte
+  AI L1:
+    rst: |-
+      The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio
+      of total floating-point operations (FLOPs) to total bytes transferred between
+      the L1 cache and the processing units. This value is used as the x-coordinate
+      for the L1 roofline.
+    unit: FLOPs/Byte
+  Performance (GFLOPs):
+    rst: |-
+      The overall achieved performance, measured in GigaFLOPs
+      per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point
+      operations divided by the total execution time. This value is used as the y-coordinate
+      for the kernel's point on the Roofline plot.
+    unit: GFLOP/s
+Command processor fetcher (CPF):
+  CPF Utilization:
+    rst: Percent of total cycles where the CPF was busy actively doing any work. The
+      ratio of CPF busy cycles over total cycles counted by the CPF.
+    unit: Percent
+  CPF Stall:
+    rst: Percent of CPF busy cycles where the CPF was stalled for any reason.
+    unit: Percent
+  CPF-L2 Utilization:
+    rst: Percent of total cycles counted by the CPF-:doc:`L2 <l2-cache>` interface
+      where the CPF-L2 interface was active doing any work. The ratio of CPF-L2 busy
+      cycles over total cycles counted by the CPF-L2.
+    unit: Percent
+  CPF-L2 Stall:
+    rst: Percent of CPF-:doc:`L2 <l2-cache>` L2 busy cycles where the CPF-L2 interface
+      was stalled for any reason.
+    unit: Percent
+  CPF-UTCL1 Stall:
+    rst: Percent of CPF busy cycles where the CPF was stalled by address translation.
+    unit: Percent
+Command processor packet processor (CPC):
+  CPC Utilization:
+    rst: Percent of total cycles where the CPC was busy actively doing any work. The
+      ratio of CPC busy cycles over total cycles counted by the CPC.
+    unit: Percent
+  CPC Stall Rate:
+    rst: Percent of CPC busy cycles where the CPC was stalled for any reason.
+    unit: Percent
+  CPC Packet Decoding Utilization:
+    rst: Percent of CPC busy cycles spent decoding commands for processing.
+    unit: Percent
+  CPC-Workgroup Manager Utilization:
+    rst: Percent of CPC busy cycles spent dispatching workgroups to the :ref:`workgroup
+      manager <desc-spi>`.
+    unit: Percent
+  CPC-L2 Utilization:
+    rst: Percent of total cycles counted by the CPC-:doc:`L2 <l2-cache>` interface
+      where the CPC-L2 interface was active doing any work.
+    unit: Percent
+  CPC-UTCL1 Stall:
+    rst: Percent of CPC busy cycles where the CPC was stalled by address translation
+    unit: Percent
+  CPC-UTCL2 Utilization:
+    rst: Percent of total cycles counted by the CPC's :doc:`L2 <l2-cache>` address
+      translation interface where the CPC was busy doing address translation work.
+    unit: Percent
+Workgroup manager utilizations:
+  Accelerator Utilization:
+    rst: The percent of cycles in the kernel where the accelerator was actively doing
+      any work.
+    unit: Percent
+  Scheduler-Pipe Utilization:
+    rst: |-
+      The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
+      in the kernel where the scheduler-pipes were actively doing any work. Note: this
+      value is expected to range between 0% and 25%. See :ref:`desc-spi`.
+    unit: Percent
+  Workgroup Manager Utilization:
+    rst: The percent of cycles in the kernel where the workgroup manager was actively
+      doing any work.
+    unit: Percent
+  Shader Engine Utilization:
+    rst: The percent of :ref:`total shader engine cycles <total-se-cycles>` in the
+      kernel where any CU in a shader-engine was actively doing any work, normalized
+      over all shader-engines. Low values (e.g., << 100%) indicate that the accelerator
+      was not fully saturated by the kernel, or a potential load-imbalance issue.
+    unit: Percent
+  SIMD Utilization:
+    rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel
+      where any :ref:`SIMD <desc-valu>` on a CU was actively doing any work, summed
+      over all CUs. Low values (less than 100%) indicate that the accelerator was
+      not fully saturated by the kernel, or a potential load-imbalance issue.
+    unit: Percent
+  Dispatched Workgroups:
+    rst: The total number of workgroups forming this kernel launch.
+    unit: Workgroups
+  Dispatched Wavefronts:
+    rst: The total number of wavefronts, summed over all workgroups, forming this
+      kernel launch.
+    unit: Wavefronts
+  VGPR Writes:
+    rst: The average number of cycles spent initializing :ref:`VGPRs <desc-valu>`
+      at wave creation.
+    unit: Cycles/wave
+  SGPR Writes:
+    rst: The average number of cycles spent initializing :ref:`SGPRs <desc-salu>`
+      at wave creation.
+    unit: Cycles/wave
+Workgroup Manager - Resource Allocation:
+  Not-scheduled Rate (Workgroup Manager):
+    rst: |-
+      The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
+      in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
+      due to a bottleneck within the workgroup manager rather than a lack of a
+      CU or :ref:`SIMD <desc-valu>` with sufficient resources. Note: this value
+      is expected to range between 0-25%. See note in :ref:`workgroup manager <desc-spi>`
+      description.
+    unit: Percent
+  Not-scheduled Rate (Scheduler-Pipe):
+    rst: |-
+      The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
+      in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
+      due to a bottleneck within the scheduler-pipes rather than a lack of a CU
+      or :ref:`SIMD <desc-valu>` with sufficient resources. Note: this value is
+      expected to range between 0-25%, see note in :ref:`workgroup manager <desc-spi>`
+      description.
+    unit: Percent
+  Scheduler-Pipe Stall Rate:
+    rst: |-
+      The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
+      in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
+      due to occupancy limitations (like a lack of a CU or :ref:`SIMD <desc-valu>`
+      with sufficient resources). Note: this value is expected to range between
+      0-25%, see note in :ref:`workgroup manager <desc-spi>` description.
+    unit: Percent
+  Scratch Stall Rate:
+    rst: The percent of :ref:`total shader-engine cycles <total-se-cycles>` in the
+      kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
+      due to lack of :ref:`private (a.k.a., scratch) memory <memory-type>` slots.
+      While this can reach up to 100%, note that the actual occupancy limitations
+      on a kernel using private memory are typically quite small (for example, less
+      than 1% of the total number of waves that can be scheduled to an accelerator).
+    unit: Percent
+  Insufficient SIMD Waveslots:
+    rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel
+      where a workgroup could not be scheduled to a :ref:`SIMD <desc-valu>` due to
+      lack of available :ref:`waveslots <desc-valu>`.
+    unit: Percent
+  Insufficient SIMD VGPRs:
+    rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel
+      where a workgroup could not be scheduled to a :ref:`SIMD <desc-valu>` due to
+      lack of available :ref:`VGPRs <desc-valu>`.
+    unit: Percent
+  Insufficient SIMD SGPRs:
+    rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel
+      where a workgroup could not be scheduled to a :ref:`SIMD <desc-valu>` due to
+      lack of available :ref:`SGPRs <desc-salu>`.
+    unit: Percent
+  Insufficient CU LDS:
+    rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel where
+      a workgroup could not be scheduled to a :doc:`CU <compute-unit>` due to lack
+      of available :doc:`LDS <local-data-share>`.
+    unit: Percent
+  Insufficient CU Barriers:
+    rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel where
+      a workgroup could not be scheduled to a :doc:`CU <compute-unit>` due to lack
+      of available :ref:`barriers <desc-barrier>`.
+    unit: Percent
+  Reached CU Workgroup Limit:
+    rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel where
+      a workgroup could not be scheduled to a :doc:`CU <compute-unit>` due to limits
+      within the workgroup manager. This is expected to be always be zero on CDNA2
+      or newer accelerators (and small for previous accelerators).
+    unit: Percent
+  Reached CU Wavefront Limit:
+    rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel where
+      a wavefront could not be scheduled to a :doc:`CU <compute-unit>` due to limits
+      within the workgroup manager. This is expected to be always be zero on CDNA2
+      or newer accelerators (and small for previous accelerators).
+    unit: Percent
+Wavefront Launch Stats:
+  Grid Size:
+    rst: The total number of work-items (or, threads) launched as a part of the kernel
+      dispatch. In HIP, this is equivalent to the total grid size multiplied by the
+      total workgroup (or, block) size.
+    unit: Work-Items
+  Workgroup Size:
+    rst: The total number of work-items (or, threads) in each workgroup (or, block)
+      launched as part of the kernel dispatch. In HIP, this is equivalent to the total
+      block size.
+    unit: Work-Items
+  Total Wavefronts:
+    rst: |-
+      The total number of wavefronts launched as part of the kernel dispatch.
+      On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront
+      size is always 64 work-items. Thus, the total number of wavefronts should
+      be equivalent to the ceiling of grid size divided by 64.
+    unit: Wavefronts
+  Saved Wavefronts:
+    rst: The total number of wavefronts saved at a context-save. See `cwsr_enable
+      <https://docs.kernel.org/gpu/amdgpu/module-parameters.html?highlight=cwsr>`_.
+    unit: Wavefronts
+  Restored Wavefronts:
+    rst: The total number of wavefronts restored from a context-save. See `cwsr_enable
+      <https://docs.kernel.org/gpu/amdgpu/module-parameters.html?highlight=cwsr>`_.
+    unit: Wavefronts
+  VGPRs:
+    rst: |-
+      The number of architected vector general-purpose registers allocated for the
+      kernel, see :ref:`VALU <desc-valu>`. Note: this may not exactly match the
+      number of VGPRs requested by the compiler due to allocation granularity.
+    unit: VGPRs
+  AGPRs:
+    rst: |-
+      The number of accumulation vector general-purpose registers allocated
+      for the kernel, see :ref:`AGPRs <desc-agprs>`. Note: this may not exactly match
+      the number of AGPRs requested by the compiler due to allocation granularity.
+    unit: AGPRs
+  SGPRs:
+    rst: |-
+      The number of scalar general-purpose registers allocated for the kernel, see
+      :ref:`SALU <desc-salu>`. Note: this may not exactly match the number of
+      SGPRs requested by the compiler due to allocation granularity.
+    unit: SGPRs
+  LDS Allocation:
+    rst: |-
+      The number of bytes of :doc:`LDS <local-data-share>` memory (or, shared memory)
+      allocated for this kernel. Note: This may also be larger than what was requested
+      at compile time due to both allocation granularity and dynamic per-dispatch
+      LDS allocations.
+    unit: Bytes per workgroup
+  Scratch Allocation:
+    rst: The number of bytes of :ref:`scratch memory <memory-spaces>` requested per
+      work-item for this kernel. Scratch memory is used for stack memory on the accelerator,
+      as well as for register spills and restores.
+    unit: Bytes per work-item
+Wavefront Runtime Stats:
+  Kernel Time:
+    rst: The total duration of the executed kernel.
+    unit: Nanoseconds
+  Kernel Time (Cycles):
+    rst: The total duration of the executed kernel in cycles.
+    unit: Cycles
+  Instructions per wavefront:
+    rst: The average number of instructions (of all types) executed per wavefront.
+      This is averaged over all wavefronts in a kernel dispatch.
+    unit: Instructions per wavefront
+  Wave Cycles:
+    rst: |-
+      The number of cycles a wavefront in the kernel dispatch spent resident
+      on a compute unit per :ref:`normalization unit <normalization-units>`. This is
+      averaged over all wavefronts in a kernel dispatch. Note: this should not
+      be directly compared to the kernel cycles above.
+    unit: Cycles per normalization unit
+  Dependency Wait Cycles:
+    rst: The number of cycles a wavefront in the kernel dispatch stalled waiting on
+      memory of any kind (e.g., instruction fetch, vector or scalar memory, etc.)
+      per :ref:`normalization unit <normalization-units>`. This counter is incremented
+      at every cycle by *all* wavefronts on a CU stalled at a memory operation. As
+      such, it is most useful to get a sense of how waves were spending their time,
+      rather than identification of a precise limiter because another wave could be
+      actively executing while a wave is stalled. The sum of this metric, Issue Wait
+      Cycles and Active Cycles should be equal to the total Wave Cycles metric.
+    unit: Cycles per normalization unit
+  Issue Wait Cycles:
+    rst: The number of cycles a wavefront in the kernel dispatch was unable to issue
+      an instruction for any reason (e.g., execution pipe back-pressure, arbitration
+      loss, etc.) per :ref:`normalization unit <normalization-units>`. This counter
+      is incremented at every cycle by *all* wavefronts on a CU unable to issue an
+      instruction. As such, it is most useful to get a sense of how waves were spending
+      their time, rather than identification of a precise limiter because another
+      wave could be actively executing while a wave is issue stalled. The sum of this
+      metric, Dependency Wait Cycles and Active Cycles should be equal to the total
+      Wave Cycles metric.
+    unit: Cycles per normalization unit
+  Active Cycles:
+    rst: The average number of cycles a wavefront in the kernel dispatch was actively
+      executing instructions per :ref:`normalization unit <normalization-units>`.
+      This measurement is made on a per-wavefront basis, and may include cycles that
+      another wavefront spent actively executing (on another execution unit, for example)
+      or was stalled. As such, it is most useful to get a sense of how waves were
+      spending their time, rather than identification of a precise limiter. The sum
+      of this metric, Issue Wait Cycles and Active Wait Cycles should be equal to
+      the total Wave Cycles metric.
+    unit: Cycles per normalization unit
+  Wavefront Occupancy:
+    rst: |-
+      The time-averaged number of wavefronts resident on the accelerator over the
+      lifetime of the kernel. Note: this metric may be inaccurate for short-running
+      kernels (less than 1ms).
+    unit: Wavefronts
+Overall Instruction Mix:
+  VALU:
+    rst: The total number of vector arithmetic logic unit (VALU) operations issued.
+      These are the workhorses of the :doc:`compute unit <compute-unit>`, and are
+      used to execute a wide range of instruction types including floating point operations,
+      non-uniform address calculations, transcendental operations, integer operations,
+      shifts, conditional evaluation, etc.
+    unit: Instructions
+  VMEM:
+    rst: The total number of vector memory operations issued. These include most loads,
+      stores and atomic operations and all accesses to :ref:`generic, global, private
+      and texture <memory-spaces>` memory.
+    unit: Instructions
+  LDS:
+    rst: The total number of LDS (also known as shared memory) operations issued.
+      These include loads, stores, atomics, and HIP's ``__shfl`` operations.
+    unit: Instructions
+  MFMA:
+    rst: The total number of matrix fused multiply-add instructions issued.
+    unit: Instructions
+  SALU:
+    rst: The total number of scalar arithmetic logic unit (SALU) operations issued.
+      Typically these are used for address calculations, literal constants, and other
+      operations that are provably uniform across a wavefront. Although scalar memory
+      (SMEM) operations are issued by the SALU, they are counted separately in this
+      section.
+    unit: Instructions
+  SMEM:
+    rst: The total number of scalar memory (SMEM) operations issued. These are typically
+      used for loading kernel arguments, base-pointers and loads from HIP's ``__constant__``
+      memory.
+    unit: Instructions
+  Branch:
+    rst: The total number of branch operations issued. These typically consist of
+      jump or branch operations and are used to implement control flow.
+    unit: Instructions
+VALU Arithmetic Instruction Mix:
+  INT32:
+    rst: The total number of instructions operating on 32-bit integer operands issued
+      to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  INT64:
+    rst: The total number of instructions operating on 64-bit integer operands issued
+      to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F16-ADD:
+    rst: The total number of addition instructions operating on 16-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F16-MUL:
+    rst: The total number of multiplication instructions operating on 16-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F16-FMA:
+    rst: The total number of fused multiply-add instructions operating on 16-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F16-Trans:
+    rst: The total number of transcendental instructions (e.g., `sqrt`) operating
+      on 16-bit floating-point operands issued to the VALU per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F32-ADD:
+    rst: The total number of addition instructions operating on 32-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F32-MUL:
+    rst: The total number of multiplication instructions operating on 32-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F32-FMA:
+    rst: The total number of fused multiply-add instructions operating on 32-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F32-Trans:
+    rst: The total number of transcendental instructions (such as ``sqrt``) operating
+      on 32-bit floating-point operands issued to the VALU per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F64-ADD:
+    rst: The total number of addition instructions operating on 64-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F64-MUL:
+    rst: The total number of multiplication instructions operating on 64-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F64-FMA:
+    rst: The total number of fused multiply-add instructions operating on 64-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F64-Trans:
+    rst: The total number of transcendental instructions (such as `sqrt`) operating
+      on 64-bit floating-point operands issued to the VALU per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Conversion:
+    rst: |-
+      The total number of type conversion instructions (such as converting data
+      to or from F32\u2194F64) issued to the VALU per :ref:`normalization unit
+      <normalization-units>`.
+    unit: Instructions per normalization unit
+VMEM Instruction Mix:
+  Global/Generic Instr:
+    rst: The total number of global & generic memory instructions executed on all
+      :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Global/Generic Read:
+    rst: The total number of global & generic memory read instructions executed on
+      all :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Global/Generic Write:
+    rst: The total number of global & generic memory write instructions executed on
+      all :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Global/Generic Atomic:
+    rst: The total number of global & generic memory atomic (with and without return)
+      instructions executed on all :doc:`compute units <compute-unit>` on the accelerator,
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Spill/Stack Instr:
+    rst: The total number of spill/stack memory instructions executed on all :doc:`compute
+      units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Spill/Stack Read:
+    rst: The total number of spill/stack memory read instructions executed on all
+      :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Spill/Stack Write:
+    rst: The total number of spill/stack memory write instructions executed on all
+      :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Spill/Stack Atomic:
+    rst: The total number of spill/stack memory atomic (with and without return) instructions
+      executed on all :doc:`compute units <compute-unit>` on the accelerator, per
+      :ref:`normalization unit <normalization-units>`. Typically unused as these memory
+      operations are typically used to implement thread-local storage.
+    unit: Instructions per normalization unit
+MFMA Arithmetic Instruction Mix:
+  MFMA-I8:
+    rst: The total number of 8-bit integer :ref:`MFMA <desc-mfma>` instructions issued
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  MFMA-F8:
+    rst: The total number of 8-bit floating point :ref:`MFMA <desc-mfma>` instructions
+      issued per :ref:`normalization unit <normalization-units>`. This is supported
+      in AMD Instinct MI300 series and later only.
+    unit: Instructions per normalization unit
+  MFMA-F16:
+    rst: The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` instructions
+      issued per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  MFMA-BF16:
+    rst: The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>` instructions
+      issued per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  MFMA-F32:
+    rst: The total number of 32-bit floating-point :ref:`MFMA <desc-mfma>` instructions
+      issued per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  MFMA-F64:
+    rst: The total number of 64-bit floating-point :ref:`MFMA <desc-mfma>` instructions
+      issued per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+Compute Speed-of-Light:
+  VALU FLOPs:
+    rst: |-
+      The total floating-point operations executed per second on the :ref:`VALU
+      <desc-valu>`. This is also presented as a percent of the peak theoretical
+      FLOPs achievable on the specific accelerator. Note: this does not include
+      any floating-point operations from :ref:`MFMA <desc-mfma>` instructions.
+    unit: GFLOPs
+  VALU IOPs:
+    rst: |-
+      The total integer operations executed per second on the :ref:`VALU <desc-valu>`.
+      This is also presented as a percent of the peak theoretical IOPs achievable
+      on the specific accelerator. Note: this does not include any integer operations
+      from :ref:`MFMA <desc-mfma>` instructions.
+    unit: GIOPs
+  MFMA FLOPs (F8):
+    rst: ''
+    unit: Unknown
+  MFMA FLOPs (BF16):
+    rst: |-
+      The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 16-bit brain floating
+      point operations from :ref:`VALU <desc-valu>` instructions. This is also
+      presented as a percent of the peak theoretical BF16 MFMA operations achievable
+      on the specific accelerator.
+    unit: GFLOPs
+  MFMA FLOPs (F16):
+    rst: |-
+      The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 16-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. This is also presented
+      as a percent of the peak theoretical F16 MFMA operations achievable on the
+      specific accelerator.
+    unit: GFLOPs
+  MFMA FLOPs (F32):
+    rst: |-
+      The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 32-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. This is also presented
+      as a percent of the peak theoretical F32 MFMA operations achievable on the
+      specific accelerator.
+    unit: GFLOPs
+  MFMA FLOPs (F64):
+    rst: |-
+      The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 64-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. This is also presented
+      as a percent of the peak theoretical F64 MFMA operations achievable on the
+      specific accelerator. The total number of 64-bit floating point :ref:`MFMA
+      <desc-mfma>` operations executed per second. Note: this does not include
+      any 64-bit floating point operations from :ref:`VALU <desc-valu>` instructions.
+      This is also presented as a percent of the peak theoretical F64 MFMA operations
+      achievable on the specific accelerator.
+    unit: GFLOPs
+  MFMA IOPs (INT8):
+    rst: |-
+      The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
+      per second. Note: this does not include any 8-bit integer operations from
+      :ref:`VALU <desc-valu>` instructions. This is also presented as a percent
+      of the peak theoretical INT8 MFMA operations achievable on the specific accelerator.
+    unit: GFLOPs
+Pipeline Statistics:
+  IPC:
+    rst: The ratio of the total number of instructions executed on the :doc:`CU <compute-unit>`
+      over the :ref:`total active CU cycles <total-active-cu-cycles>`.
+    unit: Instructions per cycle
+  IPC (Issued):
+    rst: The ratio of the total number of (non-:ref:`internal <ipc-internal-instructions>`)
+      instructions issued over the number of cycles where the :ref:`scheduler <desc-scheduler>`
+      was actively working on issuing instructions. Refer to the :ref:`Issued IPC
+      <issued-ipc>` example for further detail.
+    unit: Instructions per cycle
+  SALU Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`SALU <desc-salu>`
+      was busy executing instructions. Computed as the ratio of the total number of
+      cycles spent by the :ref:`scheduler <desc-scheduler>` issuing SALU / :ref:`SMEM
+      <desc-smem>` instructions over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  VALU Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`VALU <desc-valu>`
+      was busy executing instructions. Does not include :ref:`VMEM <desc-vmem>` operations.
+      Computed as the ratio of the total number of cycles spent by the :ref:`scheduler
+      <desc-scheduler>` issuing VALU instructions over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  VMEM Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`VMEM <desc-vmem>`
+      unit was busy executing instructions, including both global/generic and spill/scratch
+      operations (see the :ref:`VMEM instruction count metrics <ta-instruction-counts>`
+      for more detail). Does not include :ref:`VALU <desc-valu>` operations. Computed
+      as the ratio of the total number of cycles spent by the :ref:`scheduler <desc-scheduler>`
+      issuing VMEM instructions over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  Branch Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`branch <desc-branch>`
+      unit was busy executing instructions. Computed as the ratio of the total number
+      of cycles spent by the :ref:`scheduler <desc-scheduler>` issuing branch instructions
+      over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  VALU Active Threads:
+    rst: Indicates the average level of :ref:`divergence <desc-divergence>` within
+      a wavefront over the lifetime of the kernel. The number of work-items that were
+      active in a wavefront during execution of each :ref:`VALU <desc-valu>` instruction,
+      time-averaged over all VALU instructions run on all wavefronts in the kernel.
+    unit: Work-items
+  MFMA Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`MFMA <desc-mfma>`
+      unit was busy executing instructions. Computed as the ratio of the total number
+      of cycles spent by the :ref:`MFMA <desc-salu>` was busy over the :ref:`total
+      CU cycles <total-cu-cycles>`.
+    unit: Percent
+  MFMA Instruction Cycles:
+    rst: The average duration of :ref:`MFMA <desc-mfma>` instructions in this kernel
+      in cycles. Computed as the ratio of the total number of cycles the MFMA unit
+      was busy over the total number of MFMA instructions. Compare to, for example,
+      the `AMD Matrix Instruction Calculator <https://github.com/RadeonOpenCompute/amd_matrix_instruction_calculator>`_.
+    unit: Cycles per instruction
+  VMEM Latency:
+    rst: The average number of round-trip cycles (that is, from issue to data return
+      / acknowledgment) required for a VMEM instruction to complete.
+    unit: Cycles
+  SMEM Latency:
+    rst: The average number of round-trip cycles (that is, from issue to data return
+      / acknowledgment) required for a SMEM instruction to complete.
+    unit: Cycles
+Arithmetic Operations:
+  FLOPs (Total):
+    rst: The total number of floating-point operations executed on either the :ref:`VALU
+      <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization unit
+      <normalization-units>`.
+    unit: FLOP per normalization unit
+  IOPs (Total):
+    rst: The total number of integer operations executed on either the :ref:`VALU
+      <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization unit
+      <normalization-units>`.
+    unit: IOP per normalization unit
+  F8 OPs:
+    rst: ''
+    unit: Unknown
+  F16 OPs:
+    rst: The total number of 16-bit floating-point operations executed on either the
+      :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: FLOP per normalization unit
+  BF16 OPs:
+    rst: |-
+      The total number of 16-bit brain floating-point operations executed on
+      either the :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization
+      unit <normalization-units>`. Note: on current CDNA accelerators, the VALU
+      has no native BF16 instructions.
+    unit: FLOP per normalization unit
+  F32 OPs:
+    rst: The total number of 32-bit floating-point operations executed on either the
+      :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: FLOP per normalization unit
+  F64 OPs:
+    rst: The total number of 64-bit floating-point operations executed on either the
+      :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: FLOP per normalization unit
+  INT8 OPs:
+    rst: |-
+      The total number of 8-bit integer operations executed on either the :ref:`VALU
+      <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization unit
+      <normalization-units>`. Note: on current CDNA accelerators, the VALU has
+      no native INT8 instructions.
+    unit: IOP per normalization unit
+LDS Speed-of-Light:
+  Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`LDS <desc-lds>`
+      was actively executing instructions (including, but not limited to, load, store,
+      atomic and HIP's ``__shfl`` operations). Calculated as the ratio of the total
+      number of cycles LDS was active over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  Access Rate:
+    rst: Indicates the percentage of SIMDs in the :ref:`VALU <desc-valu>` [#lds-workload]_
+      actively issuing LDS instructions, averaged over the lifetime of the kernel.
+      Calculated as the ratio of the total number of cycles spent by the :ref:`scheduler
+      <desc-scheduler>` issuing :ref:`LDS <desc-lds>` instructions over the :ref:`total
+      CU cycles <total-cu-cycles>`.
+    unit: Percent
+  Theoretical Bandwidth Utilization:
+    rst: Indicates the maximum amount of bytes that could have been loaded from, stored
+      to, or atomically updated in the LDS divided as percentage of theoretical peak.
+      Does *not* take into account the execution mask of the wavefront when the instruction
+      was executed. See the :ref:`LDS bandwidth example <lds-bandwidth>` for more
+      detail.
+    unit: Percent
+  Bank Conflict Rate:
+    rst: Indicates the percentage of active LDS cycles that were spent servicing bank
+      conflicts. Calculated as the ratio of LDS cycles spent servicing bank conflicts
+      over the number of LDS cycles that would have been required to move the same
+      amount of data in an uncontended access. [#lds-bank-conflict]_
+    unit: Percent
+LDS Statistics:
+  LDS Instructions:
+    rst: The total number of LDS instructions (including, but not limited to, read/write/atomics
+      and HIP's ``__shfl`` instructions) executed per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Theoretical Bandwidth:
+    rst: Indicates the maximum amount of bytes that could have been loaded from, stored
+      to, or atomically updated in the LDS divided by total duration. Does *not* take
+      into account the execution mask of the wavefront when the instruction was executed.
+      See the :ref:`LDS bandwidth example <lds-bandwidth>` for more detail.
+    unit: Gbps
+  LDS Latency:
+    rst: The average number of round-trip cycles (i.e., from issue to data-return
+      acknowledgment) required for an LDS instruction to complete.
+    unit: Cycles
+  Bank Conflicts/Access:
+    rst: The ratio of the number of cycles spent in the :ref:`LDS scheduler <desc-lds>`
+      due to bank conflicts (as determined by the conflict resolution hardware) to
+      the base number of cycles that would be spent in the LDS scheduler in a completely
+      uncontended case. This is the unnormalized form of the Bank Conflict Rate.
+    unit: Conflicts per Access
+  Index Accesses:
+    rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>` over
+      all operations per :ref:`normalization unit <normalization-units>`.
+    unit: Cycles per normalization unit
+  Atomic Return Cycles:
+    rst: The total number of cycles spent on LDS atomics with return per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Cycles per normalization unit
+  Bank Conflict:
+    rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>` due
+      to bank conflicts (as determined by the conflict resolution hardware) per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Cycles per normalization unit
+  Addr Conflict:
+    rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>` due
+      to address conflicts (as determined by the conflict resolution hardware) per
+      :ref:`normalization unit <normalization-units>`.
+    unit: Cycles per normalization unit
+  Unaligned Stall:
+    rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>` due
+      to stalls from non-dword aligned addresses per :ref:`normalization unit <normalization-units>`.
+    unit: Cycles per normalization unit
+  Mem Violations:
+    rst: |-
+      The total number of out-of-bounds accesses made to the LDS, per :ref:`normalization
+      unit <normalization-units>`. This is unused and expected to be zero in
+      most configurations for modern CDNA\u2122 accelerators.
+    unit: Accesses per normalization unit
+L1I Speed-of-Light:
+  Bandwidth Utilization:
+    rst: The number of bytes looked up in the L1I cache, as a percent of the peak
+      theoretical bandwidth. Calculated as the ratio of L1I requests over the :ref:`total
+      L1I cycles <total-l1i-cycles>`.
+    unit: Percent
+  Cache Hit Rate:
+    rst: The percent of L1I requests that hit [#l1i-cache]_ on a previously loaded
+      line the cache. Calculated as the ratio of the number of L1I requests that hit
+      over the number of all L1I requests.
+    unit: Percent
+  L1I-L2 Bandwidth Utilization:
+    rst: |-
+      The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth
+      achieved. Calculated as the ratio of the total number of requests from
+      the L1I to the L2 cache over the :ref:`total L1I-L2 interface cycles <total-l1i-cycles>`.
+    unit: Percent
+L1I cache accesses:
+  Req:
+    rst: The total number of requests made to the L1I per normalization-unit
+    unit: Requests per normalization unit
+  Hits:
+    rst: The total number of L1I requests that hit on a previously loaded cache line,
+      per :ref:`normalization-unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Misses - Non Duplicated:
+    rst: The total number of L1I requests that missed on a cache line that *were not*
+      already pending due to another request, per :ref:`normalization-unit <normalization-units>`.
+      See note in :ref:`desc-l1i-sol` for more detail.
+    unit: Requests per normalization unit
+  Misses - Duplicated:
+    rst: The total number of L1I requests that missed on a cache line that *were*
+      already pending due to another request, per :ref:`normalization-unit <normalization-units>`.
+      See note in :ref:`desc-l1i-sol` for more detail.
+    unit: Requests per normalization unit
+  Cache Hit Rate:
+    rst: The percent of L1I requests that hit [#l1i-cache]_ on a previously loaded
+      line the cache. Calculated as the ratio of the number of L1I requests that hit
+      over the number of all L1I requests.
+    unit: Percent
+  Instruction Fetch Latency:
+    rst: The average number of cycles spent to fetch instructions to a :doc:`CU <compute-unit>`.
+    unit: Cycles
+L1I <-> L2 interface:
+  L1I-L2 Bandwidth:
+    rst: Total number of bytes transferred across L1I - L2 interface divided by total
+      duration.
+    unit: Gbps
+Scalar L1D Speed-of-Light:
+  Bandwidth Utilization:
+    rst: The number of bytes looked up in the sL1D cache, as a percent of the peak
+      theoretical bandwidth. Calculated as the ratio of sL1D requests over the :ref:`total
+      sL1D cycles <total-sl1d-cycles>`.
+    unit: Percent
+  Cache Hit Rate:
+    rst: Indicates the percent of sL1D requests that hit on a previously loaded line
+      the cache. The ratio of the number of sL1D requests that hit [#sl1d-cache]_
+      over the number of all sL1D requests.
+    unit: Percent
+  sL1D-L2 BW Utilization:
+    rst: The percentage of the peak theoretical sL1D - L2 interface bandwidth acheived.
+      Caclulated as total number of bytes read from, written to, or atomically updated
+      across the sL1D - L2 interface.
+    unit: Percent
+Scalar L1D cache accesses:
+  Req:
+    rst: The total number of requests, of any size or type, made to the sL1D per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Hits:
+    rst: The total number of sL1D requests that hit on a previously loaded cache line,
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Misses - Non Duplicated:
+    rst: The total number of sL1D requests that missed on a cache line that *was not*
+      already pending due to another request, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`desc-sl1d-sol` for more detail.
+    unit: Requests per normalization unit
+  Misses- Duplicated:
+    rst: The total number of sL1D requests that missed on a cache line that *was*
+      already pending due to another request, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`desc-sl1d-sol` for more detail.
+    unit: Requests per normalization unit
+  Cache Hit Rate:
+    rst: Indicates the percent of sL1D requests that hit on a previously loaded line
+      the cache. The ratio of the number of sL1D requests that hit [#sl1d-cache]_
+      over the number of all sL1D requests.
+    unit: Percent
+  Read Req (Total):
+    rst: The total number of sL1D read requests of any size, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Atomic Req:
+    rst: The total number of atomic requests from sL1D to the :doc:`L2 <l2-cache>`,
+      per :ref:`normalization unit <normalization-units>`. Typically unused on current
+      CDNA accelerators.
+    unit: Requests per normalization unit
+  Read Req (1 DWord):
+    rst: The total number of sL1D read requests made for a single dword of data (4B),
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Read Req (2 DWord):
+    rst: The total number of sL1D read requests made for a two dwords of data (8B),
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Read Req (4 DWord):
+    rst: The total number of sL1D read requests made for a four dwords of data (16B),
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Read Req (8 DWord):
+    rst: The total number of sL1D read requests made for a eight dwords of data (32B),
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Read Req (16 DWord):
+    rst: The total number of sL1D read requests made for a sixteen dwords of data
+      (64B), per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+Scalar L1D Cache - L2 Interface:
+  sL1D-L2 BW:
+    rst: |-
+      The total number of bytes read from, written to, or atomically updated
+      across the sL1D\u2194:doc:`L2 <l2-cache>` interface, divided by total duration.
+      Note that sL1D writes and atomics are typically
+      unused on current CDNA accelerators, so in the majority of cases this can
+      be interpreted as an sL1D\u2192L2 read bandwidth.
+    unit: Gbps
+  Read Req:
+    rst: The total number of read requests from sL1D to the :doc:`L2 <l2-cache>`,
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Write Req:
+    rst: The total number of write requests from sL1D to the :doc:`L2 <l2-cache>`,
+      per :ref:`normalization unit <normalization-units>`. Typically unused on current
+      CDNA accelerators.
+    unit: Requests per normalization unit
+  Atomic Req:
+    rst: The total number of atomic requests from sL1D to the :doc:`L2 <l2-cache>`,
+      per :ref:`normalization unit <normalization-units>`. Typically unused on current
+      CDNA accelerators.
+    unit: Requests per normalization unit
+  Stall Cycles:
+    rst: |-
+      The total number of cycles the sL1D\u2194 :doc:`L2 <l2-cache>` interface
+      was stalled, per :ref:`normalization unit <normalization-units>`.
+    unit: Cycles per normalization unit
+Busy and stall metrics:
+  Address Processing Unit Busy:
+    rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the address processor
+      was busy
+    unit: Percent
+  Address Stall:
+    rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the address processor
+      was stalled from sending address requests further into the vL1D pipeline
+    unit: Percent
+  Data Stall:
+    rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the address processor
+      was stalled from sending write/atomic data further into the vL1D pipeline
+    unit: Percent
+  Data-Processor → Address Stall:
+    rst: Percent of :ref:`total CU cycles <total-cu-cycles>` the address processor
+      was stalled waiting to send command data to the :ref:`data processor <desc-td>`
+    unit: Percent
+  Sequencer → TA Address Stall:
+    rst: ''
+    unit: Unknown
+  Sequencer → TA Command Stall:
+    rst: ''
+    unit: Unknown
+  Sequencer → TA Data Stall:
+    rst: ''
+    unit: Unknown
+Instruction counts:
+  Total Instructions:
+    rst: The total number of memory instructions executed by the address processer
+      over all compute units on the accelerator, per normalization unit.
+    unit: Instructions per normalization unit
+  Global/Generic Instructions:
+    rst: The total number of global & generic memory instructions executed on all
+      :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Global/Generic Read Instructions:
+    rst: The total number of global & generic memory read instructions executed on
+      all :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Global/Generic Write Instructions:
+    rst: The total number of global & generic memory write instructions executed on
+      all :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Global/Generic Atomic Instructions:
+    rst: The total number of global & generic memory atomic (with and without return)
+      instructions executed on all :doc:`compute units <compute-unit>` on the accelerator,
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Spill/Stack Instructions:
+    rst: The total number of spill/stack memory instructions executed on all :doc:`compute
+      units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Spill/Stack Read Instructions:
+    rst: The total number of spill/stack memory read instructions executed on all
+      :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Spill/Stack Write Instructions:
+    rst: The total number of spill/stack memory write instructions executed on all
+      :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Spill/Stack Atomic Instructions:
+    rst: The total number of spill/stack memory atomic (with and without return) instructions
+      executed on all :doc:`compute units <compute-unit>` on the accelerator, per
+      :ref:`normalization unit <normalization-units>`. Typically unused as these memory
+      operations are typically used to implement thread-local storage.
+    unit: Instructions per normalization unit
+Spill and stack metrics:
+  Spill/Stack Total Cycles:
+    rst: The number of cycles the address processing unit spent working on spill/stack
+      instructions, per :ref:`normalization unit <normalization-units>`.
+    unit: Cycles per normalization unit
+  Spill/Stack Coalesced Read:
+    rst: The number of cycles the address processing unit spent working on coalesced
+      spill/stack read instructions, per :ref:`normalization unit <normalization-units>`.
+    unit: Cycles per normalization unit
+  Spill/Stack Coalesced Write:
+    rst: The number of cycles the address processing unit spent working on coalesced
+      spill/stack write instructions, per :ref:`normalization unit <normalization-units>`.
+    unit: Cycles per normalization unit
+Vector L1 data-return path or Texture Data (TD):
+  Data-Return Busy:
+    rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the data-return unit
+      was busy processing or waiting on data to return to the :doc:`CU <compute-unit>`.
+    unit: Percent
+  Cache RAM → Data-Return Stall:
+    rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the data-return unit
+      was stalled on data to be returned from the :ref:`vL1D Cache RAM <desc-tc>`.
+    unit: Percent
+  Workgroup manager → Data-Return Stall:
+    rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the data-return unit
+      was stalled by the :ref:`workgroup manager <desc-spi>` due to initialization
+      of registers as a part of launching new workgroups.
+    unit: Percent
+  Coalescable Instructions:
+    rst: The number of instructions submitted to the :ref:`data-return unit <desc-td>`
+      by the :ref:`address processor <desc-ta>` that were found to be coalescable,
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Read Instructions:
+    rst: The number of read instructions submitted to the :ref:`data-return unit <desc-td>`
+      by the :ref:`address processor <desc-ta>` summed over all :doc:`compute units
+      <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
+      This is expected to be the sum of global/generic and spill/stack reads in the
+      :ref:`address processor <desc-ta>`.
+    unit: Instructions per normalization unit
+  Write Instructions:
+    rst: The number of store instructions submitted to the :ref:`data-return unit
+      <desc-td>` by the :ref:`address processor <desc-ta>` summed over all :doc:`compute
+      units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
+      This is expected to be the sum of global/generic and spill/stack stores counted
+      by the :ref:`vL1D cache-front-end <ta-instruction-counts>`.
+    unit: Instructions per normalization unit
+  Atomic Instructions:
+    rst: The number of atomic instructions submitted to the :ref:`data-return unit
+      <desc-td>` by the :ref:`address processor <desc-ta>` summed over all :doc:`compute
+      units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
+      This is expected to be the sum of global/generic and spill/stack atomics in
+      the :ref:`address processor <desc-ta>`.
+    unit: Instructions per normalization unit
+vL1D Speed-of-Light:
+  Hit rate:
+    rst: The ratio of the number of vL1D cache line requests that hit [#vl1d-hit]_
+      in vL1D cache over the total number of cache line requests to the :ref:`vL1D
+      Cache RAM <desc-tc>`.
+    unit: Percent
+  Bandwidth Utilization:
+    rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM
+      <desc-vmem>` instructions, as a percent of the peak theoretical bandwidth achievable
+      on the specific accelerator. The number of bytes is calculated as the number
+      of cache lines requested multiplied by the cache line size. This value does
+      not consider partial requests, so for instance, if only a single value is requested
+      in a cache line, the data movement will still be counted as a full cache line.
+    unit: Percent
+  Utilization:
+    rst: Indicates how busy the :ref:`vL1D Cache RAM <desc-tc>` was during the kernel
+      execution. The number of cycles where the vL1D Cache RAM is actively processing
+      any request divided by the number of cycles where the vL1D is active [#vl1d-activity]_.
+    unit: Percent
+  Coalescing:
+    rst: Indicates how well memory instructions were coalesced by the :ref:`address
+      processing unit <desc-ta>`, ranging from uncoalesced (25%) to fully coalesced
+      (100%). Calculated as the average number of :ref:`thread-requests <thread-requests>`
+      generated per instruction divided by the ideal number of thread-requests per
+      instruction.
+    unit: Percent
+vL1D cache stall metrics:
+  Stalled on L2 Data:
+    rst: The ratio of the number of cycles where the vL1D is stalled waiting for requested
+      data to return from the :doc:`L2 cache <l2-cache>` divided by the number of
+      cycles where the vL1D is active [#vl1d-activity]_.
+    unit: Percent
+  Stalled on L2 Req:
+    rst: The ratio of the number of cycles where the vL1D is stalled waiting to issue
+      a request for data to the :doc:`L2 cache <l2-cache>` divided by the number of
+      cycles where the vL1D is active [#vl1d-activity]_.
+    unit: Percent
+  Tag RAM Stall (Read):
+    rst: The ratio of the number of cycles where the vL1D is stalled due to Read requests
+      with conflicting tags being looked up concurrently, divided by the number of
+      cycles where the vL1D is active [#vl1d-activity]_.
+    unit: Percent
+  Tag RAM Stall (Write):
+    rst: The ratio of the number of cycles where the vL1D is stalled due to Write
+      requests with conflicting tags being looked up concurrently, divided by the
+      number of cycles where the vL1D is active [#vl1d-activity]_.
+    unit: Percent
+  Tag RAM Stall (Atomic):
+    rst: The ratio of the number of cycles where the vL1D is stalled due to Atomic
+      requests with conflicting tags being looked up concurrently, divided by the
+      number of cycles where the vL1D is active [#vl1d-activity]_.
+    unit: Percent
+vL1D cache access metrics:
+  Total Req:
+    rst: The total number of incoming requests from the :ref:`address processing unit
+      <desc-ta>` after coalescing.
+    unit: Requests
+  Read Req:
+    rst: The total number of incoming read requests from the :ref:`address processing
+      unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
+    unit: Requests per normalization unit
+  Write Req:
+    rst: The total number of incoming write requests from the :ref:`address processing
+      unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
+    unit: Requests per normalization unit
+  Atomic Req:
+    rst: The total number of incoming atomic requests from the :ref:`address processing
+      unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
+    unit: Requests per normalization unit
+  Cache BW:
+    rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM
+      <desc-vmem>` instructions divided by total duration. The number of bytes is
+      calculated as the number of cache lines requested multiplied by the cache line
+      size. This value does not consider partial requests, so for instance, if only
+      a single value is requested in a cache line, the data movement will still be
+      counted as a full cache line.
+    unit: Gbps
+  Cache Hit Rate:
+    rst: The ratio of the number of vL1D cache line requests that hit in vL1D cache
+      over the total number of cache line requests to the :ref:`vL1D Cache RAM <desc-tc>`.
+    unit: Percent
+  Cache Accesses:
+    rst: The total number of cache line lookups in the vL1D.
+    unit: Cache lines
+  Cache Hits:
+    rst: The number of cache accesses minus the number of outgoing requests to the
+      :doc:`L2 cache <l2-cache>`, that is, the number of cache line requests serviced
+      by the :ref:`vL1D Cache RAM <desc-tc>` per :ref:`normalization unit <normalization-units>`.
+    unit: Cache lines per normalization unit
+  Invalidations:
+    rst: The number of times the vL1D was issued a write-back invalidate command during
+      the kernel's execution per :ref:`normalization unit <normalization-units>`.
+      This may be triggered by, for instance, the ``buffer_wbinvl1`` instruction.
+    unit: Invalidations per normalization unit
+  L1-L2 BW:
+    rst: The number of bytes transferred across the vL1D-L2 interface as a result
+      of :ref:`VMEM <desc-vmem>` instructions, divided by total duration. The number
+      of bytes is calculated as the number of cache lines requested multiplied by
+      the cache line size. This value does not consider partial requests, so for instance,
+      if only a single value is requested in a cache line, the data movement will
+      still be counted as a full cache line.
+    unit: Gbps
+  L1-L2 Read:
+    rst: The number of read requests for a vL1D cache line that were not satisfied
+      by the vL1D and must be retrieved from the to the :doc:`L2 Cache <l2-cache>`
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  L1-L2 Write:
+    rst: The number of write requests to a vL1D cache line that were sent through
+      the vL1D to the :doc:`L2 cache <l2-cache>`, per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  L1-L2 Atomic:
+    rst: The number of atomic requests that are sent through the vL1D to the :doc:`L2
+      cache <l2-cache>`, per :ref:`normalization unit <normalization-units>`. This
+      includes requests for atomics with, and without return.
+    unit: Requests per normalization unit
+L1D - L2 Transactions:
+  NC - Read:
+    rst: Total read requests with NC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  UC - Read:
+    rst: Total read requests with UC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  CC - Read:
+    rst: Total read requests with CC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  RW - Read:
+    rst: Total read requests with RW mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  RW - Write:
+    rst: Total write requests with RW mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  NC - Write:
+    rst: Total write requests with NC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  UC - Write:
+    rst: Total write requests with UC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  CC - Write:
+    rst: Total write requests with CC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  NC - Atomic:
+    rst: Total atomic requests with NC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  UC - Atomic:
+    rst: Total atomic requests with UC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  CC - Atomic:
+    rst: Total atomic requests with CC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  RW - Atomic:
+    rst: Total atomic requests with RW mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+L1 Unified Translation Cache (UTCL1):
+  Req:
+    rst: The number of translation requests made to the UTCL1 per normalization unit.
+    unit: Requests per normalization unit
+  Hit Ratio:
+    rst: The ratio of the number of translation requests that hit in the UTCL1 divided
+      by the total number of translation requests made to the UTCL1.
+    unit: Percent
+  Hits:
+    rst: The number of translation requests that hit in the UTCL1, and could be reused,
+      per normalization unit.
+    unit: Requests per normalization unit
+  Translation Misses:
+    rst: The total number of translation requests that missed in the UTCL1 due to
+      translation not being present in the cache, per :ref:`normalization unit <normalization-units>`.
+    unit: unit
+  Permission Misses:
+    rst: |-
+      The total number of translation requests that missed in the UTCL1 due
+      to a permission error, per :ref:`normalization unit <normalization-units>`.
+      This is unused and expected to be zero in most configurations for modern
+      CDNA\u2122 accelerators.
+    unit: Requests per normalization unit
+L1D Addr Translation Stalls: {}
+L2 Speed-of-Light:
+  Utilization:
+    rst: The ratio of the :ref:`number of cycles an L2 channel was active, summed
+      over all L2 channels on the accelerator <total-active-l2-cycles>` over the :ref:`total
+      L2 cycles <total-l2-cycles>`.
+    unit: Percent
+  Peak Bandwidth:
+    rst: The number of bytes looked up in the L2 cache, as a percent of the peak theoretical
+      bandwidth achievable on the specific accelerator. The number of bytes is calculated
+      as the number of cache lines requested multiplied by the cache line size. This
+      value does not consider partial requests, so e.g., if only a single value is
+      requested in a cache line, the data movement will still be counted as a full
+      cache line.
+    unit: Percent
+  Hit Rate:
+    rst: The ratio of the number of L2 cache line requests that hit in the L2 cache
+      over the total number of incoming cache line requests to the L2 cache.
+    unit: Percent
+  L2-Fabric Read BW:
+    rst: The number of bytes read by the L2 over the :ref:`Infinity Fabric interface
+      <l2-fabric>` per unit time.
+    unit: GB/s
+  L2-Fabric Write and Atomic BW:
+    rst: The number of bytes sent by the L2 over the :ref:`Infinity Fabric interface
+      <l2-fabric>` by write and atomic operations per unit time.
+    unit: GB/s
+  HBM Bandwidth:
+    rst: Maximum theoretical bandwidth of the accelerator's local high-bandwidth memory
+      (HBM) per unit time. This value is calculated as the number of HBM channels
+      multiplied by the HBM channel width multiplied by the HBM clock frequency.
+    unit: GB/s
+L2-Fabric interface metrics:
+  Read BW:
+    rst: The total number of bytes read by the L2 cache from Infinity Fabric divided
+      by total duration.
+    unit: Gbps
+  HBM Read Traffic:
+    rst: The percent of read requests generated by the L2 cache that are routed to
+      the accelerator's local high-bandwidth memory (HBM). This breakdown does not
+      consider the *size* of the request (meaning that 32B and 64B requests are both
+      counted as a single request), so this metric only *approximates* the percent
+      of the L2-Fabric Read bandwidth directed to the local HBM.
+    unit: Percent
+  Remote Read Traffic:
+    rst: The percent of read requests generated by the L2 cache that are routed to
+      any memory location other than the accelerator's local high-bandwidth memory
+      (HBM) -- for example, the CPU's DRAM or a remote accelerator's HBM. This breakdown
+      does not consider the *size* of the request (meaning that 32B and 64B requests
+      are both counted as a single request), so this metric only *approximates* the
+      percent of the L2-Fabric Read bandwidth directed to a remote location.
+    unit: Percent
+  Uncached Read Traffic:
+    rst: The percent of read requests generated by the L2 cache that are reading from
+      an :ref:`uncached memory allocation <memory-type>`. Note, as described in the
+      :ref:`request flow <l2-request-flow>` section, a single 64B read request is
+      typically counted as two uncached read requests. So, it is possible for the
+      Uncached Read Traffic to reach up to 200% of the total number of read requests.
+      This breakdown does not consider the *size* of the request (i.e., 32B and 64B
+      requests are both counted as a single request), so this metric only *approximates*
+      the percent of the L2-Fabric read bandwidth directed to an uncached memory location.
+    unit: Percent
+  Write and Atomic BW:
+    rst: The total number of bytes written by the L2 over Infinity Fabric by write
+      and atomic operations divided by total duration. Note that on current CDNA accelerators,
+      such as the :ref:`MI2XX <mixxx-note>`, requests are only considered *atomic*
+      by Infinity Fabric if they are targeted at non-write-cacheable memory, for example,
+      :ref:`fine-grained memory <memory-type>` allocations or :ref:`uncached memory
+      <memory-type>` allocations on the MI2XX.
+    unit: Gbps
+  HBM Write and Atomic Traffic:
+    rst: The percent of write and atomic requests generated by the L2 cache that are
+      routed to the accelerator's local high-bandwidth memory (HBM). This breakdown
+      does not consider the *size* of the request (meaning that 32B and 64B requests
+      are both counted as a single request), so this metric only *approximates* the
+      percent of the L2-Fabric Write and Atomic bandwidth directed to the local HBM.
+      Note that on current CDNA accelerators, such as the :ref:`MI2XX <mixxx-note>`,
+      requests are only considered *atomic* by Infinity Fabric if they are targeted
+      at :ref:`fine-grained memory <memory-type>` allocations or :ref:`uncached memory
+      <memory-type>` allocations.
+    unit: Percent
+  Remote Write and Atomic Traffic:
+    rst: The percent of read requests generated by the L2 cache that are routed to
+      any memory location other than the accelerator's local high-bandwidth memory
+      (HBM) -- for example, the CPU's DRAM or a remote accelerator's HBM. This breakdown
+      does not consider the *size* of the request (meaning that 32B and 64B requests
+      are both counted as a single request), so this metric only *approximates* the
+      percent of the L2-Fabric Read bandwidth directed to a remote location. Note
+      that on current CDNA accelerators, such as the :ref:`MI2XX <mixxx-note>`, requests
+      are only considered *atomic* by Infinity Fabric if they are targeted at :ref:`fine-grained
+      memory <memory-type>` allocations or :ref:`uncached memory <memory-type>` allocations.
+    unit: Percent
+  Atomic Traffic:
+    rst: The percent of write requests generated by the L2 cache that are atomic requests
+      to *any* memory location. This breakdown does not consider the *size* of the
+      request (meaning that 32B and 64B requests are both counted as a single request),
+      so this metric only *approximates* the percent of the L2-Fabric Read bandwidth
+      directed to a remote location. Note that on current CDNA accelerators, such
+      as the :ref:`MI2XX <mixxx-note>`, requests are only considered *atomic* by Infinity
+      Fabric if they are targeted at :ref:`fine-grained memory <memory-type>` allocations
+      or :ref:`uncached memory <memory-type>` allocations.
+    unit: Percent
+  Uncached Write and Atomic Traffic:
+    rst: The percent of write and atomic requests generated by the L2 cache that are
+      targeting :ref:`uncached memory allocations <memory-type>`. This breakdown does
+      not consider the *size* of the request (meaning that 32B and 64B requests are
+      both counted as a single request), so this metric only *approximates* the percent
+      of the L2-Fabric read bandwidth directed to uncached memory allocations.
+    unit: Percent
+  Read Latency:
+    rst: The time-averaged number of cycles read requests spent in Infinity Fabric
+      before data was returned to the L2.
+    unit: Cycles
+  Write and Atomic Latency:
+    rst: The time-averaged number of cycles write requests spent in Infinity Fabric
+      before a completion acknowledgement was returned to the L2.
+    unit: Cycles
+  Atomic Latency:
+    rst: The time-averaged number of cycles atomic requests spent in Infinity Fabric
+      before a completion acknowledgement (atomic without return value) or data (atomic
+      with return value) was returned to the L2.
+    unit: Cycles
+L2 Cache Accesses:
+  Bandwidth:
+    rst: The number of bytes looked up in the L2 cache, divided by total duration.
+      The number of bytes is calculated as the number of cache lines requested multiplied
+      by the cache line size. This value does not consider partial requests, so for
+      example, if only a single value is requested in a cache line, the data movement
+      will still be counted as a full cache line.
+    unit: Gbps
+  Req:
+    rst: The total number of incoming requests to the L2 from all clients for all
+      request types, per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Read Req:
+    rst: The total number of read requests to the L2 from all clients.
+    unit: Requests per normalization unit
+  Write Req:
+    rst: The total number of write requests to the L2 from all clients.
+    unit: Requests per normalization unit
+  Atomic Req:
+    rst: The total number of atomic requests (with and without return) to the L2 from
+      all clients.
+    unit: Requests per normalization unit
+  Streaming Req:
+    rst: The total number of incoming requests to the L2 that are marked as *streaming*.
+      The exact meaning of this may differ depending on the targeted accelerator,
+      however on an :ref:`MI2XX <mixxx-note>` this corresponds to `non-temporal load
+      or stores <https://clang.llvm.org/docs/LanguageExtensions.html#non-temporal-load-store-builtins>`_.
+      The L2 cache attempts to evict *streaming* requests before normal requests when
+      the L2 is at capacity.
+    unit: Requests per normalization unit
+  Probe Req:
+    rst: The number of coherence probe requests made to the L2 cache from outside
+      the accelerator. On an :ref:`MI2XX <mixxx-note>`, probe requests may be generated
+      by, for example, writes to :ref:`fine-grained device <memory-type>` memory or
+      by writes to :ref:`coarse-grained <memory-type>` device memory.
+    unit: Requests per normalization unit
+  Cache Hit:
+    rst: The ratio of the number of L2 cache line requests that hit in the L2 cache
+      over the total number of incoming cache line requests to the L2 cache.
+    unit: Percent
+  Hits:
+    rst: The total number of requests to the L2 from all clients that hit in the cache.
+      As noted in the :ref:`Speed-of-Light <l2-sol>` section, this includes hit-on-miss
+      requests.
+    unit: Requests per normalization unit
+  Misses:
+    rst: The total number of requests to the L2 from all clients that miss in the
+      cache. As noted in the :ref:`Speed-of-Light <l2-sol>` section, these do not
+      include hit-on-miss requests.
+    unit: Requests per normalization unit
+  Writeback:
+    rst: The total number of L2 cache lines written back to memory for any reason.
+      Write-backs may occur due to user code (such as HIP kernel calls to ``__threadfence_system``
+      or atomic built-ins) by the :doc:`command processor <command-processor>`'s memory
+      acquire/release fences, or for other internal hardware reasons.
+    unit: Cache lines per normalization unit
+  Writeback (Internal):
+    rst: The total number of L2 cache lines written back to memory for internal hardware
+      reasons, per :ref:`normalization unit <normalization-units>`.
+    unit: Cache lines per normalization unit
+  Writeback (vL1D Req):
+    rst: The total number of L2 cache lines written back to memory due to requests
+      initiated by the :doc:`vL1D cache <vector-l1-cache>`, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Cache lines per normalization unit
+  Evict (Internal):
+    rst: The total number of L2 cache lines evicted from the cache due to capacity
+      limits, per :ref:`normalization unit <normalization-units>`.
+    unit: Cache lines per normalization unit
+  Evict (vL1D Req):
+    rst: The total number of L2 cache lines evicted from the cache due to invalidation
+      requests initiated by the :doc:`vL1D cache <vector-l1-cache>`, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Cache lines per normalization unit
+  NC Req:
+    rst: The total number of requests to the L2 to Not-hardware-Coherent (NC) memory
+      allocations, per :ref:`normalization unit <normalization-units>`. See the :ref:`memory-type`
+      for more information.
+    unit: Requests per normalization unit
+  UC Req:
+    rst: The total number of requests to the L2 that go to Uncached (UC) memory allocations.
+      See the :ref:`memory-type` for more information.
+    unit: Requests per normalization unit
+  CC Req:
+    rst: The total number of requests to the L2 that go to Coherently Cacheable (CC)
+      memory allocations. See the :ref:`memory-type` for more information.
+    unit: Requests per normalization unit
+  RW Req:
+    rst: The total number of requests to the L2 that go to Read-Write coherent memory
+      (RW) allocations. See the :ref:`memory-type` for more information.
+    unit: Requests per normalization unit
+L2 Cache Stalls: {}
+L2 - Fabric Interface stalls:
+  Write - Credit Starvation:
+    rst: The number of cycles the L2-Fabric interface was stalled on write or atomic
+      requests to any memory location because too many write/atomic requests were
+      currently in flight, as a percent of the :ref:`total active L2 cycles <total-active-l2-cycles>`.
+    unit: Percent
+L2 - Fabric interface detailed metrics:
+  Read (32B):
+    rst: The total number of L2 requests to Infinity Fabric to read 32B of data from
+      any memory location, per :ref:`normalization unit <normalization-units>`. See
+      :ref:`l2-request-flow` for more detail. Typically unused on CDNA accelerators.
+    unit: Requests per normalization unit
+  Read (64B):
+    rst: The total number of L2 requests to Infinity Fabric to read 64B of data from
+      any memory location, per :ref:`normalization unit <normalization-units>`. See
+      :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+  Read (128B):
+    rst: ''
+    unit: Unknown
+  Read (Uncached):
+    rst: The total number of L2 requests to Infinity Fabric to read :ref:`uncached
+      data <memory-type>` from any memory location, per :ref:`normalization unit <normalization-units>`.
+      64B requests for uncached data are counted as two 32B uncached data requests.
+      See :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+  HBM Read:
+    rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of
+      data from the accelerator's local HBM, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+  Remote Read:
+    rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of
+      data from any source other than the accelerator's local HBM, per :ref:`normalization
+      unit <normalization-units>`. See :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+  Write and Atomic (32B):
+    rst: The total number of L2 requests to Infinity Fabric to write or atomically
+      update 32B of data to any memory location, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+  Write and Atomic (Uncached):
+    rst: The total number of L2 requests to Infinity Fabric to write or atomically
+      update 32B or 64B of :ref:`uncached data <memory-type>`, per :ref:`normalization
+      unit <normalization-units>`. See :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+  Write and Atomic (64B):
+    rst: The total number of L2 requests to Infinity Fabric to write or atomically
+      update 64B of data in any memory location, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+  HBM Write and Atomic:
+    rst: The total number of L2 requests to Infinity Fabric to write or atomically
+      update 32B or 64B of data in the accelerator's local HBM, per :ref:`normalization
+      unit <normalization-units>`. See :ref:`l2-request-flow` for more detail. plain
+    unit: Requests per normalization unit
+  Remote Write and Atomic:
+    rst: The total number of L2 requests to Infinity Fabric to write or atomically
+      update 32B or 64B of data in any memory location other than the accelerator's
+      local HBM, per :ref:`normalization unit <normalization-units>`. See :ref:`l2-request-flow`
+      for more detail.
+    unit: Requests per normalization unit
+  Atomic:
+    rst: The total number of L2 requests to Infinity Fabric to atomically update 32B
+      or 64B of data in any memory location, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`l2-request-flow` for more detail. Note that on current CDNA accelerators,
+      such as the :ref:`MI2XX <mixxx-note>`, requests are only considered *atomic*
+      by Infinity Fabric if they are targeted at non-write-cacheable memory, such
+      as :ref:`fine-grained memory <memory-type>` allocations or :ref:`uncached memory
+      <memory-type>` allocations on the MI2XX.
+    unit: Requests per normalization unit
+Aggregate Stats (All channels):
+  L2 Cache Hit Rate:
+    rst: The total number of requests to the L2 from all clients that hit in the cache.
+      As noted in the :ref:`Speed-of-Light <l2-sol>` section, this includes hit-on-miss
+      requests.
+    unit: Percent
+L2 Cache Hit Rate (pct):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
+L2 Requests (per normUnit):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
+L2-Fabric Requests (per normUnit):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
+L2-Fabric Read Latency (Cycles):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
+L2-Fabric Write and Atomic Latency (Cycles):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
+L2-Fabric Atomic Latency (Cycles):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
+L2-Fabric Read Stall (Cycles per normUnit):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
+L2-Fabric Write and Atomic Stall (Cycles per normUnit):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
+L2-Fabric (128B read requests per normUnit):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
diff --git a/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx950_metrics_description.yaml b/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx950_metrics_description.yaml
new file mode 100644
index 0000000000..d0a1898da5
--- /dev/null
+++ b/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx950_metrics_description.yaml
@@ -0,0 +1,2309 @@
+System Speed-of-Light:
+  VALU FLOPs:
+    rst: |-
+      The total floating-point operations executed per second on the :ref:`VALU
+      <desc-valu>`. This is also presented as a percent of the peak theoretical
+      FLOPs achievable on the specific accelerator. Note: this does not include
+      any floating-point operations from :ref:`MFMA <desc-mfma>` instructions.
+    unit: GFLOPs
+  VALU IOPs:
+    rst: |-
+      The total integer operations executed per second on the :ref:`VALU <desc-valu>`.
+      This is also presented as a percent of the peak theoretical IOPs achievable
+      on the specific accelerator. Note: this does not include any integer operations
+      from :ref:`MFMA <desc-mfma>` instructions.
+    unit: GOIPs
+  MFMA FLOPs (F8):
+    rst: |-
+      The total number of 8-bit brain floating point :ref:`MFMA <desc-mfma>`
+      operations executed per second. Note: this does not include any 16-bit brain
+      floating point operations from :ref:`VALU <desc-valu>` instructions. This
+      is also presented as a percent of the peak theoretical F8 MFMA operations
+      achievable on the specific accelerator. It is supported on AMD Instinct MI300
+      series and later only.
+    unit: GFLOPs
+  MFMA FLOPs (BF16):
+    rst: |-
+      The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>`
+      operations executed per second. Note: this does not include any 16-bit brain
+      floating point operations from :ref:`VALU <desc-valu>` instructions. This
+      is also presented as a percent of the peak theoretical BF16 MFMA operations
+      achievable on the specific accelerator.
+    unit: GFLOPs
+  MFMA FLOPs (F16):
+    rst: |-
+      The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 16-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. This is also presented
+      as a percent of the peak theoretical F16 MFMA operations achievable on the
+      specific accelerator.
+    unit: GFLOPs
+  MFMA FLOPs (F32):
+    rst: |-
+      The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 32-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. This is also presented
+      as a percent of the peak theoretical F32 MFMA operations achievable on the
+      specific accelerator.
+    unit: GFLOPs
+  MFMA FLOPs (F64):
+    rst: |-
+      The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 64-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. This is also presented
+      as a percent of the peak theoretical F64 MFMA operations achievable on the
+      specific accelerator.
+    unit: GFLOPs
+  MFMA FLOPs (F6F4):
+    rst: ''
+    unit: Unknown
+  MFMA IOPs (Int8):
+    rst: |-
+      The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
+      per second. Note: this does not include any 8-bit integer operations from
+      :ref:`VALU <desc-valu>` instructions. This is also presented as a percent
+      of the peak theoretical INT8 MFMA operations achievable on the specific accelerator.
+    unit: GIOPs
+  Active CUs:
+    rst: Total number of active compute units (CUs) on the accelerator during the
+      kernel execution.
+    unit: Number
+  SALU Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`SALU <desc-salu>`
+      was busy executing instructions. Computed as the ratio of the total number of
+      cycles spent by the :ref:`scheduler <desc-scheduler>` issuing SALU / :ref:`SMEM
+      <desc-smem>` instructions over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  VALU Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`VALU <desc-valu>`
+      was busy executing instructions. Does not include :ref:`VMEM <desc-vmem>` operations.
+      Computed as the ratio of the total number of cycles spent by the :ref:`scheduler
+      <desc-scheduler>` issuing VALU instructions over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  MFMA Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`MFMA <desc-mfma>`
+      unit was busy executing instructions. Computed as the ratio of the total number
+      of cycles spent by the :ref:`MFMA <desc-salu>` was busy over the :ref:`total
+      CU cycles <total-cu-cycles>`.
+    unit: Percent
+  VMEM Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`VMEM <desc-vmem>`
+      unit was busy executing instructions, including both global/generic and spill/scratch
+      operations (see the :ref:`VMEM instruction count metrics <ta-instruction-counts>`
+      for more detail). Does not include :ref:`VALU <desc-valu>` operations. Computed
+      as the ratio of the total number of cycles spent by the :ref:`scheduler <desc-scheduler>`
+      issuing VMEM instructions over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  Branch Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`branch <desc-branch>`
+      unit was busy executing instructions. Computed as the ratio of the total number
+      of cycles spent by the :ref:`scheduler <desc-scheduler>` issuing branch instructions
+      over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  VALU Active Threads:
+    rst: Indicates the average level of :ref:`divergence <desc-divergence>` within
+      a wavefront over the lifetime of the kernel. The number of work-items that were
+      active in a wavefront during execution of each :ref:`VALU <desc-valu>` instruction,
+      time-averaged over all VALU instructions run on all wavefronts in the kernel.
+    unit: Work-items
+  IPC:
+    rst: The ratio of the total number of instructions executed on the :doc:`CU <compute-unit>`
+      over the :ref:`total active CU cycles <total-active-cu-cycles>`.
+    unit: Instructions per-cycle
+  Wavefront Occupancy:
+    rst: |-
+      The time-averaged number of wavefronts resident on the accelerator over
+      the lifetime of the kernel. Note: this metric may be inaccurate for short-running
+      kernels (less than 1ms). This is also presented as a percent of the peak theoretical
+      occupancy achievable on the specific accelerator.
+    unit: Wavefronts
+  Theoretical LDS Bandwidth:
+    rst: Indicates the maximum amount of bytes that could have been loaded from, stored
+      to, or atomically updated in the LDS per unit time (see :ref:`LDS Bandwidth
+      <lds-bandwidth>` example for more detail). This is also presented as a percent
+      of the peak theoretical F64 MFMA operations achievable on the specific accelerator.
+    unit: GB/s
+  LDS Bank Conflicts/Access:
+    rst: The ratio of the number of cycles spent in the :doc:`LDS scheduler <local-data-share>`
+      due to bank conflicts (as determined by the conflict resolution hardware) to
+      the base number of cycles that would be spent in the LDS scheduler in a completely
+      uncontended case. This is also presented in normalized form (i.e., the Bank
+      Conflict Rate).
+    unit: Conflicts/Access
+  vL1D Cache Hit Rate:
+    rst: The ratio of the number of vL1D cache line requests that hit in vL1D cache
+      over the total number of cache line requests to the :ref:`vL1D cache RAM <desc-tc>`.
+    unit: Percent
+  vL1D Cache BW:
+    rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM
+      <desc-vmem>` instructions per unit time. The number of bytes is calculated as
+      the number of cache lines requested multiplied by the cache line size. This
+      value does not consider partial requests, so e.g., if only a single value is
+      requested in a cache line, the data movement will still be counted as a full
+      cache line. This is also presented as a percent of the peak theoretical bandwidth
+      achievable on the specific accelerator.
+    unit: GB/s
+  L2 Cache Hit Rate:
+    rst: The ratio of the number of L2 cache line requests that hit in the L2 cache
+      over the total number of incoming cache line requests to the L2 cache.
+    unit: Percent
+  L2 Cache BW:
+    rst: The number of bytes looked up in the L2 cache per unit time. The number of
+      bytes is calculated as the number of cache lines requested multiplied by the
+      cache line size. This value does not consider partial requests, so e.g., if
+      only a single value is requested in a cache line, the data movement will still
+      be counted as a full cache line. This is also presented as a percent of the
+      peak theoretical bandwidth achievable on the specific accelerator.
+    unit: GB/s
+  L2-Fabric Read BW:
+    rst: |-
+      The number of bytes read by the L2 over the :ref:`Infinity Fabric\u2122
+      interface <l2-fabric>` per unit time. This is also presented as a percent
+      of the peak theoretical bandwidth achievable on the specific accelerator.
+    unit: GB/s
+  L2-Fabric Write BW:
+    rst: The number of bytes sent by the L2 over the :ref:`Infinity Fabric interface
+      <l2-fabric>` by write and atomic operations per unit time. This is also presented
+      as a percent of the peak theoretical bandwidth achievable on the specific accelerator.
+    unit: GB/s
+  L2-Fabric Read Latency:
+    rst: The time-averaged number of cycles read requests spent in Infinity Fabric
+      before data was returned to the L2.
+    unit: Cycles
+  L2-Fabric Write Latency:
+    rst: The time-averaged number of cycles write requests spent in Infinity Fabric
+      before a completion acknowledgement was returned to the L2.
+    unit: Cycles
+  sL1D Cache Hit Rate:
+    rst: The percent of sL1D requests that hit on a previously loaded line the cache.
+      Calculated as the ratio of the number of sL1D requests that hit over the number
+      of all sL1D requests.
+    unit: Percent
+  sL1D Cache BW:
+    rst: The number of bytes looked up in the sL1D cache per unit time. This is also
+      presented as a percent of the peak theoretical bandwidth achievable on the specific
+      accelerator.
+    unit: GB/s
+  L1I Hit Rate:
+    rst: The percent of L1I requests that hit on a previously loaded line the cache.
+      Calculated as the ratio of the number of L1I requests that hit over the number
+      of all L1I requests.
+    unit: GB/s
+  L1I BW:
+    rst: The number of bytes looked up in the L1I cache per unit time. This is also
+      presented as a percent of the peak theoretical bandwidth achievable on the specific
+      accelerator.
+    unit: Percent
+  L1I Fetch Latency:
+    rst: The average number of cycles spent to fetch instructions to a :doc:`CU <compute-unit>`.
+    unit: Cycles
+Memory Chart:
+  Wavefront Occupancy:
+    rst: Wavefronts per active CU.
+    unit: Wavefronts
+  Wave Life:
+    rst: Average number of cycles executing a wave.
+    unit: Cycles per wave
+  SALU:
+    rst: Total Number of SALU (Scalar ALU) instructions issued per normalization unit.
+    unit: Instructions per normalization unit
+  SMEM:
+    rst: Total number of SMEM (Scalar Memory Read) instructions issued normalization
+      unit.
+    unit: Instructions per normalization unit
+  VALU:
+    rst: The number of VALU (Vector ALU) instructions issued per normalization unit.
+    unit: Instructions per normalization unit
+  MFMA:
+    rst: Total number of MFMA (Matrix-Fused-Multiply-Add) instructions issued per
+      normalization unit.
+    unit: Instructions per normalization unit
+  VMEM:
+    rst: The number of VMEM (GPU Memory) read instructions issued (including FLAT/scratch
+      memory) per normalization unit.
+    unit: Instructions per normalization unit
+  LDS:
+    rst: The total number of LDS instructions (including, but not limited to, read/write/atomics
+      and HIP's __shfl instructions) executed per normalization unit.
+    unit: Instructions per normalization unit
+  GWS:
+    rst: Total number of GDS (global data sync) instructions issued per normalization
+      unit.
+    unit: Instructions per normalization unit
+  BR:
+    rst: Total number of BRANCH instructions issued per normalization unit.
+    unit: Instructions per normalization unit
+  Active CUs:
+    rst: Total number of active compute units (CUs) on the accelerator during the
+      kernel execution.
+    unit: CUs
+  Num CUs:
+    rst: Total number of compute units (CUs) on the accelerator.
+    unit: CUs
+  VGPR:
+    rst: |-
+      The number of architected vector general-purpose registers allocated for the
+      kernel, see :ref:`VALU <desc-valu>`. Note: this may not exactly match the
+      number of VGPRs requested by the compiler due to allocation granularity.
+    unit: VGPRs
+  SGPR:
+    rst: |-
+      The number of scalar general-purpose registers allocated for the kernel, see
+      :ref:`SALU <desc-salu>`. Note: this may not exactly match the number of
+      SGPRs requested by the compiler due to allocation granularity.
+    unit: SGPRs
+  LDS Allocation:
+    rst: |-
+      The number of bytes of :doc:`LDS <local-data-share>` memory (or, shared memory)
+      allocated for this kernel. Note: This may also be larger than what was requested
+      at compile time due to both allocation granularity and dynamic per-dispatch
+      LDS allocations.
+    unit: Bytes per workgroup
+  Scratch Allocation:
+    rst: The number of bytes of :ref:`scratch memory <memory-spaces>` requested per
+      work-item for this kernel. Scratch memory is used for stack memory on the accelerator,
+      as well as for register spills and restores.
+    unit: Bytes per workgroup
+  Wavefronts:
+    rst: The total number of wavefronts, summed over all workgroups, forming this
+      kernel launch.
+    unit: Wavefronts
+  Workgroups:
+    rst: The total number of workgroups forming this kernel launch.
+    unit: Workgroups
+  LDS Req:
+    rst: The total number of LDS instructions (including, but not limited to, read/write/atomics
+      and HIP's ``__shfl`` instructions) executed per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  LDS Util:
+    rst: Indicates what percent of the kernel's duration the :ref:`LDS <desc-lds>`
+      was actively executing instructions (including, but not limited to, load, store,
+      atomic and HIP's ``__shfl`` operations). Calculated as the ratio of the total
+      number of cycles LDS was active over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  LDS Latency:
+    rst: The average number of round-trip cycles (i.e., from issue to data-return
+      / acknowledgment) required for an LDS instruction to complete.
+    unit: Cycles
+  VL1 Rd:
+    rst: The total number of incoming read requests from the :ref:`address processing
+      unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
+    unit: Requests per normalization unit
+  VL1 Wr:
+    rst: The total number of incoming write requests from the :ref:`address processing
+      unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
+    unit: Requests per normalization unit
+  VL1 Atomic:
+    rst: The total number of incoming atomic requests from the :ref:`address processing
+      unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
+    unit: Requests per normalization unit
+  VL1 Hit:
+    rst: The ratio of the number of vL1D cache line requests that hit in vL1D cache
+      over the total number of cache line requests to the :ref:`vL1D Cache RAM <desc-tc>`.
+    unit: Percent
+  VL1 Lat:
+    rst: Calculated as the average number of cycles that a vL1D cache line request
+      spent in the vL1D cache pipeline.
+    unit: Cycles
+  VL1 Coalesce:
+    rst: Indicates how well memory instructions were coalesced by the :ref:`address
+      processing unit <desc-ta>`, ranging from uncoalesced (25%) to fully coalesced
+      (100%). Calculated as the average number of :ref:`thread-requests <thread-requests>`
+      generated per instruction divided by the ideal number of thread-requests per
+      instruction.
+    unit: Percent
+  VL1 Stall:
+    rst: The ratio of the number of cycles where the vL1D is stalled waiting to issue
+      a request for data to the :doc:`L2 cache <l2-cache>` divided by the number of
+      cycles where the vL1D is active [#vl1d-activity]_.
+    unit: Percent
+  VL1_L2 Rd:
+    rst: The number of read requests for a vL1D cache line that were not satisfied
+      by the vL1D and must be retrieved from the to the :doc:`L2 Cache <l2-cache>`
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  VL1_L2 Wr:
+    rst: The number of write requests to a vL1D cache line that were sent through
+      the vL1D to the :doc:`L2 cache <l2-cache>`, per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  VL1_L2 Atomic:
+    rst: The number of atomic requests that are sent through the vL1D to the :doc:`L2
+      cache <l2-cache>`, per :ref:`normalization unit <normalization-units>`. This
+      includes requests for atomics with, and without return.
+    unit: Requests per normalization unit
+  sL1D Rd:
+    rst: The total number of requests, of any size or type, made to the sL1D per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Requests per normalization unit
+  sL1D Hit:
+    rst: The total number of sL1D requests that hit on a previously loaded cache line,
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  sL1D Lat:
+    rst: ''
+    unit: Unknown
+  sL1D_L2 Rd:
+    rst: The total number of read requests from sL1D to the :doc:`L2 <l2-cache>`,
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  sL1D_L2 Wr:
+    rst: The total number of write requests from sL1D to the :doc:`L2 <l2-cache>`,
+      per :ref:`normalization unit <normalization-units>`. Typically unused on current
+      CDNA accelerators.
+    unit: Requests per normalization unit
+  sL1D_L2 Atomic:
+    rst: The total number of atomic requests from sL1D to the :doc:`L2 <l2-cache>`,
+      per :ref:`normalization unit <normalization-units>`. Typically unused on current
+      CDNA accelerators.
+    unit: Requests per normalization unit
+  IL1 Fetch:
+    rst: The total number of requests made to the L1I per :ref:`normalization-unit
+      <normalization-units>`.
+    unit: Requests per normalization unit
+  IL1 Hit:
+    rst: The total number of L1I requests that hit on a previously loaded cache line,
+      per :ref:`normalization-unit <normalization-units>`.
+    unit: Percent
+  IL1 Lat:
+    rst: The average number of cycles spent to fetch instructions to a :doc:`CU <compute-unit>`.
+    unit: Cycles
+  IL1_L2 Rd:
+    rst: The total number of requests across the L1I - L2 interface per normalization-unit.
+    unit: Requests per normalization unit
+  L2 Rd:
+    rst: The total number of read requests to the L2 from all clients.
+    unit: Requests per normalization unit
+  L2 Wr:
+    rst: The total number of write requests to the L2 from all clients.
+    unit: Requests per normalization unit
+  L2 Atomic:
+    rst: The total number of atomic requests (with and without return) to the L2 from
+      all clients.
+    unit: Requests per normalization unit
+  L2 Hit:
+    rst: The ratio of the number of L2 cache line requests that hit in the L2 cache
+      over the total number of incoming cache line requests to the L2 cache.
+    unit: Percent
+  L2 Rd Lat:
+    rst: Calculated as the average number of cycles that the vL1D cache took to issue
+      and receive read requests from the :doc:`L2 Cache <l2-cache>`. This number also
+      includes requests for atomics with return values.
+    unit: Cycles
+  L2 Wr Lat:
+    rst: Calculated as the average number of cycles that the vL1D cache took to issue
+      and receive acknowledgement of a write request to the :doc:`L2 Cache <l2-cache>`.
+      This number also includes requests for atomics without return values.
+    unit: Cycles
+  Fabric_L2 Rd:
+    rst: Number of L2 cache - Infinity Fabric read requests (either 32-byte or 64-byte)
+      summed over TCC instances per normalization unit.
+    unit: Requests per normalization unit
+  Fabric_L2 Wr:
+    rst: Number of L2 cache - Infinity Fabric write requests (either 32-byte or 64-byte)
+      summed over TCC instances per normalization unit.
+    unit: Requests per normalization unit
+  Fabric_L2 Atomic:
+    rst: Number of L2 cache - Infinity Fabric write requests (either 32-byte or 64-byte)
+      that are actually atomic requests summed over TCC instances per normalization
+      unit.
+    unit: Requests per normalization unit
+  Fabric Rd Lat:
+    rst: The time-averaged number of cycles read requests spent in Infinity Fabric
+      before data was returned to the L2.
+    unit: Cycles
+  Fabric Wr Lat:
+    rst: The time-averaged number of cycles write requests spent in Infinity Fabric
+      before a completion acknowledgement was returned to the L2.
+    unit: Cycles
+  Fabric Atomic Lat:
+    rst: The time-averaged number of cycles atomic requests spent in Infinity Fabric
+      before a completion acknowledgement (atomic without return value) or data (atomic
+      with return value) was returned to the L2.
+    unit: Cycles
+  HBM Rd:
+    rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of
+      data from the accelerator's local HBM, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+  HBM Wr:
+    rst: The total number of L2 requests to Infinity Fabric to write 32B or 64B of
+      data from the accelerator's local HBM, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+Roofline Performance Rates:
+  VALU FLOPs (F16):
+    rst: |-
+      The total 16-bit floating-point operations executed per second on the :ref:`VALU
+      <desc-valu>`. This is presented with the value of the peak empirical F16 FLOPs achievable
+      on the specific accelerator. Note: this does not include any F16 operations
+      from :ref:`MFMA <desc-mfma>` instructions.
+    unit: GFLOPs
+  VALU FLOPs (F32):
+    rst: |-
+      The total 32-bit floating-point operations executed per second on the :ref:`VALU
+      <desc-valu>`. This is presented with the value of the peak empirical F32 FLOPs achievable
+      on the specific accelerator. Note: this does not include any F32 operations
+      from :ref:`MFMA <desc-mfma>` instructions.
+    unit: GFLOPs
+  VALU FLOPs (F64):
+    rst: |-
+      The total 64-bit floating-point operations executed per second on the :ref:`VALU
+      <desc-valu>`. This is presented with the value of the peak empirical F64 FLOPs achievable
+      on the specific accelerator. Note: this does not include any F64 operations
+      from :ref:`MFMA <desc-mfma>` instructions.
+    unit: GFLOPs
+  MFMA FLOPs (F64):
+    rst: |-
+      The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 64-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
+      measured F64 MFMA operations achievable on the specific accelerator is
+      displayed alongside for comparison.
+    unit: GFLOPs
+  MFMA FLOPs (F32):
+    rst: |-
+      The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 32-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
+      measured F32 MFMA operations achievable on the specific accelerator is
+      displayed alongside for comparison.
+    unit: GFLOPs
+  MFMA FLOPs (F16):
+    rst: |-
+      The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 16-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
+      measured F16 MFMA operations achievable on the specific accelerator is
+      displayed alongside for comparison.
+    unit: GFLOPs
+  MFMA FLOPs (BF16):
+    rst: |-
+      The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>`
+      operations executed per second. Note: this does not include any 16-bit brain
+      floating point operations from :ref:`VALU <desc-valu>` instructions. The
+      peak empirically measured BF16 MFMA operations achievable on the specific
+      accelerator is displayed alongside for comparison.
+    unit: GFLOPs
+  MFMA FLOPs (F8):
+    rst: |-
+      The total number of 8-bit brain floating point :ref:`MFMA <desc-mfma>`
+      operations executed per second. Note: this does not include any 16-bit brain
+      floating point operations from :ref:`VALU <desc-valu>` instructions. The
+      peak empirically measured F8 MFMA operations achievable on the specific
+      accelerator is displayed alongside for comparison. It is supported on AMD
+      Instinct MI300 series and later only.
+    unit: GFLOPs
+  MFMA FLOPs (F6F4):
+    rst: |-
+      The total number of 4-bit and 6-bit floating point :ref:`MFMA <desc-mfma>`
+      operations executed per second. Note: this does not include any floating point
+      operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
+      measured F6F4 MFMA operations achievable on the specific accelerator is
+      displayed alongside for comparison. It is supported on AMD Instinct MI350
+      series (gfx950) and later only.
+    unit: GFLOPs
+  MFMA IOPs (Int8):
+    rst: |-
+      The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
+      per second. Note: this does not include any 8-bit integer operations from
+      :ref:`VALU <desc-valu>` instructions. The peak empirically measured INT8 MFMA
+      operations achievable on the specific accelerator is displayed alongside
+      for comparison.
+    unit: GIOPs
+  HBM Bandwidth:
+    rst: |-
+      The total number of bytes read from and written to High-Bandwidth
+        Memory (HBM) per second. The peak empirically measured bandwidth achievable
+        on the specific accelerator is displayed alongside for comparison.
+    unit: GB/s
+  L2 Cache Bandwidth:
+    rst: The number of bytes looked up in the L2 cache per unit time. The number of
+      bytes is calculated as the number of cache lines requested multiplied by the
+      cache line size. This value does not consider partial requests, so e.g., if
+      only a single value is requested in a cache line, the data movement will still
+      be counted as a full cache line. The peak empirically measured bandwidth achievable
+      on the specific accelerator is displayed alongside for comparison.
+    unit: GB/s
+  L1 Cache Bandwidth:
+    rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM
+      <desc-vmem>` instructions per unit time. The number of bytes is calculated as
+      the number of cache lines requested multiplied by the cache line size. This
+      value does not consider partial requests, so e.g., if only a single value is
+      requested in a cache line, the data movement will still be counted as a full
+      cache line. The peak empirically measured bandwidth achievable on the specific
+      accelerator is displayed alongside for comparison.
+    unit: GB/s
+  LDS Bandwidth:
+    rst: Indicates the maximum amount of bytes that could have been loaded from, stored
+      to, or atomically updated in the LDS per unit time (see :ref:`LDS Bandwidth
+      <lds-bandwidth>` example for more detail). The peak empirically measured LDS
+      bandwidth achievable on the specific accelerator is displayed alongside for
+      comparison.
+    unit: GB/s
+Roofline Plot Points:
+  AI HBM:
+    rst: |-
+      The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM).
+      It is the ratio of total floating-point operations (FLOPs) to total bytes
+      transferred between HBM and the L2 cache. This value is used as the x-coordinate
+      for the HBM roofline.
+    unit: FLOPs/Byte
+  AI L2:
+    rst: |-
+      The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio
+      of total floating-point operations (FLOPs) to total bytes transferred between
+      the L2 cache and the L1 cache. This value is used as the x-coordinate for
+      the L2 roofline.
+    unit: FLOPs/Byte
+  AI L1:
+    rst: |-
+      The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio
+      of total floating-point operations (FLOPs) to total bytes transferred between
+      the L1 cache and the processing units. This value is used as the x-coordinate
+      for the L1 roofline.
+    unit: FLOPs/Byte
+  Performance (GFLOPs):
+    rst: |-
+      The overall achieved performance, measured in GigaFLOPs
+      per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point
+      operations divided by the total execution time. This value is used as the y-coordinate
+      for the kernel's point on the Roofline plot.
+    unit: GFLOP/s
+Command processor fetcher (CPF):
+  CPF Utilization:
+    rst: Percent of total cycles where the CPF was busy actively doing any work. The
+      ratio of CPF busy cycles over total cycles counted by the CPF.
+    unit: Percent
+  CPF Stall:
+    rst: Percent of CPF busy cycles where the CPF was stalled for any reason.
+    unit: Percent
+  CPF-L2 Utilization:
+    rst: Percent of total cycles counted by the CPF-:doc:`L2 <l2-cache>` interface
+      where the CPF-L2 interface was active doing any work. The ratio of CPF-L2 busy
+      cycles over total cycles counted by the CPF-L2.
+    unit: Percent
+  CPF-L2 Stall:
+    rst: Percent of CPF-:doc:`L2 <l2-cache>` L2 busy cycles where the CPF-L2 interface
+      was stalled for any reason.
+    unit: Percent
+  CPF-UTCL1 Stall:
+    rst: Percent of CPF busy cycles where the CPF was stalled by address translation.
+    unit: Percent
+Command processor packet processor (CPC):
+  CPC SYNC FIFO Full Rate:
+    rst: ''
+    unit: Unknown
+  CPC CANE Stall Rate:
+    rst: ''
+    unit: Unknown
+  CPC ADC Utilization:
+    rst: ''
+    unit: Unknown
+  CPC Utilization:
+    rst: Percent of total cycles where the CPC was busy actively doing any work. The
+      ratio of CPC busy cycles over total cycles counted by the CPC.
+    unit: Percent
+  CPC Stall Rate:
+    rst: Percent of CPC busy cycles where the CPC was stalled for any reason.
+    unit: Percent
+  CPC Packet Decoding Utilization:
+    rst: Percent of CPC busy cycles spent decoding commands for processing.
+    unit: Percent
+  CPC-Workgroup Manager Utilization:
+    rst: Percent of CPC busy cycles spent dispatching workgroups to the :ref:`workgroup
+      manager <desc-spi>`.
+    unit: Percent
+  CPC-L2 Utilization:
+    rst: Percent of total cycles counted by the CPC-:doc:`L2 <l2-cache>` interface
+      where the CPC-L2 interface was active doing any work.
+    unit: Percent
+  CPC-UTCL1 Stall:
+    rst: Percent of CPC busy cycles where the CPC was stalled by address translation
+    unit: Percent
+  CPC-UTCL2 Utilization:
+    rst: Percent of total cycles counted by the CPC's :doc:`L2 <l2-cache>` address
+      translation interface where the CPC was busy doing address translation work.
+    unit: Percent
+Workgroup manager utilizations:
+  Schedule-Pipe Wave Occupancy:
+    rst: ''
+    unit: Unknown
+  Accelerator Utilization:
+    rst: The percent of cycles in the kernel where the accelerator was actively doing
+      any work.
+    unit: Percent
+  Scheduler-Pipe Utilization:
+    rst: |-
+      The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
+      in the kernel where the scheduler-pipes were actively doing any work. Note: this
+      value is expected to range between 0% and 25%. See :ref:`desc-spi`.
+    unit: Percent
+  Scheduler-Pipe Wave Utilization:
+    rst: ''
+    unit: Unknown
+  Workgroup Manager Utilization:
+    rst: The percent of cycles in the kernel where the workgroup manager was actively
+      doing any work.
+    unit: Percent
+  Shader Engine Utilization:
+    rst: The percent of :ref:`total shader engine cycles <total-se-cycles>` in the
+      kernel where any CU in a shader-engine was actively doing any work, normalized
+      over all shader-engines. Low values (e.g., << 100%) indicate that the accelerator
+      was not fully saturated by the kernel, or a potential load-imbalance issue.
+    unit: Percent
+  SIMD Utilization:
+    rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel
+      where any :ref:`SIMD <desc-valu>` on a CU was actively doing any work, summed
+      over all CUs. Low values (less than 100%) indicate that the accelerator was
+      not fully saturated by the kernel, or a potential load-imbalance issue.
+    unit: Percent
+  Dispatched Workgroups:
+    rst: The total number of workgroups forming this kernel launch.
+    unit: Workgroups
+  Dispatched Wavefronts:
+    rst: The total number of wavefronts, summed over all workgroups, forming this
+      kernel launch.
+    unit: Wavefronts
+  VGPR Writes:
+    rst: The average number of cycles spent initializing :ref:`VGPRs <desc-valu>`
+      at wave creation.
+    unit: Cycles/wave
+  SGPR Writes:
+    rst: The average number of cycles spent initializing :ref:`SGPRs <desc-salu>`
+      at wave creation.
+    unit: Cycles/wave
+Workgroup Manager - Resource Allocation:
+  Not-scheduled Rate (Workgroup Manager):
+    rst: |-
+      The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
+      in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
+      due to a bottleneck within the workgroup manager rather than a lack of a
+      CU or :ref:`SIMD <desc-valu>` with sufficient resources. Note: this value
+      is expected to range between 0-25%. See note in :ref:`workgroup manager <desc-spi>`
+      description.
+    unit: Percent
+  Not-scheduled Rate (Scheduler-Pipe):
+    rst: |-
+      The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
+      in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
+      due to a bottleneck within the scheduler-pipes rather than a lack of a CU
+      or :ref:`SIMD <desc-valu>` with sufficient resources. Note: this value is
+      expected to range between 0-25%, see note in :ref:`workgroup manager <desc-spi>`
+      description.
+    unit: Percent
+  Scheduler-Pipe FIFO Full Rate:
+    rst: ''
+    unit: Unknown
+  Scheduler-Pipe Stall Rate:
+    rst: |-
+      The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
+      in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
+      due to occupancy limitations (like a lack of a CU or :ref:`SIMD <desc-valu>`
+      with sufficient resources). Note: this value is expected to range between
+      0-25%, see note in :ref:`workgroup manager <desc-spi>` description.
+    unit: Percent
+  Scratch Stall Rate:
+    rst: The percent of :ref:`total shader-engine cycles <total-se-cycles>` in the
+      kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
+      due to lack of :ref:`private (a.k.a., scratch) memory <memory-type>` slots.
+      While this can reach up to 100%, note that the actual occupancy limitations
+      on a kernel using private memory are typically quite small (for example, less
+      than 1% of the total number of waves that can be scheduled to an accelerator).
+    unit: Percent
+  Insufficient SIMD Waveslots:
+    rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel
+      where a workgroup could not be scheduled to a :ref:`SIMD <desc-valu>` due to
+      lack of available :ref:`waveslots <desc-valu>`.
+    unit: Percent
+  Insufficient SIMD VGPRs:
+    rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel
+      where a workgroup could not be scheduled to a :ref:`SIMD <desc-valu>` due to
+      lack of available :ref:`VGPRs <desc-valu>`.
+    unit: Percent
+  Insufficient SIMD SGPRs:
+    rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel
+      where a workgroup could not be scheduled to a :ref:`SIMD <desc-valu>` due to
+      lack of available :ref:`SGPRs <desc-salu>`.
+    unit: Percent
+  Insufficient CU LDS:
+    rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel where
+      a workgroup could not be scheduled to a :doc:`CU <compute-unit>` due to lack
+      of available :doc:`LDS <local-data-share>`.
+    unit: Percent
+  Insufficient CU Barriers:
+    rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel where
+      a workgroup could not be scheduled to a :doc:`CU <compute-unit>` due to lack
+      of available :ref:`barriers <desc-barrier>`.
+    unit: Percent
+  Reached CU Workgroup Limit:
+    rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel where
+      a workgroup could not be scheduled to a :doc:`CU <compute-unit>` due to limits
+      within the workgroup manager. This is expected to be always be zero on CDNA2
+      or newer accelerators (and small for previous accelerators).
+    unit: Percent
+  Reached CU Wavefront Limit:
+    rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel where
+      a wavefront could not be scheduled to a :doc:`CU <compute-unit>` due to limits
+      within the workgroup manager. This is expected to be always be zero on CDNA2
+      or newer accelerators (and small for previous accelerators).
+    unit: Percent
+Wavefront Launch Stats:
+  Grid Size:
+    rst: The total number of work-items (or, threads) launched as a part of the kernel
+      dispatch. In HIP, this is equivalent to the total grid size multiplied by the
+      total workgroup (or, block) size.
+    unit: Work-Items
+  Workgroup Size:
+    rst: The total number of work-items (or, threads) in each workgroup (or, block)
+      launched as part of the kernel dispatch. In HIP, this is equivalent to the total
+      block size.
+    unit: Work-Items
+  Total Wavefronts:
+    rst: |-
+      The total number of wavefronts launched as part of the kernel dispatch.
+      On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront
+      size is always 64 work-items. Thus, the total number of wavefronts should
+      be equivalent to the ceiling of grid size divided by 64.
+    unit: Wavefronts
+  Saved Wavefronts:
+    rst: The total number of wavefronts saved at a context-save. See `cwsr_enable
+      <https://docs.kernel.org/gpu/amdgpu/module-parameters.html?highlight=cwsr>`_.
+    unit: Wavefronts
+  Restored Wavefronts:
+    rst: The total number of wavefronts restored from a context-save. See `cwsr_enable
+      <https://docs.kernel.org/gpu/amdgpu/module-parameters.html?highlight=cwsr>`_.
+    unit: Wavefronts
+  VGPRs:
+    rst: |-
+      The number of architected vector general-purpose registers allocated for the
+      kernel, see :ref:`VALU <desc-valu>`. Note: this may not exactly match the
+      number of VGPRs requested by the compiler due to allocation granularity.
+    unit: VGPRs
+  AGPRs:
+    rst: |-
+      The number of accumulation vector general-purpose registers allocated
+      for the kernel, see :ref:`AGPRs <desc-agprs>`. Note: this may not exactly match
+      the number of AGPRs requested by the compiler due to allocation granularity.
+    unit: AGPRs
+  SGPRs:
+    rst: |-
+      The number of scalar general-purpose registers allocated for the kernel, see
+      :ref:`SALU <desc-salu>`. Note: this may not exactly match the number of
+      SGPRs requested by the compiler due to allocation granularity.
+    unit: SGPRs
+  LDS Allocation:
+    rst: |-
+      The number of bytes of :doc:`LDS <local-data-share>` memory (or, shared memory)
+      allocated for this kernel. Note: This may also be larger than what was requested
+      at compile time due to both allocation granularity and dynamic per-dispatch
+      LDS allocations.
+    unit: Bytes per workgroup
+  Scratch Allocation:
+    rst: The number of bytes of :ref:`scratch memory <memory-spaces>` requested per
+      work-item for this kernel. Scratch memory is used for stack memory on the accelerator,
+      as well as for register spills and restores.
+    unit: Bytes per work-item
+Wavefront Runtime Stats:
+  Kernel Time:
+    rst: The total duration of the executed kernel.
+    unit: Nanoseconds
+  Kernel Time (Cycles):
+    rst: The total duration of the executed kernel in cycles.
+    unit: Cycles
+  Instructions per wavefront:
+    rst: The average number of instructions (of all types) executed per wavefront.
+      This is averaged over all wavefronts in a kernel dispatch.
+    unit: Instructions per wavefront
+  Wave Cycles:
+    rst: |-
+      The number of cycles a wavefront in the kernel dispatch spent resident
+      on a compute unit per :ref:`normalization unit <normalization-units>`. This is
+      averaged over all wavefronts in a kernel dispatch. Note: this should not
+      be directly compared to the kernel cycles above.
+    unit: Cycles per normalization unit
+  Dependency Wait Cycles:
+    rst: The number of cycles a wavefront in the kernel dispatch stalled waiting on
+      memory of any kind (e.g., instruction fetch, vector or scalar memory, etc.)
+      per :ref:`normalization unit <normalization-units>`. This counter is incremented
+      at every cycle by *all* wavefronts on a CU stalled at a memory operation. As
+      such, it is most useful to get a sense of how waves were spending their time,
+      rather than identification of a precise limiter because another wave could be
+      actively executing while a wave is stalled. The sum of this metric, Issue Wait
+      Cycles and Active Cycles should be equal to the total Wave Cycles metric.
+    unit: Cycles per normalization unit
+  Issue Wait Cycles:
+    rst: The number of cycles a wavefront in the kernel dispatch was unable to issue
+      an instruction for any reason (e.g., execution pipe back-pressure, arbitration
+      loss, etc.) per :ref:`normalization unit <normalization-units>`. This counter
+      is incremented at every cycle by *all* wavefronts on a CU unable to issue an
+      instruction. As such, it is most useful to get a sense of how waves were spending
+      their time, rather than identification of a precise limiter because another
+      wave could be actively executing while a wave is issue stalled. The sum of this
+      metric, Dependency Wait Cycles and Active Cycles should be equal to the total
+      Wave Cycles metric.
+    unit: Cycles per normalization unit
+  Active Cycles:
+    rst: The average number of cycles a wavefront in the kernel dispatch was actively
+      executing instructions per :ref:`normalization unit <normalization-units>`.
+      This measurement is made on a per-wavefront basis, and may include cycles that
+      another wavefront spent actively executing (on another execution unit, for example)
+      or was stalled. As such, it is most useful to get a sense of how waves were
+      spending their time, rather than identification of a precise limiter. The sum
+      of this metric, Issue Wait Cycles and Active Wait Cycles should be equal to
+      the total Wave Cycles metric.
+    unit: Cycles per normalization unit
+  Wavefront Occupancy:
+    rst: |-
+      The time-averaged number of wavefronts resident on the accelerator over the
+      lifetime of the kernel. Note: this metric may be inaccurate for short-running
+      kernels (less than 1ms).
+    unit: Wavefronts
+Overall Instruction Mix:
+  VALU:
+    rst: The total number of vector arithmetic logic unit (VALU) operations issued.
+      These are the workhorses of the :doc:`compute unit <compute-unit>`, and are
+      used to execute a wide range of instruction types including floating point operations,
+      non-uniform address calculations, transcendental operations, integer operations,
+      shifts, conditional evaluation, etc.
+    unit: Instructions
+  VMEM:
+    rst: The total number of vector memory operations issued. These include most loads,
+      stores and atomic operations and all accesses to :ref:`generic, global, private
+      and texture <memory-spaces>` memory.
+    unit: Instructions
+  LDS:
+    rst: The total number of LDS (also known as shared memory) operations issued.
+      These include loads, stores, atomics, and HIP's ``__shfl`` operations.
+    unit: Instructions
+  MFMA:
+    rst: The total number of matrix fused multiply-add instructions issued.
+    unit: Instructions
+  SALU:
+    rst: The total number of scalar arithmetic logic unit (SALU) operations issued.
+      Typically these are used for address calculations, literal constants, and other
+      operations that are provably uniform across a wavefront. Although scalar memory
+      (SMEM) operations are issued by the SALU, they are counted separately in this
+      section.
+    unit: Instructions
+  SMEM:
+    rst: The total number of scalar memory (SMEM) operations issued. These are typically
+      used for loading kernel arguments, base-pointers and loads from HIP's ``__constant__``
+      memory.
+    unit: Instructions
+  Branch:
+    rst: The total number of branch operations issued. These typically consist of
+      jump or branch operations and are used to implement control flow.
+    unit: Instructions
+VALU Arithmetic Instruction Mix:
+  INT32:
+    rst: The total number of instructions operating on 32-bit integer operands issued
+      to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  INT64:
+    rst: The total number of instructions operating on 64-bit integer operands issued
+      to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F16-ADD:
+    rst: The total number of addition instructions operating on 16-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F16-MUL:
+    rst: The total number of multiplication instructions operating on 16-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F16-FMA:
+    rst: The total number of fused multiply-add instructions operating on 16-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F16-Trans:
+    rst: The total number of transcendental instructions (e.g., `sqrt`) operating
+      on 16-bit floating-point operands issued to the VALU per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F32-ADD:
+    rst: The total number of addition instructions operating on 32-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F32-MUL:
+    rst: The total number of multiplication instructions operating on 32-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F32-FMA:
+    rst: The total number of fused multiply-add instructions operating on 32-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F32-Trans:
+    rst: The total number of transcendental instructions (such as ``sqrt``) operating
+      on 32-bit floating-point operands issued to the VALU per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F64-ADD:
+    rst: The total number of addition instructions operating on 64-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F64-MUL:
+    rst: The total number of multiplication instructions operating on 64-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F64-FMA:
+    rst: The total number of fused multiply-add instructions operating on 64-bit floating-point
+      operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F64-Trans:
+    rst: The total number of transcendental instructions (such as `sqrt`) operating
+      on 64-bit floating-point operands issued to the VALU per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Conversion:
+    rst: |-
+      The total number of type conversion instructions (such as converting data
+      to or from F32\u2194F64) issued to the VALU per :ref:`normalization unit
+      <normalization-units>`.
+    unit: Instructions per normalization unit
+VMEM Instruction Mix:
+  Global/Generic Instr:
+    rst: The total number of global & generic memory instructions executed on all
+      :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Global/Generic Read:
+    rst: The total number of global & generic memory read instructions executed on
+      all :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Global/Generic Write:
+    rst: The total number of global & generic memory write instructions executed on
+      all :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Global/Generic Atomic:
+    rst: The total number of global & generic memory atomic (with and without return)
+      instructions executed on all :doc:`compute units <compute-unit>` on the accelerator,
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Spill/Stack Instr:
+    rst: The total number of spill/stack memory instructions executed on all :doc:`compute
+      units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Spill/Stack Coalesceable Instr:
+    rst: ''
+    unit: Unknown
+  Spill/Stack Read:
+    rst: The total number of spill/stack memory read instructions executed on all
+      :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Spill/Stack Write:
+    rst: The total number of spill/stack memory write instructions executed on all
+      :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Spill/Stack Atomic:
+    rst: The total number of spill/stack memory atomic (with and without return) instructions
+      executed on all :doc:`compute units <compute-unit>` on the accelerator, per
+      :ref:`normalization unit <normalization-units>`. Typically unused as these memory
+      operations are typically used to implement thread-local storage.
+    unit: Instructions per normalization unit
+MFMA Arithmetic Instruction Mix:
+  MFMA-I8:
+    rst: The total number of 8-bit integer :ref:`MFMA <desc-mfma>` instructions issued
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  MFMA-F8:
+    rst: The total number of 8-bit floating point :ref:`MFMA <desc-mfma>` instructions
+      issued per :ref:`normalization unit <normalization-units>`. This is supported
+      in AMD Instinct MI300 series and later only.
+    unit: Instructions per normalization unit
+  MFMA-F16:
+    rst: The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` instructions
+      issued per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  MFMA-BF16:
+    rst: The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>` instructions
+      issued per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  MFMA-F32:
+    rst: The total number of 32-bit floating-point :ref:`MFMA <desc-mfma>` instructions
+      issued per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  MFMA-F64:
+    rst: The total number of 64-bit floating-point :ref:`MFMA <desc-mfma>` instructions
+      issued per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  MFMA-F6F4:
+    rst: ''
+    unit: Unknown
+Compute Speed-of-Light:
+  VALU FLOPs:
+    rst: |-
+      The total floating-point operations executed per second on the :ref:`VALU
+      <desc-valu>`. This is also presented as a percent of the peak theoretical
+      FLOPs achievable on the specific accelerator. Note: this does not include
+      any floating-point operations from :ref:`MFMA <desc-mfma>` instructions.
+    unit: GFLOPs
+  VALU IOPs:
+    rst: |-
+      The total integer operations executed per second on the :ref:`VALU <desc-valu>`.
+      This is also presented as a percent of the peak theoretical IOPs achievable
+      on the specific accelerator. Note: this does not include any integer operations
+      from :ref:`MFMA <desc-mfma>` instructions.
+    unit: GIOPs
+  MFMA FLOPs (F8):
+    rst: ''
+    unit: Unknown
+  MFMA FLOPs (BF16):
+    rst: |-
+      The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 16-bit brain floating
+      point operations from :ref:`VALU <desc-valu>` instructions. This is also
+      presented as a percent of the peak theoretical BF16 MFMA operations achievable
+      on the specific accelerator.
+    unit: GFLOPs
+  MFMA FLOPs (F16):
+    rst: |-
+      The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 16-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. This is also presented
+      as a percent of the peak theoretical F16 MFMA operations achievable on the
+      specific accelerator.
+    unit: GFLOPs
+  MFMA FLOPs (F32):
+    rst: |-
+      The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 32-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. This is also presented
+      as a percent of the peak theoretical F32 MFMA operations achievable on the
+      specific accelerator.
+    unit: GFLOPs
+  MFMA FLOPs (F64):
+    rst: |-
+      The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 64-bit floating point
+      operations from :ref:`VALU <desc-valu>` instructions. This is also presented
+      as a percent of the peak theoretical F64 MFMA operations achievable on the
+      specific accelerator. The total number of 64-bit floating point :ref:`MFMA
+      <desc-mfma>` operations executed per second. Note: this does not include
+      any 64-bit floating point operations from :ref:`VALU <desc-valu>` instructions.
+      This is also presented as a percent of the peak theoretical F64 MFMA operations
+      achievable on the specific accelerator.
+    unit: GFLOPs
+  MFMA FLOPs (F6F4):
+    rst: ''
+    unit: Unknown
+  MFMA IOPs (INT8):
+    rst: |-
+      The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
+      per second. Note: this does not include any 8-bit integer operations from
+      :ref:`VALU <desc-valu>` instructions. This is also presented as a percent
+      of the peak theoretical INT8 MFMA operations achievable on the specific accelerator.
+    unit: GFLOPs
+Pipeline Statistics:
+  IPC:
+    rst: The ratio of the total number of instructions executed on the :doc:`CU <compute-unit>`
+      over the :ref:`total active CU cycles <total-active-cu-cycles>`.
+    unit: Instructions per cycle
+  IPC (Issued):
+    rst: The ratio of the total number of (non-:ref:`internal <ipc-internal-instructions>`)
+      instructions issued over the number of cycles where the :ref:`scheduler <desc-scheduler>`
+      was actively working on issuing instructions. Refer to the :ref:`Issued IPC
+      <issued-ipc>` example for further detail.
+    unit: Instructions per cycle
+  SALU Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`SALU <desc-salu>`
+      was busy executing instructions. Computed as the ratio of the total number of
+      cycles spent by the :ref:`scheduler <desc-scheduler>` issuing SALU / :ref:`SMEM
+      <desc-smem>` instructions over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  VALU Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`VALU <desc-valu>`
+      was busy executing instructions. Does not include :ref:`VMEM <desc-vmem>` operations.
+      Computed as the ratio of the total number of cycles spent by the :ref:`scheduler
+      <desc-scheduler>` issuing VALU instructions over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  VALU Co-Issue Efficiency:
+    rst: ''
+    unit: Unknown
+  VMEM Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`VMEM <desc-vmem>`
+      unit was busy executing instructions, including both global/generic and spill/scratch
+      operations (see the :ref:`VMEM instruction count metrics <ta-instruction-counts>`
+      for more detail). Does not include :ref:`VALU <desc-valu>` operations. Computed
+      as the ratio of the total number of cycles spent by the :ref:`scheduler <desc-scheduler>`
+      issuing VMEM instructions over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  Branch Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`branch <desc-branch>`
+      unit was busy executing instructions. Computed as the ratio of the total number
+      of cycles spent by the :ref:`scheduler <desc-scheduler>` issuing branch instructions
+      over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  VALU Active Threads:
+    rst: Indicates the average level of :ref:`divergence <desc-divergence>` within
+      a wavefront over the lifetime of the kernel. The number of work-items that were
+      active in a wavefront during execution of each :ref:`VALU <desc-valu>` instruction,
+      time-averaged over all VALU instructions run on all wavefronts in the kernel.
+    unit: Work-items
+  MFMA Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`MFMA <desc-mfma>`
+      unit was busy executing instructions. Computed as the ratio of the total number
+      of cycles spent by the :ref:`MFMA <desc-salu>` was busy over the :ref:`total
+      CU cycles <total-cu-cycles>`.
+    unit: Percent
+  MFMA Instruction Cycles:
+    rst: The average duration of :ref:`MFMA <desc-mfma>` instructions in this kernel
+      in cycles. Computed as the ratio of the total number of cycles the MFMA unit
+      was busy over the total number of MFMA instructions. Compare to, for example,
+      the `AMD Matrix Instruction Calculator <https://github.com/RadeonOpenCompute/amd_matrix_instruction_calculator>`_.
+    unit: Cycles per instruction
+  VMEM Latency:
+    rst: The average number of round-trip cycles (that is, from issue to data return
+      / acknowledgment) required for a VMEM instruction to complete.
+    unit: Cycles
+  SMEM Latency:
+    rst: The average number of round-trip cycles (that is, from issue to data return
+      / acknowledgment) required for a SMEM instruction to complete.
+    unit: Cycles
+Arithmetic Operations:
+  FLOPs (Total):
+    rst: The total number of floating-point operations executed on either the :ref:`VALU
+      <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization unit
+      <normalization-units>`.
+    unit: FLOP per normalization unit
+  IOPs (Total):
+    rst: The total number of integer operations executed on either the :ref:`VALU
+      <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization unit
+      <normalization-units>`.
+    unit: IOP per normalization unit
+  F8 OPs:
+    rst: ''
+    unit: Unknown
+  F16 OPs:
+    rst: The total number of 16-bit floating-point operations executed on either the
+      :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: FLOP per normalization unit
+  BF16 OPs:
+    rst: |-
+      The total number of 16-bit brain floating-point operations executed on
+      either the :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization
+      unit <normalization-units>`. Note: on current CDNA accelerators, the VALU
+      has no native BF16 instructions.
+    unit: FLOP per normalization unit
+  F32 OPs:
+    rst: The total number of 32-bit floating-point operations executed on either the
+      :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: FLOP per normalization unit
+  F64 OPs:
+    rst: The total number of 64-bit floating-point operations executed on either the
+      :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: FLOP per normalization unit
+  F6F4 OPs:
+    rst: ''
+    unit: Unknown
+  INT8 OPs:
+    rst: |-
+      The total number of 8-bit integer operations executed on either the :ref:`VALU
+      <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization unit
+      <normalization-units>`. Note: on current CDNA accelerators, the VALU has
+      no native INT8 instructions.
+    unit: IOP per normalization unit
+LDS Speed-of-Light:
+  Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`LDS <desc-lds>`
+      was actively executing instructions (including, but not limited to, load, store,
+      atomic and HIP's ``__shfl`` operations). Calculated as the ratio of the total
+      number of cycles LDS was active over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  Access Rate:
+    rst: Indicates the percentage of SIMDs in the :ref:`VALU <desc-valu>` [#lds-workload]_
+      actively issuing LDS instructions, averaged over the lifetime of the kernel.
+      Calculated as the ratio of the total number of cycles spent by the :ref:`scheduler
+      <desc-scheduler>` issuing :ref:`LDS <desc-lds>` instructions over the :ref:`total
+      CU cycles <total-cu-cycles>`.
+    unit: Percent
+  Theoretical Bandwidth Utilization:
+    rst: Indicates the maximum amount of bytes that could have been loaded from, stored
+      to, or atomically updated in the LDS divided as percentage of theoretical peak.
+      Does *not* take into account the execution mask of the wavefront when the instruction
+      was executed. See the :ref:`LDS bandwidth example <lds-bandwidth>` for more
+      detail.
+    unit: Percent
+  Bank Conflict Rate:
+    rst: Indicates the percentage of active LDS cycles that were spent servicing bank
+      conflicts. Calculated as the ratio of LDS cycles spent servicing bank conflicts
+      over the number of LDS cycles that would have been required to move the same
+      amount of data in an uncontended access. [#lds-bank-conflict]_
+    unit: Percent
+LDS Statistics:
+  LDS Instructions:
+    rst: The total number of LDS instructions (including, but not limited to, read/write/atomics
+      and HIP's ``__shfl`` instructions) executed per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  LDS LOAD:
+    rst: ''
+    unit: Unknown
+  LDS STORE:
+    rst: ''
+    unit: Unknown
+  LDS ATOMIC:
+    rst: ''
+    unit: Unknown
+  LDS LOAD Bandwidth:
+    rst: ''
+    unit: Unknown
+  LDS STORE Bandwidth:
+    rst: ''
+    unit: Unknown
+  LDS ATOMIC Bandwidth:
+    rst: ''
+    unit: Unknown
+  Theoretical Bandwidth:
+    rst: Indicates the maximum amount of bytes that could have been loaded from, stored
+      to, or atomically updated in the LDS divided by total duration. Does *not* take
+      into account the execution mask of the wavefront when the instruction was executed.
+      See the :ref:`LDS bandwidth example <lds-bandwidth>` for more detail.
+    unit: Gbps
+  LDS Latency:
+    rst: The average number of round-trip cycles (i.e., from issue to data-return
+      acknowledgment) required for an LDS instruction to complete.
+    unit: Cycles
+  Bank Conflicts/Access:
+    rst: The ratio of the number of cycles spent in the :ref:`LDS scheduler <desc-lds>`
+      due to bank conflicts (as determined by the conflict resolution hardware) to
+      the base number of cycles that would be spent in the LDS scheduler in a completely
+      uncontended case. This is the unnormalized form of the Bank Conflict Rate.
+    unit: Conflicts per Access
+  Index Accesses:
+    rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>` over
+      all operations per :ref:`normalization unit <normalization-units>`.
+    unit: Cycles per normalization unit
+  Atomic Return Cycles:
+    rst: The total number of cycles spent on LDS atomics with return per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Cycles per normalization unit
+  Bank Conflict:
+    rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>` due
+      to bank conflicts (as determined by the conflict resolution hardware) per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Cycles per normalization unit
+  Addr Conflict:
+    rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>` due
+      to address conflicts (as determined by the conflict resolution hardware) per
+      :ref:`normalization unit <normalization-units>`.
+    unit: Cycles per normalization unit
+  Unaligned Stall:
+    rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>` due
+      to stalls from non-dword aligned addresses per :ref:`normalization unit <normalization-units>`.
+    unit: Cycles per normalization unit
+  Mem Violations:
+    rst: |-
+      The total number of out-of-bounds accesses made to the LDS, per :ref:`normalization
+      unit <normalization-units>`. This is unused and expected to be zero in
+      most configurations for modern CDNA\u2122 accelerators.
+    unit: Accesses per normalization unit
+  LDS Command FIFO Full Rate:
+    rst: ''
+    unit: Unknown
+  LDS Data FIFO Full Rate:
+    rst: ''
+    unit: Unknown
+L1I Speed-of-Light:
+  Bandwidth Utilization:
+    rst: The number of bytes looked up in the L1I cache, as a percent of the peak
+      theoretical bandwidth. Calculated as the ratio of L1I requests over the :ref:`total
+      L1I cycles <total-l1i-cycles>`.
+    unit: Percent
+  Cache Hit Rate:
+    rst: The percent of L1I requests that hit [#l1i-cache]_ on a previously loaded
+      line the cache. Calculated as the ratio of the number of L1I requests that hit
+      over the number of all L1I requests.
+    unit: Percent
+  L1I-L2 Bandwidth Utilization:
+    rst: |-
+      The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth
+      achieved. Calculated as the ratio of the total number of requests from
+      the L1I to the L2 cache over the :ref:`total L1I-L2 interface cycles <total-l1i-cycles>`.
+    unit: Percent
+L1I cache accesses:
+  Req:
+    rst: The total number of requests made to the L1I per normalization-unit
+    unit: Requests per normalization unit
+  Hits:
+    rst: The total number of L1I requests that hit on a previously loaded cache line,
+      per :ref:`normalization-unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Misses - Non Duplicated:
+    rst: The total number of L1I requests that missed on a cache line that *were not*
+      already pending due to another request, per :ref:`normalization-unit <normalization-units>`.
+      See note in :ref:`desc-l1i-sol` for more detail.
+    unit: Requests per normalization unit
+  Misses - Duplicated:
+    rst: The total number of L1I requests that missed on a cache line that *were*
+      already pending due to another request, per :ref:`normalization-unit <normalization-units>`.
+      See note in :ref:`desc-l1i-sol` for more detail.
+    unit: Requests per normalization unit
+  Cache Hit Rate:
+    rst: The percent of L1I requests that hit [#l1i-cache]_ on a previously loaded
+      line the cache. Calculated as the ratio of the number of L1I requests that hit
+      over the number of all L1I requests.
+    unit: Percent
+  Instruction Fetch Latency:
+    rst: The average number of cycles spent to fetch instructions to a :doc:`CU <compute-unit>`.
+    unit: Cycles
+L1I <-> L2 interface:
+  L1I-L2 Bandwidth:
+    rst: Total number of bytes transferred across L1I - L2 interface divided by total
+      duration.
+    unit: Gbps
+Scalar L1D Speed-of-Light:
+  Bandwidth Utilization:
+    rst: The number of bytes looked up in the sL1D cache, as a percent of the peak
+      theoretical bandwidth. Calculated as the ratio of sL1D requests over the :ref:`total
+      sL1D cycles <total-sl1d-cycles>`.
+    unit: Percent
+  Cache Hit Rate:
+    rst: Indicates the percent of sL1D requests that hit on a previously loaded line
+      the cache. The ratio of the number of sL1D requests that hit [#sl1d-cache]_
+      over the number of all sL1D requests.
+    unit: Percent
+  sL1D-L2 BW Utilization:
+    rst: The percentage of the peak theoretical sL1D - L2 interface bandwidth acheived.
+      Caclulated as total number of bytes read from, written to, or atomically updated
+      across the sL1D - L2 interface.
+    unit: Percent
+Scalar L1D cache accesses:
+  Req:
+    rst: The total number of requests, of any size or type, made to the sL1D per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Hits:
+    rst: The total number of sL1D requests that hit on a previously loaded cache line,
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Misses - Non Duplicated:
+    rst: The total number of sL1D requests that missed on a cache line that *was not*
+      already pending due to another request, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`desc-sl1d-sol` for more detail.
+    unit: Requests per normalization unit
+  Misses- Duplicated:
+    rst: The total number of sL1D requests that missed on a cache line that *was*
+      already pending due to another request, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`desc-sl1d-sol` for more detail.
+    unit: Requests per normalization unit
+  Cache Hit Rate:
+    rst: Indicates the percent of sL1D requests that hit on a previously loaded line
+      the cache. The ratio of the number of sL1D requests that hit [#sl1d-cache]_
+      over the number of all sL1D requests.
+    unit: Percent
+  Read Req (Total):
+    rst: The total number of sL1D read requests of any size, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Atomic Req:
+    rst: The total number of atomic requests from sL1D to the :doc:`L2 <l2-cache>`,
+      per :ref:`normalization unit <normalization-units>`. Typically unused on current
+      CDNA accelerators.
+    unit: Requests per normalization unit
+  Read Req (1 DWord):
+    rst: The total number of sL1D read requests made for a single dword of data (4B),
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Read Req (2 DWord):
+    rst: The total number of sL1D read requests made for a two dwords of data (8B),
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Read Req (4 DWord):
+    rst: The total number of sL1D read requests made for a four dwords of data (16B),
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Read Req (8 DWord):
+    rst: The total number of sL1D read requests made for a eight dwords of data (32B),
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Read Req (16 DWord):
+    rst: The total number of sL1D read requests made for a sixteen dwords of data
+      (64B), per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+Scalar L1D Cache - L2 Interface:
+  sL1D-L2 BW:
+    rst: |-
+      The total number of bytes read from, written to, or atomically updated
+      across the sL1D\u2194:doc:`L2 <l2-cache>` interface, divided by total duration.
+      Note that sL1D writes and atomics are typically
+      unused on current CDNA accelerators, so in the majority of cases this can
+      be interpreted as an sL1D\u2192L2 read bandwidth.
+    unit: Gbps
+  Read Req:
+    rst: The total number of read requests from sL1D to the :doc:`L2 <l2-cache>`,
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Write Req:
+    rst: The total number of write requests from sL1D to the :doc:`L2 <l2-cache>`,
+      per :ref:`normalization unit <normalization-units>`. Typically unused on current
+      CDNA accelerators.
+    unit: Requests per normalization unit
+  Atomic Req:
+    rst: The total number of atomic requests from sL1D to the :doc:`L2 <l2-cache>`,
+      per :ref:`normalization unit <normalization-units>`. Typically unused on current
+      CDNA accelerators.
+    unit: Requests per normalization unit
+  Stall Cycles:
+    rst: |-
+      The total number of cycles the sL1D\u2194 :doc:`L2 <l2-cache>` interface
+      was stalled, per :ref:`normalization unit <normalization-units>`.
+    unit: Cycles per normalization unit
+Busy and stall metrics:
+  Address Processing Unit Busy:
+    rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the address processor
+      was busy
+    unit: Percent
+  Address Stall:
+    rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the address processor
+      was stalled from sending address requests further into the vL1D pipeline
+    unit: Percent
+  Data Stall:
+    rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the address processor
+      was stalled from sending write/atomic data further into the vL1D pipeline
+    unit: Percent
+  Data-Processor → Address Stall:
+    rst: Percent of :ref:`total CU cycles <total-cu-cycles>` the address processor
+      was stalled waiting to send command data to the :ref:`data processor <desc-td>`
+    unit: Percent
+  Sequencer → TA Address Stall:
+    rst: ''
+    unit: Unknown
+  Sequencer → TA Command Stall:
+    rst: ''
+    unit: Unknown
+  Sequencer → TA Data Stall:
+    rst: ''
+    unit: Unknown
+Instruction counts:
+  Total Instructions:
+    rst: The total number of memory instructions executed by the address processer
+      over all compute units on the accelerator, per normalization unit.
+    unit: Instructions per normalization unit
+  Global/Generic Instructions:
+    rst: The total number of global & generic memory instructions executed on all
+      :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Global/Generic Read Instructions:
+    rst: The total number of global & generic memory read instructions executed on
+      all :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Global/Generic Read Instructions for LDS:
+    rst: ''
+    unit: Unknown
+  Global/Generic Write Instructions:
+    rst: The total number of global & generic memory write instructions executed on
+      all :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Global/Generic Atomic Instructions:
+    rst: The total number of global & generic memory atomic (with and without return)
+      instructions executed on all :doc:`compute units <compute-unit>` on the accelerator,
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Spill/Stack Instructions:
+    rst: The total number of spill/stack memory instructions executed on all :doc:`compute
+      units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Spill/Stack Read Instructions:
+    rst: The total number of spill/stack memory read instructions executed on all
+      :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Spill/Stack Read Instructions for LDS:
+    rst: ''
+    unit: Unknown
+  Spill/Stack Write Instructions:
+    rst: The total number of spill/stack memory write instructions executed on all
+      :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Spill/Stack Atomic Instructions:
+    rst: The total number of spill/stack memory atomic (with and without return) instructions
+      executed on all :doc:`compute units <compute-unit>` on the accelerator, per
+      :ref:`normalization unit <normalization-units>`. Typically unused as these memory
+      operations are typically used to implement thread-local storage.
+    unit: Instructions per normalization unit
+Spill and stack metrics:
+  Spill/Stack Total Cycles:
+    rst: The number of cycles the address processing unit spent working on spill/stack
+      instructions, per :ref:`normalization unit <normalization-units>`.
+    unit: Cycles per normalization unit
+  Spill/Stack Coalesced Read:
+    rst: The number of cycles the address processing unit spent working on coalesced
+      spill/stack read instructions, per :ref:`normalization unit <normalization-units>`.
+    unit: Cycles per normalization unit
+  Spill/Stack Coalesced Write:
+    rst: The number of cycles the address processing unit spent working on coalesced
+      spill/stack write instructions, per :ref:`normalization unit <normalization-units>`.
+    unit: Cycles per normalization unit
+Vector L1 data-return path or Texture Data (TD):
+  Data-Return Busy:
+    rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the data-return unit
+      was busy processing or waiting on data to return to the :doc:`CU <compute-unit>`.
+    unit: Percent
+  Cache RAM → Data-Return Stall:
+    rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the data-return unit
+      was stalled on data to be returned from the :ref:`vL1D Cache RAM <desc-tc>`.
+    unit: Percent
+  Workgroup manager → Data-Return Stall:
+    rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the data-return unit
+      was stalled by the :ref:`workgroup manager <desc-spi>` due to initialization
+      of registers as a part of launching new workgroups.
+    unit: Percent
+  Coalescable Instructions:
+    rst: The number of instructions submitted to the :ref:`data-return unit <desc-td>`
+      by the :ref:`address processor <desc-ta>` that were found to be coalescable,
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Read Instructions:
+    rst: The number of read instructions submitted to the :ref:`data-return unit <desc-td>`
+      by the :ref:`address processor <desc-ta>` summed over all :doc:`compute units
+      <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
+      This is expected to be the sum of global/generic and spill/stack reads in the
+      :ref:`address processor <desc-ta>`.
+    unit: Instructions per normalization unit
+  Write Instructions:
+    rst: The number of store instructions submitted to the :ref:`data-return unit
+      <desc-td>` by the :ref:`address processor <desc-ta>` summed over all :doc:`compute
+      units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
+      This is expected to be the sum of global/generic and spill/stack stores counted
+      by the :ref:`vL1D cache-front-end <ta-instruction-counts>`.
+    unit: Instructions per normalization unit
+  Atomic Instructions:
+    rst: The number of atomic instructions submitted to the :ref:`data-return unit
+      <desc-td>` by the :ref:`address processor <desc-ta>` summed over all :doc:`compute
+      units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
+      This is expected to be the sum of global/generic and spill/stack atomics in
+      the :ref:`address processor <desc-ta>`.
+    unit: Instructions per normalization unit
+  Write Ack Instructions:
+    rst: The total number of write acknowledgements submitted by :ref:`data-return
+      unit <desc-td>` to SQ, summed over all compute units on the accelerator, per
+      normalization unit.
+    unit: Instructions per normalization unit
+vL1D Speed-of-Light:
+  Hit rate:
+    rst: The ratio of the number of vL1D cache line requests that hit [#vl1d-hit]_
+      in vL1D cache over the total number of cache line requests to the :ref:`vL1D
+      Cache RAM <desc-tc>`.
+    unit: Percent
+  Bandwidth Utilization:
+    rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM
+      <desc-vmem>` instructions, as a percent of the peak theoretical bandwidth achievable
+      on the specific accelerator. The number of bytes is calculated as the number
+      of cache lines requested multiplied by the cache line size. This value does
+      not consider partial requests, so for instance, if only a single value is requested
+      in a cache line, the data movement will still be counted as a full cache line.
+    unit: Percent
+  Utilization:
+    rst: Indicates how busy the :ref:`vL1D Cache RAM <desc-tc>` was during the kernel
+      execution. The number of cycles where the vL1D Cache RAM is actively processing
+      any request divided by the number of cycles where the vL1D is active [#vl1d-activity]_.
+    unit: Percent
+  Coalescing:
+    rst: Indicates how well memory instructions were coalesced by the :ref:`address
+      processing unit <desc-ta>`, ranging from uncoalesced (25%) to fully coalesced
+      (100%). Calculated as the average number of :ref:`thread-requests <thread-requests>`
+      generated per instruction divided by the ideal number of thread-requests per
+      instruction.
+    unit: Percent
+vL1D cache stall metrics:
+  Stalled on L2 Data:
+    rst: The ratio of the number of cycles where the vL1D is stalled waiting for requested
+      data to return from the :doc:`L2 cache <l2-cache>` divided by the number of
+      cycles where the vL1D is active [#vl1d-activity]_.
+    unit: Percent
+  Stalled on L2 Req:
+    rst: The ratio of the number of cycles where the vL1D is stalled waiting to issue
+      a request for data to the :doc:`L2 cache <l2-cache>` divided by the number of
+      cycles where the vL1D is active [#vl1d-activity]_.
+    unit: Percent
+  Stalled on Address:
+    rst: ''
+    unit: Unknown
+  Stalled on Data:
+    rst: ''
+    unit: Unknown
+  Stalled on Latency FIFO:
+    rst: ''
+    unit: Unknown
+  Stalled on Request FIFO:
+    rst: ''
+    unit: Unknown
+  Stalled on Read Return:
+    rst: ''
+    unit: Unknown
+  Tag RAM Stall (Read):
+    rst: The ratio of the number of cycles where the vL1D is stalled due to Read requests
+      with conflicting tags being looked up concurrently, divided by the number of
+      cycles where the vL1D is active [#vl1d-activity]_.
+    unit: Percent
+  Tag RAM Stall (Write):
+    rst: The ratio of the number of cycles where the vL1D is stalled due to Write
+      requests with conflicting tags being looked up concurrently, divided by the
+      number of cycles where the vL1D is active [#vl1d-activity]_.
+    unit: Percent
+  Tag RAM Stall (Atomic):
+    rst: The ratio of the number of cycles where the vL1D is stalled due to Atomic
+      requests with conflicting tags being looked up concurrently, divided by the
+      number of cycles where the vL1D is active [#vl1d-activity]_.
+    unit: Percent
+vL1D cache access metrics:
+  Total Req:
+    rst: The total number of incoming requests from the :ref:`address processing unit
+      <desc-ta>` after coalescing.
+    unit: Requests
+  Read Req:
+    rst: The total number of incoming read requests from the :ref:`address processing
+      unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
+    unit: Requests per normalization unit
+  Write Req:
+    rst: The total number of incoming write requests from the :ref:`address processing
+      unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
+    unit: Requests per normalization unit
+  Atomic Req:
+    rst: The total number of incoming atomic requests from the :ref:`address processing
+      unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
+    unit: Requests per normalization unit
+  Cache BW:
+    rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM
+      <desc-vmem>` instructions divided by total duration. The number of bytes is
+      calculated as the number of cache lines requested multiplied by the cache line
+      size. This value does not consider partial requests, so for instance, if only
+      a single value is requested in a cache line, the data movement will still be
+      counted as a full cache line.
+    unit: Gbps
+  Cache Hit Rate:
+    rst: The ratio of the number of vL1D cache line requests that hit in vL1D cache
+      over the total number of cache line requests to the :ref:`vL1D Cache RAM <desc-tc>`.
+    unit: Percent
+  Cache Accesses:
+    rst: The total number of cache line lookups in the vL1D.
+    unit: Cache lines
+  Cache Hits:
+    rst: The number of cache accesses minus the number of outgoing requests to the
+      :doc:`L2 cache <l2-cache>`, that is, the number of cache line requests serviced
+      by the :ref:`vL1D Cache RAM <desc-tc>` per :ref:`normalization unit <normalization-units>`.
+    unit: Cache lines per normalization unit
+  Invalidations:
+    rst: The number of times the vL1D was issued a write-back invalidate command during
+      the kernel's execution per :ref:`normalization unit <normalization-units>`.
+      This may be triggered by, for instance, the ``buffer_wbinvl1`` instruction.
+    unit: Invalidations per normalization unit
+  L1-L2 BW:
+    rst: The number of bytes transferred across the vL1D-L2 interface as a result
+      of :ref:`VMEM <desc-vmem>` instructions, divided by total duration. The number
+      of bytes is calculated as the number of cache lines requested multiplied by
+      the cache line size. This value does not consider partial requests, so for instance,
+      if only a single value is requested in a cache line, the data movement will
+      still be counted as a full cache line.
+    unit: Gbps
+  Tag RAM 0 Req:
+    rst: ''
+    unit: Unknown
+  Tag RAM 1 Req:
+    rst: ''
+    unit: Unknown
+  Tag RAM 2 Req:
+    rst: ''
+    unit: Unknown
+  Tag RAM 3 Req:
+    rst: ''
+    unit: Unknown
+  L1-L2 Read:
+    rst: The number of read requests for a vL1D cache line that were not satisfied
+      by the vL1D and must be retrieved from the to the :doc:`L2 Cache <l2-cache>`
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  L1-L2 Write:
+    rst: The number of write requests to a vL1D cache line that were sent through
+      the vL1D to the :doc:`L2 cache <l2-cache>`, per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  L1-L2 Atomic:
+    rst: The number of atomic requests that are sent through the vL1D to the :doc:`L2
+      cache <l2-cache>`, per :ref:`normalization unit <normalization-units>`. This
+      includes requests for atomics with, and without return.
+    unit: Requests per normalization unit
+  L1 Access Latency:
+    rst: Calculated as the average number of cycles that a vL1D cache line request
+      spent in the vL1D cache pipeline.
+    unit: Cycles
+  L1-L2 Read Latency:
+    rst: Calculated as the average number of cycles that the vL1D cache took to issue
+      and receive read requests from the :doc:`L2 Cache <l2-cache>`. This number also
+      includes requests for atomics with return values.
+    unit: Cycles
+  L1-L2 Write Latency:
+    rst: Calculated as the average number of cycles that the vL1D cache took to issue
+      and receive acknowledgement of a write request to the :doc:`L2 Cache <l2-cache>`.
+      This number also includes requests for atomics without return values.
+    unit: Cycles
+L1D - L2 Transactions:
+  NC - Read:
+    rst: Total read requests with NC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  UC - Read:
+    rst: Total read requests with UC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  CC - Read:
+    rst: Total read requests with CC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  RW - Read:
+    rst: Total read requests with RW mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  RW - Write:
+    rst: Total write requests with RW mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  NC - Write:
+    rst: Total write requests with NC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  UC - Write:
+    rst: Total write requests with UC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  CC - Write:
+    rst: Total write requests with CC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  NC - Atomic:
+    rst: Total atomic requests with NC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  UC - Atomic:
+    rst: Total atomic requests with UC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  CC - Atomic:
+    rst: Total atomic requests with CC mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+  RW - Atomic:
+    rst: Total atomic requests with RW mtype from this TCP to all TCCs Sum over TCP
+      instances per normalization unit.
+    unit: Requests per normalization unit
+L1 Unified Translation Cache (UTCL1):
+  Req:
+    rst: The number of translation requests made to the UTCL1 per normalization unit.
+    unit: Requests per normalization unit
+  Inflight Req:
+    rst: ''
+    unit: Unknown
+  Hit Ratio:
+    rst: The ratio of the number of translation requests that hit in the UTCL1 divided
+      by the total number of translation requests made to the UTCL1.
+    unit: Percent
+  Hits:
+    rst: The number of translation requests that hit in the UTCL1, and could be reused,
+      per normalization unit.
+    unit: Requests per normalization unit
+  Translation Misses:
+    rst: The total number of translation requests that missed in the UTCL1 due to
+      translation not being present in the cache, per :ref:`normalization unit <normalization-units>`.
+    unit: unit
+  Misses under Translation Miss:
+    rst: ''
+    unit: Unknown
+  Permission Misses:
+    rst: |-
+      The total number of translation requests that missed in the UTCL1 due
+      to a permission error, per :ref:`normalization unit <normalization-units>`.
+      This is unused and expected to be zero in most configurations for modern
+      CDNA\u2122 accelerators.
+    unit: Requests per normalization unit
+L1D Addr Translation Stalls:
+  Cache Full Stall:
+    rst: ''
+    unit: Unknown
+  Cache Miss Stall:
+    rst: ''
+    unit: Unknown
+  Serialization Stall:
+    rst: ''
+    unit: Unknown
+  Thrashing Stall:
+    rst: ''
+    unit: Unknown
+  Latency FIFO Stall:
+    rst: ''
+    unit: Unknown
+  Resident Page Full Stall:
+    rst: ''
+    unit: Unknown
+  UTCL2 Stall:
+    rst: ''
+    unit: Unknown
+L2 Speed-of-Light:
+  Utilization:
+    rst: The ratio of the :ref:`number of cycles an L2 channel was active, summed
+      over all L2 channels on the accelerator <total-active-l2-cycles>` over the :ref:`total
+      L2 cycles <total-l2-cycles>`.
+    unit: Percent
+  Peak Bandwidth:
+    rst: The number of bytes looked up in the L2 cache, as a percent of the peak theoretical
+      bandwidth achievable on the specific accelerator. The number of bytes is calculated
+      as the number of cache lines requested multiplied by the cache line size. This
+      value does not consider partial requests, so e.g., if only a single value is
+      requested in a cache line, the data movement will still be counted as a full
+      cache line.
+    unit: Percent
+  Hit Rate:
+    rst: The ratio of the number of L2 cache line requests that hit in the L2 cache
+      over the total number of incoming cache line requests to the L2 cache.
+    unit: Percent
+  L2-Fabric Read BW:
+    rst: The number of bytes read by the L2 over the :ref:`Infinity Fabric interface
+      <l2-fabric>` per unit time.
+    unit: GB/s
+  L2-Fabric Write and Atomic BW:
+    rst: The number of bytes sent by the L2 over the :ref:`Infinity Fabric interface
+      <l2-fabric>` by write and atomic operations per unit time.
+    unit: GB/s
+  HBM Bandwidth:
+    rst: Maximum theoretical bandwidth of the accelerator's local high-bandwidth memory
+      (HBM) per unit time. This value is calculated as the number of HBM channels
+      multiplied by the HBM channel width multiplied by the HBM clock frequency.
+    unit: GB/s
+L2-Fabric interface metrics:
+  Read BW:
+    rst: The total number of bytes read by the L2 cache from Infinity Fabric divided
+      by total duration.
+    unit: Gbps
+  HBM Read Traffic:
+    rst: The percent of read requests generated by the L2 cache that are routed to
+      the accelerator's local high-bandwidth memory (HBM). This breakdown does not
+      consider the *size* of the request (meaning that 32B and 64B requests are both
+      counted as a single request), so this metric only *approximates* the percent
+      of the L2-Fabric Read bandwidth directed to the local HBM.
+    unit: Percent
+  Remote Read Traffic:
+    rst: The percent of read requests generated by the L2 cache that are routed to
+      any memory location other than the accelerator's local high-bandwidth memory
+      (HBM) -- for example, the CPU's DRAM or a remote accelerator's HBM. This breakdown
+      does not consider the *size* of the request (meaning that 32B and 64B requests
+      are both counted as a single request), so this metric only *approximates* the
+      percent of the L2-Fabric Read bandwidth directed to a remote location.
+    unit: Percent
+  Uncached Read Traffic:
+    rst: The percent of read requests generated by the L2 cache that are reading from
+      an :ref:`uncached memory allocation <memory-type>`. Note, as described in the
+      :ref:`request flow <l2-request-flow>` section, a single 64B read request is
+      typically counted as two uncached read requests. So, it is possible for the
+      Uncached Read Traffic to reach up to 200% of the total number of read requests.
+      This breakdown does not consider the *size* of the request (i.e., 32B and 64B
+      requests are both counted as a single request), so this metric only *approximates*
+      the percent of the L2-Fabric read bandwidth directed to an uncached memory location.
+    unit: Percent
+  Write and Atomic BW:
+    rst: The total number of bytes written by the L2 over Infinity Fabric by write
+      and atomic operations divided by total duration. Note that on current CDNA accelerators,
+      such as the :ref:`MI2XX <mixxx-note>`, requests are only considered *atomic*
+      by Infinity Fabric if they are targeted at non-write-cacheable memory, for example,
+      :ref:`fine-grained memory <memory-type>` allocations or :ref:`uncached memory
+      <memory-type>` allocations on the MI2XX.
+    unit: Gbps
+  HBM Write and Atomic Traffic:
+    rst: The percent of write and atomic requests generated by the L2 cache that are
+      routed to the accelerator's local high-bandwidth memory (HBM). This breakdown
+      does not consider the *size* of the request (meaning that 32B and 64B requests
+      are both counted as a single request), so this metric only *approximates* the
+      percent of the L2-Fabric Write and Atomic bandwidth directed to the local HBM.
+      Note that on current CDNA accelerators, such as the :ref:`MI2XX <mixxx-note>`,
+      requests are only considered *atomic* by Infinity Fabric if they are targeted
+      at :ref:`fine-grained memory <memory-type>` allocations or :ref:`uncached memory
+      <memory-type>` allocations.
+    unit: Percent
+  Remote Write and Atomic Traffic:
+    rst: The percent of read requests generated by the L2 cache that are routed to
+      any memory location other than the accelerator's local high-bandwidth memory
+      (HBM) -- for example, the CPU's DRAM or a remote accelerator's HBM. This breakdown
+      does not consider the *size* of the request (meaning that 32B and 64B requests
+      are both counted as a single request), so this metric only *approximates* the
+      percent of the L2-Fabric Read bandwidth directed to a remote location. Note
+      that on current CDNA accelerators, such as the :ref:`MI2XX <mixxx-note>`, requests
+      are only considered *atomic* by Infinity Fabric if they are targeted at :ref:`fine-grained
+      memory <memory-type>` allocations or :ref:`uncached memory <memory-type>` allocations.
+    unit: Percent
+  Atomic Traffic:
+    rst: The percent of write requests generated by the L2 cache that are atomic requests
+      to *any* memory location. This breakdown does not consider the *size* of the
+      request (meaning that 32B and 64B requests are both counted as a single request),
+      so this metric only *approximates* the percent of the L2-Fabric Read bandwidth
+      directed to a remote location. Note that on current CDNA accelerators, such
+      as the :ref:`MI2XX <mixxx-note>`, requests are only considered *atomic* by Infinity
+      Fabric if they are targeted at :ref:`fine-grained memory <memory-type>` allocations
+      or :ref:`uncached memory <memory-type>` allocations.
+    unit: Percent
+  Uncached Write and Atomic Traffic:
+    rst: The percent of write and atomic requests generated by the L2 cache that are
+      targeting :ref:`uncached memory allocations <memory-type>`. This breakdown does
+      not consider the *size* of the request (meaning that 32B and 64B requests are
+      both counted as a single request), so this metric only *approximates* the percent
+      of the L2-Fabric read bandwidth directed to uncached memory allocations.
+    unit: Percent
+  Read Latency:
+    rst: The time-averaged number of cycles read requests spent in Infinity Fabric
+      before data was returned to the L2.
+    unit: Cycles
+  Write and Atomic Latency:
+    rst: The time-averaged number of cycles write requests spent in Infinity Fabric
+      before a completion acknowledgement was returned to the L2.
+    unit: Cycles
+  Atomic Latency:
+    rst: The time-averaged number of cycles atomic requests spent in Infinity Fabric
+      before a completion acknowledgement (atomic without return value) or data (atomic
+      with return value) was returned to the L2.
+    unit: Cycles
+  Read Stall:
+    rst: |-
+      The ratio of the total number of cycles the L2-Fabric interface was stalled
+      on a read request to any destination (local HBM, remote PCIe\xAE connected
+      accelerator or CPU, or remote Infinity Fabric connected accelerator [#inf]_
+      or CPU) over the :ref:`total active L2 cycles <total-active-l2-cycles>`.
+    unit: Percent
+  Write Stall:
+    rst: The ratio of the total number of cycles the L2-Fabric interface was stalled
+      on a write or atomic request to any destination (local HBM, remote accelerator
+      or CPU, PCIe connected accelerator or CPU, or remote Infinity Fabric connected
+      accelerator [#inf]_ or CPU) over the :ref:`total active L2 cycles <total-active-l2-cycles>`.
+    unit: Percent
+L2 Cache Accesses:
+  Bandwidth:
+    rst: The number of bytes looked up in the L2 cache, divided by total duration.
+      The number of bytes is calculated as the number of cache lines requested multiplied
+      by the cache line size. This value does not consider partial requests, so for
+      example, if only a single value is requested in a cache line, the data movement
+      will still be counted as a full cache line.
+    unit: Gbps
+  Read Bandwidth:
+    rst: Total number of bytes looked up in the L2 cache for read requests, divided
+      by total duration.
+    unit: Gbps
+  Write Bandwidth:
+    rst: Total number of bytes looked up in the L2 cache for write requests, divided
+      by total duration.
+    unit: Gbps
+  Atomic Bandwidth:
+    rst: Total number of bytes looked up in the L2 cache for atomic requests, divided
+      by total duration.
+    unit: Gbps
+  Req:
+    rst: The total number of incoming requests to the L2 from all clients for all
+      request types, per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Read Req:
+    rst: The total number of read requests to the L2 from all clients.
+    unit: Requests per normalization unit
+  Write Req:
+    rst: The total number of write requests to the L2 from all clients.
+    unit: Requests per normalization unit
+  Atomic Req:
+    rst: The total number of atomic requests (with and without return) to the L2 from
+      all clients.
+    unit: Requests per normalization unit
+  Streaming Req:
+    rst: The total number of incoming requests to the L2 that are marked as *streaming*.
+      The exact meaning of this may differ depending on the targeted accelerator,
+      however on an :ref:`MI2XX <mixxx-note>` this corresponds to `non-temporal load
+      or stores <https://clang.llvm.org/docs/LanguageExtensions.html#non-temporal-load-store-builtins>`_.
+      The L2 cache attempts to evict *streaming* requests before normal requests when
+      the L2 is at capacity.
+    unit: Requests per normalization unit
+  Bypasss Req:
+    rst: ''
+    unit: Unknown
+  Probe Req:
+    rst: The number of coherence probe requests made to the L2 cache from outside
+      the accelerator. On an :ref:`MI2XX <mixxx-note>`, probe requests may be generated
+      by, for example, writes to :ref:`fine-grained device <memory-type>` memory or
+      by writes to :ref:`coarse-grained <memory-type>` device memory.
+    unit: Requests per normalization unit
+  Input Buffer Req:
+    rst: ''
+    unit: Unknown
+  Cache Hit:
+    rst: The ratio of the number of L2 cache line requests that hit in the L2 cache
+      over the total number of incoming cache line requests to the L2 cache.
+    unit: Percent
+  Hits:
+    rst: The total number of requests to the L2 from all clients that hit in the cache.
+      As noted in the :ref:`Speed-of-Light <l2-sol>` section, this includes hit-on-miss
+      requests.
+    unit: Requests per normalization unit
+  Misses:
+    rst: The total number of requests to the L2 from all clients that miss in the
+      cache. As noted in the :ref:`Speed-of-Light <l2-sol>` section, these do not
+      include hit-on-miss requests.
+    unit: Requests per normalization unit
+  Writeback:
+    rst: The total number of L2 cache lines written back to memory for any reason.
+      Write-backs may occur due to user code (such as HIP kernel calls to ``__threadfence_system``
+      or atomic built-ins) by the :doc:`command processor <command-processor>`'s memory
+      acquire/release fences, or for other internal hardware reasons.
+    unit: Cache lines per normalization unit
+  Writeback (Internal):
+    rst: The total number of L2 cache lines written back to memory for internal hardware
+      reasons, per :ref:`normalization unit <normalization-units>`.
+    unit: Cache lines per normalization unit
+  Writeback (vL1D Req):
+    rst: The total number of L2 cache lines written back to memory due to requests
+      initiated by the :doc:`vL1D cache <vector-l1-cache>`, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Cache lines per normalization unit
+  Evict (Internal):
+    rst: The total number of L2 cache lines evicted from the cache due to capacity
+      limits, per :ref:`normalization unit <normalization-units>`.
+    unit: Cache lines per normalization unit
+  Evict (vL1D Req):
+    rst: The total number of L2 cache lines evicted from the cache due to invalidation
+      requests initiated by the :doc:`vL1D cache <vector-l1-cache>`, per :ref:`normalization
+      unit <normalization-units>`.
+    unit: Cache lines per normalization unit
+  NC Req:
+    rst: The total number of requests to the L2 to Not-hardware-Coherent (NC) memory
+      allocations, per :ref:`normalization unit <normalization-units>`. See the :ref:`memory-type`
+      for more information.
+    unit: Requests per normalization unit
+  UC Req:
+    rst: The total number of requests to the L2 that go to Uncached (UC) memory allocations.
+      See the :ref:`memory-type` for more information.
+    unit: Requests per normalization unit
+  CC Req:
+    rst: The total number of requests to the L2 that go to Coherently Cacheable (CC)
+      memory allocations. See the :ref:`memory-type` for more information.
+    unit: Requests per normalization unit
+  RW Req:
+    rst: The total number of requests to the L2 that go to Read-Write coherent memory
+      (RW) allocations. See the :ref:`memory-type` for more information.
+    unit: Requests per normalization unit
+L2 Cache Stalls:
+  Stalled on Latency FIFO:
+    rst: ''
+    unit: Unknown
+  Stalled on Write Data FIFO:
+    rst: ''
+    unit: Unknown
+  Input Buffer Stalled on L2:
+    rst: ''
+    unit: Unknown
+L2 - Fabric Interface stalls:
+  Read - PCIe Stall:
+    rst: The number of cycles the L2-Fabric interface was stalled on read requests
+      to remote PCIe connected accelerators [#inf]_ or CPUs as a percent of the :ref:`total
+      active L2 cycles <total-active-l2-cycles>`.
+    unit: Percent
+  Read - Infinity Fabric Stall:
+    rst: The number of cycles the L2-Fabric interface was stalled on read requests
+      to remote Infinity Fabric connected accelerators [#inf]_ or CPUs as a percent
+      of the :ref:`total active L2 cycles <total-active-l2-cycles>`.
+    unit: Percent
+  Read - HBM Stall:
+    rst: The number of cycles the L2-Fabric interface was stalled on read requests
+      to the accelerator's local HBM as a percent of the :ref:`total active L2 cycles
+      <total-active-l2-cycles>`.
+    unit: Percent
+  Write - PCIe Stall:
+    rst: The number of cycles the L2-Fabric interface was stalled on write or atomic
+      requests to remote PCIe connected accelerators [#inf]_ or CPUs as a percent
+      of the :ref:`total active L2 cycles <total-active-l2-cycles>`.
+    unit: Percent
+  Write - Infinity Fabric Stall:
+    rst: The number of cycles the L2-Fabric interface was stalled on write or atomic
+      requests to remote Infinity Fabric connected accelerators [#inf]_ or CPUs as
+      a percent of the :ref:`total active L2 cycles <total-active-l2-cycles>`.
+    unit: Percent
+  Write - HBM Stall:
+    rst: The number of cycles the L2-Fabric interface was stalled on write or atomic
+      requests to accelerator's local HBM as a percent of the total active L2 cycles.
+    unit: Percent
+  Write - Credit Starvation:
+    rst: The number of cycles the L2-Fabric interface was stalled on write or atomic
+      requests to any memory location because too many write/atomic requests were
+      currently in flight, as a percent of the :ref:`total active L2 cycles <total-active-l2-cycles>`.
+    unit: Percent
+L2 - Fabric interface detailed metrics:
+  Read (32B):
+    rst: The total number of L2 requests to Infinity Fabric to read 32B of data from
+      any memory location, per :ref:`normalization unit <normalization-units>`. See
+      :ref:`l2-request-flow` for more detail. Typically unused on CDNA accelerators.
+    unit: Requests per normalization unit
+  Read (64B):
+    rst: The total number of L2 requests to Infinity Fabric to read 64B of data from
+      any memory location, per :ref:`normalization unit <normalization-units>`. See
+      :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+  Read (128B):
+    rst: ''
+    unit: Unknown
+  Read (Uncached):
+    rst: The total number of L2 requests to Infinity Fabric to read :ref:`uncached
+      data <memory-type>` from any memory location, per :ref:`normalization unit <normalization-units>`.
+      64B requests for uncached data are counted as two 32B uncached data requests.
+      See :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+  HBM Read:
+    rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of
+      data from the accelerator's local HBM, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+  Remote Read:
+    rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of
+      data from any source other than the accelerator's local HBM, per :ref:`normalization
+      unit <normalization-units>`. See :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+  Read Bandwidth - PCIe:
+    rst: Total number of bytes due to L2 read requests due to PCIe traffic, divided
+      by total duration.
+    unit: Gbps
+  Read Bandwidth - Infinity Fabric™:
+    rst: Total number of bytes due to L2 read requests due to Infinity Fabric traffic,
+      divided by total duration.
+    unit: Gbps
+  Read Bandwidth - HBM:
+    rst: Total number of bytes due to L2 read requests due to HBM traffic, divided
+      by total duration.
+    unit: Gbps
+  Write and Atomic (32B):
+    rst: The total number of L2 requests to Infinity Fabric to write or atomically
+      update 32B of data to any memory location, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+  Write and Atomic (Uncached):
+    rst: The total number of L2 requests to Infinity Fabric to write or atomically
+      update 32B or 64B of :ref:`uncached data <memory-type>`, per :ref:`normalization
+      unit <normalization-units>`. See :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+  Write and Atomic (64B):
+    rst: The total number of L2 requests to Infinity Fabric to write or atomically
+      update 64B of data in any memory location, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+  HBM Write and Atomic:
+    rst: The total number of L2 requests to Infinity Fabric to write or atomically
+      update 32B or 64B of data in the accelerator's local HBM, per :ref:`normalization
+      unit <normalization-units>`. See :ref:`l2-request-flow` for more detail. plain
+    unit: Requests per normalization unit
+  Remote Write and Atomic:
+    rst: The total number of L2 requests to Infinity Fabric to write or atomically
+      update 32B or 64B of data in any memory location other than the accelerator's
+      local HBM, per :ref:`normalization unit <normalization-units>`. See :ref:`l2-request-flow`
+      for more detail.
+    unit: Requests per normalization unit
+  Write Bandwidth - PCIe:
+    rst: Total number of bytes due to L2 write requests due to PCIe traffic, divided
+      by total duration.
+    unit: Gbps
+  Write Bandwidth - Infinity Fabric™:
+    rst: Total number of bytes due to L2 write requests due to Infinity Fabric traffic,
+      divided by total duration.
+    unit: Gbps
+  Write Bandwidth - HBM:
+    rst: Total number of bytes due to L2 write requests due to HBM traffic, divided
+      by total duration.
+    unit: Gbps
+  Atomic:
+    rst: The total number of L2 requests to Infinity Fabric to atomically update 32B
+      or 64B of data in any memory location, per :ref:`normalization unit <normalization-units>`.
+      See :ref:`l2-request-flow` for more detail. Note that on current CDNA accelerators,
+      such as the :ref:`MI2XX <mixxx-note>`, requests are only considered *atomic*
+      by Infinity Fabric if they are targeted at non-write-cacheable memory, such
+      as :ref:`fine-grained memory <memory-type>` allocations or :ref:`uncached memory
+      <memory-type>` allocations on the MI2XX.
+    unit: Requests per normalization unit
+  Atomic - HBM:
+    rst: ''
+    unit: Unknown
+  Atomic Bandwidth - PCIe:
+    rst: Total number of bytes due to L2 atomic requests due to PCIe traffic, divided
+      by total duration.
+    unit: Gbps
+  Atomic Bandwidth - Infinity Fabric™:
+    rst: Total number of bytes due to L2 atomic requests due to Infinity Fabric traffic,
+      divided by total duration.
+    unit: Gbps
+  Atomic Bandwidth - HBM:
+    rst: Total number of bytes due to L2 atomic requests due to HBM traffic, divided
+      by total duration.
+    unit: Gbps
+Aggregate Stats (All channels):
+  L2 Cache Hit Rate:
+    rst: The total number of requests to the L2 from all clients that hit in the cache.
+      As noted in the :ref:`Speed-of-Light <l2-sol>` section, this includes hit-on-miss
+      requests.
+    unit: Percent
+L2 Cache Hit Rate (pct):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
+L2 Requests (per normUnit):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
+L2-Fabric Requests (per normUnit):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
+L2-Fabric Read Latency (Cycles):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
+L2-Fabric Write and Atomic Latency (Cycles):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
+L2-Fabric Atomic Latency (Cycles):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
+L2-Fabric Read Stall (Cycles per normUnit):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
+L2-Fabric Write and Atomic Stall (Cycles per normUnit):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
+L2-Fabric (128B read requests per normUnit):
+  ::_1:
+    rst: ''
+    unit: Unknown
+  placeholder_range:
+    rst: ''
+    unit: Unknown
diff --git a/projects/rocprofiler-compute/utils/run-ci.py b/projects/rocprofiler-compute/tools/run-ci.py
similarity index 100%
rename from projects/rocprofiler-compute/utils/run-ci.py
rename to projects/rocprofiler-compute/tools/run-ci.py
diff --git a/projects/rocprofiler-compute/utils/split_config.py b/projects/rocprofiler-compute/tools/split_config.py
similarity index 87%
rename from projects/rocprofiler-compute/utils/split_config.py
rename to projects/rocprofiler-compute/tools/split_config.py
index c43e45c58d..d1f0a55ca3 100644
--- a/projects/rocprofiler-compute/utils/split_config.py
+++ b/projects/rocprofiler-compute/tools/split_config.py
@@ -25,11 +25,11 @@
 
 # NOTES
 #
-# Read utils/unified_config.yaml and split it into per gfx architecture per panel
+# Read tools/unified_config.yaml and split it into per gfx architecture per panel
 # config files. WARNING: This script will overwrite existing files under per gfx
 # architecture folders under src/rocprof_compute_soc/analysis_configs.
 #
-# Read utils/unified_config.yaml and split it into metric tables per documentation
+# Read tools/unified_config.yaml and split it into metric tables per documentation
 # section.
 # WARNING: This script will overwrite existing docs/data/metrics_description.yaml.
 
@@ -42,25 +42,34 @@ import yaml
 
 # Get root directory of the project
 ROOT_DIR = Path(__file__).parent.parent
-SOURCE_DIR = ROOT_DIR / "utils"
+SOURCE_DIR = ROOT_DIR / "tools"
 TARGET_DIR = ROOT_DIR / "src" / "rocprof_compute_soc" / "analysis_configs"
 SETS_TARGET_DIR = ROOT_DIR / "src" / "rocprof_compute_soc" / "profile_configs" / "sets"
 DOC_TARGET_DIR = ROOT_DIR / "docs" / "data"
 AUTOGEN_TEXT = (
     "# AUTOGENERATED FILE. Only edit for testing purposes, not for development. "
-    "Generated from utils/unified_config.yaml. Generated by utils/split_config.py\n"
+    "Generated from tools/unified_config.yaml. Generated by tools/split_config.py\n"
 )
-HASH_FILE = ROOT_DIR / "utils" / "autogen_hash.yaml"
+HASH_FILE = ROOT_DIR / "tools" / "autogen_hash.yaml"
 HASH_FILE_MAP = {}
 GFX_VERSIONS = ["gfx908", "gfx90a", "gfx940", "gfx941", "gfx942", "gfx950"]
 METRIC_ID_TO_NAME_MAP = {gfx_version: {} for gfx_version in GFX_VERSIONS}
 
 
-def get_autogen_text(config_file="utils/unified_config.yaml"):
+def str_representer(dumper, data):
+    if "\n" in data:
+        return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
+    return dumper.represent_scalar("tag:yaml.org,2002:str", data)
+
+
+yaml.add_representer(str, str_representer)
+
+
+def get_autogen_text(config_file="tools/unified_config.yaml"):
     return (
         f"# AUTOGENERATED FILE. Only edit for testing purposes, "
         f"not for development. Generated from {config_file}. "
-        f"Generated by utils/split_config.py\n"
+        f"Generated by tools/split_config.py\n"
     )
 
 
@@ -76,10 +85,7 @@ def update_analysis_config():
         new_panel_config = {"Panel Config": {}}
         new_panel_config["Panel Config"]["id"] = panel_config["id"]
         new_panel_config["Panel Config"]["title"] = panel_config["title"]
-        new_panel_config["Panel Config"]["metrics_description"] = {
-            key: value["plain"]
-            for key, value in panel_config.get("metrics_description", {}).items()
-        }
+
         panel_id_int = panel_config["id"]
         # Convert int into str with 4 digits
         panel_id = str(panel_config["id"]).zfill(4)
@@ -98,6 +104,9 @@ def update_analysis_config():
                 gfx_dir.mkdir()
                 print(f"Created directory: {gfx_dir}")
 
+            # Collect metrics for this gfx_version
+            gfx_metrics = []
+
             # Select metrics from current gfx arch
             new_panel_config["Panel Config"]["data source"] = []
             for data_source_index, data_source_config in enumerate(
@@ -108,6 +117,14 @@ def update_analysis_config():
                     data_source_config["metric_table"]["metric"] = data_source_config[
                         "metric_table"
                     ]["metric"][gfx_version]
+
+                    # Collect metric names for this gfx version (preserve order)
+                    for metric_name in data_source_config["metric_table"][
+                        "metric"
+                    ].keys():
+                        if metric_name not in gfx_metrics:
+                            gfx_metrics.append(metric_name)
+
                     build_metric_id_mapping(
                         panel_id_int,
                         data_source_index,
@@ -117,6 +134,14 @@ def update_analysis_config():
                 new_panel_config["Panel Config"]["data source"].append(
                     data_source_config
                 )
+
+            # Only include metric descriptions for metrics that exist in this gfx
+            new_panel_config["Panel Config"]["metrics_description"] = {
+                key: value["plain"].strip()
+                for key, value in panel_config.get("metrics_description", {}).items()
+                if key in gfx_metrics
+            }
+
             # Write panel config to file
             filename = TARGET_DIR / gfx_version / f"{panel_id}_{panel_title}.yaml"
             with open(filename, "w") as file:
@@ -170,7 +195,7 @@ def update_sets_config():
         # Write gfx version sets to file
         filename = SETS_TARGET_DIR / f"{gfx_version}_sets.yaml"
         with open(filename, "w") as file:
-            file.write(get_autogen_text("utils/unified_sets.yaml"))
+            file.write(get_autogen_text("tools/unified_sets.yaml"))
             yaml.dump(new_sets, file, sort_keys=False)
             print(f"File write: {filename}")
         # Calculate hash of filename
@@ -240,7 +265,9 @@ def update_documentation():
                 # Add metrics info
                 for metric_name in sorted(list(metric_names)):
                     metrics_info[metric_name] = {
-                        "rst": panel_config["metrics_description"][metric_name]["rst"],
+                        "rst": panel_config["metrics_description"][metric_name][
+                            "rst"
+                        ].strip(),
                         "unit": panel_config["metrics_description"][metric_name][
                             "unit"
                         ],
diff --git a/projects/rocprofiler-compute/utils/unified_config.yaml b/projects/rocprofiler-compute/tools/unified_config.yaml
similarity index 92%
rename from projects/rocprofiler-compute/utils/unified_config.yaml
rename to projects/rocprofiler-compute/tools/unified_config.yaml
index ffc505a18f..d157b14ac1 100644
--- a/projects/rocprofiler-compute/utils/unified_config.yaml
+++ b/projects/rocprofiler-compute/tools/unified_config.yaml
@@ -1,4 +1,4 @@
-# NOTE: Please run utils/split_config.py after making changes to this file to auto-generate configs
+# NOTE: Please run tools/split_config.py after making changes to this file to auto-generate configs
 panels:
 - id: 0
   title: Top Stats
@@ -1318,24 +1318,28 @@ panels:
             coll_level: SQ_IFETCH_LEVEL
   metrics_description:
     VALU FLOPs:
-      plain: 'The total floating-point operations executed per second on the VALU.
+      plain: |-
+        The total floating-point operations executed per second on the VALU.
         This is also presented as a percent of the peak theoretical FLOPs achievable
         on the specific accelerator. Note: this does not include any floating-point
-        operations from MFMA instructions.'
-      rst: 'The total floating-point operations executed per second on the :ref:`VALU
+        operations from MFMA instructions.
+      rst: |-
+        The total floating-point operations executed per second on the :ref:`VALU
         <desc-valu>`. This is also presented as a percent of the peak theoretical
         FLOPs achievable on the specific accelerator. Note: this does not include
-        any floating-point operations from :ref:`MFMA <desc-mfma>` instructions.'
+        any floating-point operations from :ref:`MFMA <desc-mfma>` instructions.
       unit: GFLOPs
     VALU IOPs:
-      plain: 'The total integer operations executed per second on the VALU. This is
+      plain: |-
+        The total integer operations executed per second on the VALU. This is
         also presented as a percent of the peak theoretical IOPs achievable on the
         specific accelerator. Note: this does not include any integer operations from
-        MFMA instructions.'
-      rst: 'The total integer operations executed per second on the :ref:`VALU <desc-valu>`.
+        MFMA instructions.
+      rst: |-
+        The total integer operations executed per second on the :ref:`VALU <desc-valu>`.
         This is also presented as a percent of the peak theoretical IOPs achievable
         on the specific accelerator. Note: this does not include any integer operations
-        from :ref:`MFMA <desc-mfma>` instructions.'
+        from :ref:`MFMA <desc-mfma>` instructions.
       unit: GOIPs
     MFMA FLOPs (F8):
       plain: The total number of 8-bit brain floating point MFMA operations executed
@@ -1343,66 +1347,77 @@ panels:
         from VALU instructions. This is also presented as a percent of the peak theoretical
         F8 MFMA operations achievable on the specific accelerator. It is supported
         on AMD Instinct MI300 series and later only.
-      rst: 'The total number of 8-bit brain floating point :ref:`MFMA <desc-mfma>`
+      rst: |-
+        The total number of 8-bit brain floating point :ref:`MFMA <desc-mfma>`
         operations executed per second. Note: this does not include any 16-bit brain
         floating point operations from :ref:`VALU <desc-valu>` instructions. This
         is also presented as a percent of the peak theoretical F8 MFMA operations
         achievable on the specific accelerator. It is supported on AMD Instinct MI300
-        series and later only.'
+        series and later only.
       unit: GFLOPs
     MFMA FLOPs (BF16):
-      plain: 'The total number of 16-bit brain floating point MFMA operations executed
+      plain: |-
+        The total number of 16-bit brain floating point MFMA operations executed
         per second. Note: this does not include any 16-bit brain floating point operations
         from VALU instructions. This is also presented as a percent of the peak theoretical
-        BF16 MFMA operations achievable on the specific accelerator.'
-      rst: 'The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>`
+        BF16 MFMA operations achievable on the specific accelerator.
+      rst: |-
+        The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>`
         operations executed per second. Note: this does not include any 16-bit brain
         floating point operations from :ref:`VALU <desc-valu>` instructions. This
         is also presented as a percent of the peak theoretical BF16 MFMA operations
-        achievable on the specific accelerator.'
+        achievable on the specific accelerator.
       unit: GFLOPs
     MFMA FLOPs (F16):
-      plain: 'The total number of 16-bit floating point MFMA operations executed per
+      plain: |-
+        The total number of 16-bit floating point MFMA operations executed per
         second. Note: this does not include any 16-bit floating point operations from
         VALU instructions. This is also presented as a percent of the peak theoretical
-        F16 MFMA operations achievable on the specific accelerator.'
-      rst: 'The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
+        F16 MFMA operations achievable on the specific accelerator.
+      rst: |-
+        The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
         executed per second. Note: this does not include any 16-bit floating point
         operations from :ref:`VALU <desc-valu>` instructions. This is also presented
         as a percent of the peak theoretical F16 MFMA operations achievable on the
-        specific accelerator.'
+        specific accelerator.
       unit: GFLOPs
     MFMA FLOPs (F32):
-      plain: 'The total number of 32-bit floating point MFMA operations executed per
+      plain: |-
+        The total number of 32-bit floating point MFMA operations executed per
         second. Note: this does not include any 32-bit floating point operations from
         VALU instructions. This is also presented as a percent of the peak theoretical
-        F32 MFMA operations achievable on the specific accelerator.'
-      rst: 'The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
+        F32 MFMA operations achievable on the specific accelerator.
+      rst: |-
+        The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
         executed per second. Note: this does not include any 32-bit floating point
         operations from :ref:`VALU <desc-valu>` instructions. This is also presented
         as a percent of the peak theoretical F32 MFMA operations achievable on the
-        specific accelerator.'
+        specific accelerator.
       unit: GFLOPs
     MFMA FLOPs (F64):
-      plain: 'The total number of 64-bit floating point MFMA operations executed per
+      plain: |-
+        The total number of 64-bit floating point MFMA operations executed per
         second. Note: this does not include any 64-bit floating point operations from
         VALU instructions. This is also presented as a percent of the peak theoretical
-        F64 MFMA operations achievable on the specific accelerator.'
-      rst: 'The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
+        F64 MFMA operations achievable on the specific accelerator.
+      rst: |-
+        The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
         executed per second. Note: this does not include any 64-bit floating point
         operations from :ref:`VALU <desc-valu>` instructions. This is also presented
         as a percent of the peak theoretical F64 MFMA operations achievable on the
-        specific accelerator.'
+        specific accelerator.
       unit: GFLOPs
     MFMA IOPs (Int8):
-      plain: 'The total number of 8-bit integer MFMA operations executed per second.
+      plain: |-
+        The total number of 8-bit integer MFMA operations executed per second.
         Note: this does not include any 8-bit integer operations from VALU instructions.
         This is also presented as a percent of the peak theoretical INT8 MFMA operations
-        achievable on the specific accelerator.'
-      rst: 'The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
+        achievable on the specific accelerator.
+      rst: |-
+        The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
         per second. Note: this does not include any 8-bit integer operations from
         :ref:`VALU <desc-valu>` instructions. This is also presented as a percent
-        of the peak theoretical INT8 MFMA operations achievable on the specific accelerator.'
+        of the peak theoretical INT8 MFMA operations achievable on the specific accelerator.
       unit: GIOPs
     Active CUs:
       plain: Total number of active compute units (CUs) on the accelerator during
@@ -1448,7 +1463,7 @@ panels:
       rst: Indicates what percent of the kernel's duration the :ref:`VMEM <desc-vmem>`
         unit was busy executing instructions, including both global/generic and spill/scratch
         operations (see the :ref:`VMEM instruction count metrics <ta-instruction-counts>`
-        for more detail).  Does not include :ref:`VALU <desc-valu>` operations. Computed
+        for more detail). Does not include :ref:`VALU <desc-valu>` operations. Computed
         as the ratio of the total number of cycles spent by the :ref:`scheduler <desc-scheduler>`
         issuing VMEM instructions over the :ref:`total CU cycles <total-cu-cycles>`.
       unit: Percent
@@ -1480,14 +1495,16 @@ panels:
         <compute-unit>` over the :ref:`total active CU cycles <total-active-cu-cycles>`.
       unit: Instructions per-cycle
     Wavefront Occupancy:
-      plain: 'The time-averaged number of wavefronts resident on the accelerator over
+      plain: |-
+        The time-averaged number of wavefronts resident on the accelerator over
         the lifetime of the kernel. Note: this metric may be inaccurate for short-running
         kernels (less than 1ms). This is also presented as a percent of the peak theoretical
-        occupancy achievable on the specific accelerator.'
-      rst: 'The time-averaged number of wavefronts resident on the accelerator over
+        occupancy achievable on the specific accelerator.
+      rst: |-
+        The time-averaged number of wavefronts resident on the accelerator over
         the lifetime of the kernel. Note: this metric may be inaccurate for short-running
         kernels (less than 1ms). This is also presented as a percent of the peak theoretical
-        occupancy achievable on the specific accelerator.'
+        occupancy achievable on the specific accelerator.
       unit: Wavefronts
     Theoretical LDS Bandwidth:
       plain: Indicates the maximum amount of bytes that could have been loaded from,
@@ -1505,17 +1522,17 @@ panels:
         number of cycles that would be spent in the LDS scheduler in a completely
         uncontended case. This is also presented in normalized form (i.e., the Bank
         Conflict Rate).
-      rst: The ratio of the number of cycles spent in the  :doc:`LDS scheduler <local-data-share>`
-        due to bank conflicts (as  determined by the conflict resolution hardware)
-        to the base number of  cycles that would be spent in the LDS scheduler in
-        a completely  uncontended case. This is also presented in normalized form
-        (i.e., the  Bank Conflict Rate).
+      rst: The ratio of the number of cycles spent in the :doc:`LDS scheduler <local-data-share>`
+        due to bank conflicts (as determined by the conflict resolution hardware)
+        to the base number of cycles that would be spent in the LDS scheduler in
+        a completely uncontended case. This is also presented in normalized form
+        (i.e., the Bank Conflict Rate).
       unit: Conflicts/Access
     vL1D Cache Hit Rate:
       plain: The ratio of the number of vL1D cache line requests that hit in vL1D
         cache over the total number of cache line requests to the vL1D cache RAM.
-      rst: The ratio of the number of vL1D cache line requests that hit in vL1D  cache
-        over the total number of cache line requests to the  :ref:`vL1D cache RAM
+      rst: The ratio of the number of vL1D cache line requests that hit in vL1D cache
+        over the total number of cache line requests to the :ref:`vL1D cache RAM
         <desc-tc>`.
       unit: Percent
     vL1D Cache BW:
@@ -1526,19 +1543,19 @@ panels:
         line, the data movement will still be counted as a full cache line. This is
         also presented as a percent of the peak theoretical bandwidth achievable on
         the specific accelerator.
-      rst: The number of bytes looked up in the vL1D cache as a result of  :ref:`VMEM
-        <desc-vmem>` instructions per unit time. The number of bytes  is calculated
-        as the number of cache lines requested multiplied by the  cache line size.
-        This value does not consider partial requests, so e.g.,  if only a single
-        value is requested in a cache line, the data movement  will still be counted
-        as a full cache line. This is also presented as a  percent of the peak theoretical
-        bandwidth achievable on the specific  accelerator.
+      rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM
+        <desc-vmem>` instructions per unit time. The number of bytes is calculated
+        as the number of cache lines requested multiplied by the cache line size.
+        This value does not consider partial requests, so e.g., if only a single
+        value is requested in a cache line, the data movement will still be counted
+        as a full cache line. This is also presented as a percent of the peak theoretical
+        bandwidth achievable on the specific accelerator.
       unit: GB/s
     L2 Cache Hit Rate:
       plain: The ratio of the number of L2 cache line requests that hit in the L2
         cache over the total number of incoming cache line requests to the L2 cache.
-      rst: The ratio of the number of L2 cache line requests that hit in the L2  cache
-        over the total number of incoming cache line requests to the L2  cache.
+      rst: The ratio of the number of L2 cache line requests that hit in the L2 cache
+        over the total number of incoming cache line requests to the L2 cache.
       unit: Percent
     L2 Cache BW:
       plain: The number of bytes looked up in the L2 cache per unit time. The number
@@ -1547,77 +1564,79 @@ panels:
         if only a single value is requested in a cache line, the data movement will
         still be counted as a full cache line. This is also presented as a percent
         of the peak theoretical bandwidth achievable on the specific accelerator.
-      rst: The number of bytes looked up in the L2 cache per unit time.  The number  of
-        bytes is calculated as the number of cache lines requested multiplied  by
-        the cache line size. This value does not consider partial requests, so  e.g.,
-        if only a single value is requested in a cache line, the data  movement will
-        still be counted as a full cache line. This is also  presented as a percent
-        of the peak theoretical bandwidth achievable on  the specific accelerator.
+      rst: The number of bytes looked up in the L2 cache per unit time. The number of
+        bytes is calculated as the number of cache lines requested multiplied by
+        the cache line size. This value does not consider partial requests, so e.g.,
+        if only a single value is requested in a cache line, the data movement will
+        still be counted as a full cache line. This is also presented as a percent
+        of the peak theoretical bandwidth achievable on the specific accelerator.
       unit: GB/s
     L2-Fabric Read BW:
-      plain: "The number of bytes read by the L2 over the Infinity Fabric\u2122 interface\
-        \ per unit time. This is also presented as a percent of the peak theoretical\
-        \ bandwidth achievable on the specific accelerator."
-      rst: "The number of bytes read by the L2 over the  :ref:`Infinity Fabric\u2122\
-        \ interface <l2-fabric>` per unit time. This is also  presented as a percent\
-        \ of the peak theoretical bandwidth achievable on  the specific accelerator."
+      plain: |-
+        The number of bytes read by the L2 over the Infinity Fabric\u2122 interface
+        per unit time. This is also presented as a percent of the peak theoretical
+        bandwidth achievable on the specific accelerator.
+      rst: |-
+        The number of bytes read by the L2 over the :ref:`Infinity Fabric\u2122
+        interface <l2-fabric>` per unit time. This is also presented as a percent
+        of the peak theoretical bandwidth achievable on the specific accelerator.
       unit: GB/s
     L2-Fabric Write BW:
       plain: The number of bytes sent by the L2 over the Infinity Fabric interface
         by write and atomic operations per unit time. This is also presented as a
         percent of the peak theoretical bandwidth achievable on the specific accelerator.
-      rst: The number of bytes sent by the L2 over the  :ref:`Infinity Fabric interface
-        <l2-fabric>` by write and atomic  operations per unit time. This is also presented
-        as a percent of the peak  theoretical bandwidth achievable on the specific
+      rst: The number of bytes sent by the L2 over the :ref:`Infinity Fabric interface
+        <l2-fabric>` by write and atomic operations per unit time. This is also presented
+        as a percent of the peak theoretical bandwidth achievable on the specific
         accelerator.
       unit: GB/s
     L2-Fabric Read Latency:
       plain: The time-averaged number of cycles read requests spent in Infinity Fabric
         before data was returned to the L2.
-      rst: The time-averaged number of cycles read requests spent in Infinity Fabric  before
+      rst: The time-averaged number of cycles read requests spent in Infinity Fabric before
         data was returned to the L2.
       unit: Cycles
     L2-Fabric Write Latency:
       plain: The time-averaged number of cycles write requests spent in Infinity Fabric
         before a completion acknowledgement was returned to the L2.
-      rst: The time-averaged number of cycles write requests spent in Infinity  Fabric
+      rst: The time-averaged number of cycles write requests spent in Infinity Fabric
         before a completion acknowledgement was returned to the L2.
       unit: Cycles
     sL1D Cache Hit Rate:
       plain: The percent of sL1D requests that hit on a previously loaded line the
         cache. Calculated as the ratio of the number of sL1D requests that hit over
         the number of all sL1D requests.
-      rst: The percent of sL1D requests that hit on a previously loaded line the  cache.
-        Calculated as the ratio of the number of sL1D requests that hit  over the
+      rst: The percent of sL1D requests that hit on a previously loaded line the cache.
+        Calculated as the ratio of the number of sL1D requests that hit over the
         number of all sL1D requests.
       unit: Percent
     sL1D Cache BW:
       plain: The number of bytes looked up in the sL1D cache per unit time. This is
         also presented as a percent of the peak theoretical bandwidth achievable on
         the specific accelerator.
-      rst: The number of bytes looked up in the sL1D cache per unit time. This is  also
-        presented as a percent of the peak theoretical bandwidth achievable  on the
+      rst: The number of bytes looked up in the sL1D cache per unit time. This is also
+        presented as a percent of the peak theoretical bandwidth achievable on the
         specific accelerator.
       unit: GB/s
     L1I Hit Rate:
       plain: The number of bytes looked up in the L1I cache per unit time. This is
         also presented as a percent of the peak theoretical bandwidth achievable on
         the specific accelerator.
-      rst: The percent of L1I requests that hit on a previously loaded line the  cache.
-        Calculated as the ratio of the number of L1I requests that hit  over the number
+      rst: The percent of L1I requests that hit on a previously loaded line the cache.
+        Calculated as the ratio of the number of L1I requests that hit over the number
         of all L1I requests.
       unit: GB/s
     L1I BW:
       plain: The percent of L1I requests that hit on a previously loaded line the
         cache. Calculated as the ratio of the number of L1I requests that hit over
         the number of all L1I requests.
-      rst: The number of bytes looked up in the L1I cache per unit time. This is  also
-        presented as a percent of the peak theoretical bandwidth achievable  on the
+      rst: The number of bytes looked up in the L1I cache per unit time. This is also
+        presented as a percent of the peak theoretical bandwidth achievable on the
         specific accelerator.
       unit: Percent
     L1I Fetch Latency:
       plain: The average number of cycles spent to fetch instructions to a CU.
-      rst: The average number of cycles spent to fetch instructions to a  :doc:`CU
+      rst: The average number of cycles spent to fetch instructions to a :doc:`CU
         <compute-unit>`.
       unit: Cycles
 - id: 300
@@ -1757,13 +1776,13 @@ panels:
             value: ROUND(AVG((TCC_EA_ATOMIC_sum / $denom)), 0)
           Fabric Rd Lat:
             value: ROUND(AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
-              != 0) else  0)), 0)
+              != 0) else 0)), 0)
           Fabric Wr Lat:
             value: ROUND(AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
-              != 0) else  0)), 0)
+              != 0) else 0)), 0)
           Fabric Atomic Lat:
             value: ROUND(AVG(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum
-              != 0) else  0)), 0)
+              != 0) else 0)), 0)
           HBM Rd:
             value: ROUND(AVG((TCC_EA_RDREQ_DRAM_sum / $denom)), 0)
           HBM Wr:
@@ -1887,13 +1906,13 @@ panels:
             value: ROUND(AVG((TCC_EA0_ATOMIC_sum / $denom)), 0)
           Fabric Rd Lat:
             value: ROUND(AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else  0)), 0)
+              != 0) else 0)), 0)
           Fabric Wr Lat:
             value: ROUND(AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else  0)), 0)
+              != 0) else 0)), 0)
           Fabric Atomic Lat:
             value: ROUND(AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
-              != 0) else  0)), 0)
+              != 0) else 0)), 0)
           HBM Rd:
             value: ROUND(AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)), 0)
           HBM Wr:
@@ -2017,13 +2036,13 @@ panels:
             value: ROUND(AVG((TCC_EA0_ATOMIC_sum / $denom)), 0)
           Fabric Rd Lat:
             value: ROUND(AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else  0)), 0)
+              != 0) else 0)), 0)
           Fabric Wr Lat:
             value: ROUND(AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else  0)), 0)
+              != 0) else 0)), 0)
           Fabric Atomic Lat:
             value: ROUND(AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
-              != 0) else  0)), 0)
+              != 0) else 0)), 0)
           HBM Rd:
             value: ROUND(AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)), 0)
           HBM Wr:
@@ -2147,13 +2166,13 @@ panels:
             value: ROUND(AVG((TCC_EA0_ATOMIC_sum / $denom)), 0)
           Fabric Rd Lat:
             value: ROUND(AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else  0)), 0)
+              != 0) else 0)), 0)
           Fabric Wr Lat:
             value: ROUND(AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else  0)), 0)
+              != 0) else 0)), 0)
           Fabric Atomic Lat:
             value: ROUND(AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
-              != 0) else  0)), 0)
+              != 0) else 0)), 0)
           HBM Rd:
             value: ROUND(AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)), 0)
           HBM Wr:
@@ -2287,13 +2306,13 @@ panels:
             value: ROUND(AVG((TCC_EA0_ATOMIC_sum / $denom)), 0)
           Fabric Rd Lat:
             value: ROUND(AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else  0)), 0)
+              != 0) else 0)), 0)
           Fabric Wr Lat:
             value: ROUND(AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else  0)), 0)
+              != 0) else 0)), 0)
           Fabric Atomic Lat:
             value: ROUND(AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
-              != 0) else  0)), 0)
+              != 0) else 0)), 0)
           HBM Rd:
             value: ROUND(AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)), 0)
           HBM Wr:
@@ -2425,13 +2444,13 @@ panels:
             value: ROUND(AVG((TCC_EA0_ATOMIC_sum / $denom)), 0)
           Fabric Rd Lat:
             value: ROUND(AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else  0)), 0)
+              != 0) else 0)), 0)
           Fabric Wr Lat:
             value: ROUND(AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else  0)), 0)
+              != 0) else 0)), 0)
           Fabric Atomic Lat:
             value: ROUND(AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
-              != 0) else  0)), 0)
+              != 0) else 0)), 0)
           HBM Rd:
             value: ROUND(AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)), 0)
           HBM Wr:
@@ -2505,36 +2524,42 @@ panels:
       rst: Total number of compute units (CUs) on the accelerator.
       unit: CUs
     VGPR:
-      plain: 'The number of architected vector general-purpose registers allocated
+      plain: |-
+        The number of architected vector general-purpose registers allocated
         for the kernel, see VALU. Note: this may not exactly match the number of VGPRs
-        requested by the compiler due to allocation granularity.'
-      rst: 'The number of architected vector general-purpose registers allocated for  the
-        kernel, see :ref:`VALU <desc-valu>`.  Note: this may not exactly  match the
-        number of VGPRs requested by the compiler due to allocation  granularity.'
+        requested by the compiler due to allocation granularity.
+      rst: |-
+        The number of architected vector general-purpose registers allocated for the
+        kernel, see :ref:`VALU <desc-valu>`. Note: this may not exactly match the
+        number of VGPRs requested by the compiler due to allocation granularity.
       unit: VGPRs
     SGPR:
-      plain: 'The number of scalar general-purpose registers allocated for the kernel,
+      plain: |-
+        The number of scalar general-purpose registers allocated for the kernel,
         see SALU. Note: this may not exactly match the number of SGPRs requested by
-        the compiler due to allocation granularity.'
-      rst: 'The number of scalar general-purpose registers allocated for the kernel,  see
-        :ref:`SALU <desc-salu>`.  Note: this may not exactly match the number  of
-        SGPRs requested by the compiler due to allocation granularity.'
+        the compiler due to allocation granularity.
+      rst: |-
+        The number of scalar general-purpose registers allocated for the kernel, see
+        :ref:`SALU <desc-salu>`. Note: this may not exactly match the number of
+        SGPRs requested by the compiler due to allocation granularity.
       unit: SGPRs
     LDS Allocation:
-      plain: 'The number of bytes of LDS memory (or, shared memory) allocated for
+      plain: |-
+        The number of bytes of LDS memory (or, shared memory) allocated for
         this kernel. Note: This may also be larger than what was requested at compile
-        time due to both allocation granularity and dynamic per-dispatch LDS allocations.'
-      rst: 'The number of bytes of :doc:`LDS <local-data-share>` memory (or, shared  memory)
-        allocated for this kernel.  Note: This may also be larger than  what was requested
-        at compile time due to both allocation granularity and  dynamic per-dispatch
-        LDS allocations.'
+        time due to both allocation granularity and dynamic per-dispatch LDS allocations.
+      rst: |-
+        The number of bytes of :doc:`LDS <local-data-share>` memory (or, shared memory)
+        allocated for this kernel. Note: This may also be larger than what was requested
+        at compile time due to both allocation granularity and dynamic per-dispatch
+        LDS allocations.
       unit: Bytes per workgroup
     Scratch Allocation:
       plain: The number of bytes of scratch memory requested per work-item for this
         kernel. Scratch memory is used for stack memory on the accelerator, as well
         as for register spills and restores.
-      rst: The number of bytes of :ref:`scratch memory <memory-spaces>` requested  per
-        work-item for this kernel. Scratch memory is used for stack memory  on the
+      rst: The number of bytes of :ref:`scratch memory <memory-spaces>` requested per
+        work-item for this kernel. Scratch memory is used for stack memory on the
         accelerator, as well as for register spills and restores.
       unit: Bytes per workgroup
     Wavefronts:
@@ -2551,49 +2576,49 @@ panels:
       plain: The total number of LDS instructions (including, but not limited to,
         read/write/atomics and HIP's __shfl instructions) executed per normalization
         unit.
-      rst: The total number of LDS instructions (including, but not limited to,  read/write/atomics
-        and HIP's ``__shfl`` instructions) executed per  :ref:`normalization unit
-        <normalization-units>`.
+      rst: The total number of LDS instructions (including, but not limited to,
+        read/write/atomics and HIP's ``__shfl`` instructions) executed
+        per :ref:`normalization unit <normalization-units>`.
       unit: Instructions per normalization unit
     LDS Util:
       plain: Indicates what percent of the kernel's duration the LDS was actively
         executing instructions (including, but not limited to, load, store, atomic
         and HIP's __shfl operations). Calculated as the ratio of the total number
         of cycles LDS was active over the total CU cycles.
-      rst: Indicates what percent of the kernel's duration the :ref:`LDS <desc-lds>`  was
-        actively executing instructions (including, but not limited to, load,  store,
-        atomic and HIP's ``__shfl`` operations).  Calculated as the ratio  of the
-        total number of cycles LDS was active over the  :ref:`total CU cycles <total-cu-cycles>`.
+      rst: Indicates what percent of the kernel's duration the :ref:`LDS <desc-lds>` was
+        actively executing instructions (including, but not limited to, load, store,
+        atomic and HIP's ``__shfl`` operations). Calculated as the ratio of the
+        total number of cycles LDS was active over the :ref:`total CU cycles <total-cu-cycles>`.
       unit: Percent
     LDS Latency:
       plain: The average number of round-trip cycles (i.e., from issue to data-return
         / acknowledgment) required for an LDS instruction to complete.
-      rst: The average number of round-trip cycles (i.e., from issue to data-return  /
+      rst: The average number of round-trip cycles (i.e., from issue to data-return /
         acknowledgment) required for an LDS instruction to complete.
       unit: Cycles
     VL1 Rd:
       plain: The total number of incoming read requests from the address processing
         unit after coalescing per normalization unit
-      rst: The total number of incoming read requests from the  :ref:`address processing
-        unit <desc-ta>` after coalescing per  :ref:`normalization unit <normalization-units>`
+      rst: The total number of incoming read requests from the :ref:`address processing
+        unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
       unit: Requests per normalization unit
     VL1 Wr:
       plain: The total number of incoming write requests from the address processing
         unit after coalescing per normalization unit
-      rst: The total number of incoming write requests from the  :ref:`address processing
-        unit <desc-ta>` after coalescing per  :ref:`normalization unit <normalization-units>`
+      rst: The total number of incoming write requests from the :ref:`address processing
+        unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
       unit: Requests per normalization unit
     VL1 Atomic:
       plain: The total number of incoming atomic requests from the address processing
         unit after coalescing per normalization unit
-      rst: The total number of incoming atomic requests from the  :ref:`address processing
-        unit <desc-ta>` after coalescing per  :ref:`normalization unit <normalization-units>`
+      rst: The total number of incoming atomic requests from the :ref:`address processing
+        unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
       unit: Requests per normalization unit
     VL1 Hit:
       plain: The ratio of the number of vL1D cache line requests that hit in vL1D
         cache over the total number of cache line requests to the vL1D Cache RAM.
-      rst: The ratio of the number of vL1D cache line requests that hit in vL1D  cache
-        over the total number of cache line requests to the  :ref:`vL1D Cache RAM
+      rst: The ratio of the number of vL1D cache line requests that hit in vL1D cache
+        over the total number of cache line requests to the :ref:`vL1D Cache RAM
         <desc-tc>`.
       unit: Percent
     VL1 Lat:
@@ -2607,9 +2632,9 @@ panels:
         processing unit, ranging from uncoalesced (25%) to fully coalesced (100%).
         Calculated as the average number of thread-requests generated per instruction
         divided by the ideal number of thread-requests per instruction.
-      rst: Indicates how well memory instructions were coalesced by the  :ref:`address
-        processing unit <desc-ta>`, ranging from uncoalesced (25%)  to fully coalesced
-        (100%). Calculated as the average number of  :ref:`thread-requests <thread-requests>`
+      rst: Indicates how well memory instructions were coalesced by the :ref:`address
+        processing unit <desc-ta>`, ranging from uncoalesced (25%) to fully coalesced
+        (100%). Calculated as the average number of :ref:`thread-requests <thread-requests>`
         generated per instruction divided by the ideal number of thread-requests per
         instruction.
       unit: Percent
@@ -2617,79 +2642,79 @@ panels:
       plain: The ratio of the number of cycles where the vL1D is stalled waiting to
         issue a request for data to the L2 cache divided by the number of cycles where
         the vL1D is active.
-      rst: The ratio of the number of cycles where the vL1D is stalled waiting to  issue
-        a request for data to the :doc:`L2 cache <l2-cache>` divided by the  number
+      rst: The ratio of the number of cycles where the vL1D is stalled waiting to issue
+        a request for data to the :doc:`L2 cache <l2-cache>` divided by the number
         of cycles where the vL1D is active [#vl1d-activity]_.
       unit: Percent
     VL1_L2 Rd:
       plain: The number of read requests for a vL1D cache line that were not satisfied
         by the vL1D and must be retrieved from the to the L2 Cache per normalization
         unit.
-      rst: The number of read requests for a vL1D cache line that were not satisfied  by
-        the vL1D and must be retrieved from the to the  :doc:`L2 Cache <l2-cache>`
-        per  :ref:`normalization unit <normalization-units>`.
+      rst: The number of read requests for a vL1D cache line that were not satisfied by
+        the vL1D and must be retrieved from the to the :doc:`L2 Cache <l2-cache>`
+        per :ref:`normalization unit <normalization-units>`.
       unit: Requests per normalization unit
     VL1_L2 Wr:
       plain: The number of write requests to a vL1D cache line that were sent through
         the vL1D to the L2 cache, per normalization unit.
-      rst: The number of write requests to a vL1D cache line that were sent through  the
-        vL1D to the :doc:`L2 cache <l2-cache>`, per  :ref:`normalization unit <normalization-units>`.
+      rst: The number of write requests to a vL1D cache line that were sent through the
+        vL1D to the :doc:`L2 cache <l2-cache>`, per :ref:`normalization unit <normalization-units>`.
       unit: Requests per normalization unit
     VL1_L2 Atomic:
       plain: The number of atomic requests that are sent through the vL1D to the L2
         cache, per normalization unit. This includes requests for atomics with, and
         without return.
-      rst: The number of atomic requests that are sent through the vL1D to the  :doc:`L2
-        cache <l2-cache>`, per  :ref:`normalization unit <normalization-units>`. This
-        includes requests  for atomics with, and without return.
+      rst: The number of atomic requests that are sent through the vL1D to the :doc:`L2
+        cache <l2-cache>`, per :ref:`normalization unit <normalization-units>`. This
+        includes requests for atomics with, and without return.
       unit: Requests per normalization unit
     sL1D Rd:
       plain: The total number of requests, of any size or type, made to the sL1D per
         normalization unit.
-      rst: The total number of requests, of any size or type, made to the sL1D per  :ref:`normalization
+      rst: The total number of requests, of any size or type, made to the sL1D per :ref:`normalization
         unit <normalization-units>`.
       unit: Requests per normalization unit
     sL1D Hit:
       plain: The total number of sL1D requests that hit on a previously loaded cache
         line, per normalization unit.
-      rst: The total number of sL1D requests that hit on a previously loaded cache  line,
+      rst: The total number of sL1D requests that hit on a previously loaded cache line,
         per :ref:`normalization unit <normalization-units>`.
       unit: Requests per normalization unit
     sL1D_L2 Rd:
       plain: The total number of read requests from sL1D to the L2, per normalization
         unit.
-      rst: The total number of read requests from sL1D to the :doc:`L2 <l2-cache>`,  per
+      rst: The total number of read requests from sL1D to the :doc:`L2 <l2-cache>`, per
         :ref:`normalization unit <normalization-units>`.
       unit: Requests per normalization unit
     sL1D_L2 Wr:
       plain: The total number of write requests from sL1D to the L2, per normalization
         unit. Typically unused on current CDNA accelerators.
-      rst: The total number of write requests from sL1D to the :doc:`L2 <l2-cache>`,  per
+      rst: The total number of write requests from sL1D to the :doc:`L2 <l2-cache>`, per
         :ref:`normalization unit <normalization-units>`. Typically unused on current
         CDNA accelerators.
       unit: Requests per normalization unit
     sL1D_L2 Atomic:
       plain: The total number of atomic requests from sL1D to the L2, per normalization
         unit. Typically unused on current CDNA accelerators.
-      rst: The total number of atomic requests from sL1D to the  :doc:`L2 <l2-cache>`,
-        per  :ref:`normalization unit <normalization-units>`. Typically unused on  current
+      rst: The total number of atomic requests from sL1D to the :doc:`L2 <l2-cache>`,
+        per :ref:`normalization unit <normalization-units>`. Typically unused on current
         CDNA accelerators.
       unit: Requests per normalization unit
     IL1 Fetch:
       plain: The total number of requests made to the L1I per normalization-unit.
-      rst: The total number of requests made to the L1I per  :ref:`normalization-unit
+      rst: The total number of requests made to the L1I per :ref:`normalization-unit
         <normalization-units>`.
       unit: Requests per normalization unit
     IL1 Hit:
       plain: The percent of L1I requests that hit on a previously loaded line the
         cache. Calculated as the ratio of the number of L1I requests that hit over
         the number of all L1I requests.
-      rst: The total number of L1I requests that hit on a previously loaded cache  line,
+      rst: The total number of L1I requests that hit on a previously loaded cache line,
         per :ref:`normalization-unit <normalization-units>`.
       unit: Percent
     IL1 Lat:
       plain: The average number of cycles spent to fetch instructions to a CU.
-      rst: The average number of cycles spent to fetch instructions to a  :doc:`CU
+      rst: The average number of cycles spent to fetch instructions to a :doc:`CU
         <compute-unit>`.
       unit: Cycles
     IL1_L2 Rd:
@@ -2707,7 +2732,7 @@ panels:
     L2 Atomic:
       plain: The total number of atomic requests (with and without return) to the
         L2 from all clients.
-      rst: The total number of atomic requests (with and without return) to the L2  from
+      rst: The total number of atomic requests (with and without return) to the L2 from
         all clients.
       unit: Requests per normalization unit
     L2 Hit:
@@ -2720,17 +2745,17 @@ panels:
       plain: Calculated as the average number of cycles that the vL1D cache took to
         issue and receive read requests from the L2 Cache. This number also includes
         requests for atomics with return values.
-      rst: Calculated as the average number of cycles that the vL1D cache took to  issue
-        and receive read requests from the :doc:`L2 Cache <l2-cache>`. This  number
+      rst: Calculated as the average number of cycles that the vL1D cache took to issue
+        and receive read requests from the :doc:`L2 Cache <l2-cache>`. This number
         also includes requests for atomics with return values.
       unit: Cycles
     L2 Wr Lat:
       plain: Calculated as the average number of cycles that the vL1D cache took to
         issue and receive acknowledgement of a write request to the L2 Cache. This
         number also includes requests for atomics without return values.
-      rst: Calculated as the average number of cycles that the vL1D cache took to  issue
-        and receive acknowledgement of a write request to the  :doc:`L2 Cache <l2-cache>`.
-        This number also includes requests for  atomics without return values.
+      rst: Calculated as the average number of cycles that the vL1D cache took to issue
+        and receive acknowledgement of a write request to the :doc:`L2 Cache <l2-cache>`.
+        This number also includes requests for atomics without return values.
       unit: Cycles
     Fabric_L2 Rd:
       plain: Number of L2 cache - Infinity Fabric read requests (either 32-byte or
@@ -2775,17 +2800,18 @@ panels:
     HBM Rd:
       plain: The total number of L2 requests to Infinity Fabric to read 32B or 64B
         of data from the accelerator's local HBM, per normalization unit.
-      rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of  data
-        from the accelerator's local HBM, per  :ref:`normalization unit <normalization-units>`.
-        See  :ref:`l2-request-flow` for more detail.
+      rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of data
+        from the accelerator's local HBM, per :ref:`normalization unit <normalization-units>`.
+        See :ref:`l2-request-flow` for more detail.
       unit: Requests per normalization unit
     HBM Wr:
-      plain: 'The total number of L2 requests to Infinity Fabric to write or atomically
-        update 32B or 64B of data in the accelerator''s local HBM, per normalization
-        unit. '
+      plain: |-
+        The total number of L2 requests to Infinity Fabric to write or atomically
+        update 32B or 64B of data in the accelerator's local HBM, per normalization
+        unit.
       rst: The total number of L2 requests to Infinity Fabric to write 32B or 64B
-        of  data from the accelerator's local HBM, per  :ref:`normalization unit <normalization-units>`.
-        See  :ref:`l2-request-flow` for more detail.
+        of data from the accelerator's local HBM, per :ref:`normalization unit <normalization-units>`.
+        See :ref:`l2-request-flow` for more detail.
       unit: Requests per normalization unit
 - id: 400
   title: Roofline
@@ -3245,7 +3271,7 @@ panels:
                 (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)
               ) /
               SUM(
-                (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + 
+                (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum +
                 TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) * 64
               )
               )
@@ -3319,7 +3345,7 @@ panels:
                 (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)
               ) /
               SUM(
-                (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + 
+                (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum +
                 TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) * 64
               )
               )
@@ -3394,7 +3420,7 @@ panels:
                 (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) +
               ) /
               SUM(
-                (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + 
+                (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum +
                 TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) * 64
               )
               )
@@ -3471,7 +3497,7 @@ panels:
                 (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512)
               ) /
               SUM(
-                (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + 
+                (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum +
                 TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) * 64
               )
               )
@@ -3509,7 +3535,7 @@ panels:
               ) /
               (SUM(End_Timestamp - Start_Timestamp) / 1e9)
               ) / 1e9
-            unit: GFLOP/s        
+            unit: GFLOP/s
         gfx942:
           AI HBM:
             value: ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16
@@ -3532,7 +3558,7 @@ panels:
               + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
               (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 *
               512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8
-              * 512) ) / SUM( (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + 
+              * 512) ) / SUM( (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum +
               TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) * 64 ) )
             unit: FLOPs/Byte
           AI L1:
@@ -3596,7 +3622,7 @@ panels:
                 (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512)
               ) /
               SUM(
-                (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + 
+                (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum +
                 TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) * 64
               )
               )
@@ -3639,34 +3665,40 @@ panels:
             unit: GFLOP/s
   metrics_description:
       VALU FLOPs (F16):
-        plain: 'The total 16-bit floating-point operations executed per second on the VALU.
+        plain: |-
+          The total 16-bit floating-point operations executed per second on the VALU.
           This is presented with the value of the peak empirical F16 FLOPs achievable
           on the specific accelerator. Note: this does not include any F16 operations
-          from MFMA instructions.'
-        rst: 'The total 16-bit floating-point operations executed per second on the :ref:`VALU
+          from MFMA instructions.
+        rst: |-
+          The total 16-bit floating-point operations executed per second on the :ref:`VALU
           <desc-valu>`. This is presented with the value of the peak empirical F16 FLOPs achievable
           on the specific accelerator. Note: this does not include any F16 operations
-          from :ref:`MFMA <desc-mfma>` instructions.'
+          from :ref:`MFMA <desc-mfma>` instructions.
         unit: GFLOPs
       VALU FLOPs (F32):
-        plain: 'The total 32-bit floating-point operations executed per second on the VALU.
+        plain: |-
+          The total 32-bit floating-point operations executed per second on the VALU.
           This is presented with the value of the peak empirical F32 FLOPs achievable
           on the specific accelerator. Note: this does not include any F32 operations
-          from MFMA instructions.'
-        rst: 'The total 32-bit floating-point operations executed per second on the :ref:`VALU
+          from MFMA instructions.
+        rst: |-
+          The total 32-bit floating-point operations executed per second on the :ref:`VALU
           <desc-valu>`. This is presented with the value of the peak empirical F32 FLOPs achievable
           on the specific accelerator. Note: this does not include any F32 operations
-          from :ref:`MFMA <desc-mfma>` instructions.'
+          from :ref:`MFMA <desc-mfma>` instructions.
         unit: GFLOPs
       VALU FLOPs (F64):
-        plain: 'The total 64-bit floating-point operations executed per second on the VALU.
+        plain: |-
+          The total 64-bit floating-point operations executed per second on the VALU.
           This is presented with the value of the peak empirical F64 FLOPs achievable
           on the specific accelerator. Note: this does not include any F64 operations
-          from MFMA instructions.'
-        rst: 'The total 64-bit floating-point operations executed per second on the :ref:`VALU
+          from MFMA instructions.
+        rst: |-
+          The total 64-bit floating-point operations executed per second on the :ref:`VALU
           <desc-valu>`. This is presented with the value of the peak empirical F64 FLOPs achievable
           on the specific accelerator. Note: this does not include any F64 operations
-          from :ref:`MFMA <desc-mfma>` instructions.'
+          from :ref:`MFMA <desc-mfma>` instructions.
         unit: GFLOPs
       MFMA FLOPs (F8):
         plain: The total number of 8-bit brain floating point MFMA operations executed
@@ -3674,89 +3706,104 @@ panels:
           from VALU instructions. The peak empirically measured F8 MFMA operations
           achievable on the specific accelerator is displayed alongside for comparison.
           It is supported on AMD Instinct MI300 series and later only.
-        rst: 'The total number of 8-bit brain floating point :ref:`MFMA <desc-mfma>`
+        rst: |-
+          The total number of 8-bit brain floating point :ref:`MFMA <desc-mfma>`
           operations executed per second. Note: this does not include any 16-bit brain
           floating point operations from :ref:`VALU <desc-valu>` instructions. The
           peak empirically measured F8 MFMA operations achievable on the specific
           accelerator is displayed alongside for comparison. It is supported on AMD
-          Instinct MI300 series and later only.'
+          Instinct MI300 series and later only.
         unit: GFLOPs
       MFMA FLOPs (BF16):
-        plain: 'The total number of 16-bit brain floating point MFMA operations executed
+        plain: |-
+          The total number of 16-bit brain floating point MFMA operations executed
           per second. Note: this does not include any 16-bit brain floating point
           operations from VALU instructions. The peak empirically measured BF16 MFMA
           operations achievable on the specific accelerator is displayed alongside
-          for comparison.'
-        rst: 'The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>`
+          for comparison.
+        rst: |-
+          The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>`
           operations executed per second. Note: this does not include any 16-bit brain
           floating point operations from :ref:`VALU <desc-valu>` instructions. The
           peak empirically measured BF16 MFMA operations achievable on the specific
-          accelerator is displayed alongside for comparison.'
+          accelerator is displayed alongside for comparison.
         unit: GFLOPs
       MFMA FLOPs (F16):
-        plain: 'The total number of 16-bit floating point MFMA operations executed per
+        plain: |-
+          The total number of 16-bit floating point MFMA operations executed per
           second. Note: this does not include any 16-bit floating point operations from
           VALU instructions. The peak empirically measured F16 MFMA operations
-          achievable on the specific accelerator is displayed alongside for comparison.'
-        rst: 'The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
+          achievable on the specific accelerator is displayed alongside for comparison.
+        rst: |-
+          The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
           executed per second. Note: this does not include any 16-bit floating point
           operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
           measured F16 MFMA operations achievable on the specific accelerator is
-          displayed alongside for comparison.'
+          displayed alongside for comparison.
         unit: GFLOPs
       MFMA FLOPs (F32):
-        plain: 'The total number of 32-bit floating point MFMA operations executed per
+        plain: |-
+          The total number of 32-bit floating point MFMA operations executed per
           second. Note: this does not include any 32-bit floating point operations from
           VALU instructions. The peak empirically measured F32 MFMA operations
-          achievable on the specific accelerator is displayed alongside for comparison.'
-        rst: 'The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
+          achievable on the specific accelerator is displayed alongside for comparison.
+        rst: |-
+          The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
           executed per second. Note: this does not include any 32-bit floating point
           operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
           measured F32 MFMA operations achievable on the specific accelerator is
-          displayed alongside for comparison.'
+          displayed alongside for comparison.
         unit: GFLOPs
       MFMA FLOPs (F64):
-        plain: 'The total number of 64-bit floating point MFMA operations executed per
+        plain: |-
+          The total number of 64-bit floating point MFMA operations executed per
           second. Note: this does not include any 64-bit floating point operations from
           VALU instructions. The peak empirically measured F64 MFMA operations
-          achievable on the specific accelerator is displayed alongside for comparison.'
-        rst: 'The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
+          achievable on the specific accelerator is displayed alongside for comparison.
+        rst: |-
+          The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
           executed per second. Note: this does not include any 64-bit floating point
           operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
           measured F64 MFMA operations achievable on the specific accelerator is
-          displayed alongside for comparison.'
+          displayed alongside for comparison.
         unit: GFLOPs
       MFMA FLOPs (F6F4):
-        plain: 'The total number of 4-bit and 6-bit floating point MFMA operations executed
+        plain: |-
+          The total number of 4-bit and 6-bit floating point MFMA operations executed
           per second. Note: this does not include any floating point operations from
           VALU instructions. The peak empirically measured F6F4 MFMA operations
           achievable on the specific accelerator is displayed alongside for comparison.
-          It is supported on AMD Instinct MI350 series (gfx950) and later only.'
-        rst: 'The total number of 4-bit and 6-bit floating point :ref:`MFMA <desc-mfma>`
+          It is supported on AMD Instinct MI350 series (gfx950) and later only.
+        rst: |-
+          The total number of 4-bit and 6-bit floating point :ref:`MFMA <desc-mfma>`
           operations executed per second. Note: this does not include any floating point
           operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
           measured F6F4 MFMA operations achievable on the specific accelerator is
           displayed alongside for comparison. It is supported on AMD Instinct MI350
-          series (gfx950) and later only.'
+          series (gfx950) and later only.
         unit: GFLOPs
       MFMA IOPs (Int8):
-        plain: 'The total number of 8-bit integer MFMA operations executed per second.
+        plain: |-
+          The total number of 8-bit integer MFMA operations executed per second.
           Note: this does not include any 8-bit integer operations from VALU instructions.
           The peak empirically measured INT8 MFMA operations achievable on the specific
-          accelerator is displayed alongside for comparison.'
-        rst: 'The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
+          accelerator is displayed alongside for comparison.
+        rst: |-
+          The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
           per second. Note: this does not include any 8-bit integer operations from
           :ref:`VALU <desc-valu>` instructions. The peak empirically measured INT8 MFMA
           operations achievable on the specific accelerator is displayed alongside
-          for comparison.'
+          for comparison.
         unit: GIOPs
       HBM Bandwidth:
-        plain: 'The total number of bytes read from and written to High-Bandwidth
-            Memory (HBM) per second. The peak empirically measured bandwidth achievable
-            on the specific accelerator is displayed alongside for comparison.'
-        rst: 'The total number of bytes read from and written to High-Bandwidth
-            Memory (HBM) per second. The peak empirically measured bandwidth achievable
-            on the specific accelerator is displayed alongside for comparison.'
+        plain: |-
+          The total number of bytes read from and written to High-Bandwidth
+          Memory (HBM) per second. The peak empirically measured bandwidth achievable
+          on the specific accelerator is displayed alongside for comparison.
+        rst: |-
+          The total number of bytes read from and written to High-Bandwidth
+          Memory (HBM) per second. The peak empirically measured bandwidth achievable
+          on the specific accelerator is displayed alongside for comparison.
         unit: GB/s
       L2 Cache Bandwidth:
         plain: The number of bytes looked up in the L2 cache per unit time. The number
@@ -3801,44 +3848,52 @@ panels:
           alongside for comparison.
         unit: GB/s
       AI L1:
-        plain: 'The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio
+        plain: |-
+          The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio
           of total floating-point operations (FLOPs) to total bytes transferred between
           the L1 cache and the processing units. This value is used as the x-coordinate
-          for the L1 roofline.'
-        rst: 'The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio
+          for the L1 roofline.
+        rst: |-
+          The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio
           of total floating-point operations (FLOPs) to total bytes transferred between
           the L1 cache and the processing units. This value is used as the x-coordinate
-          for the L1 roofline.'
+          for the L1 roofline.
         unit: FLOPs/Byte
       AI L2:
-        plain: 'The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio
+        plain: |-
+          The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio
           of total floating-point operations (FLOPs) to total bytes transferred between
           the L2 cache and the L1 cache. This value is used as the x-coordinate for
-          the L2 roofline.'
-        rst: 'The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio
+          the L2 roofline.
+        rst: |-
+          The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio
           of total floating-point operations (FLOPs) to total bytes transferred between
           the L2 cache and the L1 cache. This value is used as the x-coordinate for
-          the L2 roofline.'
+          the L2 roofline.
         unit: FLOPs/Byte
       AI HBM:
-        plain: 'The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM).
+        plain: |-
+          The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM).
           It is the ratio of total floating-point operations (FLOPs) to total bytes
           transferred between HBM and the L2 cache. This value is used as the x-coordinate
-          for the HBM roofline.'
-        rst: 'The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM).
+          for the HBM roofline.
+        rst: |-
+          The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM).
           It is the ratio of total floating-point operations (FLOPs) to total bytes
           transferred between HBM and the L2 cache. This value is used as the x-coordinate
-          for the HBM roofline.'
+          for the HBM roofline.
         unit: FLOPs/Byte
       Performance (GFLOPs):
-        plain: 'The overall achieved performance, measured in GigaFLOPs
+        plain: |-
+          The overall achieved performance, measured in GigaFLOPs
           per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point
           operations divided by the total execution time. This value is used as the y-coordinate
-          for the kernel''s point on the Roofline plot.'
-        rst: 'The overall achieved performance, measured in GigaFLOPs
+          for the kernel's point on the Roofline plot.
+        rst: |-
+          The overall achieved performance, measured in GigaFLOPs
           per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point
           operations divided by the total execution time. This value is used as the y-coordinate
-          for the kernel''s point on the Roofline plot.'
+          for the kernel's point on the Roofline plot.
         unit: GFLOP/s
 - id: 500
   title: Command Processor (CPC/CPF)
@@ -4490,8 +4545,8 @@ panels:
       plain: Percent of total cycles counted by the CPF-L2 interface where the CPF-L2
         interface was active doing any work. The ratio of CPF-L2 busy cycles over
         total cycles counted by the CPF-L2.
-      rst: Percent of total cycles counted by the CPF-:doc:`L2 <l2-cache>` interface  where
-        the CPF-L2 interface was active doing any work. The ratio of CPF-L2  busy
+      rst: Percent of total cycles counted by the CPF-:doc:`L2 <l2-cache>` interface where
+        the CPF-L2 interface was active doing any work. The ratio of CPF-L2 busy
         cycles over total cycles counted by the CPF-L2.
       unit: Percent
     CPF-L2 Stall:
@@ -4521,13 +4576,13 @@ panels:
     CPC-Workgroup Manager Utilization:
       plain: Percent of CPC busy cycles spent dispatching workgroups to the workgroup
         manager.
-      rst: Percent of CPC busy cycles spent dispatching workgroups to the  :ref:`workgroup
+      rst: Percent of CPC busy cycles spent dispatching workgroups to the :ref:`workgroup
         manager <desc-spi>`.
       unit: Percent
     CPC-L2 Utilization:
       plain: Percent of total cycles counted by the CPC-L2 interface where the CPC-L2
         interface was active doing any work.
-      rst: Percent of total cycles counted by the CPC-:doc:`L2 <l2-cache>` interface  where
+      rst: Percent of total cycles counted by the CPC-:doc:`L2 <l2-cache>` interface where
         the CPC-L2 interface was active doing any work.
       unit: Percent
     CPC-UTCL1 Stall:
@@ -4535,10 +4590,11 @@ panels:
       rst: Percent of CPC busy cycles where the CPC was stalled by address translation
       unit: Percent
     CPC-UTCL2 Utilization:
-      plain: 'Percent of total cycles counted by the CPC''s L2 address translation
-        interface where the CPC was busy doing address translation work.  '
-      rst: Percent of total cycles counted by the CPC's :doc:`L2 <l2-cache>` address  translation
-        interface where the CPC was busy doing address translation  work.
+      plain: |-
+        Percent of total cycles counted by the CPC's L2 address translation
+        interface where the CPC was busy doing address translation work.
+      rst: Percent of total cycles counted by the CPC's :doc:`L2 <l2-cache>` address translation
+        interface where the CPC was busy doing address translation work.
       unit: Percent
 - id: 600
   title: Workgroup Manager (SPI)
@@ -5419,9 +5475,10 @@ panels:
     Scheduler-Pipe Utilization:
       plain: The percent of total scheduler-pipe cycles in the kernel where the scheduler-pipes
         were actively doing any work.
-      rst: 'The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
-        in  the kernel where the scheduler-pipes were actively doing any work. Note:  this
-        value is expected to range between 0% and 25%. See :ref:`desc-spi`.'
+      rst: |-
+        The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
+        in the kernel where the scheduler-pipes were actively doing any work. Note: this
+        value is expected to range between 0% and 25%. See :ref:`desc-spi`.
       unit: Percent
     Workgroup Manager Utilization:
       plain: The percent of cycles in the kernel where the workgroup manager was actively
@@ -5434,20 +5491,20 @@ panels:
         in a shader-engine was actively doing any work, normalized over all shader-engines.
         Low values (e.g., << 100%) indicate that the accelerator was not fully saturated
         by the kernel, or a potential load-imbalance issue.
-      rst: The percent of :ref:`total shader engine cycles <total-se-cycles>` in the  kernel
-        where any CU in a shader-engine was actively doing any work,  normalized over
-        all shader-engines. Low values (e.g., << 100%) indicate  that the accelerator
-        was not fully saturated by the kernel, or a  potential load-imbalance issue.
+      rst: The percent of :ref:`total shader engine cycles <total-se-cycles>` in the kernel
+        where any CU in a shader-engine was actively doing any work, normalized over
+        all shader-engines. Low values (e.g., << 100%) indicate that the accelerator
+        was not fully saturated by the kernel, or a potential load-imbalance issue.
       unit: Percent
     SIMD Utilization:
       plain: The percent of total SIMD cycles in the kernel where any SIMD on a CU
         was actively doing any work, summed over all CUs. Low values (less than 100%)
         indicate that the accelerator was not fully saturated by the kernel, or a
         potential load-imbalance issue.
-      rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel  where
-        any :ref:`SIMD <desc-valu>` on a CU was actively doing any work,  summed over
-        all CUs. Low values (less than 100%) indicate that the  accelerator was not
-        fully saturated by the kernel, or a potential  load-imbalance issue.
+      rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel where
+        any :ref:`SIMD <desc-valu>` on a CU was actively doing any work, summed over
+        all CUs. Low values (less than 100%) indicate that the accelerator was not
+        fully saturated by the kernel, or a potential load-imbalance issue.
       unit: Percent
     Dispatched Workgroups:
       plain: The total number of workgroups forming this kernel launch.
@@ -5461,45 +5518,49 @@ panels:
       unit: Wavefronts
     VGPR Writes:
       plain: The average number of cycles spent initializing VGPRs at wave creation.
-      rst: The average number of cycles spent initializing :ref:`VGPRs <desc-valu>`  at
+      rst: The average number of cycles spent initializing :ref:`VGPRs <desc-valu>` at
         wave creation.
       unit: Cycles/wave
     SGPR Writes:
       plain: The average number of cycles spent initializing SGPRs at wave creation.
-      rst: The average number of cycles spent initializing :ref:`SGPRs <desc-salu>`  at
+      rst: The average number of cycles spent initializing :ref:`SGPRs <desc-salu>` at
         wave creation.
       unit: Cycles/wave
     Not-scheduled Rate (Workgroup Manager):
       plain: The percent of total scheduler-pipe cycles in the kernel where a workgroup
         could not be scheduled to a CU due to a bottleneck within the workgroup manager
         rather than a lack of a CU or SIMD with sufficient resources.
-      rst: 'The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
-        in  the kernel where a workgroup could not be scheduled to a  :doc:`CU <compute-unit>`
-        due to a bottleneck within the workgroup manager  rather than a lack of a
-        CU or :ref:`SIMD <desc-valu>` with sufficient  resources. Note: this value
-        is expected to range between 0-25%. See note  in :ref:`workgroup manager <desc-spi>`
-        description.'
+      rst: |-
+        The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
+        in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
+        due to a bottleneck within the workgroup manager rather than a lack of a
+        CU or :ref:`SIMD <desc-valu>` with sufficient resources. Note: this value
+        is expected to range between 0-25%. See note in :ref:`workgroup manager <desc-spi>`
+        description.
       unit: Percent
     Not-scheduled Rate (Scheduler-Pipe):
-      plain: 'The percent of total scheduler-pipe cycles in the kernel where a workgroup
+      plain: |-
+        The percent of total scheduler-pipe cycles in the kernel where a workgroup
         could not be scheduled to a CU due to a bottleneck within the scheduler-pipes
-        rather than a lack of a CU or SIMD with sufficient resources. '
-      rst: 'The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
-        in  the kernel where a workgroup could not be scheduled to a  :doc:`CU <compute-unit>`
-        due to a bottleneck within the scheduler-pipes  rather than a lack of a CU
-        or :ref:`SIMD <desc-valu>` with sufficient  resources. Note: this value is
-        expected to range between 0-25%, see note  in :ref:`workgroup manager <desc-spi>`
-        description.'
+        rather than a lack of a CU or SIMD with sufficient resources.
+      rst: |-
+        The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
+        in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
+        due to a bottleneck within the scheduler-pipes rather than a lack of a CU
+        or :ref:`SIMD <desc-valu>` with sufficient resources. Note: this value is
+        expected to range between 0-25%, see note in :ref:`workgroup manager <desc-spi>`
+        description.
       unit: Percent
     Scheduler-Pipe Stall Rate:
       plain: The percent of total scheduler-pipe cycles in the kernel where a workgroup
         could not be scheduled to a CU due to occupancy limitations (like a lack of
         a CU or SIMD with sufficient resources).
-      rst: 'The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
-        in  the kernel where a workgroup could not be scheduled to a  :doc:`CU <compute-unit>`
-        due to occupancy limitations (like a lack of a  CU or :ref:`SIMD <desc-valu>`
-        with sufficient resources). Note: this  value is expected to range between
-        0-25%, see note in  :ref:`workgroup manager <desc-spi>` description.'
+      rst: |-
+        The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
+        in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
+        due to occupancy limitations (like a lack of a CU or :ref:`SIMD <desc-valu>`
+        with sufficient resources). Note: this value is expected to range between
+        0-25%, see note in :ref:`workgroup manager <desc-spi>` description.
       unit: Percent
     Scratch Stall Rate:
       plain: The percent of total shader-engine cycles in the kernel where a workgroup
@@ -5507,46 +5568,46 @@ panels:
         slots. While this can reach up to 100%, note that the actual occupancy limitations
         on a kernel using private memory are typically quite small (for example, less
         than 1% of the total number of waves that can be scheduled to an accelerator).
-      rst: The percent of :ref:`total shader-engine cycles <total-se-cycles>` in the  kernel
-        where a workgroup could not be scheduled to a  :doc:`CU <compute-unit>` due
-        to lack of  :ref:`private (a.k.a., scratch) memory <memory-type>` slots. While
-        this  can reach up to 100%, note that the actual occupancy limitations on
-        a  kernel using private memory are typically quite small (for example, less  than
-        1% of the total number of waves that can be scheduled to an  accelerator).
+      rst: The percent of :ref:`total shader-engine cycles <total-se-cycles>` in the kernel
+        where a workgroup could not be scheduled to a :doc:`CU <compute-unit>` due
+        to lack of :ref:`private (a.k.a., scratch) memory <memory-type>` slots. While
+        this can reach up to 100%, note that the actual occupancy limitations on
+        a kernel using private memory are typically quite small (for example, less than
+        1% of the total number of waves that can be scheduled to an accelerator).
       unit: Percent
     Insufficient SIMD Waveslots:
       plain: The percent of total SIMD cycles in the kernel where a workgroup could
         not be scheduled to a SIMD due to lack of available waveslots.
-      rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel  where
-        a workgroup could not be scheduled to a  :ref:`SIMD <desc-valu>`  due to lack
+      rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel where
+        a workgroup could not be scheduled to a :ref:`SIMD <desc-valu>` due to lack
         of available :ref:`waveslots <desc-valu>`.
       unit: Percent
     Insufficient SIMD VGPRs:
       plain: The percent of total SIMD cycles in the kernel where a workgroup could
         not be scheduled to a SIMD due to lack of available VGPRs.
-      rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel  where
-        a workgroup could not be scheduled to a  :ref:`SIMD <desc-valu>`  due to lack
+      rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel where
+        a workgroup could not be scheduled to a :ref:`SIMD <desc-valu>` due to lack
         of available :ref:`VGPRs <desc-valu>`.
       unit: Percent
     Insufficient SIMD SGPRs:
       plain: The percent of total SIMD cycles in the kernel where a workgroup could
         not be scheduled to a SIMD due to lack of available SGPRs.
-      rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel  where
-        a workgroup could not be scheduled to a :ref:`SIMD <desc-valu>`  due to lack
+      rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel where
+        a workgroup could not be scheduled to a :ref:`SIMD <desc-valu>` due to lack
         of available :ref:`SGPRs <desc-salu>`.
       unit: Percent
     Insufficient CU LDS:
       plain: The percent of total CU cycles in the kernel where a workgroup could
         not be scheduled to a CU due to lack of available LDS.
-      rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel  where
-        a workgroup could not be scheduled to a :doc:`CU <compute-unit>`  due to lack
+      rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel where
+        a workgroup could not be scheduled to a :doc:`CU <compute-unit>` due to lack
         of available :doc:`LDS <local-data-share>`.
       unit: Percent
     Insufficient CU Barriers:
       plain: The percent of total CU cycles in the kernel where a workgroup could
         not be scheduled to a CU due to lack of available barriers.
-      rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel  where
-        a workgroup could not be scheduled to a :doc:`CU <compute-unit>`  due to lack
+      rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel where
+        a workgroup could not be scheduled to a :doc:`CU <compute-unit>` due to lack
         of available :ref:`barriers <desc-barrier>`.
       unit: Percent
     Reached CU Workgroup Limit:
@@ -5554,20 +5615,20 @@ panels:
         not be scheduled to a CU due to limits within the workgroup manager. This
         is expected to be always be zero on CDNA2 or newer accelerators (and small
         for previous accelerators).
-      rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel  where
-        a workgroup could not be scheduled to a :doc:`CU <compute-unit>`  due to limits
-        within the workgroup manager.  This is expected to be  always be zero on CDNA2
-        or newer accelerators (and small for previous  accelerators).
+      rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel where
+        a workgroup could not be scheduled to a :doc:`CU <compute-unit>` due to limits
+        within the workgroup manager. This is expected to be always be zero on CDNA2
+        or newer accelerators (and small for previous accelerators).
       unit: Percent
     Reached CU Wavefront Limit:
       plain: The percent of total CU cycles in the kernel where a wavefront could
         not be scheduled to a CU due to limits within the workgroup manager. This
         is expected to be always be zero on CDNA2 or newer accelerators (and small
         for previous accelerators).
-      rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel  where
-        a wavefront could not be scheduled to a :doc:`CU <compute-unit>`  due to limits
-        within the workgroup manager.  This is expected to be  always be zero on CDNA2
-        or newer accelerators (and small for previous  accelerators).
+      rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel where
+        a wavefront could not be scheduled to a :doc:`CU <compute-unit>` due to limits
+        within the workgroup manager. This is expected to be always be zero on CDNA2
+        or newer accelerators (and small for previous accelerators).
       unit: Percent
 - id: 700
   title: Wavefront
@@ -6168,64 +6229,74 @@ panels:
         total block size.
       unit: Work-Items
     Total Wavefronts:
-      plain: "The total number of wavefronts launched as part of the kernel dispatch.\
-        \ On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront\
-        \ size is always 64 work-items. Thus, the total number of wavefronts should\
-        \ be equivalent to the ceiling of grid size divided by 64."
-      rst: "The total number of wavefronts launched as part of the kernel dispatch.\
-        \ On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront\
-        \ size is always 64 work-items. Thus, the total number of wavefronts should\
-        \ be equivalent to the ceiling of grid size divided by 64."
+      plain: |-
+        The total number of wavefronts launched as part of the kernel dispatch.
+        On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront
+        size is always 64 work-items. Thus, the total number of wavefronts should
+        be equivalent to the ceiling of grid size divided by 64.
+      rst: |-
+        The total number of wavefronts launched as part of the kernel dispatch.
+        On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront
+        size is always 64 work-items. Thus, the total number of wavefronts should
+        be equivalent to the ceiling of grid size divided by 64.
       unit: Wavefronts
     Saved Wavefronts:
       plain: The total number of wavefronts saved at a context-save.
-      rst: The total number of wavefronts saved at a context-save. See  `cwsr_enable
+      rst: The total number of wavefronts saved at a context-save. See `cwsr_enable
         <https://docs.kernel.org/gpu/amdgpu/module-parameters.html?highlight=cwsr>`_.
       unit: Wavefronts
     Restored Wavefronts:
       plain: The total number of wavefronts restored from a context-save.
-      rst: The total number of wavefronts restored from a context-save. See  `cwsr_enable
+      rst: The total number of wavefronts restored from a context-save. See `cwsr_enable
         <https://docs.kernel.org/gpu/amdgpu/module-parameters.html?highlight=cwsr>`_.
       unit: Wavefronts
     VGPRs:
-      plain: 'The number of architected vector general-purpose registers allocated
+      plain: |-
+        The number of architected vector general-purpose registers allocated
         for the kernel, see VALU. Note: this may not exactly match the number of VGPRs
-        requested by the compiler due to allocation granularity.'
-      rst: 'The number of architected vector general-purpose registers allocated for  the
-        kernel, see :ref:`VALU <desc-valu>`.  Note: this may not exactly  match the
-        number of VGPRs requested by the compiler due to allocation  granularity.'
+        requested by the compiler due to allocation granularity.
+      rst: |-
+        The number of architected vector general-purpose registers allocated for the
+        kernel, see :ref:`VALU <desc-valu>`. Note: this may not exactly match the
+        number of VGPRs requested by the compiler due to allocation granularity.
       unit: VGPRs
     AGPRs:
-      plain: 'The number of accumulation vector general-purpose registers allocated
+      plain: |-
+        The number of accumulation vector general-purpose registers allocated
         for the kernel, see AGPRs. Note: this may not exactly match the number of
-        AGPRs requested by the compiler due to allocation granularity.'
-      rst: 'The number of accumulation vector general-purpose registers allocated
-        for  the kernel, see :ref:`AGPRs <desc-agprs>`.  Note: this may not exactly  match
-        the number of AGPRs requested by the compiler due to allocation  granularity.'
+        AGPRs requested by the compiler due to allocation granularity.
+      rst: |-
+        The number of accumulation vector general-purpose registers allocated
+        for the kernel, see :ref:`AGPRs <desc-agprs>`. Note: this may not exactly match
+        the number of AGPRs requested by the compiler due to allocation granularity.
       unit: AGPRs
     SGPRs:
-      plain: 'The number of scalar general-purpose registers allocated for the kernel,
+      plain: |-
+        The number of scalar general-purpose registers allocated for the kernel,
         see SALU. Note: this may not exactly match the number of SGPRs requested by
-        the compiler due to allocation granularity.'
-      rst: 'The number of scalar general-purpose registers allocated for the kernel,  see
-        :ref:`SALU <desc-salu>`.  Note: this may not exactly match the number  of
-        SGPRs requested by the compiler due to allocation granularity. plain'
+        the compiler due to allocation granularity.
+      rst: |-
+        The number of scalar general-purpose registers allocated for the kernel, see
+        :ref:`SALU <desc-salu>`. Note: this may not exactly match the number of
+        SGPRs requested by the compiler due to allocation granularity.
       unit: SGPRs
     LDS Allocation:
-      plain: 'The number of bytes of LDS memory (or, shared memory) allocated for
+      plain: |-
+        The number of bytes of LDS memory (or, shared memory) allocated for
         this kernel. Note: This may also be larger than what was requested at compile
-        time due to both allocation granularity and dynamic per-dispatch LDS allocations.'
-      rst: 'The number of bytes of :doc:`LDS <local-data-share>` memory (or, shared  memory)
-        allocated for this kernel.  Note: This may also be larger than  what was requested
-        at compile time due to both allocation granularity and  dynamic per-dispatch
-        LDS allocations.'
+        time due to both allocation granularity and dynamic per-dispatch LDS allocations.
+      rst: |-
+        The number of bytes of :doc:`LDS <local-data-share>` memory (or, shared memory)
+        allocated for this kernel. Note: This may also be larger than what was requested
+        at compile time due to both allocation granularity and dynamic per-dispatch
+        LDS allocations.
       unit: Bytes per workgroup
     Scratch Allocation:
       plain: The number of bytes of scratch memory requested per work-item for this
         kernel. Scratch memory is used for stack memory on the accelerator, as well
         as for register spills and restores.
-      rst: The number of bytes of :ref:`scratch memory <memory-spaces>` requested  per
-        work-item for this kernel. Scratch memory is used for stack memory  on the
+      rst: The number of bytes of :ref:`scratch memory <memory-spaces>` requested per
+        work-item for this kernel. Scratch memory is used for stack memory on the
         accelerator, as well as for register spills and restores.
       unit: Bytes per work-item
     Kernel Time:
@@ -6246,23 +6317,24 @@ panels:
       plain: The number of cycles a wavefront in the kernel dispatch spent resident
         on a compute unit per normalization unit. This is averaged over all wavefronts
         in a kernel dispatch.
-      rst: 'The number of cycles a wavefront in the kernel dispatch spent resident
-        on  a compute unit per :ref:`normalization unit <normalization-units>`. This  is
-        averaged over all wavefronts in a kernel dispatch.  Note: this should  not
-        be directly compared to the kernel cycles above.'
+      rst: |-
+        The number of cycles a wavefront in the kernel dispatch spent resident
+        on a compute unit per :ref:`normalization unit <normalization-units>`. This is
+        averaged over all wavefronts in a kernel dispatch. Note: this should not
+        be directly compared to the kernel cycles above.
       unit: Cycles per normalization unit
     Dependency Wait Cycles:
       plain: The number of cycles a wavefront in the kernel dispatch spent resident
         on a compute unit per normalization unit. This is averaged over all wavefronts
         in a kernel dispatch.
-      rst: The number of cycles a wavefront in the kernel dispatch stalled waiting  on
-        memory of any kind (e.g., instruction fetch, vector or scalar memory,  etc.)
-        per :ref:`normalization unit <normalization-units>`. This counter  is incremented
-        at every cycle by *all* wavefronts on a CU stalled at a  memory operation.  As
-        such, it is most useful to get a sense of how waves  were spending their time,
-        rather than identification of a precise limiter  because another wave could
-        be actively executing while a wave is stalled.  The sum of this metric, Issue
-        Wait Cycles and Active Cycles should be  equal to the total Wave Cycles metric.
+      rst: The number of cycles a wavefront in the kernel dispatch stalled waiting on
+        memory of any kind (e.g., instruction fetch, vector or scalar memory, etc.)
+        per :ref:`normalization unit <normalization-units>`. This counter is incremented
+        at every cycle by *all* wavefronts on a CU stalled at a memory operation. As
+        such, it is most useful to get a sense of how waves were spending their time,
+        rather than identification of a precise limiter because another wave could
+        be actively executing while a wave is stalled. The sum of this metric, Issue
+        Wait Cycles and Active Cycles should be equal to the total Wave Cycles metric.
       unit: Cycles per normalization unit
     Issue Wait Cycles:
       plain: The number of cycles a wavefront in the kernel dispatch was unable to
@@ -6273,14 +6345,14 @@ panels:
         of a precise limiter because another wave could be actively executing while
         a wave is issue stalled. The sum of this metric, Dependency Wait Cycles and
         Active Cycles should be equal to the total Wave Cycles metric.
-      rst: The number of cycles a wavefront in the kernel dispatch was unable to  issue
-        an instruction for any reason (e.g., execution pipe back-pressure,  arbitration
-        loss, etc.) per  :ref:`normalization unit <normalization-units>`.  This counter
-        is  incremented at every cycle by *all* wavefronts on a CU unable to issue
-        an  instruction.  As such, it is most useful to get a sense of how waves were  spending
-        their time, rather than identification of a precise limiter  because another
-        wave could be actively executing while a wave is issue  stalled.  The sum
-        of this metric, Dependency Wait Cycles and Active  Cycles should be equal
+      rst: The number of cycles a wavefront in the kernel dispatch was unable to issue
+        an instruction for any reason (e.g., execution pipe back-pressure, arbitration
+        loss, etc.) per :ref:`normalization unit <normalization-units>`. This counter
+        is incremented at every cycle by *all* wavefronts on a CU unable to issue
+        an instruction. As such, it is most useful to get a sense of how waves were spending
+        their time, rather than identification of a precise limiter because another
+        wave could be actively executing while a wave is issue stalled. The sum
+        of this metric, Dependency Wait Cycles and Active Cycles should be equal
         to the total Wave Cycles metric.
       unit: Cycles per normalization unit
     Active Cycles:
@@ -6292,22 +6364,24 @@ panels:
         time, rather than identification of a precise limiter. The sum of this metric,
         Issue Wait Cycles and Active Wait Cycles should be equal to the total Wave
         Cycles metric.
-      rst: The average number of cycles a wavefront in the kernel dispatch was  actively
-        executing instructions per  :ref:`normalization unit <normalization-units>`.
-        This measurement is made  on a per-wavefront basis, and may include cycles
-        that another wavefront  spent actively executing (on another execution unit,
-        for example) or was  stalled.  As such, it is most useful to get a sense of
-        how waves were  spending their time, rather than identification of a precise
-        limiter. The  sum of this metric, Issue Wait Cycles and Active Wait Cycles
-        should be  equal to the total Wave Cycles metric.
+      rst: The average number of cycles a wavefront in the kernel dispatch was actively
+        executing instructions per :ref:`normalization unit <normalization-units>`.
+        This measurement is made on a per-wavefront basis, and may include cycles
+        that another wavefront spent actively executing (on another execution unit,
+        for example) or was stalled. As such, it is most useful to get a sense of
+        how waves were spending their time, rather than identification of a precise
+        limiter. The sum of this metric, Issue Wait Cycles and Active Wait Cycles
+        should be equal to the total Wave Cycles metric.
       unit: Cycles per normalization unit
     Wavefront Occupancy:
-      plain: 'The time-averaged number of wavefronts resident on the accelerator over
+      plain: |-
+        The time-averaged number of wavefronts resident on the accelerator over
         the lifetime of the kernel. Note: this metric may be inaccurate for short-running
-        kernels (less than 1ms).'
-      rst: 'The time-averaged number of wavefronts resident on the accelerator over  the
-        lifetime of the kernel. Note: this metric may be inaccurate for  short-running
-        kernels (less than 1ms).'
+        kernels (less than 1ms).
+      rst: |-
+        The time-averaged number of wavefronts resident on the accelerator over the
+        lifetime of the kernel. Note: this metric may be inaccurate for short-running
+        kernels (less than 1ms).
       unit: Wavefronts
 - id: 1000
   title: Compute Units - Instruction Mix
@@ -7348,24 +7422,24 @@ panels:
         range of instruction types including floating point operations, non-uniform
         address calculations, transcendental operations, integer operations, shifts,
         conditional evaluation, etc.
-      rst: The total number of vector arithmetic logic unit (VALU) operations  issued.
-        These are the workhorses of the  :doc:`compute unit <compute-unit>`, and are
-        used to execute a wide range of  instruction types including floating point
-        operations, non-uniform  address calculations, transcendental operations,
-        integer operations,  shifts, conditional evaluation, etc.
+      rst: The total number of vector arithmetic logic unit (VALU) operations issued.
+        These are the workhorses of the :doc:`compute unit <compute-unit>`, and are
+        used to execute a wide range of instruction types including floating point
+        operations, non-uniform address calculations, transcendental operations,
+        integer operations, shifts, conditional evaluation, etc.
       unit: Instructions
     VMEM:
       plain: The total number of vector memory operations issued. These include most
         loads, stores and atomic operations and all accesses to generic, global, private
         and texture memory.
-      rst: The total number of vector memory operations issued. These include most  loads,
-        stores and atomic operations and all accesses to  :ref:`generic, global, private
+      rst: The total number of vector memory operations issued. These include most loads,
+        stores and atomic operations and all accesses to :ref:`generic, global, private
         and texture <memory-spaces>` memory.
       unit: Instructions
     LDS:
       plain: The total number of LDS (also known as shared memory) operations issued.
         These include loads, stores, atomics, and HIP's __shfl operations.
-      rst: The total number of LDS (also known as shared memory) operations issued.  These
+      rst: The total number of LDS (also known as shared memory) operations issued. These
         include loads, stores, atomics, and HIP's ``__shfl`` operations.
       unit: Instructions
     MFMA:
@@ -7388,202 +7462,204 @@ panels:
       plain: The total number of scalar memory (SMEM) operations issued. These are
         typically used for loading kernel arguments, base-pointers and loads from
         HIP's __constant__ memory.
-      rst: The total number of scalar memory (SMEM) operations issued. These are  typically
-        used for loading kernel arguments, base-pointers and loads  from HIP's ``__constant__``
+      rst: The total number of scalar memory (SMEM) operations issued. These are typically
+        used for loading kernel arguments, base-pointers and loads from HIP's ``__constant__``
         memory.
       unit: Instructions
     Branch:
       plain: The total number of branch operations issued. These typically consist
         of jump or branch operations and are used to implement control flow.
-      rst: The total number of branch operations issued. These typically consist of  jump
+      rst: The total number of branch operations issued. These typically consist of jump
         or branch operations and are used to implement control flow.
       unit: Instructions
     INT32:
       plain: The total number of instructions operating on 32-bit integer operands
         issued to the VALU per normalization unit.
-      rst: The total number of instructions operating on 32-bit integer operands  issued
+      rst: The total number of instructions operating on 32-bit integer operands issued
         to the VALU per :ref:`normalization unit <normalization-units>`.
       unit: Instructions per normalization unit
     INT64:
       plain: The total number of instructions operating on 64-bit integer operands
         issued to the VALU per normalization unit.
-      rst: The total number of instructions operating on 64-bit integer operands  issued
+      rst: The total number of instructions operating on 64-bit integer operands issued
         to the VALU per :ref:`normalization unit <normalization-units>`.
       unit: Instructions per normalization unit
     F16-ADD:
       plain: The total number of addition instructions operating on 16-bit floating-point
         operands issued to the VALU per normalization unit.
-      rst: The total number of addition instructions operating on 16-bit  floating-point
-        operands issued to the VALU per  :ref:`normalization unit <normalization-units>`.
+      rst: The total number of addition instructions operating on 16-bit floating-point
+        operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
       unit: Instructions per normalization unit
     F16-MUL:
       plain: The total number of multiplication instructions operating on 16-bit floating-point
         operands issued to the VALU per normalization unit.
-      rst: The total number of multiplication instructions operating on 16-bit  floating-point
-        operands issued to the VALU per  :ref:`normalization unit <normalization-units>`.
+      rst: The total number of multiplication instructions operating on 16-bit floating-point
+        operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
       unit: Instructions per normalization unit
     F16-FMA:
       plain: The total number of fused multiply-add instructions operating on 16-bit
         floating-point operands issued to the VALU per normalization unit.
-      rst: The total number of fused multiply-add instructions operating on 16-bit  floating-point
-        operands issued to the VALU per  :ref:`normalization unit <normalization-units>`.
+      rst: The total number of fused multiply-add instructions operating on 16-bit floating-point
+        operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
       unit: Instructions per normalization unit
     F16-Trans:
       plain: The total number of transcendental instructions (e.g., sqrt) operating
         on 16-bit floating-point operands issued to the VALU per normalization unit.
-      rst: The total number of transcendental instructions (e.g., `sqrt`) operating  on
-        16-bit floating-point operands issued to the VALU per  :ref:`normalization
+      rst: The total number of transcendental instructions (e.g., `sqrt`) operating on
+        16-bit floating-point operands issued to the VALU per :ref:`normalization
         unit <normalization-units>`.
       unit: Instructions per normalization unit
     F32-ADD:
       plain: The total number of addition instructions operating on 32-bit floating-point
         operands issued to the VALU per normalization unit.
-      rst: The total number of addition instructions operating on 32-bit  floating-point
-        operands issued to the VALU per  :ref:`normalization unit <normalization-units>`.
+      rst: The total number of addition instructions operating on 32-bit floating-point
+        operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
       unit: Instructions per normalization unit
     F32-MUL:
       plain: The total number of multiplication instructions operating on 32-bit floating-point
         operands issued to the VALU per normalization unit.
-      rst: The total number of multiplication instructions operating on 32-bit  floating-point
-        operands issued to the VALU per  :ref:`normalization unit <normalization-units>`.
+      rst: The total number of multiplication instructions operating on 32-bit floating-point
+        operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
       unit: Instructions per normalization unit
     F32-FMA:
       plain: The total number of fused multiply-add instructions operating on 32-bit
         floating-point operands issued to the VALU per normalization unit.
-      rst: The total number of fused multiply-add instructions operating on 32-bit  floating-point
-        operands issued to the VALU per  :ref:`normalization unit <normalization-units>`.
+      rst: The total number of fused multiply-add instructions operating on 32-bit floating-point
+        operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
       unit: Instructions per normalization unit
     F32-Trans:
       plain: The total number of transcendental instructions (such as sqrt) operating
         on 32-bit floating-point operands issued to the VALU per normalization unit.
-      rst: The total number of transcendental instructions (such as ``sqrt``)  operating
-        on 32-bit floating-point operands issued to the VALU per  :ref:`normalization
+      rst: The total number of transcendental instructions (such as ``sqrt``) operating
+        on 32-bit floating-point operands issued to the VALU per :ref:`normalization
         unit <normalization-units>`.
       unit: Instructions per normalization unit
     F64-ADD:
       plain: The total number of addition instructions operating on 64-bit floating-point
         operands issued to the VALU per normalization unit.
-      rst: The total number of addition instructions operating on 64-bit  floating-point
-        operands issued to the VALU per  :ref:`normalization unit <normalization-units>`.
+      rst: The total number of addition instructions operating on 64-bit floating-point
+        operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
       unit: Instructions per normalization unit
     F64-MUL:
       plain: The total number of multiplication instructions operating on 64-bit floating-point
         operands issued to the VALU per normalization unit.
-      rst: The total number of multiplication instructions operating on 64-bit  floating-point
-        operands issued to the VALU per  :ref:`normalization unit <normalization-units>`.
+      rst: The total number of multiplication instructions operating on 64-bit floating-point
+        operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
       unit: Instructions per normalization unit
     F64-FMA:
       plain: The total number of fused multiply-add instructions operating on 64-bit
         floating-point operands issued to the VALU per normalization unit.
-      rst: The total number of fused multiply-add instructions operating on 64-bit  floating-point
-        operands issued to the VALU per  :ref:`normalization unit <normalization-units>`.
+      rst: The total number of fused multiply-add instructions operating on 64-bit floating-point
+        operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
       unit: Instructions per normalization unit
     F64-Trans:
       plain: The total number of transcendental instructions (such as sqrt) operating
         on 64-bit floating-point operands issued to the VALU per normalization unit.
-      rst: The total number of transcendental instructions (such as `sqrt`)  operating
-        on 64-bit floating-point operands issued to the VALU per  :ref:`normalization
+      rst: The total number of transcendental instructions (such as `sqrt`) operating
+        on 64-bit floating-point operands issued to the VALU per :ref:`normalization
         unit <normalization-units>`.
       unit: Instructions per normalization unit
     Conversion:
-      plain: "The total number of type conversion instructions (such as converting\
-        \ data to or from F32\u2194F64) issued to the VALU per normalization unit."
-      rst: "The total number of type conversion instructions (such as converting data\
-        \  to or from F32\u2194F64) issued to the VALU per  :ref:`normalization unit\
-        \ <normalization-units>`."
+      plain: |-
+        The total number of type conversion instructions (such as converting
+        data to or from F32\u2194F64) issued to the VALU per normalization unit.
+      rst: |-
+        The total number of type conversion instructions (such as converting data
+        to or from F32\u2194F64) issued to the VALU per :ref:`normalization unit
+        <normalization-units>`.
       unit: Instructions per normalization unit
     Global/Generic Instr:
       plain: The total number of global & generic memory instructions executed on
         all compute units on the accelerator, per normalization unit.
-      rst: The total number of global & generic memory instructions executed on all  :doc:`compute
-        units <compute-unit>` on the accelerator, per  :ref:`normalization unit <normalization-units>`.
+      rst: The total number of global & generic memory instructions executed on all :doc:`compute
+        units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
       unit: Instructions per normalization unit
     Global/Generic Read:
       plain: The total number of global & generic memory read instructions executed
         on all compute units on the accelerator, per normalization unit.
       rst: The total number of global & generic memory read instructions executed
-        on  all :doc:`compute units <compute-unit>` on the accelerator, per  :ref:`normalization
+        on all :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
         unit <normalization-units>`.
       unit: Instructions per normalization unit
     Global/Generic Write:
       plain: The total number of global & generic memory write instructions executed
         on all compute units on the accelerator, per normalization unit.
-      rst: The total number of global & generic memory write instructions executed  on
-        all :doc:`compute units <compute-unit>` on the accelerator, per  :ref:`normalization
+      rst: The total number of global & generic memory write instructions executed on
+        all :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
         unit <normalization-units>`.
       unit: Instructions per normalization unit
     Global/Generic Atomic:
       plain: The total number of global & generic memory atomic (with and without
         return) instructions executed on all compute units on the accelerator, per
         normalization unit.
-      rst: The total number of global & generic memory atomic (with and without  return)
-        instructions executed on all :doc:`compute units <compute-unit>`  on the accelerator,
+      rst: The total number of global & generic memory atomic (with and without return)
+        instructions executed on all :doc:`compute units <compute-unit>` on the accelerator,
         per :ref:`normalization unit <normalization-units>`.
       unit: Instructions per normalization unit
     Spill/Stack Instr:
       plain: The total number of spill/stack memory instructions executed on all compute
         units on the accelerator, per normalization unit.
-      rst: The total number of spill/stack memory instructions executed on all  :doc:`compute
-        units <compute-unit>` on the accelerator, per  :ref:`normalization unit <normalization-units>`.
+      rst: The total number of spill/stack memory instructions executed on all :doc:`compute
+        units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
       unit: Instructions per normalization unit
     Spill/Stack Read:
       plain: The total number of spill/stack memory read instructions executed on
         all compute units on the accelerator, per normalization unit.
-      rst: The total number of spill/stack memory read instructions executed on all  :doc:`compute
-        units <compute-unit>` on the accelerator, per  :ref:`normalization unit <normalization-units>`.
+      rst: The total number of spill/stack memory read instructions executed on all :doc:`compute
+        units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
       unit: Instructions per normalization unit
     Spill/Stack Write:
       plain: The total number of spill/stack memory write instructions executed on
         all compute units on the accelerator, per normalization unit.
-      rst: The total number of spill/stack memory write instructions executed on all  :doc:`compute
-        units <compute-unit>` on the accelerator, per  :ref:`normalization unit <normalization-units>`.
+      rst: The total number of spill/stack memory write instructions executed on all :doc:`compute
+        units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
       unit: Instructions per normalization unit
     Spill/Stack Atomic:
       plain: The total number of spill/stack memory atomic (with and without return)
         instructions executed on all compute units on the accelerator, per normalization
         unit. Typically unused as these memory operations are typically used to implement
         thread-local storage.
-      rst: The total number of spill/stack memory atomic (with and without return)  instructions
-        executed on all :doc:`compute units <compute-unit>` on the  accelerator, per
-        :ref:`normalization unit <normalization-units>`.  Typically unused as these
-        memory operations are typically used to  implement thread-local storage.
+      rst: The total number of spill/stack memory atomic (with and without return) instructions
+        executed on all :doc:`compute units <compute-unit>` on the accelerator, per
+        :ref:`normalization unit <normalization-units>`. Typically unused as these
+        memory operations are typically used to implement thread-local storage.
       unit: Instructions per normalization unit
     MFMA-I8:
       plain: The total number of 8-bit integer MFMA instructions issued per normalization
         unit.
-      rst: The total number of 8-bit integer :ref:`MFMA <desc-mfma>` instructions  issued
+      rst: The total number of 8-bit integer :ref:`MFMA <desc-mfma>` instructions issued
         per :ref:`normalization unit <normalization-units>`.
       unit: Instructions per normalization unit
     MFMA-F8:
       plain: The total number of 8-bit floating point MFMA instructions issued per
         normalization unit. This is supported in AMD Instinct MI300 series and later
         only.
-      rst: The total number of 8-bit floating point :ref:`MFMA <desc-mfma>` instructions  issued
+      rst: The total number of 8-bit floating point :ref:`MFMA <desc-mfma>` instructions issued
         per :ref:`normalization unit <normalization-units>`. This is supported in
         AMD Instinct MI300 series and later only.
       unit: Instructions per normalization unit
     MFMA-F16:
       plain: The total number of 16-bit floating point MFMA instructions issued per
         normalization unit.
-      rst: The total number of 16-bit floating point :ref:`MFMA <desc-mfma>`  instructions
+      rst: The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` instructions
         issued per :ref:`normalization unit <normalization-units>`.
       unit: Instructions per normalization unit
     MFMA-BF16:
       plain: The total number of 16-bit brain floating point MFMA instructions issued
         per normalization unit.
-      rst: The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>`  instructions
+      rst: The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>` instructions
         issued per :ref:`normalization unit <normalization-units>`.
       unit: Instructions per normalization unit
     MFMA-F32:
       plain: The total number of 32-bit floating-point MFMA instructions issued per
         normalization unit.
-      rst: The total number of 32-bit floating-point :ref:`MFMA <desc-mfma>`  instructions
+      rst: The total number of 32-bit floating-point :ref:`MFMA <desc-mfma>` instructions
         issued per :ref:`normalization unit <normalization-units>`.
       unit: Instructions per normalization unit
     MFMA-F64:
       plain: The total number of 64-bit floating-point MFMA instructions issued per
         normalization unit.
-      rst: The total number of 64-bit floating-point :ref:`MFMA <desc-mfma>`  instructions
+      rst: The total number of 64-bit floating-point :ref:`MFMA <desc-mfma>` instructions
         issued per :ref:`normalization unit <normalization-units>`.
       unit: Instructions per normalization unit
 - id: 1100
@@ -7913,13 +7989,13 @@ panels:
             unit: Instr/cycle
           IPC (Issued):
             avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
-              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED  + SQ_INSTS_LDS)
+              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
               / SQ_ACTIVE_INST_ANY))
             min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
               + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
               / SQ_ACTIVE_INST_ANY))
             max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
-              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED  + SQ_INSTS_LDS)
+              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
               / SQ_ACTIVE_INST_ANY))
             unit: Instr/cycle
           SALU Utilization:
@@ -7992,13 +8068,13 @@ panels:
             unit: Instr/cycle
           IPC (Issued):
             avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
-              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED  + SQ_INSTS_LDS)
+              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
               / SQ_ACTIVE_INST_ANY))
             min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
               + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
               / SQ_ACTIVE_INST_ANY))
             max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
-              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED  + SQ_INSTS_LDS)
+              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
               / SQ_ACTIVE_INST_ANY))
             unit: Instr/cycle
           SALU Utilization:
@@ -8071,13 +8147,13 @@ panels:
             unit: Instr/cycle
           IPC (Issued):
             avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
-              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED  + SQ_INSTS_LDS)
+              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
               / SQ_ACTIVE_INST_ANY))
             min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
               + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
               / SQ_ACTIVE_INST_ANY))
             max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
-              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED  + SQ_INSTS_LDS)
+              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
               / SQ_ACTIVE_INST_ANY))
             unit: Instr/cycle
           SALU Utilization:
@@ -8150,13 +8226,13 @@ panels:
             unit: Instr/cycle
           IPC (Issued):
             avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
-              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED  + SQ_INSTS_LDS)
+              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
               / SQ_ACTIVE_INST_ANY))
             min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
               + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
               / SQ_ACTIVE_INST_ANY))
             max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
-              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED  + SQ_INSTS_LDS)
+              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
               / SQ_ACTIVE_INST_ANY))
             unit: Instr/cycle
           SALU Utilization:
@@ -8229,13 +8305,13 @@ panels:
             unit: Instr/cycle
           IPC (Issued):
             avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
-              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED  + SQ_INSTS_LDS)
+              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
               / SQ_ACTIVE_INST_ANY))
             min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
               + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
               / SQ_ACTIVE_INST_ANY))
             max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
-              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED  + SQ_INSTS_LDS)
+              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
               / SQ_ACTIVE_INST_ANY))
             unit: Instr/cycle
           SALU Utilization:
@@ -8313,13 +8389,13 @@ panels:
             unit: Instr/cycle
           IPC (Issued):
             avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
-              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED  + SQ_INSTS_LDS)
+              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
               / SQ_ACTIVE_INST_ANY))
             min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
               + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
               / SQ_ACTIVE_INST_ANY))
             max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
-              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED  + SQ_INSTS_LDS)
+              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
               / SQ_ACTIVE_INST_ANY))
             unit: Instr/cycle
           SALU Utilization:
@@ -8376,7 +8452,7 @@ panels:
               + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
               + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
               / $denom))
-            unit: (OPs  + $normUnit)
+            unit: (OPs + $normUnit)
           IOPs (Total):
             avg: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
               * 512)) / $denom)
@@ -8384,7 +8460,7 @@ panels:
               * 512)) / $denom)
             max: MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
               * 512)) / $denom)
-            unit: (OPs  + $normUnit)
+            unit: (OPs + $normUnit)
           F16 OPs:
             avg: AVG(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
               + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
@@ -8395,12 +8471,12 @@ panels:
             max: MAX(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
               + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
               (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
-            unit: (OPs  + $normUnit)
+            unit: (OPs + $normUnit)
           BF16 OPs:
             avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
             min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
             max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
-            unit: (OPs  + $normUnit)
+            unit: (OPs + $normUnit)
           F32 OPs:
             avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
               + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
@@ -8411,7 +8487,7 @@ panels:
             max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
               + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
               / $denom))
-            unit: (OPs  + $normUnit)
+            unit: (OPs + $normUnit)
           F64 OPs:
             avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
               + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
@@ -8422,12 +8498,12 @@ panels:
             max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
               + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
               / $denom))
-            unit: (OPs  + $normUnit)
+            unit: (OPs + $normUnit)
           INT8 OPs:
             avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
             min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
             max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
-            unit: (OPs  + $normUnit)
+            unit: (OPs + $normUnit)
         gfx941:
           FLOPs (Total):
             avg: AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
@@ -8454,7 +8530,7 @@ panels:
               * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64
               + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64
               * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom))
-            unit: (OPs  + $normUnit)
+            unit: (OPs + $normUnit)
           IOPs (Total):
             avg: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
               * 512)) / $denom)
@@ -8462,12 +8538,12 @@ panels:
               * 512)) / $denom)
             max: MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
               * 512)) / $denom)
-            unit: (OPs  + $normUnit)
+            unit: (OPs + $normUnit)
           F8 OPs:
             avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
             min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
             max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
-            unit: (OPs  + $normUnit)
+            unit: (OPs + $normUnit)
           F16 OPs:
             avg: AVG(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
               + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
@@ -8478,12 +8554,12 @@ panels:
             max: MAX(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
               + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
               (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
-            unit: (OPs  + $normUnit)
+            unit: (OPs + $normUnit)
           BF16 OPs:
             avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
             min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
             max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
-            unit: (OPs  + $normUnit)
+            unit: (OPs + $normUnit)
           F32 OPs:
             avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
               + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
@@ -8494,7 +8570,7 @@ panels:
             max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
               + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
               / $denom))
-            unit: (OPs  + $normUnit)
+            unit: (OPs + $normUnit)
           F64 OPs:
             avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
               + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
@@ -8505,12 +8581,12 @@ panels:
             max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
               + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
               / $denom))
-            unit: (OPs  + $normUnit)
+            unit: (OPs + $normUnit)
           INT8 OPs:
             avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
             min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
             max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
-            unit: (OPs  + $normUnit)
+            unit: (OPs + $normUnit)
         gfx940:
           FLOPs (Total):
             avg: AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
@@ -8537,7 +8613,7 @@ panels:
               * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64
               + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64
               * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom))
-            unit: (OPs  + $normUnit)
+            unit: (OPs + $normUnit)
           IOPs (Total):
             avg: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
               * 512)) / $denom)
@@ -8545,12 +8621,12 @@ panels:
               * 512)) / $denom)
             max: MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
               * 512)) / $denom)
-            unit: (OPs  + $normUnit)
+            unit: (OPs + $normUnit)
           F8 OPs:
             avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
             min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
             max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
-            unit: (OPs  + $normUnit)
+            unit: (OPs + $normUnit)
           F16 OPs:
             avg: AVG(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
               + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
@@ -8561,12 +8637,12 @@ panels:
             max: MAX(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
               + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
               (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
-            unit: (OPs  + $normUnit)
+            unit: (OPs + $normUnit)
           BF16 OPs:
             avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
             min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
             max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
-            unit: (OPs  + $normUnit)
+            unit: (OPs + $normUnit)
           F32 OPs:
             avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
               + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
@@ -8577,7 +8653,7 @@ panels:
             max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
               + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
               / $denom))
-            unit: (OPs  + $normUnit)
+            unit: (OPs + $normUnit)
           F64 OPs:
             avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
               + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
@@ -8588,12 +8664,12 @@ panels:
             max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
               + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
               / $denom))
-            unit: (OPs  + $normUnit)
+            unit: (OPs + $normUnit)
           INT8 OPs:
             avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
             min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
             max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
-            unit: (OPs  + $normUnit)
+            unit: (OPs + $normUnit)
         gfx942:
           FLOPs (Total):
             avg: AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
@@ -8620,7 +8696,7 @@ panels:
               * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64
               + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64
               * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom))
-            unit: (OPs  + $normUnit)
+            unit: (OPs + $normUnit)
           IOPs (Total):
             avg: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
               * 512)) / $denom)
@@ -8628,12 +8704,12 @@ panels:
               * 512)) / $denom)
             max: MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
               * 512)) / $denom)
-            unit: (OPs  + $normUnit)
+            unit: (OPs + $normUnit)
           F8 OPs:
             avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
             min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
             max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
-            unit: (OPs  + $normUnit)
+            unit: (OPs + $normUnit)
           F16 OPs:
             avg: AVG(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
               + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
@@ -8644,12 +8720,12 @@ panels:
             max: MAX(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
               + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
               (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
-            unit: (OPs  + $normUnit)
+            unit: (OPs + $normUnit)
           BF16 OPs:
             avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
             min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
             max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
-            unit: (OPs  + $normUnit)
+            unit: (OPs + $normUnit)
           F32 OPs:
             avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
               + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
@@ -8660,7 +8736,7 @@ panels:
             max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
               + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
               / $denom))
-            unit: (OPs  + $normUnit)
+            unit: (OPs + $normUnit)
           F64 OPs:
             avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
               + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
@@ -8671,12 +8747,12 @@ panels:
             max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
               + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
               / $denom))
-            unit: (OPs  + $normUnit)
+            unit: (OPs + $normUnit)
           INT8 OPs:
             avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
             min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
             max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
-            unit: (OPs  + $normUnit)
+            unit: (OPs + $normUnit)
         gfx950:
           FLOPs (Total):
             avg: AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
@@ -8706,7 +8782,7 @@ panels:
               + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64
               * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4))
               / $denom))
-            unit: (OPs  + $normUnit)
+            unit: (OPs + $normUnit)
           IOPs (Total):
             avg: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
               * 512)) / $denom)
@@ -8714,12 +8790,12 @@ panels:
               * 512)) / $denom)
             max: MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
               * 512)) / $denom)
-            unit: (OPs  + $normUnit)
+            unit: (OPs + $normUnit)
           F8 OPs:
             avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
             min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
             max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
-            unit: (OPs  + $normUnit)
+            unit: (OPs + $normUnit)
           F16 OPs:
             avg: AVG(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
               + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
@@ -8730,12 +8806,12 @@ panels:
             max: MAX(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
               + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
               (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
-            unit: (OPs  + $normUnit)
+            unit: (OPs + $normUnit)
           BF16 OPs:
             avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
             min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
             max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
-            unit: (OPs  + $normUnit)
+            unit: (OPs + $normUnit)
           F32 OPs:
             avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
               + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
@@ -8746,7 +8822,7 @@ panels:
             max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
               + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
               / $denom))
-            unit: (OPs  + $normUnit)
+            unit: (OPs + $normUnit)
           F64 OPs:
             avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
               + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
@@ -8757,129 +8833,143 @@ panels:
             max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
               + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
               / $denom))
-            unit: (OPs  + $normUnit)
+            unit: (OPs + $normUnit)
           F6F4 OPs:
             avg: AVG((512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4) / $denom)
             min: MIN((512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4) / $denom)
             max: MAX((512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4) / $denom)
-            unit: (OPs  + $normUnit)
+            unit: (OPs + $normUnit)
           INT8 OPs:
             avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
             min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
             max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
-            unit: (OPs  + $normUnit)
+            unit: (OPs + $normUnit)
         gfx908: {}
   metrics_description:
     VALU FLOPs:
-      plain: 'The total floating-point operations executed per second on the VALU.
+      plain: |-
+        The total floating-point operations executed per second on the VALU.
         This is also presented as a percent of the peak theoretical FLOPs achievable
         on the specific accelerator. Note: this does not include any floating-point
-        operations from MFMA instructions.'
-      rst: 'The total floating-point operations executed per second on the  :ref:`VALU
-        <desc-valu>`. This is also presented as a percent of the peak  theoretical
-        FLOPs achievable on the specific accelerator. Note: this does  not include
-        any floating-point operations from :ref:`MFMA <desc-mfma>`  instructions.'
+        operations from MFMA instructions.
+      rst: |-
+        The total floating-point operations executed per second on the :ref:`VALU
+        <desc-valu>`. This is also presented as a percent of the peak theoretical
+        FLOPs achievable on the specific accelerator. Note: this does not include
+        any floating-point operations from :ref:`MFMA <desc-mfma>` instructions.
       unit: GFLOPs
     VALU IOPs:
-      plain: 'The total integer operations executed per second on the VALU. This is
+      plain: |-
+        The total integer operations executed per second on the VALU. This is
         also presented as a percent of the peak theoretical IOPs achievable on the
         specific accelerator. Note: this does not include any integer operations from
-        MFMA instructions.'
-      rst: 'The total integer operations executed per second on the  :ref:`VALU <desc-valu>`.
-        This is also presented as a percent of the peak  theoretical IOPs achievable
-        on the specific accelerator. Note: this does  not include any integer operations
-        from :ref:`MFMA <desc-mfma>`  instructions.'
+        MFMA instructions.
+      rst: |-
+        The total integer operations executed per second on the :ref:`VALU <desc-valu>`.
+        This is also presented as a percent of the peak theoretical IOPs achievable
+        on the specific accelerator. Note: this does not include any integer operations
+        from :ref:`MFMA <desc-mfma>` instructions.
       unit: GIOPs
     MFMA FLOPs (BF16):
-      plain: 'The total number of 16-bit brain floating point MFMA operations executed
+      plain: |-
+        The total number of 16-bit brain floating point MFMA operations executed
         per second. Note: this does not include any 16-bit brain floating point operations
         from VALU instructions. This is also presented as a percent of the peak theoretical
-        BF16 MFMA operations achievable on the specific accelerator.'
-      rst: 'The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>`  operations
-        executed per second. Note: this does not include any 16-bit  brain floating
-        point operations from :ref:`VALU <desc-valu>`  instructions. This is also
-        presented as a percent of the peak theoretical  BF16 MFMA operations achievable
-        on the specific accelerator.'
+        BF16 MFMA operations achievable on the specific accelerator.
+      rst: |-
+        The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>` operations
+        executed per second. Note: this does not include any 16-bit brain floating
+        point operations from :ref:`VALU <desc-valu>` instructions. This is also
+        presented as a percent of the peak theoretical BF16 MFMA operations achievable
+        on the specific accelerator.
       unit: GFLOPs
     MFMA FLOPs (F16):
-      plain: 'The total number of 16-bit floating point MFMA operations executed per
+      plain: |-
+        The total number of 16-bit floating point MFMA operations executed per
         second. Note: this does not include any 16-bit floating point operations from
         VALU instructions. This is also presented as a percent of the peak theoretical
-        F16 MFMA operations achievable on the specific accelerator.'
-      rst: 'The total number of 16-bit floating point :ref:`MFMA <desc-mfma>`  operations
-        executed per second. Note: this does not include any 16-bit  floating point
-        operations from :ref:`VALU <desc-valu>` instructions. This  is also presented
-        as a percent of the peak theoretical F16 MFMA  operations achievable on the
-        specific accelerator.'
+        F16 MFMA operations achievable on the specific accelerator.
+      rst: |-
+        The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
+        executed per second. Note: this does not include any 16-bit floating point
+        operations from :ref:`VALU <desc-valu>` instructions. This is also presented
+        as a percent of the peak theoretical F16 MFMA operations achievable on the
+        specific accelerator.
       unit: GFLOPs
     MFMA FLOPs (F32):
-      plain: 'The total number of 32-bit floating point MFMA operations executed per
+      plain: |-
+        The total number of 32-bit floating point MFMA operations executed per
         second. Note: this does not include any 32-bit floating point operations from
         VALU instructions. This is also presented as a percent of the peak theoretical
-        F32 MFMA operations achievable on the specific accelerator.'
-      rst: 'The total number of 32-bit floating point :ref:`MFMA <desc-mfma>`  operations
-        executed per second. Note: this does not include any 32-bit  floating point
-        operations from :ref:`VALU <desc-valu>` instructions. This  is also presented
-        as a percent of the peak theoretical F32 MFMA  operations achievable on the
-        specific accelerator.'
+        F32 MFMA operations achievable on the specific accelerator.
+      rst: |-
+        The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
+        executed per second. Note: this does not include any 32-bit floating point
+        operations from :ref:`VALU <desc-valu>` instructions. This is also presented
+        as a percent of the peak theoretical F32 MFMA operations achievable on the
+        specific accelerator.
       unit: GFLOPs
     MFMA FLOPs (F64):
-      plain: 'The total number of 64-bit floating point MFMA operations executed per
+      plain: |-
+        The total number of 64-bit floating point MFMA operations executed per
         second. Note: this does not include any 64-bit floating point operations from
         VALU instructions. This is also presented as a percent of the peak theoretical
-        F64 MFMA operations achievable on the specific accelerator.'
-      rst: 'The total number of 64-bit floating point :ref:`MFMA <desc-mfma>`  operations
-        executed per second. Note: this does not include any 64-bit  floating point
-        operations from :ref:`VALU <desc-valu>` instructions. This  is also presented
-        as a percent of the peak theoretical F64 MFMA  operations achievable on the
-        specific accelerator.  The total number of 64-bit floating point :ref:`MFMA
-        <desc-mfma>`  operations executed per second. Note: this does not include
-        any 64-bit  floating point operations from :ref:`VALU <desc-valu>` instructions.
-        This  is also presented as a percent of the peak theoretical F64 MFMA  operations
-        achievable on the specific accelerator.'
+        F64 MFMA operations achievable on the specific accelerator.
+      rst: |-
+        The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
+        executed per second. Note: this does not include any 64-bit floating point
+        operations from :ref:`VALU <desc-valu>` instructions. This is also presented
+        as a percent of the peak theoretical F64 MFMA operations achievable on the
+        specific accelerator. The total number of 64-bit floating point :ref:`MFMA
+        <desc-mfma>` operations executed per second. Note: this does not include
+        any 64-bit floating point operations from :ref:`VALU <desc-valu>` instructions.
+        This is also presented as a percent of the peak theoretical F64 MFMA operations
+        achievable on the specific accelerator.
       unit: GFLOPs
     MFMA IOPs (INT8):
-      plain: 'The total number of 8-bit integer MFMA operations executed per second.
+      plain: |-
+        The total number of 8-bit integer MFMA operations executed per second.
         Note: this does not include any 8-bit integer operations from VALU instructions.
         This is also presented as a percent of the peak theoretical INT8 MFMA operations
-        achievable on the specific accelerator.'
-      rst: 'The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations  executed
-        per second. Note: this does not include any 8-bit integer  operations from
-        :ref:`VALU <desc-valu>` instructions. This is also  presented as a percent
-        of the peak theoretical INT8 MFMA operations  achievable on the specific accelerator.'
+        achievable on the specific accelerator.
+      rst: |-
+        The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
+        per second. Note: this does not include any 8-bit integer operations from
+        :ref:`VALU <desc-valu>` instructions. This is also presented as a percent
+        of the peak theoretical INT8 MFMA operations achievable on the specific accelerator.
       unit: GFLOPs
     IPC:
       plain: The ratio of the total number of instructions executed on the CU over
         the total active CU cycles.
-      rst: The ratio of the total number of instructions executed on the  :doc:`CU
-        <compute-unit>` over the  :ref:`total active CU cycles <total-active-cu-cycles>`.
+      rst: The ratio of the total number of instructions executed on the :doc:`CU
+        <compute-unit>` over the :ref:`total active CU cycles <total-active-cu-cycles>`.
       unit: Instructions per cycle
     IPC (Issued):
       plain: The ratio of the total number of (non-internal) instructions issued over
         the number of cycles where the scheduler was actively working on issuing instructions.
-      rst: The ratio of the total number of  (non-:ref:`internal <ipc-internal-instructions>`)
-        instructions issued over  the number of cycles where the :ref:`scheduler <desc-scheduler>`
-        was  actively working on issuing instructions. Refer to the  :ref:`Issued
+      rst: The ratio of the total number of (non-:ref:`internal <ipc-internal-instructions>`)
+        instructions issued over the number of cycles where the :ref:`scheduler <desc-scheduler>`
+        was actively working on issuing instructions. Refer to the :ref:`Issued
         IPC <issued-ipc>` example for further detail.
       unit: Instructions per cycle
     SALU Utilization:
       plain: Indicates what percent of the kernel's duration the SALU was busy executing
         instructions. Computed as the ratio of the total number of cycles spent by
         the scheduler issuing SALU / SMEM instructions over the total CU cycles.
-      rst: Indicates what percent of the kernel's duration the  :ref:`SALU <desc-salu>`
-        was busy executing instructions. Computed as the  ratio of the total number
-        of cycles spent by the  :ref:`scheduler <desc-scheduler>` issuing SALU / :ref:`SMEM
-        <desc-smem>`  instructions over the :ref:`total CU cycles <total-cu-cycles>`.
+      rst: Indicates what percent of the kernel's duration the :ref:`SALU <desc-salu>`
+        was busy executing instructions. Computed as the ratio of the total number
+        of cycles spent by the :ref:`scheduler <desc-scheduler>` issuing SALU / :ref:`SMEM
+        <desc-smem>` instructions over the :ref:`total CU cycles <total-cu-cycles>`.
       unit: Percent
     VALU Utilization:
       plain: Indicates what percent of the kernel's duration the VALU was busy executing
         instructions. Does not include VMEM operations. Computed as the ratio of the
         total number of cycles spent by the scheduler issuing VALU instructions over
         the total CU cycles.
-      rst: Indicates what percent of the kernel's duration the  :ref:`VALU <desc-valu>`
-        was busy executing instructions. Does not include  :ref:`VMEM <desc-vmem>`
-        operations. Computed as the ratio of the total  number of cycles spent by
-        the :ref:`scheduler <desc-scheduler>` issuing  VALU instructions over the
+      rst: Indicates what percent of the kernel's duration the :ref:`VALU <desc-valu>`
+        was busy executing instructions. Does not include :ref:`VMEM <desc-vmem>`
+        operations. Computed as the ratio of the total number of cycles spent by
+        the :ref:`scheduler <desc-scheduler>` issuing VALU instructions over the
         :ref:`total CU cycles <total-cu-cycles>`.
       unit: Percent
     VMEM Utilization:
@@ -8888,113 +8978,115 @@ panels:
         (see the VMEM instruction count metrics for more detail). Does not include
         VALU operations. Computed as the ratio of the total number of cycles spent
         by the scheduler issuing VMEM instructions over the total CU cycles.
-      rst: Indicates what percent of the kernel's duration the  :ref:`VMEM <desc-vmem>`
-        unit was busy executing instructions, including  both global/generic and spill/scratch
-        operations (see the  :ref:`VMEM instruction count metrics <ta-instruction-counts>`
-        for more  detail).  Does not include :ref:`VALU <desc-valu>` operations. Computed  as
-        the ratio of the total number of cycles spent by the  :ref:`scheduler <desc-scheduler>`
-        issuing VMEM instructions over the  :ref:`total CU cycles <total-cu-cycles>`.
+      rst: Indicates what percent of the kernel's duration the :ref:`VMEM <desc-vmem>`
+        unit was busy executing instructions, including both global/generic and spill/scratch
+        operations (see the :ref:`VMEM instruction count metrics <ta-instruction-counts>`
+        for more detail). Does not include :ref:`VALU <desc-valu>` operations. Computed as
+        the ratio of the total number of cycles spent by the :ref:`scheduler <desc-scheduler>`
+        issuing VMEM instructions over the :ref:`total CU cycles <total-cu-cycles>`.
       unit: Percent
     Branch Utilization:
       plain: Indicates what percent of the kernel's duration the branch unit was busy
         executing instructions. Computed as the ratio of the total number of cycles
         spent by the scheduler issuing branch instructions over the total CU cycles.
-      rst: Indicates what percent of the kernel's duration the  :ref:`branch <desc-branch>`
-        unit was busy executing instructions.  Computed as the ratio of the total
-        number of cycles spent by the  :ref:`scheduler <desc-scheduler>` issuing branch
-        instructions over the  :ref:`total CU cycles <total-cu-cycles>`.
+      rst: Indicates what percent of the kernel's duration the :ref:`branch <desc-branch>`
+        unit was busy executing instructions. Computed as the ratio of the total
+        number of cycles spent by the :ref:`scheduler <desc-scheduler>` issuing branch
+        instructions over the :ref:`total CU cycles <total-cu-cycles>`.
       unit: Percent
     VALU Active Threads:
       plain: Indicates the average level of divergence within a wavefront over the
         lifetime of the kernel. The number of work-items that were active in a wavefront
         during execution of each VALU instruction, time-averaged over all VALU instructions
         run on all wavefronts in the kernel
-      rst: Indicates the average level of :ref:`divergence <desc-divergence>` within  a
-        wavefront over the lifetime of the kernel. The number of work-items  that
-        were active in a wavefront during execution of each  :ref:`VALU <desc-valu>`
-        instruction, time-averaged over all VALU  instructions run on all wavefronts
+      rst: Indicates the average level of :ref:`divergence <desc-divergence>` within a
+        wavefront over the lifetime of the kernel. The number of work-items that
+        were active in a wavefront during execution of each :ref:`VALU <desc-valu>`
+        instruction, time-averaged over all VALU instructions run on all wavefronts
         in the kernel.
       unit: Work-items
     MFMA Utilization:
       plain: Indicates what percent of the kernel's duration the MFMA unit was busy
         executing instructions. Computed as the ratio of the total number of cycles
         spent by the MFMA was busy over the total CU cycles.
-      rst: Indicates what percent of the kernel's duration the  :ref:`MFMA <desc-mfma>`
-        unit was busy executing instructions. Computed as  the ratio of the total
-        number of cycles spent by the  :ref:`MFMA <desc-salu>` was busy over the  :ref:`total
+      rst: Indicates what percent of the kernel's duration the :ref:`MFMA <desc-mfma>`
+        unit was busy executing instructions. Computed as the ratio of the total
+        number of cycles spent by the :ref:`MFMA <desc-salu>` was busy over the :ref:`total
         CU cycles <total-cu-cycles>`.
       unit: Percent
     MFMA Instruction Cycles:
       plain: The average duration of MFMA instructions in this kernel in cycles. Computed
         as the ratio of the total number of cycles the MFMA unit was busy over the
         total number of MFMA instructions.
-      rst: The average duration of :ref:`MFMA <desc-mfma>` instructions in this  kernel
-        in cycles. Computed as the ratio of the total number of cycles the  MFMA unit
-        was busy over the total number of MFMA instructions. Compare  to, for example,
-        the  `AMD Matrix Instruction Calculator <https://github.com/RadeonOpenCompute/amd_matrix_instruction_calculator>`_.
+      rst: The average duration of :ref:`MFMA <desc-mfma>` instructions in this kernel
+        in cycles. Computed as the ratio of the total number of cycles the MFMA unit
+        was busy over the total number of MFMA instructions. Compare to, for example,
+        the `AMD Matrix Instruction Calculator <https://github.com/RadeonOpenCompute/amd_matrix_instruction_calculator>`_.
       unit: Cycles per instruction
     VMEM Latency:
       plain: The average number of round-trip cycles (that is, from issue to data
         return / acknowledgment) required for a VMEM instruction to complete.
-      rst: The average number of round-trip cycles (that is, from issue to data  return
+      rst: The average number of round-trip cycles (that is, from issue to data return
         / acknowledgment) required for a VMEM instruction to complete.
       unit: Cycles
     SMEM Latency:
       plain: The average number of round-trip cycles (that is, from issue to data
         return / acknowledgment) required for a SMEM instruction to complete.
-      rst: The average number of round-trip cycles (that is, from issue to data  return
+      rst: The average number of round-trip cycles (that is, from issue to data return
         / acknowledgment) required for a SMEM instruction to complete.
       unit: Cycles
     FLOPs (Total):
       plain: The total number of floating-point operations executed on either the
         VALU or MFMA units, per normalization unit.
-      rst: The total number of floating-point operations executed on either the  :ref:`VALU
-        <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per  :ref:`normalization unit
+      rst: The total number of floating-point operations executed on either the :ref:`VALU
+        <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization unit
         <normalization-units>`.
       unit: FLOP per normalization unit
     IOPs (Total):
       plain: The total number of integer operations executed on either the VALU or
         MFMA units, per normalization unit.
-      rst: The total number of integer operations executed on either the  :ref:`VALU
-        <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per  :ref:`normalization unit
+      rst: The total number of integer operations executed on either the :ref:`VALU
+        <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization unit
         <normalization-units>`.
       unit: IOP per normalization unit
     F16 OPs:
       plain: The total number of 16-bit floating-point operations executed on either
         the VALU or MFMA units, per normalization unit.
       rst: The total number of 16-bit floating-point operations executed on either
-        the  :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per  :ref:`normalization
+        the :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization
         unit <normalization-units>`.
       unit: FLOP per normalization unit
     BF16 OPs:
       plain: The total number of 16-bit brain floating-point operations executed on
         either the VALU or MFMA units, per normalization unit.
-      rst: 'The total number of 16-bit brain floating-point operations executed on
-        either the  :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per  :ref:`normalization
-        unit <normalization-units>`. Note: on current CDNA  accelerators, the VALU
-        has no native BF16 instructions.'
+      rst: |-
+        The total number of 16-bit brain floating-point operations executed on
+        either the :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization
+        unit <normalization-units>`. Note: on current CDNA accelerators, the VALU
+        has no native BF16 instructions.
       unit: FLOP per normalization unit
     F32 OPs:
       plain: The total number of 32-bit floating-point operations executed on either
         the VALU or MFMA units, per normalization unit.
-      rst: The total number of 32-bit floating-point operations executed on either  the
-        :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per  :ref:`normalization
+      rst: The total number of 32-bit floating-point operations executed on either the
+        :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization
         unit <normalization-units>`.
       unit: FLOP per normalization unit
     F64 OPs:
       plain: The total number of 64-bit floating-point operations executed on either
         the VALU or MFMA units, per normalization unit.
-      rst: The total number of 64-bit floating-point operations executed on either  the
-        :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per  :ref:`normalization
+      rst: The total number of 64-bit floating-point operations executed on either the
+        :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization
         unit <normalization-units>`.
       unit: FLOP per normalization unit
     INT8 OPs:
       plain: The total number of 8-bit integer operations executed on either the VALU
         or MFMA units, per normalization unit.
-      rst: 'The total number of 8-bit integer operations executed on either the  :ref:`VALU
-        <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per  :ref:`normalization unit
-        <normalization-units>`. Note: on current CDNA  accelerators, the VALU has
-        no native INT8 instructions.'
+      rst: |-
+        The total number of 8-bit integer operations executed on either the :ref:`VALU
+        <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization unit
+        <normalization-units>`. Note: on current CDNA accelerators, the VALU has
+        no native INT8 instructions.
       unit: IOP per normalization unit
 - id: 1200
   title: Local Data Share (LDS)
@@ -9121,7 +9213,7 @@ panels:
             avg: AVG((SQ_INSTS_LDS / $denom))
             min: MIN((SQ_INSTS_LDS / $denom))
             max: MAX((SQ_INSTS_LDS / $denom))
-            unit: (Instr  + $normUnit)
+            unit: (Instr + $normUnit)
           Theoretical Bandwidth:
             avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
               / (End_Timestamp - Start_Timestamp)))
@@ -9151,27 +9243,27 @@ panels:
             avg: AVG((SQ_LDS_IDX_ACTIVE / $denom))
             min: MIN((SQ_LDS_IDX_ACTIVE / $denom))
             max: MAX((SQ_LDS_IDX_ACTIVE / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
           Atomic Return Cycles:
             avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom))
             min: MIN((SQ_LDS_ATOMIC_RETURN / $denom))
             max: MAX((SQ_LDS_ATOMIC_RETURN / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
           Bank Conflict:
             avg: AVG((SQ_LDS_BANK_CONFLICT / $denom))
             min: MIN((SQ_LDS_BANK_CONFLICT / $denom))
             max: MAX((SQ_LDS_BANK_CONFLICT / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
           Addr Conflict:
             avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom))
             min: MIN((SQ_LDS_ADDR_CONFLICT / $denom))
             max: MAX((SQ_LDS_ADDR_CONFLICT / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
           Unaligned Stall:
             avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom))
             min: MIN((SQ_LDS_UNALIGNED_STALL / $denom))
             max: MAX((SQ_LDS_UNALIGNED_STALL / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
           Mem Violations:
             avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom))
             min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom))
@@ -9182,7 +9274,7 @@ panels:
             avg: AVG((SQ_INSTS_LDS / $denom))
             min: MIN((SQ_INSTS_LDS / $denom))
             max: MAX((SQ_INSTS_LDS / $denom))
-            unit: (Instr  + $normUnit)
+            unit: (Instr + $normUnit)
           Theoretical Bandwidth:
             avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
               / (End_Timestamp - Start_Timestamp)))
@@ -9212,27 +9304,27 @@ panels:
             avg: AVG((SQ_LDS_IDX_ACTIVE / $denom))
             min: MIN((SQ_LDS_IDX_ACTIVE / $denom))
             max: MAX((SQ_LDS_IDX_ACTIVE / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
           Atomic Return Cycles:
             avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom))
             min: MIN((SQ_LDS_ATOMIC_RETURN / $denom))
             max: MAX((SQ_LDS_ATOMIC_RETURN / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
           Bank Conflict:
             avg: AVG((SQ_LDS_BANK_CONFLICT / $denom))
             min: MIN((SQ_LDS_BANK_CONFLICT / $denom))
             max: MAX((SQ_LDS_BANK_CONFLICT / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
           Addr Conflict:
             avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom))
             min: MIN((SQ_LDS_ADDR_CONFLICT / $denom))
             max: MAX((SQ_LDS_ADDR_CONFLICT / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
           Unaligned Stall:
             avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom))
             min: MIN((SQ_LDS_UNALIGNED_STALL / $denom))
             max: MAX((SQ_LDS_UNALIGNED_STALL / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
           Mem Violations:
             avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom))
             min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom))
@@ -9243,7 +9335,7 @@ panels:
             avg: AVG((SQ_INSTS_LDS / $denom))
             min: MIN((SQ_INSTS_LDS / $denom))
             max: MAX((SQ_INSTS_LDS / $denom))
-            unit: (Instr  + $normUnit)
+            unit: (Instr + $normUnit)
           Theoretical Bandwidth:
             avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
               / (End_Timestamp - Start_Timestamp)))
@@ -9273,27 +9365,27 @@ panels:
             avg: AVG((SQ_LDS_IDX_ACTIVE / $denom))
             min: MIN((SQ_LDS_IDX_ACTIVE / $denom))
             max: MAX((SQ_LDS_IDX_ACTIVE / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
           Atomic Return Cycles:
             avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom))
             min: MIN((SQ_LDS_ATOMIC_RETURN / $denom))
             max: MAX((SQ_LDS_ATOMIC_RETURN / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
           Bank Conflict:
             avg: AVG((SQ_LDS_BANK_CONFLICT / $denom))
             min: MIN((SQ_LDS_BANK_CONFLICT / $denom))
             max: MAX((SQ_LDS_BANK_CONFLICT / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
           Addr Conflict:
             avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom))
             min: MIN((SQ_LDS_ADDR_CONFLICT / $denom))
             max: MAX((SQ_LDS_ADDR_CONFLICT / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
           Unaligned Stall:
             avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom))
             min: MIN((SQ_LDS_UNALIGNED_STALL / $denom))
             max: MAX((SQ_LDS_UNALIGNED_STALL / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
           Mem Violations:
             avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom))
             min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom))
@@ -9304,7 +9396,7 @@ panels:
             avg: AVG((SQ_INSTS_LDS / $denom))
             min: MIN((SQ_INSTS_LDS / $denom))
             max: MAX((SQ_INSTS_LDS / $denom))
-            unit: (Instr  + $normUnit)
+            unit: (Instr + $normUnit)
           Theoretical Bandwidth:
             avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
               / (End_Timestamp - Start_Timestamp)))
@@ -9334,27 +9426,27 @@ panels:
             avg: AVG((SQ_LDS_IDX_ACTIVE / $denom))
             min: MIN((SQ_LDS_IDX_ACTIVE / $denom))
             max: MAX((SQ_LDS_IDX_ACTIVE / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
           Atomic Return Cycles:
             avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom))
             min: MIN((SQ_LDS_ATOMIC_RETURN / $denom))
             max: MAX((SQ_LDS_ATOMIC_RETURN / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
           Bank Conflict:
             avg: AVG((SQ_LDS_BANK_CONFLICT / $denom))
             min: MIN((SQ_LDS_BANK_CONFLICT / $denom))
             max: MAX((SQ_LDS_BANK_CONFLICT / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
           Addr Conflict:
             avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom))
             min: MIN((SQ_LDS_ADDR_CONFLICT / $denom))
             max: MAX((SQ_LDS_ADDR_CONFLICT / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
           Unaligned Stall:
             avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom))
             min: MIN((SQ_LDS_UNALIGNED_STALL / $denom))
             max: MAX((SQ_LDS_UNALIGNED_STALL / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
           Mem Violations:
             avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom))
             min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom))
@@ -9365,11 +9457,11 @@ panels:
             avg: AVG((SQ_INSTS_LDS / $denom))
             min: MIN((SQ_INSTS_LDS / $denom))
             max: MAX((SQ_INSTS_LDS / $denom))
-            unit: (Instr  + $normUnit)
+            unit: (Instr + $normUnit)
           LDS LOAD:
             avg: AVG((SQ_INSTS_LDS_LOAD / $denom))
-            min: MIN((SQ_INSTS_LDS_LOAD  / $denom))
-            max: MAX((SQ_INSTS_LDS_LOAD  / $denom))
+            min: MIN((SQ_INSTS_LDS_LOAD / $denom))
+            max: MAX((SQ_INSTS_LDS_LOAD / $denom))
             unit: (instr + $normUnit)
           LDS STORE:
             avg: AVG((SQ_INSTS_LDS_STORE / $denom))
@@ -9425,27 +9517,27 @@ panels:
             avg: AVG((SQ_LDS_IDX_ACTIVE / $denom))
             min: MIN((SQ_LDS_IDX_ACTIVE / $denom))
             max: MAX((SQ_LDS_IDX_ACTIVE / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
           Atomic Return Cycles:
             avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom))
             min: MIN((SQ_LDS_ATOMIC_RETURN / $denom))
             max: MAX((SQ_LDS_ATOMIC_RETURN / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
           Bank Conflict:
             avg: AVG((SQ_LDS_BANK_CONFLICT / $denom))
             min: MIN((SQ_LDS_BANK_CONFLICT / $denom))
             max: MAX((SQ_LDS_BANK_CONFLICT / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
           Addr Conflict:
             avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom))
             min: MIN((SQ_LDS_ADDR_CONFLICT / $denom))
             max: MAX((SQ_LDS_ADDR_CONFLICT / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
           Unaligned Stall:
             avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom))
             min: MIN((SQ_LDS_UNALIGNED_STALL / $denom))
             max: MAX((SQ_LDS_UNALIGNED_STALL / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
           Mem Violations:
             avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom))
             min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom))
@@ -9455,18 +9547,18 @@ panels:
             avg: AVG((SQ_LDS_CMD_FIFO_FULL / $denom))
             min: MIN((SQ_LDS_CMD_FIFO_FULL / $denom))
             max: MAX((SQ_LDS_CMD_FIFO_FULL / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
           LDS Data FIFO Full Rate:
             avg: AVG((SQ_LDS_DATA_FIFO_FULL / $denom))
             min: MIN((SQ_LDS_DATA_FIFO_FULL / $denom))
             max: MAX((SQ_LDS_DATA_FIFO_FULL / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
         gfx908:
           LDS Instructions:
             avg: AVG((SQ_INSTS_LDS / $denom))
             min: MIN((SQ_INSTS_LDS / $denom))
             max: MAX((SQ_INSTS_LDS / $denom))
-            unit: (Instr  + $normUnit)
+            unit: (Instr + $normUnit)
           Theoretical Bandwidth:
             avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
               / (End_Timestamp - Start_Timestamp)))
@@ -9496,27 +9588,27 @@ panels:
             avg: AVG((SQ_LDS_IDX_ACTIVE / $denom))
             min: MIN((SQ_LDS_IDX_ACTIVE / $denom))
             max: MAX((SQ_LDS_IDX_ACTIVE / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
           Atomic Return Cycles:
             avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom))
             min: MIN((SQ_LDS_ATOMIC_RETURN / $denom))
             max: MAX((SQ_LDS_ATOMIC_RETURN / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
           Bank Conflict:
             avg: AVG((SQ_LDS_BANK_CONFLICT / $denom))
             min: MIN((SQ_LDS_BANK_CONFLICT / $denom))
             max: MAX((SQ_LDS_BANK_CONFLICT / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
           Addr Conflict:
             avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom))
             min: MIN((SQ_LDS_ADDR_CONFLICT / $denom))
             max: MAX((SQ_LDS_ADDR_CONFLICT / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
           Unaligned Stall:
             avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom))
             min: MIN((SQ_LDS_UNALIGNED_STALL / $denom))
             max: MAX((SQ_LDS_UNALIGNED_STALL / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
           Mem Violations:
             avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom))
             min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom))
@@ -9528,10 +9620,10 @@ panels:
         executing instructions (including, but not limited to, load, store, atomic
         and HIP's __shfl operations). Calculated as the ratio of the total number
         of cycles LDS was active over the total CU cycles.
-      rst: Indicates what percent of the kernel's duration the :ref:`LDS <desc-lds>`  was
-        actively executing instructions (including, but not limited to, load,  store,
-        atomic and HIP's ``__shfl`` operations).  Calculated as the ratio  of the
-        total number of cycles LDS was active over the  :ref:`total CU cycles <total-cu-cycles>`.
+      rst: Indicates what percent of the kernel's duration the :ref:`LDS <desc-lds>` was
+        actively executing instructions (including, but not limited to, load, store,
+        atomic and HIP's ``__shfl`` operations). Calculated as the ratio of the
+        total number of cycles LDS was active over the :ref:`total CU cycles <total-cu-cycles>`.
       unit: Percent
     Access Rate:
       plain: Indicates the percentage of SIMDs in the VALU actively issuing LDS instructions,
@@ -9549,10 +9641,10 @@ panels:
         stored to, or atomically updated in the LDS divided as percentage of theoretical peak.
         Does not take into account the execution mask of the wavefront when the instruction
         was executed.
-      rst: Indicates the maximum amount of bytes that could have been loaded from,  stored
+      rst: Indicates the maximum amount of bytes that could have been loaded from, stored
         to, or atomically updated in the LDS divided as percentage of theoretical peak.
-        Does *not* take into  account the execution mask of the wavefront when the
-        instruction was  executed. See the  :ref:`LDS bandwidth example <lds-bandwidth>`
+        Does *not* take into account the execution mask of the wavefront when the
+        instruction was executed. See the :ref:`LDS bandwidth example <lds-bandwidth>`
         for more detail.
       unit: Percent
     Theoretical Bandwidth:
@@ -9560,10 +9652,10 @@ panels:
         stored to, or atomically updated in the LDS divided by total duration. Does not
         take into account the execution mask of the wavefront when the instruction
         was executed.
-      rst: Indicates the maximum amount of bytes that could have been loaded from,  stored
+      rst: Indicates the maximum amount of bytes that could have been loaded from, stored
         to, or atomically updated in the LDS divided by total duration.
-        Does *not* take into  account the execution mask of the wavefront when the
-        instruction was  executed. See the  :ref:`LDS bandwidth example <lds-bandwidth>`
+        Does *not* take into account the execution mask of the wavefront when the
+        instruction was executed. See the :ref:`LDS bandwidth example <lds-bandwidth>`
         for more detail.
       unit: Gbps
     Bank Conflict Rate:
@@ -9571,23 +9663,23 @@ panels:
         bank conflicts. Calculated as the ratio of LDS cycles spent servicing bank
         conflicts over the number of LDS cycles that would have been required to move
         the same amount of data in an uncontended access.
-      rst: Indicates the percentage of active LDS cycles that were spent servicing  bank
-        conflicts. Calculated as the ratio of LDS cycles spent servicing  bank conflicts
-        over the number of LDS cycles that would have been  required to move the same
+      rst: Indicates the percentage of active LDS cycles that were spent servicing bank
+        conflicts. Calculated as the ratio of LDS cycles spent servicing bank conflicts
+        over the number of LDS cycles that would have been required to move the same
         amount of data in an uncontended access. [#lds-bank-conflict]_
       unit: Percent
     LDS Instructions:
       plain: The total number of LDS instructions (including, but not limited to,
         read/write/atomics and HIP's __shfl instructions) executed per normalization
         unit.
-      rst: The total number of LDS instructions (including, but not limited to,  read/write/atomics
-        and HIP's ``__shfl`` instructions) executed per  :ref:`normalization unit
+      rst: The total number of LDS instructions (including, but not limited to, read/write/atomics
+        and HIP's ``__shfl`` instructions) executed per :ref:`normalization unit
         <normalization-units>`.
       unit: Instructions per normalization unit
     LDS Latency:
       plain: The average number of round-trip cycles (i.e., from issue to data-return
-        / acknowledgment) required for an LDS instruction to complete.
-      rst: The average number of round-trip cycles (i.e., from issue to data-return  /
+        acknowledgment) required for an LDS instruction to complete.
+      rst: The average number of round-trip cycles (i.e., from issue to data-return
         acknowledgment) required for an LDS instruction to complete.
       unit: Cycles
     Bank Conflicts/Access:
@@ -9595,52 +9687,54 @@ panels:
         conflicts (as determined by the conflict resolution hardware) to the base
         number of cycles that would be spent in the LDS scheduler in a completely
         uncontended case. This is the unnormalized form of the Bank Conflict Rate.
-      rst: The ratio of the number of cycles spent in the  :ref:`LDS scheduler <desc-lds>`
-        due to bank conflicts (as determined by  the conflict resolution hardware)
-        to the base number of cycles that would  be spent in the LDS scheduler in
-        a completely uncontended case. This is  the unnormalized form of the Bank
+      rst: The ratio of the number of cycles spent in the :ref:`LDS scheduler <desc-lds>`
+        due to bank conflicts (as determined by the conflict resolution hardware)
+        to the base number of cycles that would be spent in the LDS scheduler in
+        a completely uncontended case. This is the unnormalized form of the Bank
         Conflict Rate.
       unit: Conflicts per Access
     Index Accesses:
       plain: The total number of cycles spent in the LDS scheduler over all operations
         per normalization unit.
-      rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>`  over
+      rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>` over
         all operations per :ref:`normalization unit <normalization-units>`.
       unit: Cycles per normalization unit
     Atomic Return Cycles:
       plain: The total number of cycles spent on LDS atomics with return per normalization
         unit.
-      rst: The total number of cycles spent on LDS atomics with return per  :ref:`normalization
+      rst: The total number of cycles spent on LDS atomics with return per :ref:`normalization
         unit <normalization-units>`.
       unit: Cycles per normalization unit
     Bank Conflict:
       plain: The total number of cycles spent in the LDS scheduler due to bank conflicts
         (as determined by the conflict resolution hardware) per normalization unit.
-      rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>`  due
-        to bank conflicts (as determined by the conflict resolution hardware)  per
+      rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>` due
+        to bank conflicts (as determined by the conflict resolution hardware) per
         :ref:`normalization unit <normalization-units>`.
       unit: Cycles per normalization unit
     Addr Conflict:
       plain: The total number of cycles spent in the LDS scheduler due to address
         conflicts (as determined by the conflict resolution hardware) per normalization
         unit.
-      rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>`  due
-        to address conflicts (as determined by the conflict resolution  hardware)
+      rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>` due
+        to address conflicts (as determined by the conflict resolution hardware)
         per :ref:`normalization unit <normalization-units>`.
       unit: Cycles per normalization unit
     Unaligned Stall:
       plain: The total number of cycles spent in the LDS scheduler due to stalls from
         non-dword aligned addresses per normalization unit.
-      rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>`  due
-        to stalls from non-dword aligned addresses per  :ref:`normalization unit <normalization-units>`.
+      rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>` due
+        to stalls from non-dword aligned addresses per :ref:`normalization unit <normalization-units>`.
       unit: Cycles per normalization unit
     Mem Violations:
-      plain: "The total number of out-of-bounds accesses made to the LDS, per normalization\
-        \ unit. This is unused and expected to be zero in most configurations for\
-        \ modern CDNA\u2122 accelerators."
-      rst: "The total number of out-of-bounds accesses made to the LDS, per  :ref:`normalization\
-        \ unit <normalization-units>`. This is unused and  expected to be zero in\
-        \ most configurations for modern CDNA\u2122 accelerators."
+      plain: |-
+        The total number of out-of-bounds accesses made to the LDS, per normalization
+        unit. This is unused and expected to be zero in most configurations for
+        modern CDNA\u2122 accelerators.
+      rst: |-
+        The total number of out-of-bounds accesses made to the LDS, per :ref:`normalization
+        unit <normalization-units>`. This is unused and expected to be zero in
+        most configurations for modern CDNA\u2122 accelerators.
       unit: Accesses per normalization unit
 - id: 1300
   title: Instruction Cache
@@ -9749,22 +9843,22 @@ panels:
             avg: AVG((SQC_ICACHE_REQ / $denom))
             min: MIN((SQC_ICACHE_REQ / $denom))
             max: MAX((SQC_ICACHE_REQ / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Hits:
             avg: AVG((SQC_ICACHE_HITS / $denom))
             min: MIN((SQC_ICACHE_HITS / $denom))
             max: MAX((SQC_ICACHE_HITS / $denom))
-            unit: (Hits  + $normUnit)
+            unit: (Hits + $normUnit)
           Misses - Non Duplicated:
             avg: AVG((SQC_ICACHE_MISSES / $denom))
             min: MIN((SQC_ICACHE_MISSES / $denom))
             max: MAX((SQC_ICACHE_MISSES / $denom))
-            unit: (Misses  + $normUnit)
+            unit: (Misses + $normUnit)
           Misses - Duplicated:
             avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom))
             min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom))
             max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom))
-            unit: (Misses  + $normUnit)
+            unit: (Misses + $normUnit)
           Cache Hit Rate:
             avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
               + SQC_ICACHE_MISSES_DUPLICATE)))
@@ -9784,22 +9878,22 @@ panels:
             avg: AVG((SQC_ICACHE_REQ / $denom))
             min: MIN((SQC_ICACHE_REQ / $denom))
             max: MAX((SQC_ICACHE_REQ / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Hits:
             avg: AVG((SQC_ICACHE_HITS / $denom))
             min: MIN((SQC_ICACHE_HITS / $denom))
             max: MAX((SQC_ICACHE_HITS / $denom))
-            unit: (Hits  + $normUnit)
+            unit: (Hits + $normUnit)
           Misses - Non Duplicated:
             avg: AVG((SQC_ICACHE_MISSES / $denom))
             min: MIN((SQC_ICACHE_MISSES / $denom))
             max: MAX((SQC_ICACHE_MISSES / $denom))
-            unit: (Misses  + $normUnit)
+            unit: (Misses + $normUnit)
           Misses - Duplicated:
             avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom))
             min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom))
             max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom))
-            unit: (Misses  + $normUnit)
+            unit: (Misses + $normUnit)
           Cache Hit Rate:
             avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
               + SQC_ICACHE_MISSES_DUPLICATE)))
@@ -9819,22 +9913,22 @@ panels:
             avg: AVG((SQC_ICACHE_REQ / $denom))
             min: MIN((SQC_ICACHE_REQ / $denom))
             max: MAX((SQC_ICACHE_REQ / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Hits:
             avg: AVG((SQC_ICACHE_HITS / $denom))
             min: MIN((SQC_ICACHE_HITS / $denom))
             max: MAX((SQC_ICACHE_HITS / $denom))
-            unit: (Hits  + $normUnit)
+            unit: (Hits + $normUnit)
           Misses - Non Duplicated:
             avg: AVG((SQC_ICACHE_MISSES / $denom))
             min: MIN((SQC_ICACHE_MISSES / $denom))
             max: MAX((SQC_ICACHE_MISSES / $denom))
-            unit: (Misses  + $normUnit)
+            unit: (Misses + $normUnit)
           Misses - Duplicated:
             avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom))
             min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom))
             max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom))
-            unit: (Misses  + $normUnit)
+            unit: (Misses + $normUnit)
           Cache Hit Rate:
             avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
               + SQC_ICACHE_MISSES_DUPLICATE)))
@@ -9854,22 +9948,22 @@ panels:
             avg: AVG((SQC_ICACHE_REQ / $denom))
             min: MIN((SQC_ICACHE_REQ / $denom))
             max: MAX((SQC_ICACHE_REQ / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Hits:
             avg: AVG((SQC_ICACHE_HITS / $denom))
             min: MIN((SQC_ICACHE_HITS / $denom))
             max: MAX((SQC_ICACHE_HITS / $denom))
-            unit: (Hits  + $normUnit)
+            unit: (Hits + $normUnit)
           Misses - Non Duplicated:
             avg: AVG((SQC_ICACHE_MISSES / $denom))
             min: MIN((SQC_ICACHE_MISSES / $denom))
             max: MAX((SQC_ICACHE_MISSES / $denom))
-            unit: (Misses  + $normUnit)
+            unit: (Misses + $normUnit)
           Misses - Duplicated:
             avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom))
             min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom))
             max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom))
-            unit: (Misses  + $normUnit)
+            unit: (Misses + $normUnit)
           Cache Hit Rate:
             avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
               + SQC_ICACHE_MISSES_DUPLICATE)))
@@ -9889,22 +9983,22 @@ panels:
             avg: AVG((SQC_ICACHE_REQ / $denom))
             min: MIN((SQC_ICACHE_REQ / $denom))
             max: MAX((SQC_ICACHE_REQ / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Hits:
             avg: AVG((SQC_ICACHE_HITS / $denom))
             min: MIN((SQC_ICACHE_HITS / $denom))
             max: MAX((SQC_ICACHE_HITS / $denom))
-            unit: (Hits  + $normUnit)
+            unit: (Hits + $normUnit)
           Misses - Non Duplicated:
             avg: AVG((SQC_ICACHE_MISSES / $denom))
             min: MIN((SQC_ICACHE_MISSES / $denom))
             max: MAX((SQC_ICACHE_MISSES / $denom))
-            unit: (Misses  + $normUnit)
+            unit: (Misses + $normUnit)
           Misses - Duplicated:
             avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom))
             min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom))
             max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom))
-            unit: (Misses  + $normUnit)
+            unit: (Misses + $normUnit)
           Cache Hit Rate:
             avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
               + SQC_ICACHE_MISSES_DUPLICATE)))
@@ -9924,22 +10018,22 @@ panels:
             avg: AVG((SQC_ICACHE_REQ / $denom))
             min: MIN((SQC_ICACHE_REQ / $denom))
             max: MAX((SQC_ICACHE_REQ / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Hits:
             avg: AVG((SQC_ICACHE_HITS / $denom))
             min: MIN((SQC_ICACHE_HITS / $denom))
             max: MAX((SQC_ICACHE_HITS / $denom))
-            unit: (Hits  + $normUnit)
+            unit: (Hits + $normUnit)
           Misses - Non Duplicated:
             avg: AVG((SQC_ICACHE_MISSES / $denom))
             min: MIN((SQC_ICACHE_MISSES / $denom))
             max: MAX((SQC_ICACHE_MISSES / $denom))
-            unit: (Misses  + $normUnit)
+            unit: (Misses + $normUnit)
           Misses - Duplicated:
             avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom))
             min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom))
             max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom))
-            unit: (Misses  + $normUnit)
+            unit: (Misses + $normUnit)
           Cache Hit Rate:
             avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
               + SQC_ICACHE_MISSES_DUPLICATE)))
@@ -10005,25 +10099,27 @@ panels:
       plain: The number of bytes looked up in the L1I cache, as a percent of the peak
         theoretical bandwidth. Calculated as the ratio of L1I requests over the total
         L1I cycles.
-      rst: The number of bytes looked up in the L1I cache, as a percent of the peak  theoretical
-        bandwidth. Calculated as the ratio of L1I requests over the  :ref:`total L1I
+      rst: The number of bytes looked up in the L1I cache, as a percent of the peak theoretical
+        bandwidth. Calculated as the ratio of L1I requests over the :ref:`total L1I
         cycles <total-l1i-cycles>`.
       unit: Percent
     Cache Hit Rate:
       plain: The percent of L1I requests that hit [#l1i-cache]_ on a previously loaded
         line the cache. Calculated as the ratio of the number of L1I requests that
         hit over the number of all L1I requests.
-      rst: The percent of L1I requests that hit [#l1i-cache]_ on a previously loaded  line
-        the cache. Calculated as the ratio of the number of L1I requests  that hit
+      rst: The percent of L1I requests that hit [#l1i-cache]_ on a previously loaded line
+        the cache. Calculated as the ratio of the number of L1I requests that hit
         over the number of all L1I requests.
       unit: Percent
     L1I-L2 Bandwidth Utilization:
-      plain: "The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth\
-        \ achieved. Calculated as the ratio of the total number of requests from the\
-        \ L1I to the L2 cache over the total L1I-L2 interface cycles."
-      rst: "The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth\
-        \  achieved. Calculated as the ratio of the total number of requests from\
-        \  the L1I to the L2 cache over the  :ref:`total L1I-L2 interface cycles <total-l1i-cycles>`."
+      plain: |-
+        The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth
+        achieved. Calculated as the ratio of the total number of requests from the
+        L1I to the L2 cache over the total L1I-L2 interface cycles.
+      rst: |-
+        The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth
+        achieved. Calculated as the ratio of the total number of requests from
+        the L1I to the L2 cache over the :ref:`total L1I-L2 interface cycles <total-l1i-cycles>`.
       unit: Percent
     L1I-L2 Bandwidth:
       plain: Total number of bytes transferred across L1I - L2 interface divided by total duration.
@@ -10036,26 +10132,26 @@ panels:
     Hits:
       plain: The total number of L1I requests that hit on a previously loaded cache
         line, per normalization-unit.
-      rst: The total number of L1I requests that hit on a previously loaded cache  line,
+      rst: The total number of L1I requests that hit on a previously loaded cache line,
         per :ref:`normalization-unit <normalization-units>`.
       unit: Requests per normalization unit
     Misses - Non Duplicated:
       plain: The total number of L1I requests that missed on a cache line that were
         not already pending due to another request, per normalization-unit.
-      rst: The total number of L1I requests that missed on a cache line that  *were
-        not* already pending due to another request, per  :ref:`normalization-unit
-        <normalization-units>`. See note in  :ref:`desc-l1i-sol` for more detail.
+      rst: The total number of L1I requests that missed on a cache line that *were
+        not* already pending due to another request, per :ref:`normalization-unit
+        <normalization-units>`. See note in :ref:`desc-l1i-sol` for more detail.
       unit: Requests per normalization unit
     Misses - Duplicated:
       plain: The total number of L1I requests that missed on a cache line that were
         already pending due to another request, per normalization-unit.
-      rst: The total number of L1I requests that missed on a cache line that *were*  already
-        pending due to another request, per  :ref:`normalization-unit <normalization-units>`.
-        See note in  :ref:`desc-l1i-sol` for more detail.
+      rst: The total number of L1I requests that missed on a cache line that *were* already
+        pending due to another request, per :ref:`normalization-unit <normalization-units>`.
+        See note in :ref:`desc-l1i-sol` for more detail.
       unit: Requests per normalization unit
     Instruction Fetch Latency:
       plain: The average number of cycles spent to fetch instructions to a CU.
-      rst: The average number of cycles spent to fetch instructions to a  :doc:`CU
+      rst: The average number of cycles spent to fetch instructions to a :doc:`CU
         <compute-unit>`.
       unit: Cycles
 - id: 1400
@@ -10171,22 +10267,22 @@ panels:
             avg: AVG((SQC_DCACHE_REQ / $denom))
             min: MIN((SQC_DCACHE_REQ / $denom))
             max: MAX((SQC_DCACHE_REQ / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Hits:
             avg: AVG((SQC_DCACHE_HITS / $denom))
             min: MIN((SQC_DCACHE_HITS / $denom))
             max: MAX((SQC_DCACHE_HITS / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Misses - Non Duplicated:
             avg: AVG((SQC_DCACHE_MISSES / $denom))
             min: MIN((SQC_DCACHE_MISSES / $denom))
             max: MAX((SQC_DCACHE_MISSES / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Misses- Duplicated:
             avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom))
             min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom))
             max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Cache Hit Rate:
             avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
               + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
@@ -10205,58 +10301,58 @@ panels:
               + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
             max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
               + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Atomic Req:
             avg: AVG((SQC_DCACHE_ATOMIC / $denom))
             min: MIN((SQC_DCACHE_ATOMIC / $denom))
             max: MAX((SQC_DCACHE_ATOMIC / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read Req (1 DWord):
             avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom))
             min: MIN((SQC_DCACHE_REQ_READ_1 / $denom))
             max: MAX((SQC_DCACHE_REQ_READ_1 / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read Req (2 DWord):
             avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom))
             min: MIN((SQC_DCACHE_REQ_READ_2 / $denom))
             max: MAX((SQC_DCACHE_REQ_READ_2 / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read Req (4 DWord):
             avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom))
             min: MIN((SQC_DCACHE_REQ_READ_4 / $denom))
             max: MAX((SQC_DCACHE_REQ_READ_4 / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read Req (8 DWord):
             avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom))
             min: MIN((SQC_DCACHE_REQ_READ_8 / $denom))
             max: MAX((SQC_DCACHE_REQ_READ_8 / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read Req (16 DWord):
             avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom))
             min: MIN((SQC_DCACHE_REQ_READ_16 / $denom))
             max: MAX((SQC_DCACHE_REQ_READ_16 / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
         gfx941:
           Req:
             avg: AVG((SQC_DCACHE_REQ / $denom))
             min: MIN((SQC_DCACHE_REQ / $denom))
             max: MAX((SQC_DCACHE_REQ / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Hits:
             avg: AVG((SQC_DCACHE_HITS / $denom))
             min: MIN((SQC_DCACHE_HITS / $denom))
             max: MAX((SQC_DCACHE_HITS / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Misses - Non Duplicated:
             avg: AVG((SQC_DCACHE_MISSES / $denom))
             min: MIN((SQC_DCACHE_MISSES / $denom))
             max: MAX((SQC_DCACHE_MISSES / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Misses- Duplicated:
             avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom))
             min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom))
             max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Cache Hit Rate:
             avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
               + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
@@ -10275,58 +10371,58 @@ panels:
               + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
             max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
               + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Atomic Req:
             avg: AVG((SQC_DCACHE_ATOMIC / $denom))
             min: MIN((SQC_DCACHE_ATOMIC / $denom))
             max: MAX((SQC_DCACHE_ATOMIC / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read Req (1 DWord):
             avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom))
             min: MIN((SQC_DCACHE_REQ_READ_1 / $denom))
             max: MAX((SQC_DCACHE_REQ_READ_1 / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read Req (2 DWord):
             avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom))
             min: MIN((SQC_DCACHE_REQ_READ_2 / $denom))
             max: MAX((SQC_DCACHE_REQ_READ_2 / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read Req (4 DWord):
             avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom))
             min: MIN((SQC_DCACHE_REQ_READ_4 / $denom))
             max: MAX((SQC_DCACHE_REQ_READ_4 / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read Req (8 DWord):
             avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom))
             min: MIN((SQC_DCACHE_REQ_READ_8 / $denom))
             max: MAX((SQC_DCACHE_REQ_READ_8 / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read Req (16 DWord):
             avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom))
             min: MIN((SQC_DCACHE_REQ_READ_16 / $denom))
             max: MAX((SQC_DCACHE_REQ_READ_16 / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
         gfx940:
           Req:
             avg: AVG((SQC_DCACHE_REQ / $denom))
             min: MIN((SQC_DCACHE_REQ / $denom))
             max: MAX((SQC_DCACHE_REQ / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Hits:
             avg: AVG((SQC_DCACHE_HITS / $denom))
             min: MIN((SQC_DCACHE_HITS / $denom))
             max: MAX((SQC_DCACHE_HITS / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Misses - Non Duplicated:
             avg: AVG((SQC_DCACHE_MISSES / $denom))
             min: MIN((SQC_DCACHE_MISSES / $denom))
             max: MAX((SQC_DCACHE_MISSES / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Misses- Duplicated:
             avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom))
             min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom))
             max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Cache Hit Rate:
             avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
               + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
@@ -10345,58 +10441,58 @@ panels:
               + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
             max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
               + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Atomic Req:
             avg: AVG((SQC_DCACHE_ATOMIC / $denom))
             min: MIN((SQC_DCACHE_ATOMIC / $denom))
             max: MAX((SQC_DCACHE_ATOMIC / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read Req (1 DWord):
             avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom))
             min: MIN((SQC_DCACHE_REQ_READ_1 / $denom))
             max: MAX((SQC_DCACHE_REQ_READ_1 / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read Req (2 DWord):
             avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom))
             min: MIN((SQC_DCACHE_REQ_READ_2 / $denom))
             max: MAX((SQC_DCACHE_REQ_READ_2 / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read Req (4 DWord):
             avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom))
             min: MIN((SQC_DCACHE_REQ_READ_4 / $denom))
             max: MAX((SQC_DCACHE_REQ_READ_4 / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read Req (8 DWord):
             avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom))
             min: MIN((SQC_DCACHE_REQ_READ_8 / $denom))
             max: MAX((SQC_DCACHE_REQ_READ_8 / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read Req (16 DWord):
             avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom))
             min: MIN((SQC_DCACHE_REQ_READ_16 / $denom))
             max: MAX((SQC_DCACHE_REQ_READ_16 / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
         gfx942:
           Req:
             avg: AVG((SQC_DCACHE_REQ / $denom))
             min: MIN((SQC_DCACHE_REQ / $denom))
             max: MAX((SQC_DCACHE_REQ / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Hits:
             avg: AVG((SQC_DCACHE_HITS / $denom))
             min: MIN((SQC_DCACHE_HITS / $denom))
             max: MAX((SQC_DCACHE_HITS / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Misses - Non Duplicated:
             avg: AVG((SQC_DCACHE_MISSES / $denom))
             min: MIN((SQC_DCACHE_MISSES / $denom))
             max: MAX((SQC_DCACHE_MISSES / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Misses- Duplicated:
             avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom))
             min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom))
             max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Cache Hit Rate:
             avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
               + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
@@ -10415,58 +10511,58 @@ panels:
               + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
             max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
               + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Atomic Req:
             avg: AVG((SQC_DCACHE_ATOMIC / $denom))
             min: MIN((SQC_DCACHE_ATOMIC / $denom))
             max: MAX((SQC_DCACHE_ATOMIC / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read Req (1 DWord):
             avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom))
             min: MIN((SQC_DCACHE_REQ_READ_1 / $denom))
             max: MAX((SQC_DCACHE_REQ_READ_1 / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read Req (2 DWord):
             avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom))
             min: MIN((SQC_DCACHE_REQ_READ_2 / $denom))
             max: MAX((SQC_DCACHE_REQ_READ_2 / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read Req (4 DWord):
             avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom))
             min: MIN((SQC_DCACHE_REQ_READ_4 / $denom))
             max: MAX((SQC_DCACHE_REQ_READ_4 / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read Req (8 DWord):
             avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom))
             min: MIN((SQC_DCACHE_REQ_READ_8 / $denom))
             max: MAX((SQC_DCACHE_REQ_READ_8 / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read Req (16 DWord):
             avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom))
             min: MIN((SQC_DCACHE_REQ_READ_16 / $denom))
             max: MAX((SQC_DCACHE_REQ_READ_16 / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
         gfx950:
           Req:
             avg: AVG((SQC_DCACHE_REQ / $denom))
             min: MIN((SQC_DCACHE_REQ / $denom))
             max: MAX((SQC_DCACHE_REQ / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Hits:
             avg: AVG((SQC_DCACHE_HITS / $denom))
             min: MIN((SQC_DCACHE_HITS / $denom))
             max: MAX((SQC_DCACHE_HITS / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Misses - Non Duplicated:
             avg: AVG((SQC_DCACHE_MISSES / $denom))
             min: MIN((SQC_DCACHE_MISSES / $denom))
             max: MAX((SQC_DCACHE_MISSES / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Misses- Duplicated:
             avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom))
             min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom))
             max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Cache Hit Rate:
             avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
               + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
@@ -10485,58 +10581,58 @@ panels:
               + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
             max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
               + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Atomic Req:
             avg: AVG((SQC_DCACHE_ATOMIC / $denom))
             min: MIN((SQC_DCACHE_ATOMIC / $denom))
             max: MAX((SQC_DCACHE_ATOMIC / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read Req (1 DWord):
             avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom))
             min: MIN((SQC_DCACHE_REQ_READ_1 / $denom))
             max: MAX((SQC_DCACHE_REQ_READ_1 / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read Req (2 DWord):
             avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom))
             min: MIN((SQC_DCACHE_REQ_READ_2 / $denom))
             max: MAX((SQC_DCACHE_REQ_READ_2 / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read Req (4 DWord):
             avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom))
             min: MIN((SQC_DCACHE_REQ_READ_4 / $denom))
             max: MAX((SQC_DCACHE_REQ_READ_4 / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read Req (8 DWord):
             avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom))
             min: MIN((SQC_DCACHE_REQ_READ_8 / $denom))
             max: MAX((SQC_DCACHE_REQ_READ_8 / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read Req (16 DWord):
             avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom))
             min: MIN((SQC_DCACHE_REQ_READ_16 / $denom))
             max: MAX((SQC_DCACHE_REQ_READ_16 / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
         gfx908:
           Req:
             avg: AVG((SQC_DCACHE_REQ / $denom))
             min: MIN((SQC_DCACHE_REQ / $denom))
             max: MAX((SQC_DCACHE_REQ / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Hits:
             avg: AVG((SQC_DCACHE_HITS / $denom))
             min: MIN((SQC_DCACHE_HITS / $denom))
             max: MAX((SQC_DCACHE_HITS / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Misses - Non Duplicated:
             avg: AVG((SQC_DCACHE_MISSES / $denom))
             min: MIN((SQC_DCACHE_MISSES / $denom))
             max: MAX((SQC_DCACHE_MISSES / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Misses- Duplicated:
             avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom))
             min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom))
             max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Cache Hit Rate:
             avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
               + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
@@ -10555,37 +10651,37 @@ panels:
               + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
             max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
               + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Atomic Req:
             avg: AVG((SQC_DCACHE_ATOMIC / $denom))
             min: MIN((SQC_DCACHE_ATOMIC / $denom))
             max: MAX((SQC_DCACHE_ATOMIC / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read Req (1 DWord):
             avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom))
             min: MIN((SQC_DCACHE_REQ_READ_1 / $denom))
             max: MAX((SQC_DCACHE_REQ_READ_1 / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read Req (2 DWord):
             avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom))
             min: MIN((SQC_DCACHE_REQ_READ_2 / $denom))
             max: MAX((SQC_DCACHE_REQ_READ_2 / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read Req (4 DWord):
             avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom))
             min: MIN((SQC_DCACHE_REQ_READ_4 / $denom))
             max: MAX((SQC_DCACHE_REQ_READ_4 / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read Req (8 DWord):
             avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom))
             min: MIN((SQC_DCACHE_REQ_READ_8 / $denom))
             max: MAX((SQC_DCACHE_REQ_READ_8 / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read Req (16 DWord):
             avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom))
             min: MIN((SQC_DCACHE_REQ_READ_16 / $denom))
             max: MAX((SQC_DCACHE_REQ_READ_16 / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
   - metric_table:
       id: 1403
       title: Scalar L1D Cache - L2 Interface
@@ -10609,22 +10705,22 @@ panels:
             avg: AVG((SQC_TC_DATA_READ_REQ / $denom))
             min: MIN((SQC_TC_DATA_READ_REQ / $denom))
             max: MAX((SQC_TC_DATA_READ_REQ / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Write Req:
             avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom))
             min: MIN((SQC_TC_DATA_WRITE_REQ / $denom))
             max: MAX((SQC_TC_DATA_WRITE_REQ / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Atomic Req:
             avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom))
             min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom))
             max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Stall Cycles:
             avg: AVG((SQC_TC_STALL / $denom))
             min: MIN((SQC_TC_STALL / $denom))
             max: MAX((SQC_TC_STALL / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
         gfx941:
           sL1D-L2 BW:
             avg: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
@@ -10638,22 +10734,22 @@ panels:
             avg: AVG((SQC_TC_DATA_READ_REQ / $denom))
             min: MIN((SQC_TC_DATA_READ_REQ / $denom))
             max: MAX((SQC_TC_DATA_READ_REQ / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Write Req:
             avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom))
             min: MIN((SQC_TC_DATA_WRITE_REQ / $denom))
             max: MAX((SQC_TC_DATA_WRITE_REQ / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Atomic Req:
             avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom))
             min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom))
             max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Stall Cycles:
             avg: AVG((SQC_TC_STALL / $denom))
             min: MIN((SQC_TC_STALL / $denom))
             max: MAX((SQC_TC_STALL / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
         gfx940:
           sL1D-L2 BW:
             avg: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
@@ -10667,22 +10763,22 @@ panels:
             avg: AVG((SQC_TC_DATA_READ_REQ / $denom))
             min: MIN((SQC_TC_DATA_READ_REQ / $denom))
             max: MAX((SQC_TC_DATA_READ_REQ / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Write Req:
             avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom))
             min: MIN((SQC_TC_DATA_WRITE_REQ / $denom))
             max: MAX((SQC_TC_DATA_WRITE_REQ / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Atomic Req:
             avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom))
             min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom))
             max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Stall Cycles:
             avg: AVG((SQC_TC_STALL / $denom))
             min: MIN((SQC_TC_STALL / $denom))
             max: MAX((SQC_TC_STALL / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
         gfx942:
           sL1D-L2 BW:
             avg: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
@@ -10696,22 +10792,22 @@ panels:
             avg: AVG((SQC_TC_DATA_READ_REQ / $denom))
             min: MIN((SQC_TC_DATA_READ_REQ / $denom))
             max: MAX((SQC_TC_DATA_READ_REQ / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Write Req:
             avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom))
             min: MIN((SQC_TC_DATA_WRITE_REQ / $denom))
             max: MAX((SQC_TC_DATA_WRITE_REQ / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Atomic Req:
             avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom))
             min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom))
             max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Stall Cycles:
             avg: AVG((SQC_TC_STALL / $denom))
             min: MIN((SQC_TC_STALL / $denom))
             max: MAX((SQC_TC_STALL / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
         gfx950:
           sL1D-L2 BW:
             avg: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
@@ -10725,22 +10821,22 @@ panels:
             avg: AVG((SQC_TC_DATA_READ_REQ / $denom))
             min: MIN((SQC_TC_DATA_READ_REQ / $denom))
             max: MAX((SQC_TC_DATA_READ_REQ / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Write Req:
             avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom))
             min: MIN((SQC_TC_DATA_WRITE_REQ / $denom))
             max: MAX((SQC_TC_DATA_WRITE_REQ / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Atomic Req:
             avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom))
             min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom))
             max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Stall Cycles:
             avg: AVG((SQC_TC_STALL / $denom))
             min: MIN((SQC_TC_STALL / $denom))
             max: MAX((SQC_TC_STALL / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
         gfx908:
           sL1D-L2 BW:
             avg: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
@@ -10754,146 +10850,151 @@ panels:
             avg: AVG((SQC_TC_DATA_READ_REQ / $denom))
             min: MIN((SQC_TC_DATA_READ_REQ / $denom))
             max: MAX((SQC_TC_DATA_READ_REQ / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Write Req:
             avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom))
             min: MIN((SQC_TC_DATA_WRITE_REQ / $denom))
             max: MAX((SQC_TC_DATA_WRITE_REQ / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Atomic Req:
             avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom))
             min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom))
             max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Stall Cycles:
             avg: AVG((SQC_TC_STALL / $denom))
             min: MIN((SQC_TC_STALL / $denom))
             max: MAX((SQC_TC_STALL / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
   metrics_description:
     Bandwidth Utilization:
       plain: The number of bytes looked up in the sL1D cache, as a percent of the
         peak theoretical bandwidth. Calculated as the ratio of sL1D requests over
         the total sL1D cycles.
-      rst: The number of bytes looked up in the sL1D cache, as a percent of the peak  theoretical
-        bandwidth. Calculated as the ratio of sL1D requests over the  :ref:`total
+      rst: The number of bytes looked up in the sL1D cache, as a percent of the peak theoretical
+        bandwidth. Calculated as the ratio of sL1D requests over the :ref:`total
         sL1D cycles <total-sl1d-cycles>`.
       unit: Percent
     Cache Hit Rate:
       plain: Indicates the percent of sL1D requests that hit on a previously loaded
         line the cache. The ratio of the number of sL1D requests that hit over the
         number of all sL1D requests.
-      rst: Indicates the percent of sL1D requests that hit on a previously loaded  line
-        the cache. The ratio of the number of sL1D requests that hit  [#sl1d-cache]_
+      rst: Indicates the percent of sL1D requests that hit on a previously loaded line
+        the cache. The ratio of the number of sL1D requests that hit [#sl1d-cache]_
         over the number of all sL1D requests.
       unit: Percent
     sL1D-L2 BW Utilization:
-      plain: The percentage of the peak theoretical sL1D - L2 interface bandwidth acheived.\
-        \ Caclulated as total number of bytes read from, written to, or atomically updated\
-        \ across the sL1D - L2 interface.
-      rst: The percentage of the peak theoretical sL1D - L2 interface bandwidth acheived.\
-        \ Caclulated as total number of bytes read from, written to, or atomically updated\
-        \ across the sL1D - L2 interface.
+      plain: The percentage of the peak theoretical sL1D - L2 interface bandwidth acheived.
+        Calculated as total number of bytes read from, written to, or atomically updated
+        across the sL1D - L2 interface.
+      rst: The percentage of the peak theoretical sL1D - L2 interface bandwidth acheived.
+        Calculated as total number of bytes read from, written to, or atomically updated
+        across the sL1D - L2 interface.
       unit: Percent
     sL1D-L2 BW:
-      plain: "The total number of bytes read from, written to, or atomically updated\
-        \ across the sL1D\u2194L2 interface, divided by total duration. Note that sL1D\
-        \ writes and atomics are typically unused on current CDNA accelerators, so\
-        \ in the majority of cases this can be interpreted as an sL1D\u2192L2 read\
-        \ bandwidth."
-      rst: "The total number of bytes read from, written to, or atomically updated\
-        \  across the sL1D\u2194:doc:`L2 <l2-cache>` interface, divided by total duration.\
-        \ Note that sL1D writes and atomics are typically\
-        \ unused on current CDNA accelerators, so in the  majority of cases this can\
-        \ be interpreted as an sL1D\u2192L2 read bandwidth."
+      plain: |-
+        The total number of bytes read from, written to, or atomically updated
+        across the sL1D\u2194L2 interface, divided by total duration. Note that sL1D
+        writes and atomics are typically unused on current CDNA accelerators, so
+        in the majority of cases this can be interpreted as an sL1D\u2192L2 read
+        bandwidth.
+      rst: |-
+        The total number of bytes read from, written to, or atomically updated
+        across the sL1D\u2194:doc:`L2 <l2-cache>` interface, divided by total duration.
+        Note that sL1D writes and atomics are typically
+        unused on current CDNA accelerators, so in the majority of cases this can
+        be interpreted as an sL1D\u2192L2 read bandwidth.
       unit: Gbps
     Req:
       plain: The total number of requests, of any size or type, made to the sL1D per
         normalization unit.
-      rst: The total number of requests, of any size or type, made to the sL1D per  :ref:`normalization
+      rst: The total number of requests, of any size or type, made to the sL1D per :ref:`normalization
         unit <normalization-units>`.
       unit: Requests per normalization unit
     Hits:
       plain: The total number of sL1D requests that hit on a previously loaded cache
         line, per normalization unit.
-      rst: The total number of sL1D requests that hit on a previously loaded cache  line,
+      rst: The total number of sL1D requests that hit on a previously loaded cache line,
         per :ref:`normalization unit <normalization-units>`.
       unit: Requests per normalization unit
     Misses - Non Duplicated:
-      plain: 'The total number of sL1D requests that missed on a cache line that was
-        not already pending due to another request, per normalization unit. '
-      rst: The total number of sL1D requests that missed on a cache line that *was  not*
-        already pending due to another request, per  :ref:`normalization unit <normalization-units>`.
-        See :ref:`desc-sl1d-sol`  for more detail.
+      plain: |-
+        The total number of sL1D requests that missed on a cache line that was
+        not already pending due to another request, per normalization unit.
+      rst: The total number of sL1D requests that missed on a cache line that *was not*
+        already pending due to another request, per :ref:`normalization unit <normalization-units>`.
+        See :ref:`desc-sl1d-sol` for more detail.
       unit: Requests per normalization unit
     Misses- Duplicated:
       plain: The total number of sL1D requests that missed on a cache line that was
         already pending due to another request, per normalization unit.
-      rst: The total number of sL1D requests that missed on a cache line that *was*  already
-        pending due to another request, per  :ref:`normalization unit <normalization-units>`.
-        See  :ref:`desc-sl1d-sol` for more detail.
+      rst: The total number of sL1D requests that missed on a cache line that *was* already
+        pending due to another request, per :ref:`normalization unit <normalization-units>`.
+        See :ref:`desc-sl1d-sol` for more detail.
       unit: Requests per normalization unit
     Read Req (Total):
       plain: The total number of sL1D read requests of any size, per normalization
         unit.
-      rst: The total number of sL1D read requests of any size, per  :ref:`normalization
+      rst: The total number of sL1D read requests of any size, per :ref:`normalization
         unit <normalization-units>`.
       unit: Requests per normalization unit
     Atomic Req:
       plain: The total number of atomic requests from sL1D to the L2, per normalization
         unit. Typically unused on current CDNA accelerators.
-      rst: The total number of atomic requests from sL1D to the  :doc:`L2 <l2-cache>`,
-        per  :ref:`normalization unit <normalization-units>`. Typically unused on  current
+      rst: The total number of atomic requests from sL1D to the :doc:`L2 <l2-cache>`,
+        per :ref:`normalization unit <normalization-units>`. Typically unused on current
         CDNA accelerators.
       unit: Requests per normalization unit
     Read Req (1 DWord):
       plain: The total number of sL1D read requests made for a single dword of data
         (4B), per normalization unit.
-      rst: The total number of sL1D read requests made for a single dword of data  (4B),
+      rst: The total number of sL1D read requests made for a single dword of data (4B),
         per :ref:`normalization unit <normalization-units>`.
       unit: Requests per normalization unit
     Read Req (2 DWord):
       plain: The total number of sL1D read requests made for a two dwords of data
         (8B), per normalization unit.
-      rst: The total number of sL1D read requests made for a two dwords of data  (8B),
+      rst: The total number of sL1D read requests made for a two dwords of data (8B),
         per :ref:`normalization unit <normalization-units>`.
       unit: Requests per normalization unit
     Read Req (4 DWord):
       plain: The total number of sL1D read requests made for a four dwords of data
         (16B), per normalization unit.
-      rst: The total number of sL1D read requests made for a four dwords of data  (16B),
+      rst: The total number of sL1D read requests made for a four dwords of data (16B),
         per :ref:`normalization unit <normalization-units>`.
       unit: Requests per normalization unit
     Read Req (8 DWord):
       plain: The total number of sL1D read requests made for a eight dwords of data
         (32B), per normalization unit.
-      rst: The total number of sL1D read requests made for a eight dwords of data  (32B),
+      rst: The total number of sL1D read requests made for a eight dwords of data (32B),
         per :ref:`normalization unit <normalization-units>`.
       unit: Requests per normalization unit
     Read Req (16 DWord):
       plain: The total number of sL1D read requests made for a sixteen dwords of data
         (64B), per normalization unit.
-      rst: The total number of sL1D read requests made for a sixteen dwords of data  (64B),
+      rst: The total number of sL1D read requests made for a sixteen dwords of data (64B),
         per :ref:`normalization unit <normalization-units>`.
       unit: Requests per normalization unit
     Read Req:
       plain: The total number of read requests from sL1D to the L2 per normalization
         unit.
-      rst: The total number of read requests from sL1D to the :doc:`L2 <l2-cache>`,  per
+      rst: The total number of read requests from sL1D to the :doc:`L2 <l2-cache>`, per
         :ref:`normalization unit <normalization-units>`.
       unit: Requests per normalization unit
     Write Req:
       plain: The total number of write requests from sL1D to the L2, per normalization
         unit. Typically unused on current CDNA accelerators.
-      rst: The total number of write requests from sL1D to the :doc:`L2 <l2-cache>`,  per
-        :ref:`normalization unit <normalization-units>`. Typically unused on  current
+      rst: The total number of write requests from sL1D to the :doc:`L2 <l2-cache>`, per
+        :ref:`normalization unit <normalization-units>`. Typically unused on current
         CDNA accelerators.
       unit: Requests per normalization unit
     Stall Cycles:
-      plain: "The total number of cycles the sL1D\u2194L2 interface was stalled, per\
-        \ normalization unit."
-      rst: "The total number of cycles the sL1D\u2194  :doc:`L2 <l2-cache>` interface\
-        \ was stalled, per  :ref:`normalization unit <normalization-units>`."
+      plain: |-
+        The total number of cycles the sL1D\u2194L2 interface was stalled, per
+        normalization unit.
+      rst: |-
+        The total number of cycles the sL1D\u2194 :doc:`L2 <l2-cache>` interface
+        was stalled, per :ref:`normalization unit <normalization-units>`.
       unit: Cycles per normalization unit
 - id: 1500
   title: Address Processing Unit and Data Return Path (TA/TD)
@@ -11178,287 +11279,287 @@ panels:
             avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom))
             min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom))
             max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Global/Generic Instructions:
             avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
             min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
             max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Global/Generic Read Instructions:
             avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
             min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
             max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Global/Generic Write Instructions:
             avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
             min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
             max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Global/Generic Atomic Instructions:
             avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
             min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
             max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Spill/Stack Instructions:
             avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
             min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
             max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Spill/Stack Read Instructions:
             avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
             min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
             max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Spill/Stack Write Instructions:
             avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
             min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
             max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Spill/Stack Atomic Instructions:
             avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
             min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
             max: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
         gfx941:
           Total Instructions:
             avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom))
             min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom))
             max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Global/Generic Instructions:
             avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
             min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
             max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Global/Generic Read Instructions:
             avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
             min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
             max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Global/Generic Write Instructions:
             avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
             min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
             max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Global/Generic Atomic Instructions:
             avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
             min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
             max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Spill/Stack Instructions:
             avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
             min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
             max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Spill/Stack Read Instructions:
             avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
             min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
             max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Spill/Stack Write Instructions:
             avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
             min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
             max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Spill/Stack Atomic Instructions:
             avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
             min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
             max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
         gfx940:
           Total Instructions:
             avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom))
             min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom))
             max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Global/Generic Instructions:
             avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
             min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
             max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Global/Generic Read Instructions:
             avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
             min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
             max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Global/Generic Write Instructions:
             avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
             min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
             max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Global/Generic Atomic Instructions:
             avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
             min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
             max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Spill/Stack Instructions:
             avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
             min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
             max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Spill/Stack Read Instructions:
             avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
             min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
             max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Spill/Stack Write Instructions:
             avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
             min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
             max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Spill/Stack Atomic Instructions:
             avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
             min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
             max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
         gfx942:
           Total Instructions:
             avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom))
             min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom))
             max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Global/Generic Instructions:
             avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
             min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
             max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Global/Generic Read Instructions:
             avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
             min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
             max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Global/Generic Write Instructions:
             avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
             min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
             max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Global/Generic Atomic Instructions:
             avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
             min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
             max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Spill/Stack Instructions:
             avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
             min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
             max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Spill/Stack Read Instructions:
             avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
             min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
             max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Spill/Stack Write Instructions:
             avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
             min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
             max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Spill/Stack Atomic Instructions:
             avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
             min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
             max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
         gfx950:
           Total Instructions:
             avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom))
             min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom))
             max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Global/Generic Instructions:
             avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
             min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
             max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Global/Generic Read Instructions:
             avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
             min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
             max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Global/Generic Read Instructions for LDS:
             avg: AVG((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom))
             min: MIN((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom))
             max: MAX((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Global/Generic Write Instructions:
             avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
             min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
             max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Global/Generic Atomic Instructions:
             avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
             min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
             max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Spill/Stack Instructions:
             avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
             min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
             max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Spill/Stack Read Instructions:
             avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
             min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
             max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Spill/Stack Read Instructions for LDS:
             avg: AVG((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom))
             min: MIN((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom))
             max: MAX((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Spill/Stack Write Instructions:
             avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
             min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
             max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Spill/Stack Atomic Instructions:
             avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
             min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
             max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
         gfx908:
           Total Instructions:
             avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom))
             min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom))
             max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Global/Generic Instructions:
             avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
             min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
             max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Global/Generic Read Instructions:
             avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
             min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
             max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Global/Generic Write Instructions:
             avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
             min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
             max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Global/Generic Atomic Instructions:
             avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
             min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
             max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Spill/Stack Instructions:
             avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
             min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
             max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Spill/Stack Read Instructions:
             avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
             min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
             max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Spill/Stack Write Instructions:
             avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
             min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
             max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Spill/Stack Atomic Instructions:
             avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
             min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
             max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
   - metric_table:
       id: 1503
       title: Spill and stack metrics
@@ -11474,97 +11575,97 @@ panels:
             avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
             min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
             max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
           Spill/Stack Coalesced Read:
             avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
             min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
             max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
           Spill/Stack Coalesced Write:
             avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
             min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
             max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
         gfx941:
           Spill/Stack Total Cycles:
             avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
             min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
             max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
           Spill/Stack Coalesced Read:
             avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
             min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
             max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
           Spill/Stack Coalesced Write:
             avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
             min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
             max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
         gfx940:
           Spill/Stack Total Cycles:
             avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
             min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
             max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
           Spill/Stack Coalesced Read:
             avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
             min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
             max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
           Spill/Stack Coalesced Write:
             avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
             min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
             max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
         gfx942:
           Spill/Stack Total Cycles:
             avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
             min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
             max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
           Spill/Stack Coalesced Read:
             avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
             min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
             max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
           Spill/Stack Coalesced Write:
             avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
             min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
             max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
         gfx950:
           Spill/Stack Total Cycles:
             avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
             min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
             max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
           Spill/Stack Coalesced Read:
             avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
             min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
             max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
           Spill/Stack Coalesced Write:
             avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
             min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
             max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
         gfx908:
           Spill/Stack Total Cycles:
             avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
             min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
             max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
           Spill/Stack Coalesced Read:
             avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
             min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
             max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
           Spill/Stack Coalesced Write:
             avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
             min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
             max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
   - metric_table:
       id: 1504
       title: Vector L1 data-return path or Texture Data (TD)
@@ -11595,7 +11696,7 @@ panels:
             avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom))
             min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom))
             max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Read Instructions:
             avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
               / $denom))
@@ -11603,17 +11704,17 @@ panels:
               / $denom))
             max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
               / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Write Instructions:
             avg: AVG((TD_STORE_WAVEFRONT_sum / $denom))
             min: MIN((TD_STORE_WAVEFRONT_sum / $denom))
             max: MAX((TD_STORE_WAVEFRONT_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Atomic Instructions:
             avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom))
             min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom))
             max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
         gfx941:
           Data-Return Busy:
             avg: AVG(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
@@ -11634,7 +11735,7 @@ panels:
             avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom))
             min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom))
             max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Read Instructions:
             avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
               / $denom))
@@ -11642,17 +11743,17 @@ panels:
               / $denom))
             max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
               / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Write Instructions:
             avg: AVG((TD_STORE_WAVEFRONT_sum / $denom))
             min: MIN((TD_STORE_WAVEFRONT_sum / $denom))
             max: MAX((TD_STORE_WAVEFRONT_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Atomic Instructions:
             avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom))
             min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom))
             max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
         gfx940:
           Data-Return Busy:
             avg: AVG(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
@@ -11673,7 +11774,7 @@ panels:
             avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom))
             min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom))
             max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Read Instructions:
             avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
               / $denom))
@@ -11681,17 +11782,17 @@ panels:
               / $denom))
             max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
               / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Write Instructions:
             avg: AVG((TD_STORE_WAVEFRONT_sum / $denom))
             min: MIN((TD_STORE_WAVEFRONT_sum / $denom))
             max: MAX((TD_STORE_WAVEFRONT_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Atomic Instructions:
             avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom))
             min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom))
             max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
         gfx942:
           Data-Return Busy:
             avg: AVG(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
@@ -11712,7 +11813,7 @@ panels:
             avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom))
             min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom))
             max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Read Instructions:
             avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
               / $denom))
@@ -11720,17 +11821,17 @@ panels:
               / $denom))
             max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
               / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Write Instructions:
             avg: AVG((TD_STORE_WAVEFRONT_sum / $denom))
             min: MIN((TD_STORE_WAVEFRONT_sum / $denom))
             max: MAX((TD_STORE_WAVEFRONT_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Atomic Instructions:
             avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom))
             min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom))
             max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
         gfx950:
           Data-Return Busy:
             avg: AVG(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
@@ -11751,7 +11852,7 @@ panels:
             avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom))
             min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom))
             max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Read Instructions:
             avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
               / $denom))
@@ -11759,22 +11860,22 @@ panels:
               / $denom))
             max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
               / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Write Instructions:
             avg: AVG((TD_STORE_WAVEFRONT_sum / $denom))
             min: MIN((TD_STORE_WAVEFRONT_sum / $denom))
             max: MAX((TD_STORE_WAVEFRONT_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Atomic Instructions:
             avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom))
             min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom))
             max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Write Ack Instructions:
             avg: AVG((TD_WRITE_ACKT_WAVEFRONT_sum / $denom))
             min: MIN((TD_WRITE_ACKT_WAVEFRONT_sum / $denom))
             max: MAX((TD_WRITE_ACKT_WAVEFRONT_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
         gfx908:
           Data-Return Busy:
             avg: AVG(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
@@ -11790,7 +11891,7 @@ panels:
             avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom))
             min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom))
             max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Read Instructions:
             avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
               / $denom))
@@ -11798,40 +11899,40 @@ panels:
               / $denom))
             max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
               / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Write Instructions:
             avg: AVG((TD_STORE_WAVEFRONT_sum / $denom))
             min: MIN((TD_STORE_WAVEFRONT_sum / $denom))
             max: MAX((TD_STORE_WAVEFRONT_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
           Atomic Instructions:
             avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom))
             min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom))
             max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom))
-            unit: (Instructions  + $normUnit)
+            unit: (Instructions + $normUnit)
   metrics_description:
     Address Processing Unit Busy:
       plain: Percent of the total CU cycles the address processor was busy
-      rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the address  processor
+      rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the address processor
         was busy
       unit: Percent
     Address Stall:
       plain: Percent of the total CU cycles the address processor was stalled from
         sending address requests further into the vL1D pipeline.
-      rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the address  processor
-        was stalled from sending address requests further into the vL1D  pipeline
+      rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the address processor
+        was stalled from sending address requests further into the vL1D pipeline
       unit: Percent
     Data Stall:
       plain: Percent of the total CU cycles the address processor was stalled from
         sending write/atomic data further into the vL1D pipeline.
-      rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the address  processor
-        was stalled from sending write/atomic data further into the  vL1D pipeline
+      rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the address processor
+        was stalled from sending write/atomic data further into the vL1D pipeline
       unit: Percent
     "Data-Processor \u2192 Address Stall":
       plain: Percent of total CU cycles the address processor was stalled waiting
         to send command data to the data processor.
-      rst: Percent of :ref:`total CU cycles <total-cu-cycles>` the address processor  was
-        stalled waiting to send command data to the  :ref:`data processor <desc-td>`
+      rst: Percent of :ref:`total CU cycles <total-cu-cycles>` the address processor was
+        stalled waiting to send command data to the :ref:`data processor <desc-td>`
       unit: Percent
     Total Instructions:
       plain: The total number of memory instructions executed by the address processer
@@ -11842,136 +11943,136 @@ panels:
     Global/Generic Instructions:
       plain: The total number of global & generic memory instructions executed on
         all compute units on the accelerator, per normalization unit.
-      rst: The total number of global & generic memory instructions executed on all  :doc:`compute
-        units <compute-unit>` on the accelerator, per  :ref:`normalization unit <normalization-units>`.
+      rst: The total number of global & generic memory instructions executed on all :doc:`compute
+        units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
       unit: Instructions per normalization unit
     Global/Generic Read Instructions:
       plain: The total number of global & generic memory read instructions executed
         on all compute units on the accelerator, per normalization unit.
       rst: The total number of global & generic memory read instructions executed
-        on  all :doc:`compute units <compute-unit>` on the accelerator, per  :ref:`normalization
+        on all :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
         unit <normalization-units>`.
       unit: Instructions per normalization unit
     Global/Generic Write Instructions:
       plain: The total number of global & generic memory write instructions executed
         on all compute units on the accelerator, per normalization unit.
-      rst: The total number of global & generic memory write instructions executed  on
-        all :doc:`compute units <compute-unit>` on the accelerator, per  :ref:`normalization
+      rst: The total number of global & generic memory write instructions executed on
+        all :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
         unit <normalization-units>`.
       unit: Instructions per normalization unit
     Global/Generic Atomic Instructions:
       plain: The total number of global & generic memory atomic (with and without
         return) instructions executed on all compute units on the accelerator, per
         normalization unit.
-      rst: The total number of global & generic memory atomic (with and without  return)
-        instructions executed on all :doc:`compute units <compute-unit>`  on the accelerator,
+      rst: The total number of global & generic memory atomic (with and without return)
+        instructions executed on all :doc:`compute units <compute-unit>` on the accelerator,
         per :ref:`normalization unit <normalization-units>`.
       unit: Instructions per normalization unit
     Spill/Stack Instructions:
       plain: The total number of spill/stack memory instructions executed on all compute
         units on the accelerator, per normalization unit.
-      rst: The total number of spill/stack memory instructions executed on all  :doc:`compute
-        units <compute-unit>` on the accelerator, per  :ref:`normalization unit <normalization-units>`.
+      rst: The total number of spill/stack memory instructions executed on all :doc:`compute
+        units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
       unit: Instructions per normalization unit
     Spill/Stack Read Instructions:
       plain: The total number of spill/stack memory read instructions executed on
         all compute units on the accelerator, per normalization unit.
-      rst: The total number of spill/stack memory read instructions executed on all  :doc:`compute
-        units <compute-unit>` on the accelerator, per  :ref:`normalization unit <normalization-units>`.
+      rst: The total number of spill/stack memory read instructions executed on all :doc:`compute
+        units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
       unit: Instructions per normalization unit
     Spill/Stack Write Instructions:
       plain: The total number of spill/stack memory write instructions executed on
         all compute units on the accelerator, per normalization unit.
-      rst: The total number of spill/stack memory write instructions executed on all  :doc:`compute
-        units <compute-unit>` on the accelerator, per  :ref:`normalization unit <normalization-units>`.
+      rst: The total number of spill/stack memory write instructions executed on all :doc:`compute
+        units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
       unit: Instructions per normalization unit
     Spill/Stack Atomic Instructions:
       plain: The total number of spill/stack memory atomic (with and without return)
         instructions executed on all compute units on the accelerator, per normalization
         unit. Typically unused as these memory operations are typically used to implement
         thread-local storage.
-      rst: The total number of spill/stack memory atomic (with and without return)  instructions
-        executed on all :doc:`compute units <compute-unit>` on the  accelerator, per
-        :ref:`normalization unit <normalization-units>`.  Typically unused as these
-        memory operations are typically used to  implement thread-local storage.
+      rst: The total number of spill/stack memory atomic (with and without return) instructions
+        executed on all :doc:`compute units <compute-unit>` on the accelerator, per
+        :ref:`normalization unit <normalization-units>`. Typically unused as these
+        memory operations are typically used to implement thread-local storage.
       unit: Instructions per normalization unit
     Spill/Stack Total Cycles:
       plain: The number of cycles the address processing unit spent working on spill/stack
         instructions, per normalization unit.
-      rst: The number of cycles the address processing unit spent working on  spill/stack
-        instructions, per  :ref:`normalization unit <normalization-units>`.
+      rst: The number of cycles the address processing unit spent working on spill/stack
+        instructions, per :ref:`normalization unit <normalization-units>`.
       unit: Cycles per normalization unit
     Spill/Stack Coalesced Read:
       plain: The number of cycles the address processing unit spent working on coalesced
         spill/stack read instructions, per normalization unit.
-      rst: The number of cycles the address processing unit spent working on  coalesced
-        spill/stack read instructions, per  :ref:`normalization unit <normalization-units>`.
+      rst: The number of cycles the address processing unit spent working on coalesced
+        spill/stack read instructions, per :ref:`normalization unit <normalization-units>`.
       unit: Cycles per normalization unit
     Spill/Stack Coalesced Write:
       plain: The number of cycles the address processing unit spent working on coalesced
         spill/stack write instructions, per normalization unit.
-      rst: The number of cycles the address processing unit spent working on  coalesced
-        spill/stack write instructions, per  :ref:`normalization unit <normalization-units>`.
+      rst: The number of cycles the address processing unit spent working on coalesced
+        spill/stack write instructions, per :ref:`normalization unit <normalization-units>`.
       unit: Cycles per normalization unit
     Data-Return Busy:
       plain: Percent of the total CU cycles the data-return unit was busy processing
         or waiting on data to return to the CU.
-      rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the data-return  unit
-        was busy processing or waiting on data to return to the  :doc:`CU <compute-unit>`.
+      rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the data-return unit
+        was busy processing or waiting on data to return to the :doc:`CU <compute-unit>`.
       unit: Percent
     "Cache RAM \u2192 Data-Return Stall":
       plain: Percent of the total CU cycles the data-return unit was stalled on data
         to be returned from the vL1D Cache RAM.
-      rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the data-return  unit
-        was stalled on data to be returned from the  :ref:`vL1D Cache RAM <desc-tc>`.
+      rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the data-return unit
+        was stalled on data to be returned from the :ref:`vL1D Cache RAM <desc-tc>`.
       unit: Percent
     "Workgroup manager \u2192 Data-Return Stall":
       plain: Percent of the total CU cycles the data-return unit was stalled by the
         workgroup manager due to initialization of registers as a part of launching
         new workgroups.
-      rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the data-return  unit
-        was stalled by the :ref:`workgroup manager <desc-spi>` due to  initialization
+      rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the data-return unit
+        was stalled by the :ref:`workgroup manager <desc-spi>` due to initialization
         of registers as a part of launching new workgroups.
       unit: Percent
     Coalescable Instructions:
       plain: The number of instructions submitted to the data-return unit by the address
         processor that were found to be coalescable, per normalization unit.
-      rst: The number of instructions submitted to the  :ref:`data-return unit <desc-td>`
-        by the  :ref:`address processor <desc-ta>` that were found to be coalescable,
-        per  :ref:`normalization unit <normalization-units>`.
+      rst: The number of instructions submitted to the :ref:`data-return unit <desc-td>`
+        by the :ref:`address processor <desc-ta>` that were found to be coalescable,
+        per :ref:`normalization unit <normalization-units>`.
       unit: Instructions per normalization unit
     Read Instructions:
       plain: The number of read instructions submitted to the data-return unit by
         the address processor summed over all compute units on the accelerator, per
         normalization unit. This is expected to be the sum of global/generic and spill/stack
         reads in the address processor.
-      rst: The number of read instructions submitted to the  :ref:`data-return unit
-        <desc-td>` by the  :ref:`address processor <desc-ta>` summed over all  :doc:`compute
-        units <compute-unit>` on the accelerator, per  :ref:`normalization unit <normalization-units>`.
-        This is expected to be  the sum of global/generic and spill/stack reads in
-        the  :ref:`address processor <desc-ta>`.
+      rst: The number of read instructions submitted to the :ref:`data-return unit
+        <desc-td>` by the :ref:`address processor <desc-ta>` summed over all :doc:`compute
+        units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
+        This is expected to be the sum of global/generic and spill/stack reads in
+        the :ref:`address processor <desc-ta>`.
       unit: Instructions per normalization unit
     Write Instructions:
       plain: The number of store instructions submitted to the data-return unit by
         the address processor summed over all compute units on the accelerator, per
         normalization unit. This is expected to be the sum of global/generic and spill/stack
         stores in the address processor.
-      rst: The number of store instructions submitted to the  :ref:`data-return unit
-        <desc-td>` by the  :ref:`address processor <desc-ta>` summed over all  :doc:`compute
-        units <compute-unit>` on the accelerator, per  :ref:`normalization unit <normalization-units>`.
-        This is expected to be  the sum of global/generic and spill/stack stores counted
-        by the  :ref:`vL1D cache-front-end <ta-instruction-counts>`.
+      rst: The number of store instructions submitted to the :ref:`data-return unit
+        <desc-td>` by the :ref:`address processor <desc-ta>` summed over all :doc:`compute
+        units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
+        This is expected to be the sum of global/generic and spill/stack stores counted
+        by the :ref:`vL1D cache-front-end <ta-instruction-counts>`.
       unit: Instructions per normalization unit
     Atomic Instructions:
       plain: The number of atomic instructions submitted to the data-return unit by
         the address processor summed over all compute units on the accelerator, per
         normalization unit. This is expected to be the sum of global/generic and spill/stack
         atomics in the address processor.
-      rst: The number of atomic instructions submitted to the  :ref:`data-return unit
-        <desc-td>` by the  :ref:`address processor <desc-ta>` summed over all  :doc:`compute
-        units <compute-unit>` on the accelerator, per  :ref:`normalization unit <normalization-units>`.
-        This is expected to be  the sum of global/generic and spill/stack atomics
-        in the  :ref:`address processor <desc-ta>`.
+      rst: The number of atomic instructions submitted to the :ref:`data-return unit
+        <desc-td>` by the :ref:`address processor <desc-ta>` summed over all :doc:`compute
+        units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
+        This is expected to be the sum of global/generic and spill/stack atomics
+        in the :ref:`address processor <desc-ta>`.
       unit: Instructions per normalization unit
     Write Ack Instructions:
       plain: The total number of write acknowledgements submitted by data-return
@@ -12243,17 +12344,17 @@ panels:
             avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom))
             min: MIN((TCP_TOTAL_ACCESSES_sum / $denom))
             max: MAX((TCP_TOTAL_ACCESSES_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read Req:
             avg: AVG((TCP_TOTAL_READ_sum / $denom))
             min: MIN((TCP_TOTAL_READ_sum / $denom))
             max: MAX((TCP_TOTAL_READ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Write Req:
             avg: AVG((TCP_TOTAL_WRITE_sum / $denom))
             min: MIN((TCP_TOTAL_WRITE_sum / $denom))
             max: MAX((TCP_TOTAL_WRITE_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Atomic Req:
             avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
               / $denom))
@@ -12261,7 +12362,7 @@ panels:
               / $denom))
             max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
               / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Cache BW:
             avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp)))
             min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp)))
@@ -12285,7 +12386,7 @@ panels:
             avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
             min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
             max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Cache Hits:
             avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
               + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
@@ -12296,7 +12397,7 @@ panels:
             max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
               + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
               / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Invalidations:
             avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
             min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
@@ -12314,12 +12415,12 @@ panels:
             avg: AVG((TCP_TCC_READ_REQ_sum / $denom))
             min: MIN((TCP_TCC_READ_REQ_sum / $denom))
             max: MAX((TCP_TCC_READ_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           L1-L2 Write:
             avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom))
             min: MIN((TCP_TCC_WRITE_REQ_sum / $denom))
             max: MAX((TCP_TCC_WRITE_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           L1-L2 Atomic:
             avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
               / $denom))
@@ -12327,7 +12428,7 @@ panels:
               / $denom))
             max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
               / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           L1 Access Latency:
             avg: AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum
               != 0) else None))
@@ -12363,17 +12464,17 @@ panels:
             avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom))
             min: MIN((TCP_TOTAL_ACCESSES_sum / $denom))
             max: MAX((TCP_TOTAL_ACCESSES_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read Req:
             avg: AVG((TCP_TOTAL_READ_sum / $denom))
             min: MIN((TCP_TOTAL_READ_sum / $denom))
             max: MAX((TCP_TOTAL_READ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Write Req:
             avg: AVG((TCP_TOTAL_WRITE_sum / $denom))
             min: MIN((TCP_TOTAL_WRITE_sum / $denom))
             max: MAX((TCP_TOTAL_WRITE_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Atomic Req:
             avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
               / $denom))
@@ -12381,7 +12482,7 @@ panels:
               / $denom))
             max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
               / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Cache BW:
             avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
             min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
@@ -12405,7 +12506,7 @@ panels:
             avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
             min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
             max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Cache Hits:
             avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
               + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
@@ -12416,7 +12517,7 @@ panels:
             max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
               + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
               / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Invalidations:
             avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
             min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
@@ -12437,12 +12538,12 @@ panels:
             avg: AVG((TCP_TCC_READ_REQ_sum / $denom))
             min: MIN((TCP_TCC_READ_REQ_sum / $denom))
             max: MAX((TCP_TCC_READ_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           L1-L2 Write:
             avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom))
             min: MIN((TCP_TCC_WRITE_REQ_sum / $denom))
             max: MAX((TCP_TCC_WRITE_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           L1-L2 Atomic:
             avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
               / $denom))
@@ -12450,23 +12551,23 @@ panels:
               / $denom))
             max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
               / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
         gfx940:
           Total Req:
             avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom))
             min: MIN((TCP_TOTAL_ACCESSES_sum / $denom))
             max: MAX((TCP_TOTAL_ACCESSES_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read Req:
             avg: AVG((TCP_TOTAL_READ_sum / $denom))
             min: MIN((TCP_TOTAL_READ_sum / $denom))
             max: MAX((TCP_TOTAL_READ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Write Req:
             avg: AVG((TCP_TOTAL_WRITE_sum / $denom))
             min: MIN((TCP_TOTAL_WRITE_sum / $denom))
             max: MAX((TCP_TOTAL_WRITE_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Atomic Req:
             avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
               / $denom))
@@ -12474,7 +12575,7 @@ panels:
               / $denom))
             max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
               / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Cache BW:
             avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
             min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
@@ -12498,7 +12599,7 @@ panels:
             avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
             min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
             max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Cache Hits:
             avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
               + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
@@ -12509,7 +12610,7 @@ panels:
             max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
               + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
               / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Invalidations:
             avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
             min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
@@ -12530,12 +12631,12 @@ panels:
             avg: AVG((TCP_TCC_READ_REQ_sum / $denom))
             min: MIN((TCP_TCC_READ_REQ_sum / $denom))
             max: MAX((TCP_TCC_READ_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           L1-L2 Write:
             avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom))
             min: MIN((TCP_TCC_WRITE_REQ_sum / $denom))
             max: MAX((TCP_TCC_WRITE_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           L1-L2 Atomic:
             avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
               / $denom))
@@ -12543,23 +12644,23 @@ panels:
               / $denom))
             max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
               / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
         gfx942:
           Total Req:
             avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom))
             min: MIN((TCP_TOTAL_ACCESSES_sum / $denom))
             max: MAX((TCP_TOTAL_ACCESSES_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read Req:
             avg: AVG((TCP_TOTAL_READ_sum / $denom))
             min: MIN((TCP_TOTAL_READ_sum / $denom))
             max: MAX((TCP_TOTAL_READ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Write Req:
             avg: AVG((TCP_TOTAL_WRITE_sum / $denom))
             min: MIN((TCP_TOTAL_WRITE_sum / $denom))
             max: MAX((TCP_TOTAL_WRITE_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Atomic Req:
             avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
               / $denom))
@@ -12567,7 +12668,7 @@ panels:
               / $denom))
             max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
               / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Cache BW:
             avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
             min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
@@ -12591,7 +12692,7 @@ panels:
             avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
             min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
             max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Cache Hits:
             avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
               + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
@@ -12602,7 +12703,7 @@ panels:
             max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
               + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
               / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Invalidations:
             avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
             min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
@@ -12623,12 +12724,12 @@ panels:
             avg: AVG((TCP_TCC_READ_REQ_sum / $denom))
             min: MIN((TCP_TCC_READ_REQ_sum / $denom))
             max: MAX((TCP_TCC_READ_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           L1-L2 Write:
             avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom))
             min: MIN((TCP_TCC_WRITE_REQ_sum / $denom))
             max: MAX((TCP_TCC_WRITE_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           L1-L2 Atomic:
             avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
               / $denom))
@@ -12636,23 +12737,23 @@ panels:
               / $denom))
             max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
               / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
         gfx950:
           Total Req:
             avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom))
             min: MIN((TCP_TOTAL_ACCESSES_sum / $denom))
             max: MAX((TCP_TOTAL_ACCESSES_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read Req:
             avg: AVG((TCP_TOTAL_READ_sum / $denom))
             min: MIN((TCP_TOTAL_READ_sum / $denom))
             max: MAX((TCP_TOTAL_READ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Write Req:
             avg: AVG((TCP_TOTAL_WRITE_sum / $denom))
             min: MIN((TCP_TOTAL_WRITE_sum / $denom))
             max: MAX((TCP_TOTAL_WRITE_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Atomic Req:
             avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
               / $denom))
@@ -12660,7 +12761,7 @@ panels:
               / $denom))
             max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
               / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Cache BW:
             avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
             min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
@@ -12684,7 +12785,7 @@ panels:
             avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
             min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
             max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Cache Hits:
             avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
               + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
@@ -12695,7 +12796,7 @@ panels:
             max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
               + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
               / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Invalidations:
             avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
             min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
@@ -12716,32 +12817,32 @@ panels:
             avg: AVG((TCP_TAGRAM0_REQ_sum / $denom))
             min: MIN((TCP_TAGRAM0_REQ_sum / $denom))
             max: MAX((TCP_TAGRAM0_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Tag RAM 1 Req:
             avg: AVG((TCP_TAGRAM1_REQ_sum / $denom))
             min: MIN((TCP_TAGRAM1_REQ_sum / $denom))
             max: MAX((TCP_TAGRAM1_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Tag RAM 2 Req:
             avg: AVG((TCP_TAGRAM2_REQ_sum / $denom))
             min: MIN((TCP_TAGRAM2_REQ_sum / $denom))
             max: MAX((TCP_TAGRAM2_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Tag RAM 3 Req:
             avg: AVG((TCP_TAGRAM3_REQ_sum / $denom))
             min: MIN((TCP_TAGRAM3_REQ_sum / $denom))
             max: MAX((TCP_TAGRAM3_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           L1-L2 Read:
             avg: AVG((TCP_TCC_READ_REQ_sum / $denom))
             min: MIN((TCP_TCC_READ_REQ_sum / $denom))
             max: MAX((TCP_TCC_READ_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           L1-L2 Write:
             avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom))
             min: MIN((TCP_TCC_WRITE_REQ_sum / $denom))
             max: MAX((TCP_TCC_WRITE_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           L1-L2 Atomic:
             avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
               / $denom))
@@ -12749,38 +12850,38 @@ panels:
               / $denom))
             max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
               / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           L1 Access Latency:
             avg: AVG((TCP_TCP_LATENCY_sum / $denom))
             min: MIN((TCP_TCP_LATENCY_sum / $denom))
             max: MAX((TCP_TCP_LATENCY_sum / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
           L1-L2 Read Latency:
             avg: AVG((TCP_TCC_READ_REQ_LATENCY_sum / $denom))
             min: MIN((TCP_TCC_READ_REQ_LATENCY_sum / $denom))
             max: MAX((TCP_TCC_READ_REQ_LATENCY_sum / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
           L1-L2 Write Latency:
             avg: AVG((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom))
             min: MIN((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom))
             max: MAX((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom))
-            unit: (Cycles  + $normUnit)
+            unit: (Cycles + $normUnit)
         gfx908:
           Total Req:
             avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom))
             min: MIN((TCP_TOTAL_ACCESSES_sum / $denom))
             max: MAX((TCP_TOTAL_ACCESSES_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read Req:
             avg: AVG((TCP_TOTAL_READ_sum / $denom))
             min: MIN((TCP_TOTAL_READ_sum / $denom))
             max: MAX((TCP_TOTAL_READ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Write Req:
             avg: AVG((TCP_TOTAL_WRITE_sum / $denom))
             min: MIN((TCP_TOTAL_WRITE_sum / $denom))
             max: MAX((TCP_TOTAL_WRITE_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Atomic Req:
             avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
               / $denom))
@@ -12788,7 +12889,7 @@ panels:
               / $denom))
             max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
               / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Cache BW:
             avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp)))
             min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp)))
@@ -12812,7 +12913,7 @@ panels:
             avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
             min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
             max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Cache Hits:
             avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
               + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
@@ -12823,7 +12924,7 @@ panels:
             max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
               + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
               / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Invalidations:
             avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
             min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
@@ -12841,12 +12942,12 @@ panels:
             avg: AVG((TCP_TCC_READ_REQ_sum / $denom))
             min: MIN((TCP_TCC_READ_REQ_sum / $denom))
             max: MAX((TCP_TCC_READ_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           L1-L2 Write:
             avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom))
             min: MIN((TCP_TCC_WRITE_REQ_sum / $denom))
             max: MAX((TCP_TCC_WRITE_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           L1-L2 Atomic:
             avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
               / $denom))
@@ -12854,7 +12955,7 @@ panels:
               / $denom))
             max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
               / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           L1 Access Latency:
             avg: AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum
               != 0) else None))
@@ -12904,84 +13005,84 @@ panels:
             avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom))
             min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom))
             max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           UC - Read:
             xfer: Read
             coherency: UC
             avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom))
             min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom))
             max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           CC - Read:
             xfer: Read
             coherency: CC
             avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom))
             min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom))
             max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           RW - Read:
             xfer: Read
             coherency: RW
             avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom))
             min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom))
             max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           RW - Write:
             xfer: Write
             coherency: RW
             avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom))
             min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom))
             max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           NC - Write:
             xfer: Write
             coherency: NC
             avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom))
             min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom))
             max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           UC - Write:
             xfer: Write
             coherency: UC
             avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom))
             min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom))
             max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           CC - Write:
             xfer: Write
             coherency: CC
             avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom))
             min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom))
             max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           NC - Atomic:
             xfer: Atomic
             coherency: NC
             avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
             min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
             max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           UC - Atomic:
             xfer: Atomic
             coherency: UC
             avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
             min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
             max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           CC - Atomic:
             xfer: Atomic
             coherency: CC
             avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
             min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
             max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           RW - Atomic:
             xfer: Atomic
             coherency: RW
             avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
             min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
             max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
         gfx941:
           NC - Read:
             xfer: Read
@@ -12989,84 +13090,84 @@ panels:
             avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom))
             min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom))
             max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           UC - Read:
             xfer: Read
             coherency: UC
             avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom))
             min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom))
             max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           CC - Read:
             xfer: Read
             coherency: CC
             avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom))
             min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom))
             max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           RW - Read:
             xfer: Read
             coherency: RW
             avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom))
             min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom))
             max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           RW - Write:
             xfer: Write
             coherency: RW
             avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom))
             min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom))
             max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           NC - Write:
             xfer: Write
             coherency: NC
             avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom))
             min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom))
             max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           UC - Write:
             xfer: Write
             coherency: UC
             avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom))
             min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom))
             max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           CC - Write:
             xfer: Write
             coherency: CC
             avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom))
             min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom))
             max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           NC - Atomic:
             xfer: Atomic
             coherency: NC
             avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
             min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
             max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           UC - Atomic:
             xfer: Atomic
             coherency: UC
             avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
             min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
             max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           CC - Atomic:
             xfer: Atomic
             coherency: CC
             avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
             min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
             max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           RW - Atomic:
             xfer: Atomic
             coherency: RW
             avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
             min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
             max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
         gfx940:
           NC - Read:
             xfer: Read
@@ -13074,84 +13175,84 @@ panels:
             avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom))
             min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom))
             max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           UC - Read:
             xfer: Read
             coherency: UC
             avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom))
             min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom))
             max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           CC - Read:
             xfer: Read
             coherency: CC
             avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom))
             min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom))
             max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           RW - Read:
             xfer: Read
             coherency: RW
             avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom))
             min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom))
             max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           RW - Write:
             xfer: Write
             coherency: RW
             avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom))
             min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom))
             max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           NC - Write:
             xfer: Write
             coherency: NC
             avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom))
             min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom))
             max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           UC - Write:
             xfer: Write
             coherency: UC
             avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom))
             min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom))
             max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           CC - Write:
             xfer: Write
             coherency: CC
             avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom))
             min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom))
             max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           NC - Atomic:
             xfer: Atomic
             coherency: NC
             avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
             min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
             max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           UC - Atomic:
             xfer: Atomic
             coherency: UC
             avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
             min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
             max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           CC - Atomic:
             xfer: Atomic
             coherency: CC
             avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
             min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
             max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           RW - Atomic:
             xfer: Atomic
             coherency: RW
             avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
             min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
             max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
         gfx942:
           NC - Read:
             xfer: Read
@@ -13159,84 +13260,84 @@ panels:
             avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom))
             min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom))
             max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           UC - Read:
             xfer: Read
             coherency: UC
             avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom))
             min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom))
             max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           CC - Read:
             xfer: Read
             coherency: CC
             avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom))
             min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom))
             max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           RW - Read:
             xfer: Read
             coherency: RW
             avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom))
             min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom))
             max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           RW - Write:
             xfer: Write
             coherency: RW
             avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom))
             min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom))
             max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           NC - Write:
             xfer: Write
             coherency: NC
             avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom))
             min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom))
             max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           UC - Write:
             xfer: Write
             coherency: UC
             avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom))
             min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom))
             max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           CC - Write:
             xfer: Write
             coherency: CC
             avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom))
             min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom))
             max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           NC - Atomic:
             xfer: Atomic
             coherency: NC
             avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
             min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
             max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           UC - Atomic:
             xfer: Atomic
             coherency: UC
             avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
             min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
             max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           CC - Atomic:
             xfer: Atomic
             coherency: CC
             avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
             min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
             max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           RW - Atomic:
             xfer: Atomic
             coherency: RW
             avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
             min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
             max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
         gfx950:
           NC - Read:
             xfer: Read
@@ -13244,84 +13345,84 @@ panels:
             avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom))
             min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom))
             max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           UC - Read:
             xfer: Read
             coherency: UC
             avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom))
             min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom))
             max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           CC - Read:
             xfer: Read
             coherency: CC
             avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom))
             min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom))
             max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           RW - Read:
             xfer: Read
             coherency: RW
             avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom))
             min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom))
             max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           RW - Write:
             xfer: Write
             coherency: RW
             avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom))
             min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom))
             max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           NC - Write:
             xfer: Write
             coherency: NC
             avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom))
             min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom))
             max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           UC - Write:
             xfer: Write
             coherency: UC
             avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom))
             min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom))
             max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           CC - Write:
             xfer: Write
             coherency: CC
             avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom))
             min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom))
             max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           NC - Atomic:
             xfer: Atomic
             coherency: NC
             avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
             min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
             max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           UC - Atomic:
             xfer: Atomic
             coherency: UC
             avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
             min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
             max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           CC - Atomic:
             xfer: Atomic
             coherency: CC
             avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
             min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
             max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           RW - Atomic:
             xfer: Atomic
             coherency: RW
             avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
             min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
             max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
         gfx908:
           NC - Read:
             xfer: Read
@@ -13329,84 +13430,84 @@ panels:
             avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom))
             min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom))
             max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           UC - Read:
             xfer: Read
             coherency: UC
             avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom))
             min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom))
             max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           CC - Read:
             xfer: Read
             coherency: CC
             avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom))
             min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom))
             max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           RW - Read:
             xfer: Read
             coherency: RW
             avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom))
             min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom))
             max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           RW - Write:
             xfer: Write
             coherency: RW
             avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom))
             min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom))
             max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           NC - Write:
             xfer: Write
             coherency: NC
             avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom))
             min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom))
             max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           UC - Write:
             xfer: Write
             coherency: UC
             avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom))
             min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom))
             max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           CC - Write:
             xfer: Write
             coherency: CC
             avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom))
             min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom))
             max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           NC - Atomic:
             xfer: Atomic
             coherency: NC
             avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
             min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
             max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           UC - Atomic:
             xfer: Atomic
             coherency: UC
             avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
             min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
             max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           CC - Atomic:
             xfer: Atomic
             coherency: CC
             avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
             min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
             max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           RW - Atomic:
             xfer: Atomic
             coherency: RW
             avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
             min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
             max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
   - metric_table:
       id: 1605
       title: L1 Unified Translation Cache (UTCL1)
@@ -13656,8 +13757,8 @@ panels:
     Hit rate:
       plain: The ratio of the number of vL1D cache line requests that hit in vL1D
         cache over the total number of cache line requests to the vL1D Cache RAM.
-      rst: The ratio of the number of vL1D cache line requests that hit [#vl1d-hit]_  in
-        vL1D cache over the total number of cache line requests to the  :ref:`vL1D
+      rst: The ratio of the number of vL1D cache line requests that hit [#vl1d-hit]_ in
+        vL1D cache over the total number of cache line requests to the :ref:`vL1D
         Cache RAM <desc-tc>`.
       unit: Percent
     Bandwidth Utilization:
@@ -13667,47 +13768,47 @@ panels:
         requested multiplied by the cache line size. This value does not consider
         partial requests, so for instance, if only a single value is requested in
         a cache line, the data movement will still be counted as a full cache line.
-      rst: The number of bytes looked up in the vL1D cache as a result of  :ref:`VMEM
-        <desc-vmem>` instructions, as a percent of the peak  theoretical bandwidth
-        achievable on the specific accelerator. The number  of bytes is calculated
-        as the number of cache lines requested multiplied  by the cache line size.
-        This value does not consider partial requests, so  for instance, if only a
-        single value is requested in a cache line, the  data movement will still be
+      rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM
+        <desc-vmem>` instructions, as a percent of the peak theoretical bandwidth
+        achievable on the specific accelerator. The number of bytes is calculated
+        as the number of cache lines requested multiplied by the cache line size.
+        This value does not consider partial requests, so for instance, if only a
+        single value is requested in a cache line, the data movement will still be
         counted as a full cache line.
       unit: Percent
     Utilization:
       plain: Indicates how busy the vL1D Cache RAM was during the kernel execution.
         The number of cycles where the vL1D Cache RAM is actively processing any request
         divided by the number of cycles where the vL1D is active.
-      rst: Indicates how busy the :ref:`vL1D Cache RAM <desc-tc>` was during the  kernel
-        execution. The number of cycles where the vL1D Cache RAM is  actively processing
-        any request divided by the number of cycles where the  vL1D is active [#vl1d-activity]_.
+      rst: Indicates how busy the :ref:`vL1D Cache RAM <desc-tc>` was during the kernel
+        execution. The number of cycles where the vL1D Cache RAM is actively processing
+        any request divided by the number of cycles where the vL1D is active [#vl1d-activity]_.
       unit: Percent
     Coalescing:
       plain: Indicates how well memory instructions were coalesced by the address
         processing unit, ranging from uncoalesced (25%) to fully coalesced (100%).
         Calculated as the average number of thread-requests generated per instruction
         divided by the ideal number of thread-requests per instruction.
-      rst: Indicates how well memory instructions were coalesced by the  :ref:`address
-        processing unit <desc-ta>`, ranging from uncoalesced (25%)  to fully coalesced
-        (100%). Calculated as the average number of  :ref:`thread-requests <thread-requests>`
-        generated per instruction  divided by the ideal number of thread-requests
+      rst: Indicates how well memory instructions were coalesced by the :ref:`address
+        processing unit <desc-ta>`, ranging from uncoalesced (25%) to fully coalesced
+        (100%). Calculated as the average number of :ref:`thread-requests <thread-requests>`
+        generated per instruction divided by the ideal number of thread-requests
         per instruction.
       unit: Percent
     Stalled on L2 Data:
       plain: The ratio of the number of cycles where the vL1D is stalled waiting for
         requested data to return from the L2 cache divided by the number of cycles
         where the vL1D is active.
-      rst: The ratio of the number of cycles where the vL1D is stalled waiting for  requested
-        data to return from the :doc:`L2 cache <l2-cache>` divided by  the number
+      rst: The ratio of the number of cycles where the vL1D is stalled waiting for requested
+        data to return from the :doc:`L2 cache <l2-cache>` divided by the number
         of cycles where the vL1D is active [#vl1d-activity]_.
       unit: Percent
     Stalled on L2 Req:
       plain: The ratio of the number of cycles where the vL1D is stalled waiting to
         issue a request for data to the L2 cache divided by the number of cycles where
         the vL1D is active.
-      rst: The ratio of the number of cycles where the vL1D is stalled waiting to  issue
-        a request for data to the :doc:`L2 cache <l2-cache>` divided by the  number
+      rst: The ratio of the number of cycles where the vL1D is stalled waiting to issue
+        a request for data to the :doc:`L2 cache <l2-cache>` divided by the number
         of cycles where the vL1D is active [#vl1d-activity]_.
       unit: Percent
     Tag RAM Stall (Read):
@@ -13715,53 +13816,53 @@ panels:
         requests with conflicting tags being looked up concurrently, divided by the
         number of cycles where the vL1D is active.
       rst: The ratio of the number of cycles where the vL1D is stalled due to Read
-        requests with conflicting tags being looked up  concurrently, divided by the
-        number of cycles where the  vL1D is active [#vl1d-activity]_.
+        requests with conflicting tags being looked up concurrently, divided by the
+        number of cycles where the vL1D is active [#vl1d-activity]_.
       unit: Percent
     Tag RAM Stall (Write):
       plain: The ratio of the number of cycles where the vL1D is stalled due to Write
         requests with conflicting tags being looked up concurrently, divided by the
         number of cycles where the vL1D is active.
       rst: The ratio of the number of cycles where the vL1D is stalled due to Write
-        requests with conflicting tags being looked up  concurrently, divided by the
-        number of cycles where the  vL1D is active [#vl1d-activity]_.
+        requests with conflicting tags being looked up concurrently, divided by the
+        number of cycles where the vL1D is active [#vl1d-activity]_.
       unit: Percent
     Tag RAM Stall (Atomic):
       plain: The ratio of the number of cycles where the vL1D is stalled due to Atomic
         requests with conflicting tags being looked up concurrently, divided by the
         number of cycles where the vL1D is active.
       rst: The ratio of the number of cycles where the vL1D is stalled due to Atomic
-        requests with conflicting tags being looked up  concurrently, divided by the
-        number of cycles where the  vL1D is active [#vl1d-activity]_.
+        requests with conflicting tags being looked up concurrently, divided by the
+        number of cycles where the vL1D is active [#vl1d-activity]_.
       unit: Percent
     Total Req:
       plain: The total number of incoming requests from the address processing unit
         after coalescing.
-      rst: The total number of incoming requests from the  :ref:`address processing
+      rst: The total number of incoming requests from the :ref:`address processing
         unit <desc-ta>` after coalescing.
       unit: Requests
     Read Req:
       plain: The total number of incoming read requests from the address processing
         unit after coalescing per normalization unit.
-      rst: The total number of incoming read requests from the  :ref:`address processing
-        unit <desc-ta>` after coalescing per  :ref:`normalization unit <normalization-units>`
+      rst: The total number of incoming read requests from the :ref:`address processing
+        unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
       unit: Requests per normalization unit
     Write Req:
       plain: The total number of incoming write requests from the address processing
         unit after coalescing per normalization unit.
-      rst: The total number of incoming write requests from the  :ref:`address processing
-        unit <desc-ta>` after coalescing per  :ref:`normalization unit <normalization-units>`
+      rst: The total number of incoming write requests from the :ref:`address processing
+        unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
       unit: Requests per normalization unit
     Atomic Req:
       plain: The total number of incoming atomic requests from the address processing
         unit after coalescing per normalization unit.
-      rst: The total number of incoming atomic requests from the  :ref:`address processing
-        unit <desc-ta>` after coalescing per  :ref:`normalization unit <normalization-units>`
+      rst: The total number of incoming atomic requests from the :ref:`address processing
+        unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
       unit: Requests per normalization unit
     Cache BW:
       plain: The number of bytes looked up in the vL1D cache as a result of VMEM instructions
         divided by total duration. The number of bytes is calculated as the number of
-        cache lines requested multiplied by the cache line size.  This value does
+        cache lines requested multiplied by the cache line size. This value does
         not consider partial requests, so for instance, if only a single value is
         requested in a cache line, the data movement will still be counted as a full
         cache line.
@@ -13769,14 +13870,14 @@ panels:
         <desc-vmem>` instructions divided by total duration. The
         number of bytes is calculated as the number of cache lines requested multiplied
         by the cache line size. This value does not consider partial requests, so
-        for  instance, if only a single value is requested in a cache line, the data movement
+        for instance, if only a single value is requested in a cache line, the data movement
         will still be counted as a full cache line.
       unit: Gbps
     Cache Hit Rate:
       plain: The ratio of the number of vL1D cache line requests that hit in vL1D
         cache over the total number of cache line requests to the vL1D Cache RAM.
-      rst: The ratio of the number of vL1D cache line requests that hit in vL1D  cache
-        over the total number of cache line requests to the  :ref:`vL1D Cache RAM
+      rst: The ratio of the number of vL1D cache line requests that hit in vL1D cache
+        over the total number of cache line requests to the :ref:`vL1D Cache RAM
         <desc-tc>`.
       unit: Percent
     Cache Accesses:
@@ -13787,17 +13888,17 @@ panels:
       plain: The number of cache accesses minus the number of outgoing requests to
         the L2 cache, that is, the number of cache line requests serviced by the vL1D
         Cache RAM per normalization unit.
-      rst: The number of cache accesses minus the number of outgoing requests to the  :doc:`L2
-        cache <l2-cache>`, that is, the number of cache line requests  serviced by
-        the :ref:`vL1D Cache RAM <desc-tc>` per  :ref:`normalization unit <normalization-units>`.
+      rst: The number of cache accesses minus the number of outgoing requests to the :doc:`L2
+        cache <l2-cache>`, that is, the number of cache line requests serviced by
+        the :ref:`vL1D Cache RAM <desc-tc>` per :ref:`normalization unit <normalization-units>`.
       unit: Cache lines per normalization unit
     Invalidations:
       plain: The number of times the vL1D was issued a write-back invalidate command
         during the kernel's execution per normalization unit. This may be triggered
         by, for instance, the buffer_wbinvl1 instruction.
-      rst: The number of times the vL1D was issued a write-back invalidate command  during
-        the kernel's execution per  :ref:`normalization unit <normalization-units>`.  This
-        may be triggered  by, for instance, the ``buffer_wbinvl1`` instruction.
+      rst: The number of times the vL1D was issued a write-back invalidate command during
+        the kernel's execution per :ref:`normalization unit <normalization-units>`. This
+        may be triggered by, for instance, the ``buffer_wbinvl1`` instruction.
       unit: Invalidations per normalization unit
     L1-L2 BW:
       plain: The number of bytes transferred across the vL1D-L2 interface as a result
@@ -13806,34 +13907,34 @@ panels:
         This value does not consider partial requests, so for instance, if only a
         single value is requested in a cache line, the data movement will still be
         counted as a full cache line.
-      rst: The number of bytes transferred across the vL1D-L2 interface as a result  of
+      rst: The number of bytes transferred across the vL1D-L2 interface as a result of
         :ref:`VMEM <desc-vmem>` instructions, divided by total duration.
-        The number of bytes is  calculated as the number of cache lines requested
-        multiplied by the cache  line size. This value does not consider partial requests,
-        so for  instance, if only a single value is requested in a cache line, the
-        data  movement will still be counted as a full cache line.
+        The number of bytes is calculated as the number of cache lines requested
+        multiplied by the cache line size. This value does not consider partial requests,
+        so for instance, if only a single value is requested in a cache line, the
+        data movement will still be counted as a full cache line.
       unit: Gbps
     L1-L2 Read:
       plain: The number of read requests for a vL1D cache line that were not satisfied
         by the vL1D and must be retrieved from the to the L2 Cache per normalization
         unit.
-      rst: The number of read requests for a vL1D cache line that were not satisfied  by
-        the vL1D and must be retrieved from the to the  :doc:`L2 Cache <l2-cache>`
-        per  :ref:`normalization unit <normalization-units>`.
+      rst: The number of read requests for a vL1D cache line that were not satisfied by
+        the vL1D and must be retrieved from the to the :doc:`L2 Cache <l2-cache>`
+        per :ref:`normalization unit <normalization-units>`.
       unit: Requests per normalization unit
     L1-L2 Write:
       plain: The number of write requests to a vL1D cache line that were sent through
         the vL1D to the L2 cache, per normalization unit.
-      rst: The number of write requests to a vL1D cache line that were sent through  the
-        vL1D to the :doc:`L2 cache <l2-cache>`, per  :ref:`normalization unit <normalization-units>`.
+      rst: The number of write requests to a vL1D cache line that were sent through the
+        vL1D to the :doc:`L2 cache <l2-cache>`, per :ref:`normalization unit <normalization-units>`.
       unit: Requests per normalization unit
     L1-L2 Atomic:
       plain: The number of atomic requests that are sent through the vL1D to the L2
         cache, per normalization unit. This includes requests for atomics with, and
         without return.
-      rst: The number of atomic requests that are sent through the vL1D to the  :doc:`L2
-        cache <l2-cache>`, per  :ref:`normalization unit <normalization-units>`. This
-        includes requests  for atomics with, and without return.
+      rst: The number of atomic requests that are sent through the vL1D to the :doc:`L2
+        cache <l2-cache>`, per :ref:`normalization unit <normalization-units>`. This
+        includes requests for atomics with, and without return.
       unit: Requests per normalization unit
     L1 Access Latency:
       plain: Calculated as the average number of cycles that a vL1D cache line request
@@ -13845,17 +13946,17 @@ panels:
       plain: Calculated as the average number of cycles that the vL1D cache took to
         issue and receive read requests from the L2 Cache. This number also includes
         requests for atomics with return values.
-      rst: Calculated as the average number of cycles that the vL1D cache took to  issue
-        and receive read requests from the :doc:`L2 Cache <l2-cache>`. This  number
+      rst: Calculated as the average number of cycles that the vL1D cache took to issue
+        and receive read requests from the :doc:`L2 Cache <l2-cache>`. This number
         also includes requests for atomics with return values.
       unit: Cycles
     L1-L2 Write Latency:
       plain: Calculated as the average number of cycles that the vL1D cache took to
         issue and receive acknowledgement of a write request to the L2 Cache. This
         number also includes requests for atomics without return values.
-      rst: Calculated as the average number of cycles that the vL1D cache took to  issue
-        and receive acknowledgement of a write request to the  :doc:`L2 Cache <l2-cache>`.
-        This number also includes requests for  atomics without return values.
+      rst: Calculated as the average number of cycles that the vL1D cache took to issue
+        and receive acknowledgement of a write request to the :doc:`L2 Cache <l2-cache>`.
+        This number also includes requests for atomics without return values.
       unit: Cycles
     NC - Read:
       plain: Total read requests with NC mtype from this TCP to all TCCs Sum over
@@ -13878,7 +13979,8 @@ panels:
     RW - Read:
       plain: Total read requests with RW mtype from this TCP to all TCCs Sum over
         TCP instances per normalization unit.
-      rst: ''
+      rst: Total read requests with RW mtype from this TCP to all TCCs Sum over
+        TCP instances per normalization unit.
       unit: Requests per normalization unit
     RW - Write:
       plain: Total write requests with RW mtype from this TCP to all TCCs Sum over
@@ -13948,18 +14050,20 @@ panels:
       unit: Requests per normalization unit
     Translation Misses:
       plain: The total number of translation requests that missed in the UTCL1 due
-        to  translation not being present in the cache, per normalization unit.
-      rst: The total number of translation requests that missed in the UTCL1 due to  translation
-        not being present in the cache, per  :ref:`normalization unit <normalization-units>`.
+        to translation not being present in the cache, per normalization unit.
+      rst: The total number of translation requests that missed in the UTCL1 due to translation
+        not being present in the cache, per :ref:`normalization unit <normalization-units>`.
       unit: unit
     Permission Misses:
-      plain: "The total number of translation requests that missed in the UTCL1 due\
-        \ to a permission error, per normalization unit. This is unused and expected\
-        \ to be zero in most configurations for modern CDNA\u2122 accelerators."
-      rst: "The total number of translation requests that missed in the UTCL1 due\
-        \ to  a permission error, per :ref:`normalization unit <normalization-units>`.\
-        \  This is unused and expected to be zero in most configurations for modern\
-        \  CDNA\u2122 accelerators."
+      plain: |-
+        The total number of translation requests that missed in the UTCL1 due
+        to a permission error, per normalization unit. This is unused and expected
+        to be zero in most configurations for modern CDNA\u2122 accelerators.
+      rst: |-
+        The total number of translation requests that missed in the UTCL1 due
+        to a permission error, per :ref:`normalization unit <normalization-units>`.
+        This is unused and expected to be zero in most configurations for modern
+        CDNA\u2122 accelerators.
       unit: Requests per normalization unit
 - id: 1700
   title: L2 Cache
@@ -14161,7 +14265,7 @@ panels:
               * 32)) / $denom))
             max: MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
               * 32)) / $denom))
-            unit: (Bytes  + $normUnit)
+            unit: (Bytes + $normUnit)
           HBM Write and Atomic Traffic:
             avg: AVG((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
               != 0) else None))
@@ -14633,7 +14737,7 @@ panels:
               * 64)) / $denom))
             max: MAX((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
               * 64)) / $denom))
-            unit: (Bytes  + $normUnit)
+            unit: (Bytes + $normUnit)
           HBM Read Traffic:
             avg: AVG((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
               != 0) else None))
@@ -14665,7 +14769,7 @@ panels:
               * 32)) / $denom))
             max: MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
               * 32)) / $denom))
-            unit: (Bytes  + $normUnit)
+            unit: (Bytes + $normUnit)
           HBM Write and Atomic Traffic:
             avg: AVG((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
               != 0) else None))
@@ -14742,32 +14846,32 @@ panels:
             avg: AVG((TCC_REQ_sum / $denom))
             min: MIN((TCC_REQ_sum / $denom))
             max: MAX((TCC_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read Req:
             avg: AVG((TCC_READ_sum / $denom))
             min: MIN((TCC_READ_sum / $denom))
             max: MAX((TCC_READ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Write Req:
             avg: AVG((TCC_WRITE_sum / $denom))
             min: MIN((TCC_WRITE_sum / $denom))
             max: MAX((TCC_WRITE_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Atomic Req:
             avg: AVG((TCC_ATOMIC_sum / $denom))
             min: MIN((TCC_ATOMIC_sum / $denom))
             max: MAX((TCC_ATOMIC_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Streaming Req:
             avg: AVG((TCC_STREAMING_REQ_sum / $denom))
             min: MIN((TCC_STREAMING_REQ_sum / $denom))
             max: MAX((TCC_STREAMING_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Probe Req:
             avg: AVG((TCC_PROBE_sum / $denom))
             min: MIN((TCC_PROBE_sum / $denom))
             max: MAX((TCC_PROBE_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Cache Hit:
             avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
               + TCC_MISS_sum) != 0) else None))
@@ -14780,17 +14884,17 @@ panels:
             avg: AVG((TCC_HIT_sum / $denom))
             min: MIN((TCC_HIT_sum / $denom))
             max: MAX((TCC_HIT_sum / $denom))
-            unit: (Hits  + $normUnit)
+            unit: (Hits + $normUnit)
           Misses:
             avg: AVG((TCC_MISS_sum / $denom))
             min: MIN((TCC_MISS_sum / $denom))
             max: MAX((TCC_MISS_sum / $denom))
-            unit: (Misses  + $normUnit)
+            unit: (Misses + $normUnit)
           Writeback:
             avg: AVG((TCC_WRITEBACK_sum / $denom))
             min: MIN((TCC_WRITEBACK_sum / $denom))
             max: MAX((TCC_WRITEBACK_sum / $denom))
-            unit: (Cachelines  + $normUnit)
+            unit: (Cachelines + $normUnit)
           Writeback (Internal):
             avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom))
             min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom))
@@ -14815,22 +14919,22 @@ panels:
             avg: AVG((TCC_NC_REQ_sum / $denom))
             min: MIN((TCC_NC_REQ_sum / $denom))
             max: MAX((TCC_NC_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           UC Req:
             avg: AVG((TCC_UC_REQ_sum / $denom))
             min: MIN((TCC_UC_REQ_sum / $denom))
             max: MAX((TCC_UC_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           CC Req:
             avg: AVG((TCC_CC_REQ_sum / $denom))
             min: MIN((TCC_CC_REQ_sum / $denom))
             max: MAX((TCC_CC_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           RW Req:
             avg: AVG((TCC_RW_REQ_sum / $denom))
             min: MIN((TCC_RW_REQ_sum / $denom))
             max: MAX((TCC_RW_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
         gfx941:
           Bandwidth:
             avg: AVG((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
@@ -14841,32 +14945,32 @@ panels:
             avg: AVG((TCC_REQ_sum / $denom))
             min: MIN((TCC_REQ_sum / $denom))
             max: MAX((TCC_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read Req:
             avg: AVG((TCC_READ_sum / $denom))
             min: MIN((TCC_READ_sum / $denom))
             max: MAX((TCC_READ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Write Req:
             avg: AVG((TCC_WRITE_sum / $denom))
             min: MIN((TCC_WRITE_sum / $denom))
             max: MAX((TCC_WRITE_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Atomic Req:
             avg: AVG((TCC_ATOMIC_sum / $denom))
             min: MIN((TCC_ATOMIC_sum / $denom))
             max: MAX((TCC_ATOMIC_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Streaming Req:
             avg: AVG((TCC_STREAMING_REQ_sum / $denom))
             min: MIN((TCC_STREAMING_REQ_sum / $denom))
             max: MAX((TCC_STREAMING_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Probe Req:
             avg: AVG((TCC_PROBE_sum / $denom))
             min: MIN((TCC_PROBE_sum / $denom))
             max: MAX((TCC_PROBE_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Cache Hit:
             avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
               + TCC_MISS_sum) != 0) else None))
@@ -14879,17 +14983,17 @@ panels:
             avg: AVG((TCC_HIT_sum / $denom))
             min: MIN((TCC_HIT_sum / $denom))
             max: MAX((TCC_HIT_sum / $denom))
-            unit: (Hits  + $normUnit)
+            unit: (Hits + $normUnit)
           Misses:
             avg: AVG((TCC_MISS_sum / $denom))
             min: MIN((TCC_MISS_sum / $denom))
             max: MAX((TCC_MISS_sum / $denom))
-            unit: (Misses  + $normUnit)
+            unit: (Misses + $normUnit)
           Writeback:
             avg: AVG((TCC_WRITEBACK_sum / $denom))
             min: MIN((TCC_WRITEBACK_sum / $denom))
             max: MAX((TCC_WRITEBACK_sum / $denom))
-            unit: (Cachelines  + $normUnit)
+            unit: (Cachelines + $normUnit)
           Writeback (Internal):
             avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom))
             min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom))
@@ -14914,22 +15018,22 @@ panels:
             avg: AVG((TCC_NC_REQ_sum / $denom))
             min: MIN((TCC_NC_REQ_sum / $denom))
             max: MAX((TCC_NC_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           UC Req:
             avg: AVG((TCC_UC_REQ_sum / $denom))
             min: MIN((TCC_UC_REQ_sum / $denom))
             max: MAX((TCC_UC_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           CC Req:
             avg: AVG((TCC_CC_REQ_sum / $denom))
             min: MIN((TCC_CC_REQ_sum / $denom))
             max: MAX((TCC_CC_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           RW Req:
             avg: AVG((TCC_RW_REQ_sum / $denom))
             min: MIN((TCC_RW_REQ_sum / $denom))
             max: MAX((TCC_RW_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
         gfx940:
           Bandwidth:
             avg: AVG((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
@@ -14940,32 +15044,32 @@ panels:
             avg: AVG((TCC_REQ_sum / $denom))
             min: MIN((TCC_REQ_sum / $denom))
             max: MAX((TCC_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read Req:
             avg: AVG((TCC_READ_sum / $denom))
             min: MIN((TCC_READ_sum / $denom))
             max: MAX((TCC_READ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Write Req:
             avg: AVG((TCC_WRITE_sum / $denom))
             min: MIN((TCC_WRITE_sum / $denom))
             max: MAX((TCC_WRITE_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Atomic Req:
             avg: AVG((TCC_ATOMIC_sum / $denom))
             min: MIN((TCC_ATOMIC_sum / $denom))
             max: MAX((TCC_ATOMIC_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Streaming Req:
             avg: AVG((TCC_STREAMING_REQ_sum / $denom))
             min: MIN((TCC_STREAMING_REQ_sum / $denom))
             max: MAX((TCC_STREAMING_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Probe Req:
             avg: AVG((TCC_PROBE_sum / $denom))
             min: MIN((TCC_PROBE_sum / $denom))
             max: MAX((TCC_PROBE_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Cache Hit:
             avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
               + TCC_MISS_sum) != 0) else None))
@@ -14978,17 +15082,17 @@ panels:
             avg: AVG((TCC_HIT_sum / $denom))
             min: MIN((TCC_HIT_sum / $denom))
             max: MAX((TCC_HIT_sum / $denom))
-            unit: (Hits  + $normUnit)
+            unit: (Hits + $normUnit)
           Misses:
             avg: AVG((TCC_MISS_sum / $denom))
             min: MIN((TCC_MISS_sum / $denom))
             max: MAX((TCC_MISS_sum / $denom))
-            unit: (Misses  + $normUnit)
+            unit: (Misses + $normUnit)
           Writeback:
             avg: AVG((TCC_WRITEBACK_sum / $denom))
             min: MIN((TCC_WRITEBACK_sum / $denom))
             max: MAX((TCC_WRITEBACK_sum / $denom))
-            unit: (Cachelines  + $normUnit)
+            unit: (Cachelines + $normUnit)
           Writeback (Internal):
             avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom))
             min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom))
@@ -15013,22 +15117,22 @@ panels:
             avg: AVG((TCC_NC_REQ_sum / $denom))
             min: MIN((TCC_NC_REQ_sum / $denom))
             max: MAX((TCC_NC_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           UC Req:
             avg: AVG((TCC_UC_REQ_sum / $denom))
             min: MIN((TCC_UC_REQ_sum / $denom))
             max: MAX((TCC_UC_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           CC Req:
             avg: AVG((TCC_CC_REQ_sum / $denom))
             min: MIN((TCC_CC_REQ_sum / $denom))
             max: MAX((TCC_CC_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           RW Req:
             avg: AVG((TCC_RW_REQ_sum / $denom))
             min: MIN((TCC_RW_REQ_sum / $denom))
             max: MAX((TCC_RW_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
         gfx942:
           Bandwidth:
             avg: AVG((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
@@ -15039,32 +15143,32 @@ panels:
             avg: AVG((TCC_REQ_sum / $denom))
             min: MIN((TCC_REQ_sum / $denom))
             max: MAX((TCC_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read Req:
             avg: AVG((TCC_READ_sum / $denom))
             min: MIN((TCC_READ_sum / $denom))
             max: MAX((TCC_READ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Write Req:
             avg: AVG((TCC_WRITE_sum / $denom))
             min: MIN((TCC_WRITE_sum / $denom))
             max: MAX((TCC_WRITE_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Atomic Req:
             avg: AVG((TCC_ATOMIC_sum / $denom))
             min: MIN((TCC_ATOMIC_sum / $denom))
             max: MAX((TCC_ATOMIC_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Streaming Req:
             avg: AVG((TCC_STREAMING_REQ_sum / $denom))
             min: MIN((TCC_STREAMING_REQ_sum / $denom))
             max: MAX((TCC_STREAMING_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Probe Req:
             avg: AVG((TCC_PROBE_sum / $denom))
             min: MIN((TCC_PROBE_sum / $denom))
             max: MAX((TCC_PROBE_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Cache Hit:
             avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
               + TCC_MISS_sum) != 0) else None))
@@ -15077,17 +15181,17 @@ panels:
             avg: AVG((TCC_HIT_sum / $denom))
             min: MIN((TCC_HIT_sum / $denom))
             max: MAX((TCC_HIT_sum / $denom))
-            unit: (Hits  + $normUnit)
+            unit: (Hits + $normUnit)
           Misses:
             avg: AVG((TCC_MISS_sum / $denom))
             min: MIN((TCC_MISS_sum / $denom))
             max: MAX((TCC_MISS_sum / $denom))
-            unit: (Misses  + $normUnit)
+            unit: (Misses + $normUnit)
           Writeback:
             avg: AVG((TCC_WRITEBACK_sum / $denom))
             min: MIN((TCC_WRITEBACK_sum / $denom))
             max: MAX((TCC_WRITEBACK_sum / $denom))
-            unit: (Cachelines  + $normUnit)
+            unit: (Cachelines + $normUnit)
           Writeback (Internal):
             avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom))
             min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom))
@@ -15112,22 +15216,22 @@ panels:
             avg: AVG((TCC_NC_REQ_sum / $denom))
             min: MIN((TCC_NC_REQ_sum / $denom))
             max: MAX((TCC_NC_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           UC Req:
             avg: AVG((TCC_UC_REQ_sum / $denom))
             min: MIN((TCC_UC_REQ_sum / $denom))
             max: MAX((TCC_UC_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           CC Req:
             avg: AVG((TCC_CC_REQ_sum / $denom))
             min: MIN((TCC_CC_REQ_sum / $denom))
             max: MAX((TCC_CC_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           RW Req:
             avg: AVG((TCC_RW_REQ_sum / $denom))
             min: MIN((TCC_RW_REQ_sum / $denom))
             max: MAX((TCC_RW_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
         gfx950:
           Bandwidth:
             avg: AVG((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
@@ -15153,42 +15257,42 @@ panels:
             avg: AVG((TCC_REQ_sum / $denom))
             min: MIN((TCC_REQ_sum / $denom))
             max: MAX((TCC_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read Req:
             avg: AVG((TCC_READ_sum / $denom))
             min: MIN((TCC_READ_sum / $denom))
             max: MAX((TCC_READ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Write Req:
             avg: AVG((TCC_WRITE_sum / $denom))
             min: MIN((TCC_WRITE_sum / $denom))
             max: MAX((TCC_WRITE_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Atomic Req:
             avg: AVG((TCC_ATOMIC_sum / $denom))
             min: MIN((TCC_ATOMIC_sum / $denom))
             max: MAX((TCC_ATOMIC_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Streaming Req:
             avg: AVG((TCC_STREAMING_REQ_sum / $denom))
             min: MIN((TCC_STREAMING_REQ_sum / $denom))
             max: MAX((TCC_STREAMING_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Bypasss Req:
             avg: AVG((TCC_BYPASS_REQ_sum / $denom))
             min: MIN((TCC_BYPASS_REQ_sum / $denom))
             max: MAX((TCC_BYPASS_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Probe Req:
             avg: AVG((TCC_PROBE_sum / $denom))
             min: MIN((TCC_PROBE_sum / $denom))
             max: MAX((TCC_PROBE_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Input Buffer Req:
             avg: AVG((TCC_IB_REQ_sum / $denom))
             min: MIN((TCC_IB_REQ_sum / $denom))
             max: MAX((TCC_IB_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Cache Hit:
             avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
               + TCC_MISS_sum) != 0) else None))
@@ -15201,17 +15305,17 @@ panels:
             avg: AVG((TCC_HIT_sum / $denom))
             min: MIN((TCC_HIT_sum / $denom))
             max: MAX((TCC_HIT_sum / $denom))
-            unit: (Hits  + $normUnit)
+            unit: (Hits + $normUnit)
           Misses:
             avg: AVG((TCC_MISS_sum / $denom))
             min: MIN((TCC_MISS_sum / $denom))
             max: MAX((TCC_MISS_sum / $denom))
-            unit: (Misses  + $normUnit)
+            unit: (Misses + $normUnit)
           Writeback:
             avg: AVG((TCC_WRITEBACK_sum / $denom))
             min: MIN((TCC_WRITEBACK_sum / $denom))
             max: MAX((TCC_WRITEBACK_sum / $denom))
-            unit: (Cachelines  + $normUnit)
+            unit: (Cachelines + $normUnit)
           Writeback (Internal):
             avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom))
             min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom))
@@ -15236,22 +15340,22 @@ panels:
             avg: AVG((TCC_NC_REQ_sum / $denom))
             min: MIN((TCC_NC_REQ_sum / $denom))
             max: MAX((TCC_NC_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           UC Req:
             avg: AVG((TCC_UC_REQ_sum / $denom))
             min: MIN((TCC_UC_REQ_sum / $denom))
             max: MAX((TCC_UC_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           CC Req:
             avg: AVG((TCC_CC_REQ_sum / $denom))
             min: MIN((TCC_CC_REQ_sum / $denom))
             max: MAX((TCC_CC_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           RW Req:
             avg: AVG((TCC_RW_REQ_sum / $denom))
             min: MIN((TCC_RW_REQ_sum / $denom))
             max: MAX((TCC_RW_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
         gfx908:
           Bandwidth:
             avg: AVG((TCC_REQ_sum * 64) / (End_Timestamp - Start_Timestamp))
@@ -15262,32 +15366,32 @@ panels:
             avg: AVG((TCC_REQ_sum / $denom))
             min: MIN((TCC_REQ_sum / $denom))
             max: MAX((TCC_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read Req:
             avg: AVG((TCC_READ_sum / $denom))
             min: MIN((TCC_READ_sum / $denom))
             max: MAX((TCC_READ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Write Req:
             avg: AVG((TCC_WRITE_sum / $denom))
             min: MIN((TCC_WRITE_sum / $denom))
             max: MAX((TCC_WRITE_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Atomic Req:
             avg: AVG((TCC_ATOMIC_sum / $denom))
             min: MIN((TCC_ATOMIC_sum / $denom))
             max: MAX((TCC_ATOMIC_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Streaming Req:
             avg: AVG((TCC_STREAMING_REQ_sum / $denom))
             min: MIN((TCC_STREAMING_REQ_sum / $denom))
             max: MAX((TCC_STREAMING_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Probe Req:
             avg: AVG((TCC_PROBE_sum / $denom))
             min: MIN((TCC_PROBE_sum / $denom))
             max: MAX((TCC_PROBE_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Cache Hit:
             avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
               + TCC_MISS_sum) != 0) else None))
@@ -15300,12 +15404,12 @@ panels:
             avg: AVG((TCC_HIT_sum / $denom))
             min: MIN((TCC_HIT_sum / $denom))
             max: MAX((TCC_HIT_sum / $denom))
-            unit: (Hits  + $normUnit)
+            unit: (Hits + $normUnit)
           Misses:
             avg: AVG((TCC_MISS_sum / $denom))
             min: MIN((TCC_MISS_sum / $denom))
             max: MAX((TCC_MISS_sum / $denom))
-            unit: (Misses  + $normUnit)
+            unit: (Misses + $normUnit)
           Writeback:
             avg: AVG((TCC_WRITEBACK_sum / $denom))
             min: MIN((TCC_WRITEBACK_sum / $denom))
@@ -15335,22 +15439,22 @@ panels:
             avg: AVG((TCC_NC_REQ_sum / $denom))
             min: MIN((TCC_NC_REQ_sum / $denom))
             max: MAX((TCC_NC_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           UC Req:
             avg: AVG((TCC_UC_REQ_sum / $denom))
             min: MIN((TCC_UC_REQ_sum / $denom))
             max: MAX((TCC_UC_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           CC Req:
             avg: AVG((TCC_CC_REQ_sum / $denom))
             min: MIN((TCC_CC_REQ_sum / $denom))
             max: MAX((TCC_CC_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           RW Req:
             avg: AVG((TCC_RW_REQ_sum / $denom))
             min: MIN((TCC_RW_REQ_sum / $denom))
             max: MAX((TCC_RW_REQ_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
   - metric_table:
       id: 1704
       title: L2 Cache Stalls
@@ -15537,175 +15641,175 @@ panels:
             avg: AVG((TCC_EA_RDREQ_32B_sum / $denom))
             min: MIN((TCC_EA_RDREQ_32B_sum / $denom))
             max: MAX((TCC_EA_RDREQ_32B_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read (64B):
             avg: AVG(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom))
             min: MIN(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom))
             max: MAX(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read (Uncached):
             avg: AVG((TCC_EA_RD_UNCACHED_32B_sum / $denom))
             min: MIN((TCC_EA_RD_UNCACHED_32B_sum / $denom))
             max: MAX((TCC_EA_RD_UNCACHED_32B_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           HBM Read:
             avg: AVG((TCC_EA_RDREQ_DRAM_sum / $denom))
             min: MIN((TCC_EA_RDREQ_DRAM_sum / $denom))
             max: MAX((TCC_EA_RDREQ_DRAM_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Remote Read:
             avg: AVG((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom))
             min: MIN((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom))
             max: MAX((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Write and Atomic (32B):
             avg: AVG(MAX(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom), 0))
             min: MIN(MAX(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom), 0))
             max: MAX(MAX(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom), 0))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Write and Atomic (Uncached):
             avg: AVG((TCC_EA_WR_UNCACHED_32B_sum / $denom))
             min: MIN((TCC_EA_WR_UNCACHED_32B_sum / $denom))
             max: MAX((TCC_EA_WR_UNCACHED_32B_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Write and Atomic (64B):
             avg: AVG((TCC_EA_WRREQ_64B_sum / $denom))
             min: MIN((TCC_EA_WRREQ_64B_sum / $denom))
             max: MAX((TCC_EA_WRREQ_64B_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           HBM Write and Atomic:
             avg: AVG((TCC_EA_WRREQ_DRAM_sum / $denom))
             min: MIN((TCC_EA_WRREQ_DRAM_sum / $denom))
             max: MAX((TCC_EA_WRREQ_DRAM_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Remote Write and Atomic:
             avg: AVG((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom))
             min: MIN((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom))
             max: MAX((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Atomic:
             avg: AVG((TCC_EA_ATOMIC_sum / $denom))
             min: MIN((TCC_EA_ATOMIC_sum / $denom))
             max: MAX((TCC_EA_ATOMIC_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
         gfx941:
           Read (32B):
             avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom))
             min: MIN((TCC_EA0_RDREQ_32B_sum / $denom))
             max: MAX((TCC_EA0_RDREQ_32B_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read (64B):
             avg: AVG(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
             min: MIN(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
             max: MAX(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read (Uncached):
             avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
             min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
             max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           HBM Read:
             avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom))
             min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom))
             max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Remote Read:
             avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
             min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
             max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Write and Atomic (32B):
             avg: AVG(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
             min: MIN(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
             max: MAX(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Write and Atomic (Uncached):
             avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
             min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
             max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Write and Atomic (64B):
             avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom))
             min: MIN((TCC_EA0_WRREQ_64B_sum / $denom))
             max: MAX((TCC_EA0_WRREQ_64B_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           HBM Write and Atomic:
             avg: AVG((TCC_EA0_WRREQ_DRAM_sum / $denom))
             min: MIN((TCC_EA0_WRREQ_DRAM_sum / $denom))
             max: MAX((TCC_EA0_WRREQ_DRAM_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Remote Write and Atomic:
             avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
             min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
             max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Atomic:
             avg: AVG((TCC_EA0_ATOMIC_sum / $denom))
             min: MIN((TCC_EA0_ATOMIC_sum / $denom))
             max: MAX((TCC_EA0_ATOMIC_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
         gfx940:
           Read (32B):
             avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom))
             min: MIN((TCC_EA0_RDREQ_32B_sum / $denom))
             max: MAX((TCC_EA0_RDREQ_32B_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read (64B):
             avg: AVG(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
             min: MIN(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
             max: MAX(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read (Uncached):
             avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
             min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
             max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           HBM Read:
             avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom))
             min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom))
             max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Remote Read:
             avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
             min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
             max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Write and Atomic (32B):
             avg: AVG(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
             min: MIN(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
             max: MAX(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Write and Atomic (Uncached):
             avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
             min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
             max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Write and Atomic (64B):
             avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom))
             min: MIN((TCC_EA0_WRREQ_64B_sum / $denom))
             max: MAX((TCC_EA0_WRREQ_64B_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           HBM Write and Atomic:
             avg: AVG((TCC_EA0_WRREQ_DRAM_sum / $denom))
             min: MIN((TCC_EA0_WRREQ_DRAM_sum / $denom))
             max: MAX((TCC_EA0_WRREQ_DRAM_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Remote Write and Atomic:
             avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
             min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
             max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Atomic:
             avg: AVG((TCC_EA0_ATOMIC_sum / $denom))
             min: MIN((TCC_EA0_ATOMIC_sum / $denom))
             max: MAX((TCC_EA0_ATOMIC_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
         gfx942:
           Read (32B):
             avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom))
             min: MIN((TCC_EA0_RDREQ_32B_sum / $denom))
             max: MAX((TCC_EA0_RDREQ_32B_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read (64B):
             avg: AVG(MAX(((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) /
               $denom), 0))
@@ -15713,88 +15817,88 @@ panels:
               $denom), 0))
             max: MAX(MAX(((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) /
               $denom), 0))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read (128B):
             avg: AVG(((TCC_BUBBLE_sum) / $denom))
             min: MIN(((TCC_BUBBLE_sum) / $denom))
             max: MAX(((TCC_BUBBLE_sum) / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read (Uncached):
             avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
             min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
             max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           HBM Read:
             avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom))
             min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom))
             max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Remote Read:
             avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
             min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
             max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Write and Atomic (32B):
             avg: AVG(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
             min: MIN(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
             max: MAX(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Write and Atomic (Uncached):
             avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
             min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
             max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Write and Atomic (64B):
             avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom))
             min: MIN((TCC_EA0_WRREQ_64B_sum / $denom))
             max: MAX((TCC_EA0_WRREQ_64B_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           HBM Write and Atomic:
             avg: AVG((TCC_EA0_WRREQ_DRAM_sum / $denom))
             min: MIN((TCC_EA0_WRREQ_DRAM_sum / $denom))
             max: MAX((TCC_EA0_WRREQ_DRAM_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Remote Write and Atomic:
             avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
             min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
             max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Atomic:
             avg: AVG((TCC_EA0_ATOMIC_sum / $denom))
             min: MIN((TCC_EA0_ATOMIC_sum / $denom))
             max: MAX((TCC_EA0_ATOMIC_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
         gfx950:
           Read (32B):
             avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom))
             min: MIN((TCC_EA0_RDREQ_32B_sum / $denom))
             max: MAX((TCC_EA0_RDREQ_32B_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read (64B):
             avg: AVG((TCC_EA0_RDREQ_64B_sum / $denom))
             min: MIN((TCC_EA0_RDREQ_64B_sum / $denom))
             max: MAX((TCC_EA0_RDREQ_64B_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read (128B):
             avg: AVG((TCC_EA0_RDREQ_128B_sum / $denom))
             min: MIN((TCC_EA0_RDREQ_128B_sum / $denom))
             max: MAX((TCC_EA0_RDREQ_128B_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read (Uncached):
             avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
             min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
             max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           HBM Read:
             avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom))
             min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom))
             max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Remote Read:
             avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
             min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
             max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read Bandwidth - PCIe:
             avg: AVG(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
             min: MIN(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
@@ -15802,39 +15906,39 @@ panels:
             unit: Gbps
           "Read Bandwidth - Infinity Fabric\u2122":
             avg: AVG(TCC_EA0_RDREQ_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
-            min: MIN(TCC_EA0_RDREQ_GMI_32B_sum  * 32/ (End_Timestamp - Start_Timestamp))
-            max: MAX(TCC_EA0_RDREQ_GMI_32B_sum  * 32/ (End_Timestamp - Start_Timestamp))
+            min: MIN(TCC_EA0_RDREQ_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+            max: MAX(TCC_EA0_RDREQ_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
             unit: Gbps
           Read Bandwidth - HBM:
-            avg: AVG(TCC_EA0_RDREQ_DRAM_32B_sum  * 32/ (End_Timestamp - Start_Timestamp))
-            min: MIN(TCC_EA0_RDREQ_DRAM_32B_sum  * 32/ (End_Timestamp - Start_Timestamp))
-            max: MAX(TCC_EA0_RDREQ_DRAM_32B_sum  * 32/ (End_Timestamp - Start_Timestamp))
+            avg: AVG(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+            min: MIN(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+            max: MAX(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
             unit: Gbps
           Write and Atomic (32B):
             avg: AVG(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
             min: MIN(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
             max: MAX(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Write and Atomic (Uncached):
             avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
             min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
             max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Write and Atomic (64B):
             avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom))
             min: MIN((TCC_EA0_WRREQ_64B_sum / $denom))
             max: MAX((TCC_EA0_WRREQ_64B_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           HBM Write and Atomic:
             avg: AVG((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom))
             min: MIN((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom))
             max: MAX((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Remote Write and Atomic:
             avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
             min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
             max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Write Bandwidth - PCIe:
             avg: AVG(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
             min: MIN(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
@@ -15842,101 +15946,101 @@ panels:
             unit: Gbps
           "Write Bandwidth - Infinity Fabric\u2122":
             avg: AVG(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
-            min: MIN(TCC_EA0_WRREQ_WRITE_GMI_32B_sum  * 32/ (End_Timestamp - Start_Timestamp))
-            max: MAX(TCC_EA0_WRREQ_WRITE_GMI_32B_sum  * 32/ (End_Timestamp - Start_Timestamp))
+            min: MIN(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+            max: MAX(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
             unit: Gbps
           Write Bandwidth - HBM:
-            avg: AVG(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum  * 32/ (End_Timestamp - Start_Timestamp))
-            min: MIN(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum  * 32/ (End_Timestamp - Start_Timestamp))
-            max: MAX(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum  * 32/ (End_Timestamp - Start_Timestamp))
+            avg: AVG(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+            min: MIN(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+            max: MAX(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
             unit: Gbps
           Atomic:
             avg: AVG((TCC_EA0_ATOMIC_sum / $denom))
             min: MIN((TCC_EA0_ATOMIC_sum / $denom))
             max: MAX((TCC_EA0_ATOMIC_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Atomic - HBM:
             avg: AVG((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom))
             min: MIN((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom))
             max: MAX((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Atomic Bandwidth - PCIe:
-            avg: AVG(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum  * 32/ (End_Timestamp - Start_Timestamp))
-            min: MIN(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum  * 32/ (End_Timestamp - Start_Timestamp))
-            max: MAX(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum  * 32/ (End_Timestamp - Start_Timestamp))
+            avg: AVG(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+            min: MIN(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+            max: MAX(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
             unit: Gbps
           "Atomic Bandwidth - Infinity Fabric\u2122":
-            avg: AVG(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum  * 32/ (End_Timestamp - Start_Timestamp))
-            min: MIN(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum  * 32/ (End_Timestamp - Start_Timestamp))
-            max: MAX(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum  * 32/ (End_Timestamp - Start_Timestamp))
+            avg: AVG(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+            min: MIN(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+            max: MAX(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
             unit: Gbps
           Atomic Bandwidth - HBM:
-            avg: AVG(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum  * 32/ (End_Timestamp - Start_Timestamp))
-            min: MIN(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum  * 32/ (End_Timestamp - Start_Timestamp))
-            max: MAX(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum  * 32/ (End_Timestamp - Start_Timestamp))
+            avg: AVG(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+            min: MIN(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
+            max: MAX(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
             unit: Gbps
         gfx908:
           Read (32B):
             avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom))
             min: MIN((TCC_EA0_RDREQ_32B_sum / $denom))
             max: MAX((TCC_EA0_RDREQ_32B_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read (64B):
             avg: AVG(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
             min: MIN(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
             max: MAX(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Read (Uncached):
             avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
             min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
             max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           HBM Read:
             avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom))
             min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom))
             max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Remote Read:
             avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
             min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
             max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Write and Atomic (32B):
             avg: AVG(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
             min: MIN(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
             max: MAX(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Write and Atomic (Uncached):
             avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
             min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
             max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Write and Atomic (64B):
             avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom))
             min: MIN((TCC_EA0_WRREQ_64B_sum / $denom))
             max: MAX((TCC_EA0_WRREQ_64B_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           HBM Write and Atomic:
             avg: AVG((TCC_EA0_WRREQ_DRAM_sum / $denom))
             min: MIN((TCC_EA0_WRREQ_DRAM_sum / $denom))
             max: MAX((TCC_EA0_WRREQ_DRAM_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Remote Write and Atomic:
             avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
             min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
             max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
           Atomic:
             avg: AVG((TCC_EA0_ATOMIC_sum / $denom))
             min: MIN((TCC_EA0_ATOMIC_sum / $denom))
             max: MAX((TCC_EA0_ATOMIC_sum / $denom))
-            unit: (Req  + $normUnit)
+            unit: (Req + $normUnit)
   metrics_description:
     Utilization:
       plain: The ratio of the number of cycles an L2 channel was active, summed over
         all L2 channels on the accelerator over the total L2 cycles.
-      rst: The ratio of the  :ref:`number of cycles an L2 channel was active, summed
-        over all L2 channels on the accelerator <total-active-l2-cycles>`  over the
+      rst: The ratio of the :ref:`number of cycles an L2 channel was active, summed
+        over all L2 channels on the accelerator <total-active-l2-cycles>` over the
         :ref:`total L2 cycles <total-l2-cycles>`.
       unit: Percent
     Peak Bandwidth:
@@ -15946,30 +16050,30 @@ panels:
         cache line size. This value does not consider partial requests, so e.g., if
         only a single value is requested in a cache line, the data movement will still
         be counted as a full cache line.
-      rst: The number of bytes looked up in the L2 cache, as a percent of the peak  theoretical
-        bandwidth achievable on the specific accelerator. The number  of bytes is
-        calculated as the number of cache lines requested multiplied  by the cache
-        line size. This value does not consider partial requests, so  e.g., if only
-        a single value is requested in a cache line, the data  movement will still
+      rst: The number of bytes looked up in the L2 cache, as a percent of the peak theoretical
+        bandwidth achievable on the specific accelerator. The number of bytes is
+        calculated as the number of cache lines requested multiplied by the cache
+        line size. This value does not consider partial requests, so e.g., if only
+        a single value is requested in a cache line, the data movement will still
         be counted as a full cache line.
       unit: Percent
     Hit Rate:
       plain: The ratio of the number of L2 cache line requests that hit in the L2
         cache over the total number of incoming cache line requests to the L2 cache.
-      rst: The ratio of the number of L2 cache line requests that hit in the L2  cache
+      rst: The ratio of the number of L2 cache line requests that hit in the L2 cache
         over the total number of incoming cache line requests to the L2 cache.
       unit: Percent
     L2-Fabric Read BW:
       plain: The number of bytes read by the L2 over the Infinity Fabric interface
         per unit time.
-      rst: The number of bytes read by the L2 over the  :ref:`Infinity Fabric interface
+      rst: The number of bytes read by the L2 over the :ref:`Infinity Fabric interface
         <l2-fabric>` per unit time.
       unit: GB/s
     L2-Fabric Write and Atomic BW:
       plain: The number of bytes sent by the L2 over the Infinity Fabric interface
         by write and atomic operations per unit time.
-      rst: The number of bytes sent by the L2 over the  :ref:`Infinity Fabric interface
-        <l2-fabric>` by write and atomic  operations per unit time.
+      rst: The number of bytes sent by the L2 over the :ref:`Infinity Fabric interface
+        <l2-fabric>` by write and atomic operations per unit time.
       unit: GB/s
     HBM Bandwidth:
       plain: Maximum theoretical bandwidth of the accelerator's local high-bandwidth
@@ -15990,9 +16094,9 @@ panels:
         both counted as a single request), so this metric only approximates the percent
         of the L2-Fabric Read bandwidth directed to the local HBM.
       rst: The percent of read requests generated by the L2 cache that are routed
-        to  the accelerator's local high-bandwidth memory (HBM). This breakdown does  not
-        consider the *size* of the request (meaning that 32B and 64B requests  are
-        both counted as a single request), so this metric only *approximates*  the
+        to the accelerator's local high-bandwidth memory (HBM). This breakdown does not
+        consider the *size* of the request (meaning that 32B and 64B requests are
+        both counted as a single request), so this metric only *approximates* the
         percent of the L2-Fabric Read bandwidth directed to the local HBM.
       unit: Percent
     Remote Read Traffic:
@@ -16003,11 +16107,11 @@ panels:
         are both counted as a single request), so this metric only approximates the
         percent of the L2-Fabric Read bandwidth directed to a remote location.
       rst: The percent of read requests generated by the L2 cache that are routed
-        to  any memory location other than the accelerator's local high-bandwidth  memory
-        (HBM) -- for example, the CPU's DRAM or a remote accelerator's  HBM. This
-        breakdown does not consider the *size* of the request (meaning  that 32B and
-        64B requests are both counted as a single request), so this  metric only *approximates*
-        the percent of the L2-Fabric Read bandwidth  directed to a remote location.
+        to any memory location other than the accelerator's local high-bandwidth memory
+        (HBM) -- for example, the CPU's DRAM or a remote accelerator's HBM. This
+        breakdown does not consider the *size* of the request (meaning that 32B and
+        64B requests are both counted as a single request), so this metric only *approximates*
+        the percent of the L2-Fabric Read bandwidth directed to a remote location.
       unit: Percent
     Uncached Read Traffic:
       plain: The percent of read requests generated by the L2 cache that are reading
@@ -16018,14 +16122,14 @@ panels:
         the size of the request (i.e., 32B and 64B requests are both counted as a
         single request), so this metric only approximates the percent of the L2-Fabric
         read bandwidth directed to an uncached memory location.
-      rst: The percent of read requests generated by the L2 cache that are reading  from
-        an :ref:`uncached memory allocation <memory-type>`. Note, as  described in
-        the :ref:`request flow <l2-request-flow>` section, a single  64B read request
-        is typically counted as two uncached read requests. So,  it is possible for
-        the Uncached Read Traffic to reach up to 200% of the  total number of read
-        requests. This breakdown does not consider the  *size* of the request (i.e.,
-        32B and 64B requests are both counted as a  single request), so this metric
-        only *approximates* the percent of the  L2-Fabric read bandwidth directed
+      rst: The percent of read requests generated by the L2 cache that are reading from
+        an :ref:`uncached memory allocation <memory-type>`. Note, as described in
+        the :ref:`request flow <l2-request-flow>` section, a single 64B read request
+        is typically counted as two uncached read requests. So, it is possible for
+        the Uncached Read Traffic to reach up to 200% of the total number of read
+        requests. This breakdown does not consider the *size* of the request (i.e.,
+        32B and 64B requests are both counted as a single request), so this metric
+        only *approximates* the percent of the L2-Fabric read bandwidth directed
         to an uncached memory location.
       unit: Percent
     Write and Atomic BW:
@@ -16034,12 +16138,12 @@ panels:
         such as the MI2XX, requests are only considered atomic by Infinity Fabric
         if they are targeted at non-write-cacheable memory, for example, fine-grained
         memory allocations or uncached memory allocations on the MI2XX.
-      rst: The total number of bytes written by the L2 over Infinity Fabric by write  and
+      rst: The total number of bytes written by the L2 over Infinity Fabric by write and
         atomic operations divided by total duration. Note
-        that on current  CDNA accelerators, such as the :ref:`MI2XX <mixxx-note>`,
-        requests are  only considered *atomic* by Infinity Fabric if they are targeted
-        at  non-write-cacheable memory, for example,  :ref:`fine-grained memory <memory-type>`
-        allocations or  :ref:`uncached memory <memory-type>` allocations on the  MI2XX.
+        that on current CDNA accelerators, such as the :ref:`MI2XX <mixxx-note>`,
+        requests are only considered *atomic* by Infinity Fabric if they are targeted
+        at non-write-cacheable memory, for example, :ref:`fine-grained memory <memory-type>`
+        allocations or :ref:`uncached memory <memory-type>` allocations on the MI2XX.
       unit: Gbps
     HBM Write and Atomic Traffic:
       plain: The percent of write and atomic requests generated by the L2 cache that
@@ -16050,14 +16154,14 @@ panels:
         HBM. Note that on current CDNA accelerators, such as the MI2XX, requests are
         only considered atomic by Infinity Fabric if they are targeted at fine-grained
         memory allocations or uncached memory allocations.
-      rst: The percent of write and atomic requests generated by the L2 cache that  are
-        routed to the accelerator's local high-bandwidth memory (HBM). This  breakdown
-        does not consider the *size* of the request (meaning that 32B  and 64B requests
-        are both counted as a single request), so this metric  only *approximates*
-        the percent of the L2-Fabric Write and Atomic  bandwidth directed to the local
-        HBM. Note that on current CDNA  accelerators, such as the :ref:`MI2XX <mixxx-note>`,
-        requests are only  considered *atomic* by Infinity Fabric if they are targeted
-        at  :ref:`fine-grained memory <memory-type>` allocations or  :ref:`uncached
+      rst: The percent of write and atomic requests generated by the L2 cache that are
+        routed to the accelerator's local high-bandwidth memory (HBM). This breakdown
+        does not consider the *size* of the request (meaning that 32B and 64B requests
+        are both counted as a single request), so this metric only *approximates*
+        the percent of the L2-Fabric Write and Atomic bandwidth directed to the local
+        HBM. Note that on current CDNA accelerators, such as the :ref:`MI2XX <mixxx-note>`,
+        requests are only considered *atomic* by Infinity Fabric if they are targeted
+        at :ref:`fine-grained memory <memory-type>` allocations or :ref:`uncached
         memory <memory-type>` allocations.
       unit: Percent
     Remote Write and Atomic Traffic:
@@ -16071,14 +16175,14 @@ panels:
         atomic by Infinity Fabric if they are targeted at fine-grained memory allocations
         or uncached memory allocations.
       rst: The percent of read requests generated by the L2 cache that are routed
-        to  any memory location other than the accelerator's local high-bandwidth  memory
-        (HBM) -- for example, the CPU's DRAM or a remote accelerator's  HBM. This
-        breakdown does not consider the *size* of the request (meaning  that 32B and
-        64B requests are both counted as a single request), so this  metric only *approximates*
-        the percent of the L2-Fabric Read bandwidth  directed to a remote location.
-        Note that on current CDNA  accelerators, such as the :ref:`MI2XX <mixxx-note>`,
-        requests are only  considered *atomic* by Infinity Fabric if they are targeted
-        at  :ref:`fine-grained memory <memory-type>` allocations or  :ref:`uncached
+        to any memory location other than the accelerator's local high-bandwidth memory
+        (HBM) -- for example, the CPU's DRAM or a remote accelerator's HBM. This
+        breakdown does not consider the *size* of the request (meaning that 32B and
+        64B requests are both counted as a single request), so this metric only *approximates*
+        the percent of the L2-Fabric Read bandwidth directed to a remote location.
+        Note that on current CDNA accelerators, such as the :ref:`MI2XX <mixxx-note>`,
+        requests are only considered *atomic* by Infinity Fabric if they are targeted
+        at :ref:`fine-grained memory <memory-type>` allocations or :ref:`uncached
         memory <memory-type>` allocations.
       unit: Percent
     Atomic Traffic:
@@ -16090,14 +16194,14 @@ panels:
         such as the MI2XX, requests are only considered atomic by Infinity Fabric
         if they are targeted at fine-grained memory allocations or uncached memory
         allocations.
-      rst: The percent of write requests generated by the L2 cache that are atomic  requests
-        to *any* memory location. This breakdown does not consider the  *size* of
-        the request (meaning that 32B and 64B requests are both counted  as a single
-        request), so this metric only *approximates* the percent of  the L2-Fabric
-        Read bandwidth directed to a remote location. Note that on  current CDNA accelerators,
-        such as the :ref:`MI2XX <mixxx-note>`,  requests are only considered *atomic*
-        by Infinity Fabric if they are  targeted at :ref:`fine-grained memory <memory-type>`
-        allocations or  :ref:`uncached memory <memory-type>` allocations.
+      rst: The percent of write requests generated by the L2 cache that are atomic requests
+        to *any* memory location. This breakdown does not consider the *size* of
+        the request (meaning that 32B and 64B requests are both counted as a single
+        request), so this metric only *approximates* the percent of the L2-Fabric
+        Read bandwidth directed to a remote location. Note that on current CDNA accelerators,
+        such as the :ref:`MI2XX <mixxx-note>`, requests are only considered *atomic*
+        by Infinity Fabric if they are targeted at :ref:`fine-grained memory <memory-type>`
+        allocations or :ref:`uncached memory <memory-type>` allocations.
       unit: Percent
     Uncached Write and Atomic Traffic:
       plain: The percent of write and atomic requests generated by the L2 cache that
@@ -16105,16 +16209,16 @@ panels:
         the size of the request (meaning that 32B and 64B requests are both counted
         as a single request), so this metric only approximates the percent of the
         L2-Fabric read bandwidth directed to uncached memory allocations.
-      rst: The percent of write and atomic requests generated by the L2 cache that  are
-        targeting :ref:`uncached memory allocations <memory-type>`. This  breakdown
-        does not consider the *size* of the request (meaning that 32B  and 64B requests
-        are both counted as a single request), so this metric  only *approximates*
-        the percent of the L2-Fabric read bandwidth directed  to uncached memory allocations.
+      rst: The percent of write and atomic requests generated by the L2 cache that are
+        targeting :ref:`uncached memory allocations <memory-type>`. This breakdown
+        does not consider the *size* of the request (meaning that 32B and 64B requests
+        are both counted as a single request), so this metric only *approximates*
+        the percent of the L2-Fabric read bandwidth directed to uncached memory allocations.
       unit: Percent
     Read Latency:
       plain: The time-averaged number of cycles read requests spent in Infinity Fabric
         before data was returned to the L2.
-      rst: The time-averaged number of cycles read requests spent in Infinity Fabric  before
+      rst: The time-averaged number of cycles read requests spent in Infinity Fabric before
         data was returned to the L2.
       unit: Cycles
     Write and Atomic Latency:
@@ -16138,10 +16242,10 @@ panels:
         for example, if only a single value is requested in a cache line, the data
         movement will still be counted as a full cache line.
       rst: The number of bytes looked up in the L2 cache, divided by total duration.
-        The number of bytes is  calculated as the number of cache lines requested
+        The number of bytes is calculated as the number of cache lines requested
         multiplied by the cache line size. This value does
         not consider partial requests, so for example, if only a single value is
-        requested in a cache line, the data movement  will still be counted as a full
+        requested in a cache line, the data movement will still be counted as a full
         cache line.
       unit: Gbps
     Read Bandwidth:
@@ -16165,12 +16269,12 @@ panels:
     Req:
       plain: The total number of incoming requests to the L2 from all clients for
         all request types, per normalization unit.
-      rst: The total number of incoming requests to the L2 from all clients for all  request
+      rst: The total number of incoming requests to the L2 from all clients for all request
         types, per :ref:`normalization unit <normalization-units>`.
       unit: Requests per normalization unit
     Read Req:
       plain: The total number of read requests to the L2 from all clients.
-      rst: 'The total number of read requests to the L2 from all clients.  '
+      rst: The total number of read requests to the L2 from all clients.
       unit: Requests per normalization unit
     Write Req:
       plain: The total number of write requests to the L2 from all clients.
@@ -16188,11 +16292,11 @@ panels:
         however on an MI2XX this corresponds to non-temporal load or stores. The L2
         cache attempts to evict streaming requests before normal requests when the
         L2 is at capacity.
-      rst: The total number of incoming requests to the L2 that are marked as  *streaming*.
-        The exact meaning of this may differ depending on the  targeted accelerator,
-        however on an :ref:`MI2XX <mixxx-note>` this  corresponds to  `non-temporal
-        load or stores <https://clang.llvm.org/docs/LanguageExtensions.html#non-temporal-load-store-builtins>`_.  The
-        L2 cache attempts to evict *streaming* requests before normal  requests when
+      rst: The total number of incoming requests to the L2 that are marked as *streaming*.
+        The exact meaning of this may differ depending on the targeted accelerator,
+        however on an :ref:`MI2XX <mixxx-note>` this corresponds to `non-temporal
+        load or stores <https://clang.llvm.org/docs/LanguageExtensions.html#non-temporal-load-store-builtins>`_. The
+        L2 cache attempts to evict *streaming* requests before normal requests when
         the L2 is at capacity.
       unit: Requests per normalization unit
     Probe Req:
@@ -16200,30 +16304,30 @@ panels:
         the accelerator. On an MI2XX, probe requests may be generated by, for example,
         writes to fine-grained device memory or by writes to coarse-grained device
         memory.
-      rst: The number of coherence probe requests made to the L2 cache from outside  the
-        accelerator. On an :ref:`MI2XX <mixxx-note>`, probe requests may be  generated
-        by, for example, writes to  :ref:`fine-grained device <memory-type>` memory
-        or by writes to  :ref:`coarse-grained <memory-type>` device memory.
+      rst: The number of coherence probe requests made to the L2 cache from outside the
+        accelerator. On an :ref:`MI2XX <mixxx-note>`, probe requests may be generated
+        by, for example, writes to :ref:`fine-grained device <memory-type>` memory
+        or by writes to :ref:`coarse-grained <memory-type>` device memory.
       unit: Requests per normalization unit
     Cache Hit:
       plain: The ratio of the number of L2 cache line requests that hit in the L2
         cache over the total number of incoming cache line requests to the L2 cache.
-      rst: The ratio of the number of L2 cache line requests that hit in the L2  cache
-        over the total number of incoming cache line requests to the L2  cache.
+      rst: The ratio of the number of L2 cache line requests that hit in the L2 cache
+        over the total number of incoming cache line requests to the L2 cache.
       unit: Percent
     Hits:
       plain: The total number of requests to the L2 from all clients that hit in the
         cache. As noted in the Speed-of-Light section, this includes hit-on-miss requests.
-      rst: The total number of requests to the L2 from all clients that hit in the  cache.
-        As noted in the :ref:`Speed-of-Light <l2-sol>` section, this  includes hit-on-miss
+      rst: The total number of requests to the L2 from all clients that hit in the cache.
+        As noted in the :ref:`Speed-of-Light <l2-sol>` section, this includes hit-on-miss
         requests.
       unit: Requests per normalization unit
     Misses:
       plain: The total number of requests to the L2 from all clients that miss in
         the cache. As noted in the Speed-of-Light section, these do not include hit-on-miss
         requests.
-      rst: The total number of requests to the L2 from all clients that miss in the  cache.
-        As noted in the :ref:`Speed-of-Light <l2-sol>` section, these do  not include
+      rst: The total number of requests to the L2 from all clients that miss in the cache.
+        As noted in the :ref:`Speed-of-Light <l2-sol>` section, these do not include
         hit-on-miss requests.
       unit: Requests per normalization unit
     Writeback:
@@ -16231,107 +16335,107 @@ panels:
         Write-backs may occur due to user code (such as HIP kernel calls to _threadfence_system
         or atomic built-ins) by the command processor's memory acquire/release fences,
         or for other internal hardware reasons.
-      rst: The total number of L2 cache lines written back to memory for any reason.  Write-backs
-        may occur due to user code (such as HIP kernel calls to  ``__threadfence_system``
-        or atomic built-ins) by the  :doc:`command processor <command-processor>`'s
-        memory acquire/release  fences, or for other internal hardware reasons.
+      rst: The total number of L2 cache lines written back to memory for any reason. Write-backs
+        may occur due to user code (such as HIP kernel calls to ``__threadfence_system``
+        or atomic built-ins) by the :doc:`command processor <command-processor>`'s
+        memory acquire/release fences, or for other internal hardware reasons.
       unit: Cache lines per normalization unit
     Writeback (Internal):
       plain: The total number of L2 cache lines written back to memory for internal
         hardware reasons, per normalization unit.
-      rst: The total number of L2 cache lines written back to memory for internal  hardware
+      rst: The total number of L2 cache lines written back to memory for internal hardware
         reasons, per :ref:`normalization unit <normalization-units>`.
       unit: Cache lines per normalization unit
     Writeback (vL1D Req):
       plain: The total number of L2 cache lines written back to memory due to requests
         initiated by the vL1D cache, per normalization unit.
-      rst: The total number of L2 cache lines written back to memory due to requests  initiated
-        by the :doc:`vL1D cache <vector-l1-cache>`, per  :ref:`normalization unit
+      rst: The total number of L2 cache lines written back to memory due to requests initiated
+        by the :doc:`vL1D cache <vector-l1-cache>`, per :ref:`normalization unit
         <normalization-units>`.
       unit: Cache lines per normalization unit
     Evict (Internal):
       plain: The total number of L2 cache lines evicted from the cache due to capacity
         limits, per normalization unit.
-      rst: The total number of L2 cache lines evicted from the cache due to capacity  limits,
+      rst: The total number of L2 cache lines evicted from the cache due to capacity limits,
         per :ref:`normalization unit <normalization-units>`.
       unit: Cache lines per normalization unit
     Evict (vL1D Req):
       plain: The total number of L2 cache lines evicted from the cache due to invalidation
         requests initiated by the vL1D cache, per normalization unit.
-      rst: The total number of L2 cache lines evicted from the cache due to  invalidation
-        requests initiated by the  :doc:`vL1D cache <vector-l1-cache>`, per  :ref:`normalization
+      rst: The total number of L2 cache lines evicted from the cache due to invalidation
+        requests initiated by the :doc:`vL1D cache <vector-l1-cache>`, per :ref:`normalization
         unit <normalization-units>`.
       unit: Cache lines per normalization unit
     NC Req:
       plain: The total number of requests to the L2 to Not-hardware-Coherent (NC)
         memory allocations, per normalization unit.
-      rst: The total number of requests to the L2 to Not-hardware-Coherent (NC)  memory
-        allocations, per :ref:`normalization unit <normalization-units>`.  See the
+      rst: The total number of requests to the L2 to Not-hardware-Coherent (NC) memory
+        allocations, per :ref:`normalization unit <normalization-units>`. See the
         :ref:`memory-type` for more information.
       unit: Requests per normalization unit
     UC Req:
       plain: The total number of requests to the L2 that go to Uncached (UC) memory
         allocations.
-      rst: The total number of requests to the L2 that go to Uncached (UC) memory  allocations.
+      rst: The total number of requests to the L2 that go to Uncached (UC) memory allocations.
         See the :ref:`memory-type` for more information.
       unit: Requests per normalization unit
     CC Req:
       plain: The total number of requests to the L2 that go to Coherently Cacheable
         (CC) memory allocations.
       rst: The total number of requests to the L2 that go to Coherently Cacheable
-        (CC)  memory allocations. See the :ref:`memory-type` for more information.
+        (CC) memory allocations. See the :ref:`memory-type` for more information.
       unit: Requests per normalization unit
     RW Req:
       plain: The total number of requests to the L2 that go to Read-Write coherent
         memory (RW) allocations.
-      rst: The total number of requests to the L2 that go to Read-Write coherent memory  (RW)
+      rst: The total number of requests to the L2 that go to Read-Write coherent memory (RW)
         allocations. See the :ref:`memory-type` for more information.
       unit: Requests per normalization unit
     Write - Credit Starvation:
       plain: The number of cycles the L2-Fabric interface was stalled on write or
         atomic requests to any memory location because too many write/atomic requests
         were currently in flight, as a percent of the total active L2 cycles.
-      rst: The number of cycles the L2-Fabric interface was stalled on write or  atomic
-        requests to any memory location because too many write/atomic  requests were
-        currently in flight, as a percent of the  :ref:`total active L2 cycles <total-active-l2-cycles>`.
+      rst: The number of cycles the L2-Fabric interface was stalled on write or atomic
+        requests to any memory location because too many write/atomic requests were
+        currently in flight, as a percent of the :ref:`total active L2 cycles <total-active-l2-cycles>`.
       unit: Percent
     Read (32B):
       plain: The total number of L2 requests to Infinity Fabric to read 32B of data
         from any memory location, per normalization unit.
-      rst: The total number of L2 requests to Infinity Fabric to read 32B of data  from
-        any memory location, per  :ref:`normalization unit <normalization-units>`.
-        See  :ref:`l2-request-flow` for more detail. Typically unused on CDNA  accelerators.
+      rst: The total number of L2 requests to Infinity Fabric to read 32B of data from
+        any memory location, per :ref:`normalization unit <normalization-units>`.
+        See :ref:`l2-request-flow` for more detail. Typically unused on CDNA accelerators.
       unit: Requests per normalization unit
     Read (64B):
       plain: The total number of L2 requests to Infinity Fabric to read 64B of data
         from any memory location, per normalization unit.
-      rst: The total number of L2 requests to Infinity Fabric to read 64B of data  from
-        any memory location, per  :ref:`normalization unit <normalization-units>`.
-        See  :ref:`l2-request-flow` for more detail.
+      rst: The total number of L2 requests to Infinity Fabric to read 64B of data from
+        any memory location, per :ref:`normalization unit <normalization-units>`.
+        See :ref:`l2-request-flow` for more detail.
       unit: Requests per normalization unit
     Read (Uncached):
       plain: The total number of L2 requests to Infinity Fabric to read uncached data
         from any memory location, per normalization unit. 64B requests for uncached
         data are counted as two 32B uncached data requests.
-      rst: The total number of L2 requests to Infinity Fabric to read  :ref:`uncached
-        data <memory-type>` from any memory location, per  :ref:`normalization unit
-        <normalization-units>`. 64B requests for  uncached data are counted as two
-        32B uncached data requests. See  :ref:`l2-request-flow` for more detail.
+      rst: The total number of L2 requests to Infinity Fabric to read :ref:`uncached
+        data <memory-type>` from any memory location, per :ref:`normalization unit
+        <normalization-units>`. 64B requests for uncached data are counted as two
+        32B uncached data requests. See :ref:`l2-request-flow` for more detail.
       unit: Requests per normalization unit
     HBM Read:
       plain: The total number of L2 requests to Infinity Fabric to read 32B or 64B
         of data from the accelerator's local HBM, per normalization unit.
-      rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of  data
-        from the accelerator's local HBM, per  :ref:`normalization unit <normalization-units>`.
-        See  :ref:`l2-request-flow` for more detail.
+      rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of data
+        from the accelerator's local HBM, per :ref:`normalization unit <normalization-units>`.
+        See :ref:`l2-request-flow` for more detail.
       unit: Requests per normalization unit
     Remote Read:
       plain: The total number of L2 requests to Infinity Fabric to read 32B or 64B
         of data from any source other than the accelerator's local HBM, per normalization
         unit.
-      rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of  data
-        from any source other than the accelerator's local HBM, per  :ref:`normalization
-        unit <normalization-units>`. See  :ref:`l2-request-flow` for more detail.
+      rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of data
+        from any source other than the accelerator's local HBM, per :ref:`normalization
+        unit <normalization-units>`. See :ref:`l2-request-flow` for more detail.
       unit: Requests per normalization unit
     Read Bandwidth - PCIe:
       plain: Total number of bytes due to L2 read requests due to PCIe traffic, divided by total duration.
@@ -16348,39 +16452,39 @@ panels:
     Write and Atomic (32B):
       plain: The total number of L2 requests to Infinity Fabric to write or atomically
         update 32B of data to any memory location, per normalization unit.
-      rst: The total number of L2 requests to Infinity Fabric to write or atomically  update
-        32B of data to any memory location, per  :ref:`normalization unit <normalization-units>`.
-        See  :ref:`l2-request-flow` for more detail.
+      rst: The total number of L2 requests to Infinity Fabric to write or atomically update
+        32B of data to any memory location, per :ref:`normalization unit <normalization-units>`.
+        See :ref:`l2-request-flow` for more detail.
       unit: Requests per normalization unit
     Write and Atomic (Uncached):
       plain: The total number of L2 requests to Infinity Fabric to write or atomically
         update 32B or 64B of uncached data, per normalization unit.
-      rst: The total number of L2 requests to Infinity Fabric to write or atomically  update
-        32B or 64B of :ref:`uncached data <memory-type>`, per  :ref:`normalization
-        unit <normalization-units>`. See  :ref:`l2-request-flow` for more detail.
+      rst: The total number of L2 requests to Infinity Fabric to write or atomically update
+        32B or 64B of :ref:`uncached data <memory-type>`, per :ref:`normalization
+        unit <normalization-units>`. See :ref:`l2-request-flow` for more detail.
       unit: Requests per normalization unit
     Write and Atomic (64B):
       plain: The total number of L2 requests to Infinity Fabric to write or atomically
         update 64B of data in any memory location, per normalization unit.
-      rst: The total number of L2 requests to Infinity Fabric to write or atomically  update
-        64B of data in any memory location, per  :ref:`normalization unit <normalization-units>`.
-        See  :ref:`l2-request-flow` for more detail.
+      rst: The total number of L2 requests to Infinity Fabric to write or atomically update
+        64B of data in any memory location, per :ref:`normalization unit <normalization-units>`.
+        See :ref:`l2-request-flow` for more detail.
       unit: Requests per normalization unit
     HBM Write and Atomic:
       plain: The total number of L2 requests to Infinity Fabric to write or atomically
         update 32B or 64B of data in the accelerator's local HBM, per normalization
         unit.
-      rst: The total number of L2 requests to Infinity Fabric to write or atomically  update
-        32B or 64B of data in the accelerator's local HBM, per  :ref:`normalization
-        unit <normalization-units>`. See  :ref:`l2-request-flow` for more detail.  plain
+      rst: The total number of L2 requests to Infinity Fabric to write or atomically update
+        32B or 64B of data in the accelerator's local HBM, per :ref:`normalization
+        unit <normalization-units>`. See :ref:`l2-request-flow` for more detail. plain
       unit: Requests per normalization unit
     Remote Write and Atomic:
       plain: The total number of L2 requests to Infinity Fabric to write or atomically
         update 32B or 64B of data in any memory location other than the accelerator's
         local HBM, per normalization unit.
-      rst: The total number of L2 requests to Infinity Fabric to write or atomically  update
-        32B or 64B of data in any memory location other than the  accelerator's local
-        HBM, per  :ref:`normalization unit <normalization-units>`. See  :ref:`l2-request-flow`
+      rst: The total number of L2 requests to Infinity Fabric to write or atomically update
+        32B or 64B of data in any memory location other than the accelerator's local
+        HBM, per :ref:`normalization unit <normalization-units>`. See :ref:`l2-request-flow`
         for more detail.
       unit: Requests per normalization unit
     Write Bandwidth - PCIe:
@@ -16414,23 +16518,25 @@ panels:
         MI2XX, requests are only considered atomic by Infinity Fabric if they are
         targeted at non-write-cacheable memory, such as fine-grained memory allocations
         or uncached memory allocations on the MI2XX.
-      rst: The total number of L2 requests to Infinity Fabric to atomically update  32B
-        or 64B of data in any memory location, per  :ref:`normalization unit <normalization-units>`.
-        See  :ref:`l2-request-flow` for more detail. Note that on current CDNA  accelerators,
-        such as the :ref:`MI2XX <mixxx-note>`, requests are only  considered *atomic*
-        by Infinity Fabric if they are targeted at  non-write-cacheable memory, such
-        as  :ref:`fine-grained memory <memory-type>` allocations or  :ref:`uncached
+      rst: The total number of L2 requests to Infinity Fabric to atomically update 32B
+        or 64B of data in any memory location, per :ref:`normalization unit <normalization-units>`.
+        See :ref:`l2-request-flow` for more detail. Note that on current CDNA accelerators,
+        such as the :ref:`MI2XX <mixxx-note>`, requests are only considered *atomic*
+        by Infinity Fabric if they are targeted at non-write-cacheable memory, such
+        as :ref:`fine-grained memory <memory-type>` allocations or :ref:`uncached
         memory <memory-type>` allocations on the MI2XX.
       unit: Requests per normalization unit
     Read Stall:
-      plain: "The ratio of the total number of cycles the L2-Fabric interface was\
-        \ stalled on a read request to any destination (local HBM, remote PCIe\xAE\
-        \ connected accelerator or CPU, or remote Infinity Fabric connected accelerator\
-        \ or CPU) over the total active L2 cycles."
-      rst: "The ratio of the total number of cycles the L2-Fabric interface was stalled\
-        \ on a read request to any destination (local HBM, remote PCIe\xAE connected\
-        \ accelerator or CPU, or remote Infinity Fabric connected accelerator [#inf]_\
-        \ or CPU) over the :ref:`total active L2 cycles <total-active-l2-cycles>`."
+      plain: |-
+        The ratio of the total number of cycles the L2-Fabric interface was
+        stalled on a read request to any destination (local HBM, remote PCIe\xAE
+        connected accelerator or CPU, or remote Infinity Fabric connected accelerator
+        or CPU) over the total active L2 cycles.
+      rst: |-
+        The ratio of the total number of cycles the L2-Fabric interface was stalled
+        on a read request to any destination (local HBM, remote PCIe\xAE connected
+        accelerator or CPU, or remote Infinity Fabric connected accelerator [#inf]_
+        or CPU) over the :ref:`total active L2 cycles <total-active-l2-cycles>`.
       unit: Percent
     Write Stall:
       plain: The ratio of the total number of cycles the L2-Fabric interface was stalled
@@ -17616,8 +17722,8 @@ panels:
       plain: The percent of total number of requests to the L2 from all clients that
         hit in the cache. As noted in the Speed-of-Light section, this includes hit-on-miss
         requests.
-      rst: The total number of requests to the L2 from all clients that hit in the  cache.
-        As noted in the :ref:`Speed-of-Light <l2-sol>` section, this  includes hit-on-miss
+      rst: The total number of requests to the L2 from all clients that hit in the cache.
+        As noted in the :ref:`Speed-of-Light <l2-sol>` section, this includes hit-on-miss
         requests.
       unit: Percent
 - id: 2100
diff --git a/projects/rocprofiler-compute/utils/unified_sets.yaml b/projects/rocprofiler-compute/tools/unified_sets.yaml
similarity index 99%
rename from projects/rocprofiler-compute/utils/unified_sets.yaml
rename to projects/rocprofiler-compute/tools/unified_sets.yaml
index 43b1da6dce..f94aa22435 100644
--- a/projects/rocprofiler-compute/utils/unified_sets.yaml
+++ b/projects/rocprofiler-compute/tools/unified_sets.yaml
@@ -173,4 +173,4 @@ sets:
       - 7.1.6
       - 7.1.7
       - 7.1.8
-      - 7.1.9
\ No newline at end of file
+      - 7.1.9
diff --git a/projects/rocprofiler-compute/utils/update-coverage.sh b/projects/rocprofiler-compute/tools/update-coverage.sh
similarity index 99%
rename from projects/rocprofiler-compute/utils/update-coverage.sh
rename to projects/rocprofiler-compute/tools/update-coverage.sh
index 6ab6f93cd1..8506e4a0fe 100755
--- a/projects/rocprofiler-compute/utils/update-coverage.sh
+++ b/projects/rocprofiler-compute/tools/update-coverage.sh
@@ -78,4 +78,4 @@ echo "1. git add $COVERAGE_FILE"
 echo "2. git commit -m 'Update coverage: $COVERAGE_INFO'"
 echo "3. Proceed with push to develop"
 echo "4. CDash upload will happen automatically on push"
-echo ""
\ No newline at end of file
+echo ""
diff --git a/projects/rocprofiler-compute/utils/update_license.py b/projects/rocprofiler-compute/tools/update_license.py
similarity index 100%
rename from projects/rocprofiler-compute/utils/update_license.py
rename to projects/rocprofiler-compute/tools/update_license.py
diff --git a/projects/rocprofiler-compute/utils/ver_check.py b/projects/rocprofiler-compute/tools/ver_check.py
similarity index 100%
rename from projects/rocprofiler-compute/utils/ver_check.py
rename to projects/rocprofiler-compute/tools/ver_check.py
diff --git a/projects/rocprofiler-compute/utils/autogen_hash.yaml b/projects/rocprofiler-compute/utils/autogen_hash.yaml
deleted file mode 100644
index ca0c10f7ea..0000000000
--- a/projects/rocprofiler-compute/utils/autogen_hash.yaml
+++ /dev/null
@@ -1,116 +0,0 @@
-# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py
-src/rocprof_compute_soc/analysis_configs/gfx908/0000_top_stats.yaml: 401770cff804c6e51b78dff61390d8b5977598a2b09c6601ac593653e912535b
-src/rocprof_compute_soc/analysis_configs/gfx90a/0000_top_stats.yaml: 401770cff804c6e51b78dff61390d8b5977598a2b09c6601ac593653e912535b
-src/rocprof_compute_soc/analysis_configs/gfx940/0000_top_stats.yaml: 401770cff804c6e51b78dff61390d8b5977598a2b09c6601ac593653e912535b
-src/rocprof_compute_soc/analysis_configs/gfx941/0000_top_stats.yaml: 401770cff804c6e51b78dff61390d8b5977598a2b09c6601ac593653e912535b
-src/rocprof_compute_soc/analysis_configs/gfx942/0000_top_stats.yaml: 401770cff804c6e51b78dff61390d8b5977598a2b09c6601ac593653e912535b
-src/rocprof_compute_soc/analysis_configs/gfx950/0000_top_stats.yaml: 401770cff804c6e51b78dff61390d8b5977598a2b09c6601ac593653e912535b
-src/rocprof_compute_soc/analysis_configs/gfx908/0100_system_info.yaml: b883dc360890c8d4fae49542b3362fa341598b86198cc7f2b9b9a3cf987f9576
-src/rocprof_compute_soc/analysis_configs/gfx90a/0100_system_info.yaml: b883dc360890c8d4fae49542b3362fa341598b86198cc7f2b9b9a3cf987f9576
-src/rocprof_compute_soc/analysis_configs/gfx940/0100_system_info.yaml: b883dc360890c8d4fae49542b3362fa341598b86198cc7f2b9b9a3cf987f9576
-src/rocprof_compute_soc/analysis_configs/gfx941/0100_system_info.yaml: b883dc360890c8d4fae49542b3362fa341598b86198cc7f2b9b9a3cf987f9576
-src/rocprof_compute_soc/analysis_configs/gfx942/0100_system_info.yaml: b883dc360890c8d4fae49542b3362fa341598b86198cc7f2b9b9a3cf987f9576
-src/rocprof_compute_soc/analysis_configs/gfx950/0100_system_info.yaml: b883dc360890c8d4fae49542b3362fa341598b86198cc7f2b9b9a3cf987f9576
-src/rocprof_compute_soc/analysis_configs/gfx908/0200_system_speed_of_light.yaml: 2103e9d6123f473f1cb18b71c046f197b5d1d873563c4aad4933d7361255f0c1
-src/rocprof_compute_soc/analysis_configs/gfx90a/0200_system_speed_of_light.yaml: e9f552ee72849dc9c4ab14fee77ecc2681f4bcf610a8649c55365ab7eea7aafc
-src/rocprof_compute_soc/analysis_configs/gfx940/0200_system_speed_of_light.yaml: 70716745e727d3a7e6fa706d34c346f796c241c485516da52e0c694386b3cf57
-src/rocprof_compute_soc/analysis_configs/gfx941/0200_system_speed_of_light.yaml: a1d4f1f712755f6369d3a350eadcd5b0fcd90b5c0cab8be691c24bb860d90ba5
-src/rocprof_compute_soc/analysis_configs/gfx942/0200_system_speed_of_light.yaml: 70716745e727d3a7e6fa706d34c346f796c241c485516da52e0c694386b3cf57
-src/rocprof_compute_soc/analysis_configs/gfx950/0200_system_speed_of_light.yaml: a2cb003c74c0a75b9fe690da4e21b46e78fdb2f3233fc4753bca9276e93d60b0
-src/rocprof_compute_soc/analysis_configs/gfx908/0300_memory_chart.yaml: 190c31ddc0bc713dba8b508faf13f0630b268ed15a0d9206f30998a0a071136f
-src/rocprof_compute_soc/analysis_configs/gfx90a/0300_memory_chart.yaml: 8eeb4bb544eebd59aa10b51c1149ee4d015c76073c9a35e673210d9740fbf808
-src/rocprof_compute_soc/analysis_configs/gfx940/0300_memory_chart.yaml: 249e9ae0445de0927827ec14d20f946a07d50d92fd56e1993bbe0c17eb65bd51
-src/rocprof_compute_soc/analysis_configs/gfx941/0300_memory_chart.yaml: 249e9ae0445de0927827ec14d20f946a07d50d92fd56e1993bbe0c17eb65bd51
-src/rocprof_compute_soc/analysis_configs/gfx942/0300_memory_chart.yaml: 249e9ae0445de0927827ec14d20f946a07d50d92fd56e1993bbe0c17eb65bd51
-src/rocprof_compute_soc/analysis_configs/gfx950/0300_memory_chart.yaml: 643b31ffa43bc3613d6f90b0c23d95093d0d0aa5bc8e72d9a0fbc1b739a08b67
-src/rocprof_compute_soc/analysis_configs/gfx908/0400_roofline.yaml: ba5b48696acc5c014e2332570855edfa3c0daebb1c2765e6877fa7db3e0cd587
-src/rocprof_compute_soc/analysis_configs/gfx90a/0400_roofline.yaml: a768b0cb265efcbef39a0e3174c2cb17abb8ff961236c441c9b17a92e547c580
-src/rocprof_compute_soc/analysis_configs/gfx940/0400_roofline.yaml: caa23f7cd9eaee6204c2f48e22d80b520ba5e0efd6e3697c0e2856024e7c0c8c
-src/rocprof_compute_soc/analysis_configs/gfx941/0400_roofline.yaml: 7260566a8e13aca53975210aca25ba7fd1d7e1a6cabaeb7ae3d23e140bc62662
-src/rocprof_compute_soc/analysis_configs/gfx942/0400_roofline.yaml: 58272f5d1136489255a7bf9c6ade720a0567b17ab58dc58ae796597ae4d73ce5
-src/rocprof_compute_soc/analysis_configs/gfx950/0400_roofline.yaml: 290ff38832460c5f845b78ad0dea4ae8fab9a6affe53ea637eaceed074d31a57
-src/rocprof_compute_soc/analysis_configs/gfx908/0500_command_processor_cpc_cpf.yaml: da1c2997d42d66da2aa069caa741cf9eade124357c56e4290333de2f3e0412bb
-src/rocprof_compute_soc/analysis_configs/gfx90a/0500_command_processor_cpc_cpf.yaml: da1c2997d42d66da2aa069caa741cf9eade124357c56e4290333de2f3e0412bb
-src/rocprof_compute_soc/analysis_configs/gfx940/0500_command_processor_cpc_cpf.yaml: da1c2997d42d66da2aa069caa741cf9eade124357c56e4290333de2f3e0412bb
-src/rocprof_compute_soc/analysis_configs/gfx941/0500_command_processor_cpc_cpf.yaml: da1c2997d42d66da2aa069caa741cf9eade124357c56e4290333de2f3e0412bb
-src/rocprof_compute_soc/analysis_configs/gfx942/0500_command_processor_cpc_cpf.yaml: da1c2997d42d66da2aa069caa741cf9eade124357c56e4290333de2f3e0412bb
-src/rocprof_compute_soc/analysis_configs/gfx950/0500_command_processor_cpc_cpf.yaml: 6eb8acab3abb4183868470a4bd8ee97bf8a426f5faeca46aab0d9000c1700f76
-src/rocprof_compute_soc/analysis_configs/gfx908/0600_workgroup_manager_spi.yaml: a866f2dc5e30404a914f0cfa10f73fc2463007ae553c4655b8e47bcdbd76f8ac
-src/rocprof_compute_soc/analysis_configs/gfx90a/0600_workgroup_manager_spi.yaml: a866f2dc5e30404a914f0cfa10f73fc2463007ae553c4655b8e47bcdbd76f8ac
-src/rocprof_compute_soc/analysis_configs/gfx940/0600_workgroup_manager_spi.yaml: a866f2dc5e30404a914f0cfa10f73fc2463007ae553c4655b8e47bcdbd76f8ac
-src/rocprof_compute_soc/analysis_configs/gfx941/0600_workgroup_manager_spi.yaml: a866f2dc5e30404a914f0cfa10f73fc2463007ae553c4655b8e47bcdbd76f8ac
-src/rocprof_compute_soc/analysis_configs/gfx942/0600_workgroup_manager_spi.yaml: a866f2dc5e30404a914f0cfa10f73fc2463007ae553c4655b8e47bcdbd76f8ac
-src/rocprof_compute_soc/analysis_configs/gfx950/0600_workgroup_manager_spi.yaml: 7fe5d39165fd1100de7f89639cf6b8b1ffdcba46f86063d2040bee3bc14dc032
-src/rocprof_compute_soc/analysis_configs/gfx908/0700_wavefront.yaml: 1d22ca4540dbc884ede6d9071e7a2e0a1cd831d4eb2da6f29ad8a582907df848
-src/rocprof_compute_soc/analysis_configs/gfx90a/0700_wavefront.yaml: 1d22ca4540dbc884ede6d9071e7a2e0a1cd831d4eb2da6f29ad8a582907df848
-src/rocprof_compute_soc/analysis_configs/gfx940/0700_wavefront.yaml: 1d22ca4540dbc884ede6d9071e7a2e0a1cd831d4eb2da6f29ad8a582907df848
-src/rocprof_compute_soc/analysis_configs/gfx941/0700_wavefront.yaml: 1d22ca4540dbc884ede6d9071e7a2e0a1cd831d4eb2da6f29ad8a582907df848
-src/rocprof_compute_soc/analysis_configs/gfx942/0700_wavefront.yaml: 1d22ca4540dbc884ede6d9071e7a2e0a1cd831d4eb2da6f29ad8a582907df848
-src/rocprof_compute_soc/analysis_configs/gfx950/0700_wavefront.yaml: 9ce451e4e9099bb5e43e6e41e5621b469d849f1e4900a74f156337eed95b644d
-src/rocprof_compute_soc/analysis_configs/gfx908/1000_compute_units_instruction_mix.yaml: 4f70eb28dad079098fcc97813c59b02dc1bda06ceb5f7806a94b3b26184e47af
-src/rocprof_compute_soc/analysis_configs/gfx90a/1000_compute_units_instruction_mix.yaml: 84eee8712ebd101e593598098bd6f9e281b36f116d0f3eba6a415c418dbbb647
-src/rocprof_compute_soc/analysis_configs/gfx940/1000_compute_units_instruction_mix.yaml: 7d9ae6b30157645b0461abaf84aa9c793c87ed630a8a6611a34ae043cbcc4c5d
-src/rocprof_compute_soc/analysis_configs/gfx941/1000_compute_units_instruction_mix.yaml: 7d9ae6b30157645b0461abaf84aa9c793c87ed630a8a6611a34ae043cbcc4c5d
-src/rocprof_compute_soc/analysis_configs/gfx942/1000_compute_units_instruction_mix.yaml: 7d9ae6b30157645b0461abaf84aa9c793c87ed630a8a6611a34ae043cbcc4c5d
-src/rocprof_compute_soc/analysis_configs/gfx950/1000_compute_units_instruction_mix.yaml: a34553f977577980312b27005bfcd9c1e4c79f77c0c3dc4e023a17bf86169373
-src/rocprof_compute_soc/analysis_configs/gfx908/1100_compute_units_compute_pipeline.yaml: 472d6f872fb9f545940899824f87f88d4f7f7544ae11addd10da08ced0110f49
-src/rocprof_compute_soc/analysis_configs/gfx90a/1100_compute_units_compute_pipeline.yaml: 2eeac2474dce7ff3b03650575dd7ce92458db8f70a7958536ada892119d33c69
-src/rocprof_compute_soc/analysis_configs/gfx940/1100_compute_units_compute_pipeline.yaml: 4a25b6abf24f4a622fde1a3cfe65fe7236cf1e626fc2444667883997564cea1e
-src/rocprof_compute_soc/analysis_configs/gfx941/1100_compute_units_compute_pipeline.yaml: 4a25b6abf24f4a622fde1a3cfe65fe7236cf1e626fc2444667883997564cea1e
-src/rocprof_compute_soc/analysis_configs/gfx942/1100_compute_units_compute_pipeline.yaml: 4a25b6abf24f4a622fde1a3cfe65fe7236cf1e626fc2444667883997564cea1e
-src/rocprof_compute_soc/analysis_configs/gfx950/1100_compute_units_compute_pipeline.yaml: 4ef656938f8a9667ae872db522855856469accff9cb42bc0444b469346760dfd
-src/rocprof_compute_soc/analysis_configs/gfx908/1200_local_data_share_lds.yaml: f3f7a74e8b2915fe27eec7948f006f218a6b0a96c91b95cdff9e624b2c484bb2
-src/rocprof_compute_soc/analysis_configs/gfx90a/1200_local_data_share_lds.yaml: f3f7a74e8b2915fe27eec7948f006f218a6b0a96c91b95cdff9e624b2c484bb2
-src/rocprof_compute_soc/analysis_configs/gfx940/1200_local_data_share_lds.yaml: f3f7a74e8b2915fe27eec7948f006f218a6b0a96c91b95cdff9e624b2c484bb2
-src/rocprof_compute_soc/analysis_configs/gfx941/1200_local_data_share_lds.yaml: f3f7a74e8b2915fe27eec7948f006f218a6b0a96c91b95cdff9e624b2c484bb2
-src/rocprof_compute_soc/analysis_configs/gfx942/1200_local_data_share_lds.yaml: f3f7a74e8b2915fe27eec7948f006f218a6b0a96c91b95cdff9e624b2c484bb2
-src/rocprof_compute_soc/analysis_configs/gfx950/1200_local_data_share_lds.yaml: 6333e18126bde83da4c66fd967531d394bd22e69c08358096b27168a9dc11a30
-src/rocprof_compute_soc/analysis_configs/gfx908/1300_instruction_cache.yaml: f60b9c657bece161e34219f3ada4041107dc5ca3d248590ee3b67e7bd400ff54
-src/rocprof_compute_soc/analysis_configs/gfx90a/1300_instruction_cache.yaml: f60b9c657bece161e34219f3ada4041107dc5ca3d248590ee3b67e7bd400ff54
-src/rocprof_compute_soc/analysis_configs/gfx940/1300_instruction_cache.yaml: f60b9c657bece161e34219f3ada4041107dc5ca3d248590ee3b67e7bd400ff54
-src/rocprof_compute_soc/analysis_configs/gfx941/1300_instruction_cache.yaml: f60b9c657bece161e34219f3ada4041107dc5ca3d248590ee3b67e7bd400ff54
-src/rocprof_compute_soc/analysis_configs/gfx942/1300_instruction_cache.yaml: f60b9c657bece161e34219f3ada4041107dc5ca3d248590ee3b67e7bd400ff54
-src/rocprof_compute_soc/analysis_configs/gfx950/1300_instruction_cache.yaml: f60b9c657bece161e34219f3ada4041107dc5ca3d248590ee3b67e7bd400ff54
-src/rocprof_compute_soc/analysis_configs/gfx908/1400_scalar_l1_data_cache.yaml: 29fac4ea38e4a018baffc4a27a720b47078fd890c10da307655d40f693e6f0e7
-src/rocprof_compute_soc/analysis_configs/gfx90a/1400_scalar_l1_data_cache.yaml: 29fac4ea38e4a018baffc4a27a720b47078fd890c10da307655d40f693e6f0e7
-src/rocprof_compute_soc/analysis_configs/gfx940/1400_scalar_l1_data_cache.yaml: 29fac4ea38e4a018baffc4a27a720b47078fd890c10da307655d40f693e6f0e7
-src/rocprof_compute_soc/analysis_configs/gfx941/1400_scalar_l1_data_cache.yaml: 29fac4ea38e4a018baffc4a27a720b47078fd890c10da307655d40f693e6f0e7
-src/rocprof_compute_soc/analysis_configs/gfx942/1400_scalar_l1_data_cache.yaml: 29fac4ea38e4a018baffc4a27a720b47078fd890c10da307655d40f693e6f0e7
-src/rocprof_compute_soc/analysis_configs/gfx950/1400_scalar_l1_data_cache.yaml: 29fac4ea38e4a018baffc4a27a720b47078fd890c10da307655d40f693e6f0e7
-src/rocprof_compute_soc/analysis_configs/gfx908/1500_address_processing_unit_and_data_return_path_ta_td.yaml: 1e7717fcbd3c8cdf87d593a33f350ca240c1db8f8065a778cca926da1f517088
-src/rocprof_compute_soc/analysis_configs/gfx90a/1500_address_processing_unit_and_data_return_path_ta_td.yaml: 2bdb9d7b3bea1057b3baee29ba3b428b211808261063a97bc4b6b319f4a19fb3
-src/rocprof_compute_soc/analysis_configs/gfx940/1500_address_processing_unit_and_data_return_path_ta_td.yaml: 3180c2f3266be0ff44e01d73d247ca43ae2ee18ecaf61765f58849e36c701b19
-src/rocprof_compute_soc/analysis_configs/gfx941/1500_address_processing_unit_and_data_return_path_ta_td.yaml: 3180c2f3266be0ff44e01d73d247ca43ae2ee18ecaf61765f58849e36c701b19
-src/rocprof_compute_soc/analysis_configs/gfx942/1500_address_processing_unit_and_data_return_path_ta_td.yaml: 3180c2f3266be0ff44e01d73d247ca43ae2ee18ecaf61765f58849e36c701b19
-src/rocprof_compute_soc/analysis_configs/gfx950/1500_address_processing_unit_and_data_return_path_ta_td.yaml: 9e56cef5b066fb575a5c530bcf9400f1291dd8636b12c8a2244cdba1defafc9f
-src/rocprof_compute_soc/analysis_configs/gfx908/1600_vector_l1_data_cache.yaml: 360a9cd6df4e345a45f0660bc8df2003d5eb5dba2359d7e59c89933dc9fba94e
-src/rocprof_compute_soc/analysis_configs/gfx90a/1600_vector_l1_data_cache.yaml: 360a9cd6df4e345a45f0660bc8df2003d5eb5dba2359d7e59c89933dc9fba94e
-src/rocprof_compute_soc/analysis_configs/gfx940/1600_vector_l1_data_cache.yaml: 37c061bc9751828621a72aa6576596262b684fca7b764adbb991cd7eef58987d
-src/rocprof_compute_soc/analysis_configs/gfx941/1600_vector_l1_data_cache.yaml: 37c061bc9751828621a72aa6576596262b684fca7b764adbb991cd7eef58987d
-src/rocprof_compute_soc/analysis_configs/gfx942/1600_vector_l1_data_cache.yaml: 37c061bc9751828621a72aa6576596262b684fca7b764adbb991cd7eef58987d
-src/rocprof_compute_soc/analysis_configs/gfx950/1600_vector_l1_data_cache.yaml: ae0388f43813302969f51a80ac58678614b993f5163083a69e1c99811d730064
-src/rocprof_compute_soc/analysis_configs/gfx908/1700_l2_cache.yaml: 1991d3ec4bf1d534d32f35ffea23ebce3fe30d6114b48171acb5a8bc4446828f
-src/rocprof_compute_soc/analysis_configs/gfx90a/1700_l2_cache.yaml: d95bc3ecf5405a3687202c3ce29230838fa872bb3df04ff4e45d8e67822409af
-src/rocprof_compute_soc/analysis_configs/gfx940/1700_l2_cache.yaml: 89501317ffae421ce2ef272894c798274488fef68d12657143415dd8514be383
-src/rocprof_compute_soc/analysis_configs/gfx941/1700_l2_cache.yaml: ad882ce0748402eb6080528c583046014665e9a4436c7a26784fb11961320b02
-src/rocprof_compute_soc/analysis_configs/gfx942/1700_l2_cache.yaml: 8544fefbaf4a6885feecc43557e8efb31c2f79b9c5e7fd2d1e6be59951cd51dc
-src/rocprof_compute_soc/analysis_configs/gfx950/1700_l2_cache.yaml: b8d83bd17c09939240c01d598cd2f8961a6e3f3545663725737b2566d6544735
-src/rocprof_compute_soc/analysis_configs/gfx908/1800_l2_cache_per_channel.yaml: f5db15673a4be8b92f05a380738c5a10f68ca78ca2b1a9c31c19acae13d17f7b
-src/rocprof_compute_soc/analysis_configs/gfx90a/1800_l2_cache_per_channel.yaml: a0c53202fe9f68d5e1fa689ce0643c471ced7d47e007d8ccc68fba294f7f6a05
-src/rocprof_compute_soc/analysis_configs/gfx940/1800_l2_cache_per_channel.yaml: e184e3692eb0d641fb2e37fada0e58a6c4958553931d7c038b884e1e6986093f
-src/rocprof_compute_soc/analysis_configs/gfx941/1800_l2_cache_per_channel.yaml: e184e3692eb0d641fb2e37fada0e58a6c4958553931d7c038b884e1e6986093f
-src/rocprof_compute_soc/analysis_configs/gfx942/1800_l2_cache_per_channel.yaml: e184e3692eb0d641fb2e37fada0e58a6c4958553931d7c038b884e1e6986093f
-src/rocprof_compute_soc/analysis_configs/gfx950/1800_l2_cache_per_channel.yaml: 896d9af08778c5ecddc6d6961ae96b972a739c913ed9143e3f5fb2f7e878cb5e
-src/rocprof_compute_soc/analysis_configs/gfx908/2100_pc_sampling.yaml: 4f3af55040c40bee5f1fd88d83e2324d06e5dc462c0adc3e6d5b19b3f31af5e7
-src/rocprof_compute_soc/analysis_configs/gfx90a/2100_pc_sampling.yaml: 4f3af55040c40bee5f1fd88d83e2324d06e5dc462c0adc3e6d5b19b3f31af5e7
-src/rocprof_compute_soc/analysis_configs/gfx940/2100_pc_sampling.yaml: 4f3af55040c40bee5f1fd88d83e2324d06e5dc462c0adc3e6d5b19b3f31af5e7
-src/rocprof_compute_soc/analysis_configs/gfx941/2100_pc_sampling.yaml: 4f3af55040c40bee5f1fd88d83e2324d06e5dc462c0adc3e6d5b19b3f31af5e7
-src/rocprof_compute_soc/analysis_configs/gfx942/2100_pc_sampling.yaml: 4f3af55040c40bee5f1fd88d83e2324d06e5dc462c0adc3e6d5b19b3f31af5e7
-src/rocprof_compute_soc/analysis_configs/gfx950/2100_pc_sampling.yaml: 4f3af55040c40bee5f1fd88d83e2324d06e5dc462c0adc3e6d5b19b3f31af5e7
-src/rocprof_compute_soc/profile_configs/sets/gfx908_sets.yaml: ee28989e70d0537db8b0f0a4bc5499444b44ff0e73d3e7f2926943be11d0aeda
-src/rocprof_compute_soc/profile_configs/sets/gfx90a_sets.yaml: 9c9533174a3f7bd5c8e09ec998743c7bb2642c4ce3f818b546673be9cafc40a8
-src/rocprof_compute_soc/profile_configs/sets/gfx940_sets.yaml: 44cd2b32b050cafa73d0ead5703b82836edf25a057c21699046b6b8b8918b242
-src/rocprof_compute_soc/profile_configs/sets/gfx941_sets.yaml: 44cd2b32b050cafa73d0ead5703b82836edf25a057c21699046b6b8b8918b242
-src/rocprof_compute_soc/profile_configs/sets/gfx942_sets.yaml: 44cd2b32b050cafa73d0ead5703b82836edf25a057c21699046b6b8b8918b242
-src/rocprof_compute_soc/profile_configs/sets/gfx950_sets.yaml: 238d9dc8a98cfead3fc904885bfe413e5bcb4f1af31e9820cd640388bcd1e1c2
-docs/data/metrics_description.yaml: c2ddad7ef7973b128c1612e56cc6286e49c2f59af829b1795dc64b38c0ecfd61