From 71b9ea6ba06fc40c4098d8d23658e275ae5fe688 Mon Sep 17 00:00:00 2001
From: xuchen-amd <xuchen@amd.com>
Date: Wed, 14 Jan 2026 13:20:27 -0500
Subject: [PATCH] [rocprofiler-compute] improve config management system
 (#2359)

---
 .../.pre-commit-config.yaml                   |     2 +-
 .../profile_configs/sets/gfx908_sets.yaml     |     1 -
 .../profile_configs/sets/gfx90a_sets.yaml     |     1 -
 .../profile_configs/sets/gfx940_sets.yaml     |     1 -
 .../profile_configs/sets/gfx941_sets.yaml     |     1 -
 .../profile_configs/sets/gfx942_sets.yaml     |     1 -
 .../profile_configs/sets/gfx950_sets.yaml     |     1 -
 .../utils}/.config_hashes.json                |     2 +-
 .../utils}/hash_checker.py                    |    27 +-
 .../tests/test_autogen_config.py              |    98 +-
 .../tools/autogen_hash.yaml                   |     2 -
 .../tools/config_management/README.md         |   726 +-
 .../config_management/apply_config_deltas.py  |    69 +-
 .../config_management/config_workflow.yaml    |     2 +-
 .../generate_config_deltas.py                 |   497 +-
 .../tools/config_management/hash_manager.py   |     2 +-
 .../master_config_workflow_script.py          |  1171 +-
 .../metric_description_manager.py             |    22 +-
 .../parse_config_template.py                  |   239 +-
 .../tools/config_management/utils.py          |    52 -
 .../tools/config_management/utils_ruamel.py   |    92 +
 .../verify_against_config_template.py         |   440 +-
 .../rocprofiler-compute/tools/split_config.py |   307 -
 .../tools/unified_config.yaml                 | 17736 ----------------
 .../tools/unified_sets.yaml                   |   176 -
 25 files changed, 1407 insertions(+), 20261 deletions(-)
 rename projects/rocprofiler-compute/{tools/config_management => src/utils}/.config_hashes.json (99%)
 rename projects/rocprofiler-compute/{tools/config_management => src/utils}/hash_checker.py (88%)
 delete mode 100644 projects/rocprofiler-compute/tools/autogen_hash.yaml
 delete mode 100644 projects/rocprofiler-compute/tools/config_management/utils.py
 create mode 100644 projects/rocprofiler-compute/tools/config_management/utils_ruamel.py
 delete mode 100644 projects/rocprofiler-compute/tools/split_config.py
 delete mode 100644 projects/rocprofiler-compute/tools/unified_config.yaml
 delete mode 100644 projects/rocprofiler-compute/tools/unified_sets.yaml

diff --git a/projects/rocprofiler-compute/.pre-commit-config.yaml b/projects/rocprofiler-compute/.pre-commit-config.yaml
index 10c643321f..2b36616c96 100644
--- a/projects/rocprofiler-compute/.pre-commit-config.yaml
+++ b/projects/rocprofiler-compute/.pre-commit-config.yaml
@@ -23,7 +23,7 @@ repos:
     hooks:
       - id: hash-check
         name: Hash consistency check
-        entry: bash -lc 'cd projects/rocprofiler-compute && python3 tools/config_management/hash_checker.py'
+        entry: bash -lc 'cd projects/rocprofiler-compute && python3 src/utils/hash_checker.py'
         language: system
         pass_filenames: false
         stages: [pre-commit]
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/sets/gfx908_sets.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/sets/gfx908_sets.yaml
index 939d12f04d..88ac2bd087 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/sets/gfx908_sets.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/sets/gfx908_sets.yaml
@@ -1,4 +1,3 @@
-# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_sets.yaml. Generated by utils/split_config.py
 sets:
 - title: Compute Throughput Utilization
   set_option: compute_thruput_util
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/sets/gfx90a_sets.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/sets/gfx90a_sets.yaml
index 3a970342f2..c67cbd6718 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/sets/gfx90a_sets.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/sets/gfx90a_sets.yaml
@@ -1,4 +1,3 @@
-# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_sets.yaml. Generated by utils/split_config.py
 sets:
 - title: Compute Throughput Utilization
   set_option: compute_thruput_util
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/sets/gfx940_sets.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/sets/gfx940_sets.yaml
index b549f0fede..ab78f316c1 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/sets/gfx940_sets.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/sets/gfx940_sets.yaml
@@ -1,4 +1,3 @@
-# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_sets.yaml. Generated by utils/split_config.py
 sets:
 - title: Compute Throughput Utilization
   set_option: compute_thruput_util
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/sets/gfx941_sets.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/sets/gfx941_sets.yaml
index b549f0fede..ab78f316c1 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/sets/gfx941_sets.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/sets/gfx941_sets.yaml
@@ -1,4 +1,3 @@
-# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_sets.yaml. Generated by utils/split_config.py
 sets:
 - title: Compute Throughput Utilization
   set_option: compute_thruput_util
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/sets/gfx942_sets.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/sets/gfx942_sets.yaml
index b549f0fede..ab78f316c1 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/sets/gfx942_sets.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/sets/gfx942_sets.yaml
@@ -1,4 +1,3 @@
-# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_sets.yaml. Generated by utils/split_config.py
 sets:
 - title: Compute Throughput Utilization
   set_option: compute_thruput_util
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/sets/gfx950_sets.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/sets/gfx950_sets.yaml
index f93a0af246..177a8f9a7a 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/sets/gfx950_sets.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/sets/gfx950_sets.yaml
@@ -1,4 +1,3 @@
-# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_sets.yaml. Generated by utils/split_config.py
 sets:
 - title: Compute Throughput Utilization
   set_option: compute_thruput_util
diff --git a/projects/rocprofiler-compute/tools/config_management/.config_hashes.json b/projects/rocprofiler-compute/src/utils/.config_hashes.json
similarity index 99%
rename from projects/rocprofiler-compute/tools/config_management/.config_hashes.json
rename to projects/rocprofiler-compute/src/utils/.config_hashes.json
index 2ad6ac76f2..0b8335ffaf 100644
--- a/projects/rocprofiler-compute/tools/config_management/.config_hashes.json
+++ b/projects/rocprofiler-compute/src/utils/.config_hashes.json
@@ -139,4 +139,4 @@
       }
     }
   }
-}
\ No newline at end of file
+}
diff --git a/projects/rocprofiler-compute/tools/config_management/hash_checker.py b/projects/rocprofiler-compute/src/utils/hash_checker.py
similarity index 88%
rename from projects/rocprofiler-compute/tools/config_management/hash_checker.py
rename to projects/rocprofiler-compute/src/utils/hash_checker.py
index 14c5d17254..78e631f888 100644
--- a/projects/rocprofiler-compute/tools/config_management/hash_checker.py
+++ b/projects/rocprofiler-compute/src/utils/hash_checker.py
@@ -43,27 +43,16 @@ from pathlib import Path
 
 import yaml
 
-try:
-    from . import hash_manager  # type: ignore
-except Exception:
-    import importlib.util
+PROJECT_ROOT = Path(__file__).resolve().parents[2]  # rocprofiler-compute/
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
 
-    _HERE = Path(__file__).resolve().parent
-    _SPEC = importlib.util.spec_from_file_location(
-        "hash_manager", str(_HERE / "hash_manager.py")
-    )
-    hash_manager = importlib.util.module_from_spec(_SPEC)  # type: ignore[assignment]
-    assert _SPEC and _SPEC.loader is not None
-    _SPEC.loader.exec_module(hash_manager)  # type: ignore[attr-defined]
-# ---------------------------------------------------------------------------
+from tools.config_management import hash_manager  # noqa: E402
 
-# Subproject root: .../projects/rocprofiler-compute
-SUBROOT = Path(__file__).resolve().parents[2]
-
-CONFIGS_ROOT: Path = SUBROOT / "src" / "rocprof_compute_soc" / "analysis_configs"
-HASH_FILE: Path = SUBROOT / "tools" / "config_management" / ".config_hashes.json"
+CONFIGS_ROOT: Path = PROJECT_ROOT / "src" / "rocprof_compute_soc" / "analysis_configs"
+HASH_FILE: Path = PROJECT_ROOT / "src" / "utils" / ".config_hashes.json"
 TEMPLATE_FILE: Path = (
-    SUBROOT / "tools" / "config_management" / "analysis_config_template.yaml"
+    PROJECT_ROOT / "tools" / "config_management" / "gfx9_config_template.yaml"
 )
 
 
@@ -73,7 +62,7 @@ TEMPLATE_FILE: Path = (
 def _latest_arch(template_file: Path) -> str:
     if not template_file.is_file():
         return ""
-    with open(template_file, "r", encoding="utf-8") as f:
+    with open(template_file, encoding="utf-8") as f:
         data = yaml.safe_load(f) or {}
     return str(data.get("latest_arch") or "")
 
diff --git a/projects/rocprofiler-compute/tests/test_autogen_config.py b/projects/rocprofiler-compute/tests/test_autogen_config.py
index b30d83e1f1..81db7498c4 100644
--- a/projects/rocprofiler-compute/tests/test_autogen_config.py
+++ b/projects/rocprofiler-compute/tests/test_autogen_config.py
@@ -24,25 +24,91 @@
 ##############################################################################
 
 import hashlib
+import json
 from pathlib import Path
 
 import pytest
-import yaml
+
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+HASH_DB = PROJECT_ROOT / "src/utils/.config_hashes.json"
+ANALYSIS_CONFIGS = PROJECT_ROOT / "src/rocprof_compute_soc/analysis_configs"
 
 
-@pytest.mark.skip(
-    reason=(
-        "TODO: Skip this test until we use "
-        "tools/config_management/.config.hashes.json for testing"
+def md5(path: Path) -> str:
+    return hashlib.md5(path.read_bytes()).hexdigest()
+
+
+def test_config_hashes_match_files() -> None:
+    assert HASH_DB.exists(), f"Missing hash DB: {HASH_DB}"
+    assert ANALYSIS_CONFIGS.exists(), (
+        f"Missing analysis configs dir: {ANALYSIS_CONFIGS}"
     )
-)
-def test_modification_time():
-    # Ensure hash map consistency
-    hash_path = Path("tools/autogen_hash.yaml")
-    with open(hash_path) as f:
-        hash_map = yaml.safe_load(f)
-    for file, hash in hash_map.items():
-        file_hash = hashlib.sha256(Path(file).read_bytes()).hexdigest()
-        assert file_hash == hash, (
-            f"Hash mismatch for {file}: expected {hash}, got {file_hash}"
-        )
+
+    with HASH_DB.open() as f:
+        data = json.load(f)
+
+    assert "archs" in data, "Hash DB missing 'archs' key"
+    assert isinstance(data["archs"], dict)
+
+    failures = []
+
+    for arch, arch_data in data["archs"].items():
+        arch_dir = ANALYSIS_CONFIGS / arch
+        if not arch_dir.exists():
+            failures.append(f"Arch directory missing: {arch_dir}")
+            continue
+
+        # -------------------------
+        # Panel YAMLs
+        # -------------------------
+        files = arch_data.get("files", {})
+        if not isinstance(files, dict):
+            failures.append(f"'files' for {arch} is not a dict")
+            continue
+
+        for rel_path, expected_hash in files.items():
+            panel_path = arch_dir / rel_path
+            if not panel_path.exists():
+                failures.append(f"Missing panel file: {panel_path}")
+                continue
+
+            actual_hash = md5(panel_path)
+            if actual_hash != expected_hash:
+                failures.append(
+                    f"[{arch}] Panel hash mismatch: {panel_path}\n"
+                    f"  expected: {expected_hash}\n"
+                    f"  actual:   {actual_hash}"
+                )
+
+        # -------------------------
+        # Delta YAML (if any)
+        # -------------------------
+        delta_hash = arch_data.get("delta_hash")
+
+        if delta_hash is not None:
+            delta_dir = arch_dir / "config_delta"
+            if not delta_dir.exists():
+                failures.append(f"[{arch}] Missing config_delta directory")
+                continue
+
+            # Exactly one *_diff.yaml should exist
+            delta_files = list(delta_dir.glob("*_diff.yaml"))
+            if len(delta_files) != 1:
+                failures.append(
+                    f"[{arch}] Expected exactly one delta file, found "
+                    f"{len(delta_files)} in {delta_dir}"
+                )
+                continue
+
+            delta_path = delta_files[0]
+            actual_delta_hash = md5(delta_path)
+
+            if actual_delta_hash != delta_hash:
+                failures.append(
+                    f"[{arch}] Delta hash mismatch: {delta_path}\n"
+                    f"  expected: {delta_hash}\n"
+                    f"  actual:   {actual_delta_hash}"
+                )
+
+    if failures:
+        pytest.fail("Hash consistency failures:\n\n" + "\n".join(failures))
diff --git a/projects/rocprofiler-compute/tools/autogen_hash.yaml b/projects/rocprofiler-compute/tools/autogen_hash.yaml
deleted file mode 100644
index e25b0bb4f9..0000000000
--- a/projects/rocprofiler-compute/tools/autogen_hash.yaml
+++ /dev/null
@@ -1,2 +0,0 @@
-# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from tools/unified_config.yaml. Generated by tools/split_config.py
-{}
diff --git a/projects/rocprofiler-compute/tools/config_management/README.md b/projects/rocprofiler-compute/tools/config_management/README.md
index d19ecf5e95..f5cbca7247 100644
--- a/projects/rocprofiler-compute/tools/config_management/README.md
+++ b/projects/rocprofiler-compute/tools/config_management/README.md
@@ -1,500 +1,276 @@
-# Architecture Configuration Workflow
+# ROCProfiler-Compute Configuration Management
 
-This document explains the master workflow system for managing architecture-specific metric configurations.
+This directory contains the authoritative configuration-management system for ROCProfiler-Compute analysis configurations.
 
-## Overview
+It is designed to guarantee:
 
-The workflow system manages changes to architecture configurations located in `src/rocprof_compute_soc/analysis_configs/gfx<arch>/`. It handles:
+- **Structural correctness** across GPU architectures
+- **Deterministic deltas** relative to a single latest architecture
+- **Byte-level immutability** enforced via hashes
+- **Safe promotion** of a new latest architecture with rollback
+- **CI enforcement** of all invariants
 
-- **Metric changes** (additions, deletions, modifications)
-- **Metric description changes** (plain text + RST documentation)
-- **New architecture additions**
-- **Template updates**
-- **Config delta generation** for version control
-
-## Files Overview
-
-### Core Scripts
-
-1. **`master_config_workflow_script.py`** - Main orchestrator script
-2. **`hash_manager.py`** - Tracks file changes via MD5 hashes
-3. **`metric_description_manager.py`** - Syncs metric descriptions across files
-4. **`config_workflow.yaml`** - Configuration file
-5. **`parse_config_template.py`** - Parses base config template from latest arch
-6. **`generate_config_deltas.py`** - Generates config deltas between two archs
-7. **`apply_config_deltas.py`** - Applies config deltas to genearte new arch configs
-8. **`verify_against_config_template.py`** - Validates configs against template
-
-## Quick Start
-
-### Initial Setup (not needed following first commit)
-
-1. Create the hash database:
-```bash
-python hash_manager.py --compute-all src/rocprof_compute_soc/analysis_configs
-```
-
-2. Ensure `analysis_config_template.yaml` has metadata:
-```yaml
-latest_arch: gfx950
-panels:
-  - file: top_stats.yaml
-    panel_id: 0
-    ...
-```
-
-### Making Changes
-
-Simply run the master workflow after making any changes:
+All workflows are orchestrated by a single sequential driver script:
 
 ```bash
-python master_config_workflow_script.py
+tools/config_management/master_config_workflow_script.py
+```
+
+## Repository Layout
+
+```bash
+rocprofiler-compute/
+├── src/rocprof_compute_soc/
+│   └── analysis_configs/
+│       ├── gfx908/
+│       │   ├── 0000_top_stats.yaml
+│       │   └── config_delta/
+│       │       └── <latest_arch>_diff.yaml
+│       ├── gfx90a/
+│       ├── gfx940/
+│       ├── gfx950/                      # latest_arch
+│       └── gfx9_config_template.yaml    # single source of truth
+│
+├── src/util/
+│   ├── hash_checker.py
+│   ├── .config_hashes.json
+│
+└── tools/config_management/
+    ├── master_config_workflow_script.py
+    ├── parse_config_template.py
+    ├── verify_against_config_template.py
+    ├── generate_config_deltas.py
+    ├── apply_config_deltas.py
+    ├── hash_manager.py
+    ├── TESTING.md
+    └── README.md
+```
+
+## Core Concepts
+### Latest Architecture
+
+- Exactly one architecture is considered *latest*
+- Defined in:
+```bash
+src/rocprof_compute_soc/analysis_configs/gfx9_config_template.yaml
+```
+
+### Panel YAMLs
+
+- Live under:
+```bash
+analysis_configs/<arch>/*.yaml
+```
+- Must conform strictly to the template schema
+- Are edited in-place using ruamel.yaml round-trip mode
+
+### Delta YAMLs
+
+- Represent differences from latest → older architecture
+- Live under:
+```bash
+analysis_configs/<older_arch>/config_delta/
+```
+- Exactly one delta file per arch
+- Always named:
+```bash
+<latest_arch>_diff.yaml
+```
+
+### Hash Database
+
+- Stored at:
+```bash
+src/utils/.config_hashes.json
+```
+- Records:
+  - md5 hashes of panel YAMLs per arch
+  - md5 hash of the delta YAML (or null for latest)
+- Machine-generated only
+- Enforced in CI and pytest
+
+## Architecture Diagram (End-to-End Flow)
+```pqsql
+                   ┌──────────────────────────┐
+                   │  analysis_configs/       │
+                   │  gfx9_config_template    │
+                   └───────────┬──────────────┘
+                               │
+                               ▼
+                 ┌───────────────────────────────┐
+                 │ verify_against_config_template│
+                 │ (structural validation)       │
+                 └───────────┬───────────────────┘
+                             │
+         ┌───────────────────┴───────────────────┐
+         │                                       │
+         ▼                                       ▼
+┌────────────────────┐               ┌──────────────────────┐
+│ edit-existing mode │               │ promotion mode       │
+│ (local dev only)   │               │ (authoritative path) │
+└─────────┬──────────┘               └──────────┬───────────┘
+          │                                     │
+          ▼                                     ▼
+┌────────────────────┐               ┌─────────────────────────────┐
+│ generate / apply   │               │ parse_config_template.py    │
+│ deltas manually    │               │ (update latest_arch)        │
+└────────────────────┘               └──────────┬──────────────────┘
+                                                 │
+                                                 ▼
+                               ┌──────────────────────────────────┐
+                               │ generate_config_deltas.py        │
+                               │ latest → all older arches        │
+                               │ (<latest>_diff.yaml only)        │
+                               └──────────┬───────────────────────┘
+                                          │
+                                          ▼
+                               ┌──────────────────────────────────┐
+                               │ verify_against_config_template   │
+                               │ (post-promotion validation)      │
+                               └──────────┬───────────────────────┘
+                                          │
+                                          ▼
+                               ┌──────────────────────────────────┐
+                               │ hash_manager.py --compute-all    │
+                               │ (new steady state)               │
+                               └──────────┬───────────────────────┘
+                                          │
+                                          ▼
+                               ┌──────────────────────────────────┐
+                               │ hash_checker.py                  │
+                               │ (semantic consistency)           │
+                               └──────────────────────────────────┘
+```
+
+## Contributor Quick Start
+
+> [!NOTE]
+> **Required Python Dependency**
+> This configuration management system requires the `ruamel.yaml` Python package.
+> It is used to safely modify YAML files while preserving comments, ordering,
+> and formatting. The workflow scripts will not function correctly without it.
+>
+> Install it via:
+> ```bash
+> pip install ruamel.yaml
+> ```
+
+### 1. Validate the current state
+
+Before making **any** config changes:
+```bash
+python tools/config_management/master_config_workflow_script.py --validate-only
+```
+
+This must pass.
+
+### 2. Editing an existing architecture (most common)
+
+Edit panel YAMLs **directly** under:
+```bash
+src/rocprof_compute_soc/analysis_configs/<arch>/
+```
+
+Rules:
+
+- Preserve structure
+- Preserve ordering
+- Use multiline `>-` formatting for metric descriptions
+- Do **not** regenerate entire files
+
+After editing:
+```bash
+python tools/config_management/master_config_workflow_script.py --validate-only
+```
+
+### 3. Generating or applying deltas (advanced / optional)
+
+For local experimentation only:
+```bash
+python tools/config_management/master_config_workflow_script.py --edit-existing
+```
+
+This mode:
+
+- never updates the template
+- never updates hashes
+- always re-validates after application
+
+### 4. Promoting a new latest architecture (rare, gated)
+
+Promotion changes **global invariants** and must use the master script:
+```bash
+python tools/config_management/master_config_workflow_script.py --promote <latest_arch>
 ```
 
 The script will:
-- Detect what changed
-- Prompt you for confirmation
-- Apply changes
-- Validate results
-- Update all necessary files
 
-### Dry Run Mode
+1. Update `latest_arch` in the template
+2. Regenerate deltas for all older arches
+3. Remove stale delta files
+4. Re-validate everything
+5. Rebuild the hash database
+6. Verify semantic consistency
 
-To see what would happen without making changes:
+If anything fails:
 
+- all changes are rolled back
+- no partial state remains
+
+### 5. Hash checks (fast local / CI)
 ```bash
-python master_config_workflow_script.py --dry-run
+python tools/config_management/master_config_workflow_script.py --hash-only
 ```
 
-## Usage Scenarios
-
-### Scenario A: Add Metrics to Latest Arch (gfx950)
-
-**Method 1: Direct Edit**
-
-1. Edit `src/rocprof_compute_soc/analysis_configs/gfx950/0700_wavefront.yaml`
-2. Add your metric to the appropriate table
-3. Add description to `metrics_description` section
-4. Run: `python master_config_workflow_script.py`
-5. Answer prompts
-
-**Method 2: Using Delta**
-
-1. Create `src/rocprof_compute_soc/analysis_configs/gfx950/config_delta/gfx955_diff.yaml`:
-```yaml
-Addition:
-  - Panel Config:
-      id: 700
-      title: Wavefront
-    metric_tables:
-      - metric_table:
-          id: 701
-          title: Wavefront Launch Stats
-          metrics:
-            - New Metric:
-                avg: AVG(something)
-                unit: Units
-    metric_descriptions:
-      New Metric:
-        plain: Description text
-        rst: >- # Optional
-          Description with :ref:`RST markup <link>`
-
-Deletion:
-  []
-
-Modification:
-  []
-```
-
-2. Run: `python master_config_workflow_script.py`
-
-**What Happens:**
-- Changes applied to gfx950
-- Template updated
-- Deltas regenerated for all previous archs (gfx940, gfx941, etc.)
-- Metric descriptions synced to:
-  - `tools/per_arch_metric_definitions/gfx950_metrics_description.yaml`
-  - `docs/data/metrics_description.yaml`
-- All archs validated
-- Hashes updated
-
-### Scenario B: Modify Metrics in Older Arch (gfx940)
-
-**Method 1: Direct Edit**
-
-1. Edit `src/rocprof_compute_soc/analysis_configs/gfx940/0700_wavefront.yaml`
-2. Make your changes
-3. Run: `python master_config_workflow_script.py`
-
-**Method 2: Using Delta**
-
-1. Create `src/rocprof_compute_soc/analysis_configs/gfx940/config_delta/gfx950_diff.yaml`
-2. Run: `python master_config_workflow_script.py`
-
-**What Happens:**
-- Changes applied to gfx940 only
-- Validated against template (must still match structure)
-- Metric descriptions synced to `tools/per_arch_metric_definitions/gfx940_metrics_description.yaml`
-- Hashes updated for gfx940 only
-
-### Scenario C: Add New Architecture (gfx955)
-
-**Method 1: Create Directory with YAMLs**
-
-1. Create `src/rocprof_compute_soc/analysis_configs/gfx955/`
-2. Copy/create YAML files
-3. Run: `python master_config_workflow_script.py`
-4. Confirm this is the new latest arch
-
-**Method 2: Using Delta from Latest**
-
-1. Create delta showing differences from gfx950
-2. Place in `src/rocprof_compute_soc/analysis_configs/gfx955/config_delta/gfx955_diff.yaml`
-3. Run: `python master_config_workflow_script.py`
-4. Confirm this is the new latest arch
-
-**What Happens:**
-- gfx955 becomes new latest arch
-- Template updated with gfx955 as source
-- Deltas generated: gfx955 → gfx950, gfx955 → gfx940, etc.
-- All archs validated
-- Metric descriptions synced
-- Hashes updated
-
-### Scenario D: Update Metric Descriptions Only
-
-1. Edit description in config YAML:
-```yaml
-metrics_description:
-  Grid Size: "Updated description text"
-```
-
-2. Run: `python master_config_workflow_script.py`
-
-**What Happens:**
-- Same workflow as metric changes
-- Plain text stored in config YAMLs
-- RST version generated and stored in docs/tools files
-
-## Delta YAML Structure
-
-### Complete Example
-
-```yaml
-Addition:
-  - Panel Config:
-      id: 1100
-      title: Compute Units - Compute Pipeline
-    metric_tables:
-      - metric_table:
-          id: 1103
-          title: Arithmetic Operations
-          metrics:
-            - F8 OPs:
-                avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
-                min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
-                max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
-                unit: (OPs + $normUnit)
-    metric_descriptions:
-      F8 OPs:
-        plain: Number of 8-bit floating point operations
-        rst: |-
-          Number of 8-bit floating point operations per :ref:`normalization unit <normalization-units>`"
-
-Deletion:
-  - Panel Config:
-      id: 1100
-      title: Compute Units - Compute Pipeline
-    metric_tables:
-      - metric_table:
-          id: 1103
-          title: Arithmetic Operations
-          metrics:
-            - Old Metric:
-                avg: AVG(something)
-    metric_descriptions:
-      Old Metric:
-        plain: "Old description"
-
-Modification:
-  - Panel Config:
-      id: 1100
-      title: Compute Units - Compute Pipeline
-    metric_tables:
-      - metric_table:
-          id: 1103
-          title: Arithmetic Operations
-          metrics:
-            - Existing Metric:
-                avg: AVG(new_formula)  # Changed field only
-    metric_descriptions:
-      Existing Metric:
-        plain: Updated description
-        rst: >-
-          Updated description with **RST**"
-```
-
-### Rules for Deltas
-
-1. **Must have all three sections**: Addition, Deletion, Modification (can be empty lists)
-2. **Metric descriptions**:
-   - `plain` field is required
-   - `rst` field is optional (defaults to copy of plain)
-3. **Delta filename**: Must be `<target_arch>_diff.yaml`
-4. **Location**: `src/rocprof_compute_soc/analysis_configs/gfx<arch>/config_delta/`
-
-## Standalone Tool Usage
-
-### Hash Manager
-
+or:
 ```bash
-# Compute hashes for all archs
-python hash_manager.py --compute-all src/rocprof_compute_soc/analysis_configs
-
-# Detect changes
-python hash_manager.py --detect-changes src/rocprof_compute_soc/analysis_configs
-
-# Update hashes for specific arch
-python hash_manager.py --update gfx950 src/rocprof_compute_soc/analysis_configs
+python tools/config_management/master_config_workflow_script.py --ci
 ```
 
-### Metric Description Manager
+This runs semantic hash validation only.
 
+## Automated Testing
+### Pytest Hash Integrity Test
+
+Located at:
 ```bash
-# Sync descriptions for specific arch
-python metric_description_manager.py --sync-arch gfx950 src/rocprof_compute_soc/analysis_configs --latest-arch gfx950
-
-# Sync all archs
-python metric_description_manager.py --sync-all src/rocprof_compute_soc/analysis_configs --latest-arch gfx950
-
-# Validate descriptions
-python metric_description_manager.py --validate gfx950 src/rocprof_compute_soc/analysis_configs
+tests/test_autogen_config.py
 ```
 
-### Parse Config Template
+This test:
 
+- parses `.config_hashes.json`
+- verifies **byte-for-byte** integrity of:
+  - panel YAMLs
+  - delta YAMLs
+- fails on:
+  - missing files
+  - changed content
+  - stale hash DB
+Semantic correctness is enforced separately by `hash_checker.py`.
+
+## Contributor Rules (Strict)
+
+- Do **not** edit `.config_hashes.json` manually
+- Do **not** create multiple delta files per arch
+- Do **not** rename delta files arbitrarily
+- Do **not** regenerate full YAMLs unnecessarily
+- Use in-place edits (ruamel round-trip)
+- Use the master script for promotions
+- Expect CI to reject inconsistent states
+
+## Summary
+
+This system guarantees:
+
+- A **single source of truth** for latest architecture
+- Deterministic, reviewable deltas
+- Stable diffs for Git review
+- Hash-backed immutability
+- Safe, transactional promotions
+- CI-enforced correctness
+
+All correctness flows through:
 ```bash
-# Generate template with metadata
-python parse_config_template.py src/rocprof_compute_soc/analysis_configs/gfx950 \
-    tools/config_management/analysis_config_template.yaml \
-    --latest-arch gfx950
-```
-
-### Generate Delta
-
-```bash
-# Generate delta from current arch to previous arch
-python generate_config_deltas.py \
-    src/rocprof_compute_soc/analysis_configs/gfx950 \
-    src/rocprof_compute_soc/analysis_configs/gfx940
-```
-
-### Apply Delta
-
-```bash
-# Apply delta to base arch
-python apply_config_deltas.py \
-    src/rocprof_compute_soc/analysis_configs/gfx940 \
-    src/rocprof_compute_soc/analysis_configs/gfx940/config_delta/gfx950_diff.yaml \
-    output_dir
-```
-
-### Verify Against Template
-
-```bash
-# Validate all archs
-python verify_against_config_template.py \
-    src/rocprof_compute_soc/analysis_configs \
-    tools/config_management/analysis_config_template.yaml
-```
-
-## File Structure
-
-```
-.
-├── src/rocprof_compute_soc/analysis_configs/
-│   ├── gfx940/
-│   │   ├── 0700_wavefront.yaml           # Config with plain descriptions
-│   │   └── config_delta/
-│   │       └── gfx950_diff.yaml          # Delta to apply changes
-│   ├── gfx941/
-│   └── gfx950/                           # Latest arch
-│       ├── 0700_wavefront.yaml
-│       └── config_delta/
-│           └── gfx950_diff.yaml          # Optional delta for modifications
-│
-├── tools/
-│   ├── config_management/
-│   │   ├── .config_hashes.json           # Hash database (auto-generated)
-│   │   ├── analysis_config_template.yaml # Template with metadata
-│   │   ├── hash_manager.py
-│   │   ├── metric_description_manager.py
-│   │   ├── parse_config_template.py
-│   │   ├── generate_config_deltas.py
-│   │   ├── apply_config_deltas.py
-│   │   ├── verify_against_config_template.py
-│   │   ├── master_config_workflow_script.py
-│   │   └── config_workflow.yaml
-│   │
-│   └── per_arch_metric_definitions/
-│       ├── gfx940_metrics_description.yaml  # RST only
-│       ├── gfx941_metrics_description.yaml
-│       └── gfx950_metrics_description.yaml
-│
-├── docs/data/
-│   └── metrics_description.yaml          # RST only, latest arch only
-│
-└── .backups/                             # Auto-generated backups
-    └── 20250115_143022/                  # Timestamped backup
-```
-
-## Configuration
-
-Edit `config_workflow.yaml` to customize paths and behavior:
-
-```yaml
-paths:
-  template: tools/config_management/analysis_config_template.yaml
-  configs_root: src/rocprof_compute_soc/analysis_configs
-  backups: .backups
-  hashes: tools/config_management/.config_hashes.json
-  per_arch_metrics: tools/per_arch_metric_definitions
-  docs_metrics: docs/data/metrics_description.yaml
-
-validation:
-  strict_mode: true              # Fail on warnings
-  verify_after_changes: true     # Validate after operations
-
-behavior:
-  require_confirmation: true     # Prompt before changes
-```
-
-## Error Handling
-
-### Validation Failures
-
-If validation fails:
-1. All changes are automatically reverted
-2. Backup is restored
-3. Detailed error report is printed
-4. Fix the issue and run again
-
-### Hash Mismatches
-
-If hashes are out of sync:
-```bash
-# Recompute all hashes
-python hash_manager.py --compute-all src/rocprof_compute_soc/analysis_configs
-```
-
-### Description Validation Errors
-
-Common issues:
-- **Missing descriptions**: Warning only (won't fail)
-- **Invalid RST syntax**: Error (will fail and revert)
-- **Missing plain text**: Error (plain is required)
-
-## Best Practices
-
-1. **Always use master_config_workflow_script.py** - Don't run individual scripts manually unless debugging
-2. **Test with --dry-run first** - See what will happen before committing
-3. **Use deltas for complex changes** - Easier to review and version control
-4. **Keep descriptions updated** - Plain text in configs, RST in docs
-5. **One change at a time** - If multiple archs need updates, do them sequentially
-6. **Check validation output** - Review warnings even if they don't fail
-
-## Troubleshooting
-
-### "No changes detected"
-
-- Check that files were actually modified
-- Ensure you're in the correct directory
-- Verify hash database exists: `tools/config_management/.config_hashes.json`
-
-### "Validation failed"
-
-- Review the error output carefully
-- Check that new metrics match template structure
-- Ensure panel IDs are correct
-- Verify data source ordering
-
-### "Failed to sync metric descriptions"
-
-- Check RST syntax in descriptions
-- Ensure all metrics have descriptions
-- Verify section_panel_map includes your table ID
-
-### Changes not detected after manual edit
-
-```bash
-# Force recompute hashes
-python hash_manager.py --compute-all src/rocprof_compute_soc/analysis_configs
-
-# Then run workflow
-python master_config_workflow_script.py
-```
-
-## Development Notes
-
-### Adding New Architecture Support
-
-When adding a completely new architecture line:
-
-1. Ensure table IDs are in `metric_description_manager.py`'s `SECTION_PANEL_MAP`
-2. Follow existing naming conventions (gfxXXX)
-3. Create complete YAML set (don't start with partial configs)
-
-### Modifying the Workflow
-
-If you need to modify the workflow behavior:
-
-1. Edit `config_workflow.yaml` for path/behavior changes
-2. Edit `master_config_workflow_script.py` for workflow logic changes
-3. Test with `--dry-run` extensively
-4. Update this README
-
-
-# Pre-commit: Hash Consistency Check
-
-We ship a lightweight pre-commit hook that catches inconsistent hash updates across config YAMLs and deltas.
-
-## What it enforces (per arch)
-
-* Latest panels changed → latest delta must change (if there are older archs).
-* Latest delta changed → latest panels must change or a new arch must be added.
-* Older arch panels changed → that arch’s delta must change.
-* Older arch delta changed → either latest panels or that arch’s panels must have changed.
-
-## Setup
-
-Install and enable pre-commit:
-
-```bash
-pip install pre-commit
-pre-commit install
-```
-
-Our .pre-commit-config.yaml includes a local hook that runs the checker.
-
-```yaml
-- repo: local
-  hooks:
-    - id: hash-check
-      name: Hash consistency check
-      entry: bash -lc 'cd projects/rocprofiler-compute && python3 tools/config_management/hash_checker.py'
-      language: system
-      pass_filenames: false
-      stages: [pre-commit]
-```
-
-## Run manually
-
-```bash
-# from super-repo root
-pre-commit run --all-files
-
-# or directly in the subproject
-cd projects/rocprofiler-compute
-python3 tools/config_management/hash_checker.py
+master_config_workflow_script.py
 ```
diff --git a/projects/rocprofiler-compute/tools/config_management/apply_config_deltas.py b/projects/rocprofiler-compute/tools/config_management/apply_config_deltas.py
index f96fba1650..d9df6672a5 100644
--- a/projects/rocprofiler-compute/tools/config_management/apply_config_deltas.py
+++ b/projects/rocprofiler-compute/tools/config_management/apply_config_deltas.py
@@ -36,24 +36,14 @@ import sys
 from pathlib import Path
 from typing import Any, Optional, Union
 
-try:
-    from . import utils as cm_utils
-except Exception:
-    repo_root = Path(__file__).resolve().parents[1]
-    if str(repo_root) not in sys.path:
-        sys.path.insert(0, str(repo_root))
-    try:
-        import config_management.utils as cm_utils  # type: ignore
-    except Exception:
-        import utils as cm_utils  # type: ignore
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
 
-AUTOGEN_TEXT = (
-    "# AUTOGENERATED FILE. Only edit for testing purposes, not for development. "
-    "Generated by tools/config_management/apply_config_deltas.py\n"
-)
+from config_management import utils_ruamel as cm_utils  # noqa: E402
 
 
-def find_table_in_config(config: dict, table_id: Any) -> Optional[dict]:
+def find_table(config: dict, table_id: Any) -> Optional[dict]:
     """Find and return the table with given id, or None."""
     for item in config.get("Panel Config", {}).get("data source", []):
         table = item.get("metric_table")
@@ -72,8 +62,8 @@ def add_table(config: dict, metric_table: dict) -> None:
 
 def add_metrics(config: dict, table_id: Any, metrics: list[dict]) -> None:
     """Add metrics to existing table."""
-    table = find_table_in_config(config, table_id)
-    if not table:
+    table = find_table(config, table_id)
+    if table is None:
         print(f"WARNING: Table {table_id} not found for metric addition")
         return
 
@@ -90,7 +80,7 @@ def delete_table(config: dict, table_id: Any) -> None:
     for idx, item in enumerate(list(data_source)):
         table = item.get("metric_table")
         if isinstance(table, dict) and table.get("id") == table_id:
-            data_source.pop(idx)
+            del data_source[idx]
             print(f"Deleted table: {table_id}")
             return
     print(f"WARNING: Table {table_id} not found for deletion")
@@ -98,8 +88,8 @@ def delete_table(config: dict, table_id: Any) -> None:
 
 def delete_metrics(config: dict, table_id: Any, metrics: list[dict]) -> None:
     """Remove specific metrics from table."""
-    table = find_table_in_config(config, table_id)
-    if not table or "metric" not in table:
+    table = find_table(config, table_id)
+    if table is None or "metric" not in table:
         print(f"WARNING: Table {table_id} not found or has no metrics")
         return
 
@@ -112,8 +102,8 @@ def delete_metrics(config: dict, table_id: Any, metrics: list[dict]) -> None:
 
 def modify_metrics(config: dict, table_id: Any, metrics: list[dict]) -> None:
     """Modify specific fields in existing metrics."""
-    table = find_table_in_config(config, table_id)
-    if not table or "metric" not in table:
+    table = find_table(config, table_id)
+    if table is None or "metric" not in table:
         print(f"WARNING: Table {table_id} not found or has no metrics")
         return
 
@@ -129,19 +119,17 @@ def modify_metrics(config: dict, table_id: Any, metrics: list[dict]) -> None:
 
 def add_descriptions(config: dict, descriptions: dict) -> None:
     """Add metric descriptions to config."""
-    pc = config.setdefault("Panel Config", {})
-    pc.setdefault("metrics_description", {})
-    md = pc["metrics_description"]
+    md = config["Panel Config"].setdefault("metrics_description", {})
 
     for metric_name, desc_data in descriptions.items():
-        value = desc_data if isinstance(desc_data, dict) else desc_data
-        md[metric_name] = value
+        md[metric_name] = dict(desc_data) if isinstance(desc_data, dict) else desc_data
+
         print(f"Added description: {metric_name}")
 
 
 def delete_descriptions(config: dict, descriptions: dict) -> None:
     """Remove metric descriptions from config."""
-    md = config.get("Panel Config", {}).get("metrics_description", {})
+    md = config["Panel Config"].setdefault("metrics_description", {})
     for metric_name in descriptions.keys():
         if metric_name in md:
             del md[metric_name]
@@ -150,21 +138,25 @@ def delete_descriptions(config: dict, descriptions: dict) -> None:
 
 def modify_descriptions(config: dict, descriptions: dict) -> None:
     """Modify metric descriptions in config."""
-    pc = config.setdefault("Panel Config", {})
-    pc.setdefault("metrics_description", {})
-    md = pc["metrics_description"]
+    md = config["Panel Config"].setdefault("metrics_description", {})
 
     for metric_name, desc_data in descriptions.items():
-        value = desc_data if isinstance(desc_data, dict) else desc_data
-        md[metric_name] = value
+        if isinstance(desc_data, dict):
+            new_dict = {}
+            for k, v in desc_data.items():
+                new_dict[k] = v
+            md[metric_name] = new_dict
+        else:
+            md[metric_name] = desc_data
+
         print(f"Added description: {metric_name}")
 
 
 def apply_changes(config: dict, changes: list[dict], category: str) -> None:
     """Apply delta changes to configuration."""
     for change in changes:
-        for mt_wrapper in change.get("metric_tables", []):
-            mt = mt_wrapper.get("metric_table", mt_wrapper)
+        mt = change.get("metric_table")
+        if mt:
             table_id = mt.get("id")
 
             if category == "Addition":
@@ -199,7 +191,7 @@ def apply_delta(
     output_dir: Union[str, Path],
 ) -> None:
     """Apply delta YAML to all files in base directory."""
-    delta = cm_utils.load_yaml(delta_file)
+    delta = cm_utils.load_yaml(delta_file, round_trip=True)
     output_path = Path(output_dir)
     output_path.mkdir(parents=True, exist_ok=True)
 
@@ -214,7 +206,7 @@ def apply_delta(
 
     base_path = Path(base_dir)
     for yaml_file in base_path.glob("*.yaml"):
-        config = cm_utils.load_yaml(yaml_file)
+        config = cm_utils.load_yaml(yaml_file, round_trip=True)
         panel_id = config.get("Panel Config", {}).get("id")
 
         if panel_id in changes_by_panel:
@@ -226,7 +218,8 @@ def apply_delta(
                         config, changes_by_panel[panel_id][category], category
                     )
 
-            cm_utils.save_yaml(config, output_path / yaml_file.name, AUTOGEN_TEXT)
+            cm_utils.strip_existing_header(config)
+            cm_utils.save_yaml(config, output_path / yaml_file.name)
             print(f"Saved: {yaml_file.name}")
         else:
             shutil.copy(yaml_file, output_path / yaml_file.name)
diff --git a/projects/rocprofiler-compute/tools/config_management/config_workflow.yaml b/projects/rocprofiler-compute/tools/config_management/config_workflow.yaml
index b6649e540d..760a4ef370 100644
--- a/projects/rocprofiler-compute/tools/config_management/config_workflow.yaml
+++ b/projects/rocprofiler-compute/tools/config_management/config_workflow.yaml
@@ -11,7 +11,7 @@ paths:
   backups: .backups
 
   # Hash database file
-  hashes: tools/config_management/.config_hashes.json
+  hashes: src/utils/.config_hashes.json
 
   # Per-arch metric definitions output
   per_arch_metrics: tools/per_arch_metric_definitions
diff --git a/projects/rocprofiler-compute/tools/config_management/generate_config_deltas.py b/projects/rocprofiler-compute/tools/config_management/generate_config_deltas.py
index 7fcaccc044..112aa9bbd5 100644
--- a/projects/rocprofiler-compute/tools/config_management/generate_config_deltas.py
+++ b/projects/rocprofiler-compute/tools/config_management/generate_config_deltas.py
@@ -34,326 +34,265 @@ from __future__ import annotations
 
 import sys
 from pathlib import Path
+from typing import Any, Optional
 
-try:
-    from . import utils as cm_utils
-except Exception:
-    repo_root = Path(__file__).resolve().parents[1]
-    if str(repo_root) not in sys.path:
-        sys.path.insert(0, str(repo_root))
-    try:
-        import config_management.utils as cm_utils  # type: ignore
-    except Exception:
-        import utils as cm_utils  # type: ignore
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
 
-AUTOGEN_TEXT = (
-    "# AUTOGENERATED FILE. Only edit for testing purposes, not for development. "
-    "Generated by tools/config_management/generate_config_deltas.py\n"
-)
+from config_management import utils_ruamel as cm_utils  # noqa: E402
+from ruamel.yaml.comments import CommentedMap  # noqa: E402
 
 
-def get_metric_tables(data: dict) -> list[dict]:
-    """Extract all metric tables from data source."""
-    tables: list[dict] = []
-    for item in data.get("Panel Config", {}).get("data source", []):
-        mt = item.get("metric_table")
-        if isinstance(mt, dict):
-            tables.append(mt)
-    return tables
+def load_yaml_roundtrip(path: Path) -> Any:
+    return cm_utils.load_yaml(path, round_trip=True)
 
 
-def get_metric_descriptions(data: dict) -> dict:
-    """Extract metric descriptions from panel config."""
-    return data.get("Panel Config", {}).get("metrics_description", {}) or {}
+def diff_metric_fields(base_fields, new_fields) -> Optional[CommentedMap]:
+    out = CommentedMap()
+
+    for key in new_fields:
+        if key not in base_fields or base_fields[key] != new_fields[key]:
+            # Preserve the original value with comments
+            out[key] = new_fields[key]
+
+    return out if out else None
 
 
-def compare_metrics(
-    prev_metrics: dict, curr_metrics: dict
+def descriptions_equal(base_desc, new_desc) -> bool:
+    """Check if two descriptions are equal by comparing their string representation."""
+    return str(base_desc) == str(new_desc)
+
+
+def diff_metric_table(
+    base_table, new_table
 ) -> tuple[list[dict], list[dict], list[dict]]:
-    """Compare metrics and return (additions, deletions, modifications)."""
-    prev_keys = set(prev_metrics.keys())
-    curr_keys = set(curr_metrics.keys())
-
-    additions = [{name: curr_metrics[name]} for name in sorted(curr_keys - prev_keys)]
-    deletions = [{name: prev_metrics[name]} for name in sorted(prev_keys - curr_keys)]
-
-    modifications: list[dict] = []
-    for name in sorted(prev_keys & curr_keys):
-        if prev_metrics[name] != curr_metrics[name]:
-            all_fields = set(prev_metrics[name].keys()) | set(curr_metrics[name].keys())
-            modified_fields = {
-                field: curr_metrics[name].get(field)
-                for field in all_fields
-                if prev_metrics[name].get(field) != curr_metrics[name].get(field)
-            }
-            if modified_fields:
-                modifications.append({name: modified_fields})
-
-    return additions, deletions, modifications
-
-
-def compare_descriptions(
-    prev_descriptions: dict, curr_descriptions: dict
-) -> tuple[dict, dict, dict]:
     """
-    Compare metric descriptions and return (additions, deletions, modifications).
-    Values are dicts with 'plain' and 'rst'.
+    Returns (additions, modifications, deletions) tuple.
     """
-    prev_keys = set(prev_descriptions.keys())
-    curr_keys = set(curr_descriptions.keys())
+    addition_metrics: list[dict] = []
+    modification_metrics: list[dict] = []
+    deletion_metrics: list[dict] = []
 
-    additions: dict = {}
-    deletions: dict = {}
-    modifications: dict = {}
+    base_metrics = base_table.get("metric", {})
+    new_metrics = new_table.get("metric", {})
 
-    for name in sorted(curr_keys - prev_keys):
-        desc = curr_descriptions[name]
-        additions[name] = (
-            desc if isinstance(desc, dict) else {"plain": desc, "rst": desc}
-        )
+    # Metrics deleted
+    for metric in base_metrics:
+        if metric not in new_metrics:
+            deletion_metrics.append({metric: None})
 
-    for name in sorted(prev_keys - curr_keys):
-        desc = prev_descriptions[name]
-        deletions[name] = (
-            desc if isinstance(desc, dict) else {"plain": desc, "rst": desc}
-        )
+    # Metrics added or modified
+    for metric in new_metrics:
+        if metric not in base_metrics:
+            # Entire metric is new - preserve original with comments
+            addition_metrics.append({metric: new_metrics[metric]})
+        else:
+            # Field-level diff
+            changes = diff_metric_fields(base_metrics[metric], new_metrics[metric])
+            if changes:
+                modification_metrics.append({metric: changes})
 
-    for name in sorted(prev_keys & curr_keys):
-        prev_desc = prev_descriptions[name]
-        curr_desc = curr_descriptions[name]
-
-        prev_plain = (
-            prev_desc if isinstance(prev_desc, str) else prev_desc.get("plain", "")
-        )
-        curr_plain = (
-            curr_desc if isinstance(curr_desc, str) else curr_desc.get("plain", "")
-        )
-
-        prev_rst = (
-            prev_desc
-            if isinstance(prev_desc, str)
-            else prev_desc.get("rst", prev_plain)
-        )
-        curr_rst = (
-            curr_desc
-            if isinstance(curr_desc, str)
-            else curr_desc.get("rst", curr_plain)
-        )
-
-        if prev_plain != curr_plain or prev_rst != curr_rst:
-            modifications[name] = {"plain": curr_plain, "rst": curr_rst}
-
-    return additions, deletions, modifications
+    return addition_metrics, modification_metrics, deletion_metrics
 
 
-def compare_tables(
-    prev_tables: list[dict], curr_tables: list[dict]
-) -> tuple[list[dict], list[dict], list[dict]]:
-    """Compare tables and return (additions, deletions, modifications)."""
-    prev_dict = {t["id"]: t for t in prev_tables}
-    curr_dict = {t["id"]: t for t in curr_tables}
+def diff_descriptions(
+    base_md, new_md
+) -> tuple[Optional[CommentedMap], Optional[CommentedMap], Optional[CommentedMap]]:
+    """
+    Returns (additions, modifications, deletions) tuple.
+    """
+    additions = CommentedMap()
+    modifications = CommentedMap()
+    deletions = CommentedMap()
 
-    prev_ids = set(prev_dict.keys())
-    curr_ids = set(curr_dict.keys())
+    # Deletions
+    for key in base_md:
+        if key not in new_md:
+            deletions[key] = None
 
-    additions: list[dict] = []
-    deletions: list[dict] = []
-    modifications: list[dict] = []
+    # Additions and modifications
+    for key in new_md:
+        if key not in base_md:
+            # New description - preserve original node
+            additions[key] = new_md[key]
+        else:
+            # Check if modified
+            if not descriptions_equal(base_md[key], new_md[key]):
+                # Preserve original node to maintain style
+                modifications[key] = new_md[key]
 
-    additions.extend(curr_dict[tid] for tid in sorted(curr_ids - prev_ids))
-    deletions.extend(prev_dict[tid] for tid in sorted(prev_ids - curr_ids))
+    return (
+        additions if additions else None,
+        modifications if modifications else None,
+        deletions if deletions else None,
+    )
 
-    for tid in sorted(prev_ids & curr_ids):
-        prev_metrics = prev_dict[tid].get("metric", {}) or {}
-        curr_metrics = curr_dict[tid].get("metric", {}) or {}
 
-        metric_adds, metric_dels, metric_mods = compare_metrics(
-            prev_metrics, curr_metrics
-        )
+def extract_metric_tables(data_sources) -> list[Any]:
+    out = []
+    for ds in data_sources:
+        if "metric_table" in ds:
+            mt = ds["metric_table"]
+            table_id = mt.get("id")
+            if table_id is not None:
+                out.append((table_id, mt))
+    return out
 
-        if metric_adds:
-            additions.append({
-                "id": tid,
-                "title": curr_dict[tid].get("title"),
-                "metrics": metric_adds,
-            })
-        if metric_dels:
-            deletions.append({
-                "id": tid,
-                "title": prev_dict[tid].get("title"),
-                "metrics": metric_dels,
-            })
-        if metric_mods:
-            modifications.append({
-                "id": tid,
-                "title": curr_dict[tid].get("title"),
-                "metrics": metric_mods,
+
+def diff_panel(base_config, new_config) -> Optional[dict[str, list[Any]]]:
+    """
+    Produce delta for a single panel.
+    Returns dicts under keys:
+       'Addition', 'Deletion', 'Modification'
+    or None if no diffs.
+    """
+    out = {"Addition": [], "Deletion": [], "Modification": []}
+    panel_id = base_config["Panel Config"]["id"]
+
+    # Table-level diffs
+    base_tables = extract_metric_tables(
+        base_config["Panel Config"].get("data source", [])
+    )
+    new_tables = extract_metric_tables(
+        new_config["Panel Config"].get("data source", [])
+    )
+
+    # Indexing by table ID to preserve order
+    base_by_id = {tid: table for (tid, table) in base_tables}
+    new_by_id = {tid: table for (tid, table) in new_tables}
+
+    # Table deletions
+    for tid in base_by_id:
+        if tid not in new_by_id:
+            out["Deletion"].append({
+                "Panel Config": {"id": panel_id},
+                "metric_tables": [{"metric_table": {"id": tid}}],
             })
 
-    return additions, deletions, modifications
-
-
-def format_metric_fields(metric_data: dict) -> list[str]:
-    """Format metric fields as YAML lines."""
-    lines: list[str] = []
-    for field_name, field_value in metric_data.items():
-        if isinstance(field_value, str) and (
-            "\n" in field_value or len(field_value) > 80
-        ):
-            lines.append(f"                {field_name}: |")
-            lines.extend(
-                f"                  {line}" for line in field_value.split("\n")
+    # Table additions + modifications
+    for tid in new_by_id:
+        if tid not in base_by_id:
+            # Entire table is added - preserve original
+            out["Addition"].append({
+                "Panel Config": {"id": panel_id},
+                "metric_tables": [{"metric_table": new_by_id[tid]}],
+            })
+        else:
+            # Check metric-level diffs
+            additions, modifications, deletions = diff_metric_table(
+                base_by_id[tid], new_by_id[tid]
             )
-        else:
-            lines.append(f"                {field_name}: {field_value}")
-    return lines
+
+            if deletions:
+                out["Deletion"].append({
+                    "Panel Config": {"id": panel_id},
+                    "metric_table": {"id": tid, "metrics": deletions},
+                })
+
+            if additions:
+                out["Addition"].append({
+                    "Panel Config": {"id": panel_id},
+                    "metric_table": {"id": tid, "metrics": additions},
+                })
+
+            if modifications:
+                out["Modification"].append({
+                    "Panel Config": {"id": panel_id},
+                    "metric_table": {"id": tid, "metrics": modifications},
+                })
+
+    # Description diffs
+    base_md = base_config["Panel Config"].get("metrics_description", {})
+    new_md = new_config["Panel Config"].get("metrics_description", {})
+    desc_additions, desc_modifications, desc_deletions = diff_descriptions(
+        base_md, new_md
+    )
+
+    if desc_deletions:
+        out["Deletion"].append({
+            "Panel Config": {"id": panel_id},
+            "metric_descriptions": desc_deletions,
+        })
+
+    if desc_additions:
+        out["Addition"].append({
+            "Panel Config": {"id": panel_id},
+            "metric_descriptions": desc_additions,
+        })
+
+    if desc_modifications:
+        out["Modification"].append({
+            "Panel Config": {"id": panel_id},
+            "metric_descriptions": desc_modifications,
+        })
+
+    # Clean empties
+    if not out["Addition"]:
+        del out["Addition"]
+    if not out["Deletion"]:
+        del out["Deletion"]
+    if not out["Modification"]:
+        del out["Modification"]
+
+    return out if out else None
 
 
-def format_description_fields(desc_data: dict) -> list[str]:
-    """Format description fields as YAML lines."""
-    lines: list[str] = []
-    for field_name, field_value in desc_data.items():
-        if isinstance(field_value, str) and (
-            "\n" in field_value or len(field_value) > 80
-        ):
-            lines.append(f"          {field_name}: |")
-            lines.extend(f"            {line}" for line in field_value.split("\n"))
-        else:
-            lines.append(f"          {field_name}: {field_value}")
-    return lines
+def generate_arch_delta(base_dir: Path, new_dir: Path) -> CommentedMap:
+    """
+    Compare all YAML files panel-by-panel.
+    """
+    out = CommentedMap()
+    out["Addition"] = []
+    out["Deletion"] = []
+    out["Modification"] = []
 
-
-def format_output(combined_diff: dict) -> str:
-    """Format the diff dictionary into a YAML string."""
-    lines: list[str] = []
-    for category in ("Addition", "Deletion", "Modification"):
-        lines.append(f"{category}:")
-        if not combined_diff.get(category):
-            lines.append("  []")
-            lines.append("")
+    base_files = sorted(base_dir.glob("*.yaml"))
+    for base_file in base_files:
+        new_file = new_dir / base_file.name
+        if not new_file.exists():
             continue
 
-        for panel_item in combined_diff[category]:
-            pc = panel_item["panel_config"]
-            lines.extend([
-                "  - Panel Config:",
-                f"      id: {pc['id']}",
-                f"      title: {pc['title']}",
-            ])
+        base_config = load_yaml_roundtrip(base_file)
+        new_config = load_yaml_roundtrip(new_file)
 
-            if panel_item.get("metric_tables"):
-                lines.append("    metric_tables:")
-                for mt in panel_item["metric_tables"]:
-                    lines.extend([
-                        "      - metric_table:",
-                        f"          id: {mt['id']}",
-                        f"          title: {mt['title']}",
-                        "          metrics:",
-                    ])
-                    metrics_to_format = mt.get("metrics") or [
-                        {name: data} for name, data in (mt.get("metric") or {}).items()
-                    ]
-                    for metric in metrics_to_format:
-                        for metric_name, metric_data in metric.items():
-                            lines.append(f"            - {metric_name}:")
-                            lines.extend(format_metric_fields(metric_data))
+        diff = diff_panel(base_config, new_config)
+        if not diff:
+            continue
 
-            if panel_item.get("metric_descriptions"):
-                lines.append("    metric_descriptions:")
-                for metric_name, desc_data in panel_item["metric_descriptions"].items():
-                    lines.append(f"      {metric_name}:")
-                    lines.extend(format_description_fields(desc_data))
+        if "Addition" in diff:
+            out["Addition"].extend(diff["Addition"])
+        if "Deletion" in diff:
+            out["Deletion"].extend(diff["Deletion"])
+        if "Modification" in diff:
+            out["Modification"].extend(diff["Modification"])
 
-        lines.append("")
-    return "\n".join(lines)
+    # Strip empty categories
+    if not out["Addition"]:
+        del out["Addition"]
+    if not out["Deletion"]:
+        del out["Deletion"]
+    if not out["Modification"]:
+        del out["Modification"]
+
+    return out
 
 
 def main() -> None:
-    if len(sys.argv) != 3:
-        print("Usage: python generate_config_deltas.py <curr_arch_dir> <prev_arch_dir>")
-        sys.exit(1)
-
-    curr_arch_dir = Path(sys.argv[1])
-    prev_arch_dir = Path(sys.argv[2])
-
-    if not curr_arch_dir.is_dir() or not prev_arch_dir.is_dir():
-        print("Error: Both arguments must be directories")
-        sys.exit(1)
-
-    curr_files = {f.name for f in curr_arch_dir.glob("*.yaml")}
-    prev_files = {f.name for f in prev_arch_dir.glob("*.yaml")}
-    common_files = curr_files & prev_files
-
-    if not common_files:
-        print("Error: No common YAML files found")
-        sys.exit(1)
-
-    print(f"Comparing {len(common_files)} files...")
-
-    combined_diff = {"Addition": [], "Deletion": [], "Modification": []}
-
-    for filename in sorted(common_files):
-        curr_data = cm_utils.load_yaml(curr_arch_dir / filename)
-        prev_data = cm_utils.load_yaml(prev_arch_dir / filename)
-
-        curr_pc = curr_data.get("Panel Config", {}) or {}
-        prev_pc = prev_data.get("Panel Config", {}) or {}
-
-        curr_tables = get_metric_tables(curr_data)
-        prev_tables = get_metric_tables(prev_data)
-
-        curr_descriptions = get_metric_descriptions(curr_data)
-        prev_descriptions = get_metric_descriptions(prev_data)
-
-        table_adds, table_dels, table_mods = compare_tables(prev_tables, curr_tables)
-        desc_adds, desc_dels, desc_mods = compare_descriptions(
-            prev_descriptions, curr_descriptions
+    if len(sys.argv) != 4:
+        print(
+            "Usage: python generate_config_deltas.py <base_arch_dir> <new_arch_dir> <output_delta_yaml>"  # noqa: E501
         )
+        sys.exit(1)
 
-        if table_adds or desc_adds:
-            entry = {
-                "panel_config": {"id": curr_pc.get("id"), "title": curr_pc.get("title")}
-            }
-            if table_adds:
-                entry["metric_tables"] = table_adds
-            if desc_adds:
-                entry["metric_descriptions"] = desc_adds
-            combined_diff["Addition"].append(entry)
+    base_dir = Path(sys.argv[1])
+    new_dir = Path(sys.argv[2])
+    out_file = Path(sys.argv[3])
 
-        if table_dels or desc_dels:
-            entry = {
-                "panel_config": {"id": prev_pc.get("id"), "title": prev_pc.get("title")}
-            }
-            if table_dels:
-                entry["metric_tables"] = table_dels
-            if desc_dels:
-                entry["metric_descriptions"] = desc_dels
-            combined_diff["Deletion"].append(entry)
+    delta = generate_arch_delta(base_dir, new_dir)
 
-        if table_mods or desc_mods:
-            entry = {
-                "panel_config": {"id": curr_pc.get("id"), "title": curr_pc.get("title")}
-            }
-            if table_mods:
-                entry["metric_tables"] = table_mods
-            if desc_mods:
-                entry["metric_descriptions"] = desc_mods
-            combined_diff["Modification"].append(entry)
-
-    output = AUTOGEN_TEXT + format_output(combined_diff)
-
-    print("\n" + "=" * 80)
-    print("COMBINED DIFF OUTPUT:")
-    print("=" * 80)
-    print(output)
-
-    output_dir = prev_arch_dir / "config_delta"
-    output_dir.mkdir(exist_ok=True)
-    output_file = output_dir / f"{curr_arch_dir.name}_diff.yaml"
-    with open(output_file, "w") as f:
-        f.write(output)
-
-    print(f"\nDiff written to: {output_file}")
+    cm_utils.save_yaml(delta, out_file)
+    print(f"Delta generated at: {out_file}")
 
 
 if __name__ == "__main__":
diff --git a/projects/rocprofiler-compute/tools/config_management/hash_manager.py b/projects/rocprofiler-compute/tools/config_management/hash_manager.py
index 5c93534986..4823b81e08 100644
--- a/projects/rocprofiler-compute/tools/config_management/hash_manager.py
+++ b/projects/rocprofiler-compute/tools/config_management/hash_manager.py
@@ -43,7 +43,7 @@ import sys
 from pathlib import Path
 from typing import Optional
 
-DEFAULT_HASH_DB = "tools/config_management/.config_hashes.json"
+DEFAULT_HASH_DB = "src/utils/.config_hashes.json"
 
 
 def compute_file_hash(filepath: Path) -> str:
diff --git a/projects/rocprofiler-compute/tools/config_management/master_config_workflow_script.py b/projects/rocprofiler-compute/tools/config_management/master_config_workflow_script.py
index 50d61b7b48..b02ab84dc4 100644
--- a/projects/rocprofiler-compute/tools/config_management/master_config_workflow_script.py
+++ b/projects/rocprofiler-compute/tools/config_management/master_config_workflow_script.py
@@ -24,991 +24,262 @@
 
 ##############################################################################
 
-"""
-Master workflow script for managing architecture configurations.
-- Detects changes
-- Handles direct edits and delta files
-- Supports promoting a NEW arch from:
-    (A) direct edits to latest, or
-    (B) a delta YAML targeting latest
-- Validates, syncs metric descriptions, and updates hashes
-
-"""
-
-from __future__ import annotations
-
 import argparse
 import shutil
 import subprocess
 import sys
-from datetime import datetime
+import time
 from pathlib import Path
-from typing import Optional
 
-try:
-    from . import hash_manager, metric_description_manager
-except Exception:
-    repo_root = Path(__file__).resolve().parents[1]  # repo root
-    if str(repo_root) not in sys.path:
-        sys.path.insert(0, str(repo_root))
-    import config_management.hash_manager as hash_manager  # type: ignore
-    import config_management.metric_description_manager as metric_description_manager  # type: ignore
+SCRIPT_DIR = Path(__file__).resolve().parent
+# .../rocprofiler-compute/tools/config_management
 
-import yaml
+REPO_ROOT = SCRIPT_DIR.parents[1]
+# .../rocprofiler-compute
 
-# =============================================================================
-# CONFIG
-# =============================================================================
+TOOLS_DIR = SCRIPT_DIR
 
-CONFIG_FILE = "config_workflow.yaml"
+SOC_ROOT = REPO_ROOT / "src" / "rocprof_compute_soc"
+ANALYSIS_CONFIGS = SOC_ROOT / "analysis_configs"
 
-DEFAULT_CONFIG: dict = {
-    "paths": {
-        "template": "tools/config_management/gfx9_config_template.yaml",
-        "configs_root": "src/rocprof_compute_soc/analysis_configs",
-        "backups": ".backups",
-        "hashes": "tools/config_management/.config_hashes.json",
-        "per_arch_metrics": "tools/per_arch_metric_definitions",
-        "docs_metrics": "docs/data/metrics_description.yaml",
-    },
-    "validation": {"strict_mode": True, "verify_after_changes": True},
-    "behavior": {"require_confirmation": True},
-}
+TEMPLATE_FILE = ANALYSIS_CONFIGS / "gfx9_config_template.yaml"
+HASH_JSON = REPO_ROOT / "src" / "utils" / ".config_hashes.json"
+BACKUP_DIR = SCRIPT_DIR / "backups"
+
+PYTHON = sys.executable
+
+VERIFY_SCRIPT = TOOLS_DIR / "verify_against_config_template.py"
+PARSE_TEMPLATE_SCRIPT = TOOLS_DIR / "parse_config_template.py"
+GENERATE_DELTAS_SCRIPT = TOOLS_DIR / "generate_config_deltas.py"
+APPLY_DELTAS_SCRIPT = TOOLS_DIR / "apply_config_deltas.py"
+HASH_CHECKER_SCRIPT = REPO_ROOT / "src" / "utils" / "hash_checker.py"
+HASH_MANAGER_SCRIPT = TOOLS_DIR / "hash_manager.py"
 
 
-# =============================================================================
-# UTILITIES
-# =============================================================================
+def run(cmd):
+    print("\n$", " ".join(str(c) for c in cmd))
+    return subprocess.run(cmd, cwd=str(REPO_ROOT)).returncode
 
 
-def load_config() -> dict:
-    """Load config from CONFIG_FILE with a shallow merge onto DEFAULT_CONFIG."""
-    p = Path(CONFIG_FILE)
-    if not p.exists():
-        return DEFAULT_CONFIG
-    with open(p) as f:
-        user = yaml.safe_load(f) or {}
-    merged = DEFAULT_CONFIG.copy()
-    for k, v in user.items():
-        if isinstance(v, dict) and isinstance(merged.get(k), dict):
-            merged[k] = {**merged[k], **v}
+def fatal(msg):
+    print(f"\nFATAL: {msg}")
+    sys.exit(1)
+
+
+def confirm(prompt):
+    ans = input(f"{prompt} [y/N]: ").strip().lower()
+    return ans in ("y", "yes")
+
+
+def backup(paths):
+    BACKUP_DIR.mkdir(exist_ok=True)
+    backup_path = BACKUP_DIR / f"backup_{int(time.time())}"
+    backup_path.mkdir()
+
+    for p in paths:
+        if not p.exists():
+            continue
+        dest = backup_path / p.name
+        if p.is_dir():
+            shutil.copytree(p, dest)
         else:
-            merged[k] = v
-    return merged
+            shutil.copy2(p, dest)
 
-
-def create_backup(source_paths: list[str], backup_dir: str) -> Path:
-    """Create a timestamped backup of the provided paths."""
-    ts = datetime.now().strftime("%Y%m%d_%H%M%S_%f")  # add microseconds
-    base = Path(backup_dir)
-    base.mkdir(parents=True, exist_ok=True)
-    backup_path = base / ts
-
-    # Fallback suffix if somehow collides
-    i = 1
-    while backup_path.exists():
-        backup_path = base / f"{ts}_{i}"
-        i += 1
-
-    print(f"Creating backup: {backup_path}")
-    for s in source_paths:
-        sp = Path(s)
-        dst = backup_path / sp.name
-        if sp.is_dir():
-            shutil.copytree(sp, dst)
-        elif sp.is_file():
-            dst.parent.mkdir(parents=True, exist_ok=True)
-            shutil.copy2(sp, dst)
+    print(f"\nBackup created at {backup_path}")
     return backup_path
 
 
-def restore_backup(backup_path: Path, target_paths: list[str]) -> None:
-    """Restore files/dirs from a given backup path."""
-    print(f"Restoring from backup: {backup_path}")
-    for t in target_paths:
-        tp = Path(t)
-        bp = backup_path / tp.name
-        if not bp.exists():
+def restore(backup_path, paths):
+    print("\nRestoring from backup...")
+    for p in paths:
+        src = backup_path / p.name
+        if not src.exists():
             continue
-        if tp.is_dir():
-            shutil.rmtree(tp, ignore_errors=True)
-        elif tp.exists():
-            tp.unlink()
-        if bp.is_dir():
-            shutil.copytree(bp, tp)
+        if p.exists():
+            if p.is_dir():
+                shutil.rmtree(p)
+            else:
+                p.unlink()
+        if src.is_dir():
+            shutil.copytree(src, p)
         else:
-            shutil.copy2(bp, tp)
-    print("Backup restored")
+            shutil.copy2(src, p)
+    print("Restore complete.")
 
 
-def cleanup_old_backups(backup_dir: str) -> None:
-    """Keep latest backup, remove older ones."""
-    b = Path(backup_dir)
-    if not b.exists():
-        return
-    dirs = sorted([d for d in b.iterdir() if d.is_dir()])
-    for old in dirs[:-1]:
-        shutil.rmtree(old, ignore_errors=True)
-        print(f"Removed old backup: {old.name}")
-
-
-def prompt_yes_no(question: str, default: Optional[bool] = None) -> bool:
-    """Ask a yes/no question in the terminal."""
-    if default is None:
-        prompt = f"{question} (y/n): "
-    elif default:
-        prompt = f"{question} [Y/n]: "
-    else:
-        prompt = f"{question} [y/N]: "
-    while True:
-        ans = input(prompt).strip().lower()
-        if not ans and default is not None:
-            return default
-        if ans in ("y", "yes"):
-            return True
-        if ans in ("n", "no"):
-            return False
-        print("Please answer 'y' or 'n'.")
-
-
-def run_script(
-    script_name: str, args: list[str], capture_output: bool = True
-) -> subprocess.CompletedProcess:
-    """Run a Python helper script and return CompletedProcess."""
-    return subprocess.run(
-        [sys.executable, script_name] + args, capture_output=capture_output, text=True
-    )
-
-
-def get_all_archs(configs_dir: str) -> list[str]:
-    """Return sorted list of gfx* directories."""
-    root = Path(configs_dir)
-    return sorted([
-        d.name for d in root.iterdir() if d.is_dir() and d.name.startswith("gfx")
-    ])
-
-
-def get_latest_arch(template_file: str) -> Optional[str]:
-    """Read 'latest_arch' from template YAML."""
-    p = Path(template_file)
-    if not p.is_file():
-        return None
-    with open(p) as f:
-        data = yaml.safe_load(f) or {}
-    return data.get("latest_arch")
-
-
-def validate_delta_structure(delta_file: str) -> tuple[bool, str]:
-    """Ensure delta YAML contains Addition/Deletion/Modification keys."""
-    with open(delta_file) as f:
-        data = yaml.safe_load(f) or {}
-    required = {"Addition", "Deletion", "Modification"}
-    if not isinstance(data, dict) or not required.issubset(data.keys()):
-        return False, "Delta must have Addition, Deletion, Modification keys"
-    return True, ""
-
-
-# =============================================================================
-# VALIDATION / SYNC
-# =============================================================================
-
-
-def validate_all_archs(config: dict) -> tuple[bool, str]:
-    """Validate all archs against the template."""
-    print("Validating all architectures against template...")
-    res = run_script(
-        "tools/config_management/verify_against_config_template.py",
-        [config["paths"]["configs_root"], config["paths"]["template"]],
-        capture_output=True,
-    )
-    if res.stdout:
-        print(res.stdout)
-    if res.returncode != 0:
-        if res.stderr:
-            print(res.stderr)
-        return False, "Validation failed"
-    return True, "Validation passed"
-
-
-def validate_arch_against_template(arch_name: str, config: dict) -> tuple[bool, str]:
-    """Validate one arch (best-effort: rely on script output mentioning arch)."""
-    print(f"Validating {arch_name} against template...")
-    res = run_script(
-        "tools/config_management/verify_against_config_template.py",
-        [config["paths"]["configs_root"], config["paths"]["template"]],
-        capture_output=True,
-    )
-    if res.returncode != 0 and arch_name in (res.stdout or ""):
-        print(res.stdout)
-        return False, f"Validation failed for {arch_name}"
-    return True, f"Validation passed for {arch_name}"
-
-
-# =============================================================================
-# CHANGE DETECTION
-# =============================================================================
-
-
-def detect_changes(config: dict) -> dict:
-    print("Detecting changes...")
-    return hash_manager.detect_changes(
-        config["paths"]["configs_root"], config["paths"]["hashes"]
-    )
-
-
-def display_change_summary(changes: dict) -> bool:
-    print("\n" + "=" * 80)
-    print("CHANGE SUMMARY")
-    print("=" * 80)
-
-    has_changes = any([
-        changes.get("new_archs"),
-        changes.get("modified_archs"),
-        changes.get("delta_files"),
-        changes.get("deleted_archs"),
-    ])
-
-    if changes.get("new_archs"):
-        print("\nNew Architecture Directories:")
-        for a in changes["new_archs"]:
-            print(f"   • {a}")
-
-    if changes.get("modified_archs"):
-        print("\nModified Architectures:")
-        for a, files in changes["modified_archs"].items():
-            print(f"   • {a}:")
-            for f in files[:5]:
-                print(f"      - {f}")
-            extra = len(files) - 5
-            if extra > 0:
-                print(f"      ... and {extra} more files")
-
-    if changes.get("delta_files"):
-        print("\nDelta Files Detected:")
-        for a, d in changes["delta_files"].items():
-            print(f"   • {a}: {Path(d).name}")
-
-    if changes.get("deleted_archs"):
-        print("\nDeleted Architectures:")
-        for a in changes["deleted_archs"]:
-            print(f"   • {a}")
-
-    if not has_changes:
-        print("\nNo changes detected")
-
-    print("=" * 80 + "\n")
-    return has_changes
-
-
-# =============================================================================
-# CORE WORKFLOW OPS
-# =============================================================================
-
-
-def promote_to_latest(
-    new_arch: str, config: dict, reuse_backup: Optional[Path] = None
-) -> bool:
-    """
-    Original 'promote' that assumes new_arch dir already exists & populated.
-    (Kept for backward compatibility.)
-    """
-    print(f"\nPROMOTING {new_arch} TO LATEST ARCHITECTURE...")
-    backup_paths = [config["paths"]["configs_root"], config["paths"]["template"]]
-    backup_path = reuse_backup or create_backup(
-        backup_paths, config["paths"]["backups"]
-    )
-
-    try:
-        root = Path(config["paths"]["configs_root"])
-        new_dir = root / new_arch
-        if not new_dir.is_dir():
-            raise Exception(f"New arch directory not found: {new_dir}")
-
-        all_archs = get_all_archs(config["paths"]["configs_root"])
-        prev_archs = [a for a in all_archs if a != new_arch]
-
-        print(f"\n1. Updating template with new latest arch: {new_arch}")
-        res = run_script(
-            "tools/config_management/parse_config_template.py",
-            [str(new_dir), config["paths"]["template"], "--latest-arch", new_arch],
-            capture_output=True,
-        )
-        if res.returncode != 0:
-            raise Exception(f"Failed to update template: {res.stderr}")
-
-        print(f"\n2. Generating deltas for {len(prev_archs)} previous architectures")
-        for p in prev_archs:
-            prev_dir = root / p
-            gen = run_script(
-                "tools/config_management/generate_config_deltas.py",
-                [str(new_dir), str(prev_dir)],
-                capture_output=True,
-            )
-            if gen.returncode != 0:
-                raise Exception(f"Failed to generate delta for {p}: {gen.stderr}")
-
-        print("\n\tUpdating hashes for previous architectures (delta files)")
-        for p in prev_archs:
-            hash_manager.update_hashes(
-                p, config["paths"]["configs_root"], config["paths"]["hashes"]
-            )
-
-        print("\n3. Validating all architectures")
-        ok, msg = validate_all_archs(config)
-        if not ok:
-            raise Exception(msg)
-
-        print("\n4. Syncing metric descriptions")
-        ok = metric_description_manager.sync_arch(
-            new_arch,
-            config["paths"]["configs_root"],
-            config["paths"]["per_arch_metrics"],
-            config["paths"]["docs_metrics"],
-            is_latest=True,
-        )
-        if not ok:
-            raise Exception("Failed to sync metric descriptions")
-
-        print("\n5. Updating hashes")
-        hash_manager.update_hashes(
-            new_arch, config["paths"]["configs_root"], config["paths"]["hashes"]
-        )
-
-        print(f"\nSuccessfully promoted {new_arch} to latest architecture!")
-        return True
-
-    except Exception as e:
-        print(f"\nERROR: {e}\nRestoring from backup...")
-        restore_backup(backup_path, backup_paths)
-        return False
-
-
-def update_latest_arch_from_delta(
-    delta_file: str, arch_name: str, config: dict
-) -> bool:
-    """Apply a delta in-place to the latest arch (legacy flow)."""
-    print(f"\nUPDATING LATEST ARCH {arch_name} FROM DELTA...")
-    backup_paths = [config["paths"]["configs_root"], config["paths"]["template"]]
-    backup_path = create_backup(backup_paths, config["paths"]["backups"])
-
-    try:
-        root = Path(config["paths"]["configs_root"])
-        arch_dir = root / arch_name
-        tmp = root / f"{arch_name}_tmp"
-
-        print(f"\n1. Applying delta to {arch_name}")
-        res = run_script(
-            "tools/config_management/apply_config_deltas.py",
-            [str(arch_dir), delta_file, str(tmp)],
-            capture_output=True,
-        )
-        if res.returncode != 0:
-            raise Exception(f"Failed to apply delta: {res.stderr}")
-
-        shutil.rmtree(arch_dir)
-        shutil.move(str(tmp), str(arch_dir))
-
-        print("\n2. Updating template")
-        res = run_script(
-            "tools/config_management/parse_config_template.py",
-            [str(arch_dir), config["paths"]["template"], "--latest-arch", arch_name],
-            capture_output=True,
-        )
-        if res.returncode != 0:
-            raise Exception(f"Failed to update template: {res.stderr}")
-
-        print("\n3. Regenerating deltas for previous architectures")
-        all_archs = get_all_archs(config["paths"]["configs_root"])
-        for prev in [a for a in all_archs if a != arch_name]:
-            prev_dir = root / prev
-            gen = run_script(
-                "tools/config_management/generate_config_deltas.py",
-                [str(arch_dir), str(prev_dir)],
-                capture_output=True,
-            )
-            if gen.returncode != 0:
-                raise Exception(f"Failed to generate delta for {prev}")
-
-        for prev in [a for a in all_archs if a != arch_name]:
-            hash_manager.update_hashes(
-                prev, config["paths"]["configs_root"], config["paths"]["hashes"]
-            )
-
-        print("\n4. Validating all architectures")
-        ok, msg = validate_all_archs(config)
-        if not ok:
-            raise Exception(msg)
-
-        print("\n5. Syncing metric descriptions")
-        ok = metric_description_manager.sync_arch(
-            arch_name,
-            config["paths"]["configs_root"],
-            config["paths"]["per_arch_metrics"],
-            config["paths"]["docs_metrics"],
-            is_latest=True,
-        )
-        if not ok:
-            raise Exception("Failed to sync metric descriptions")
-
-        print("\n6. Updating hashes")
-        hash_manager.update_hashes(
-            arch_name, config["paths"]["configs_root"], config["paths"]["hashes"]
-        )
-
-        print(f"\nSuccessfully updated latest arch {arch_name}!")
-        return True
-
-    except Exception as e:
-        print(f"\nERROR: {e}\nRestoring from backup...")
-        restore_backup(backup_path, backup_paths)
-        return False
-
-
-def update_older_arch_from_delta(delta_file: str, arch_name: str, config: dict) -> bool:
-    """Apply a delta in-place to an older arch (legacy flow)."""
-    print(f"\nUPDATING OLDER ARCH {arch_name} FROM DELTA...")
-    root = Path(config["paths"]["configs_root"])
-    arch_dir = root / arch_name
-    backup_path = create_backup([str(arch_dir)], config["paths"]["backups"])
-
-    try:
-        tmp = root / f"{arch_name}_tmp"
-
-        print(f"\n1. Applying delta to {arch_name}")
-        res = run_script(
-            "tools/config_management/apply_config_deltas.py",
-            [str(arch_dir), delta_file, str(tmp)],
-            capture_output=True,
-        )
-        if res.returncode != 0:
-            raise Exception(f"Failed to apply delta: {res.stderr}")
-
-        shutil.rmtree(arch_dir)
-        shutil.move(str(tmp), str(arch_dir))
-
-        print("\n2. Validating against template")
-        ok, msg = validate_arch_against_template(arch_name, config)
-        if not ok:
-            raise Exception(msg)
-
-        print("\n3. Syncing metric descriptions")
-        ok = metric_description_manager.sync_arch(
-            arch_name,
-            config["paths"]["configs_root"],
-            config["paths"]["per_arch_metrics"],
-            config["paths"]["docs_metrics"],
-            is_latest=False,
-        )
-        if not ok:
-            raise Exception("Failed to sync metric descriptions")
-
-        print("\n4. Updating hashes")
-        hash_manager.update_hashes(
-            arch_name, config["paths"]["configs_root"], config["paths"]["hashes"]
-        )
-
-        print(f"\nSuccessfully updated older arch {arch_name}!")
-        return True
-
-    except Exception as e:
-        print(f"\nERROR: {e}\nRestoring from backup...")
-        restore_backup(backup_path, [str(arch_dir)])
-        return False
-
-
-def update_latest_arch_from_edits(arch_name: str, config: dict) -> bool:
-    """Re-derive template/deltas from direct edits to latest (legacy in-place)."""
-    print(f"\nUPDATING LATEST ARCH {arch_name} FROM DIRECT EDITS...")
-    backup_paths = [config["paths"]["configs_root"], config["paths"]["template"]]
-    backup_path = create_backup(backup_paths, config["paths"]["backups"])
-
-    try:
-        root = Path(config["paths"]["configs_root"])
-        arch_dir = root / arch_name
-
-        print("\n1. Updating template")
-        res = run_script(
-            "tools/config_management/parse_config_template.py",
-            [str(arch_dir), config["paths"]["template"], "--latest-arch", arch_name],
-            capture_output=True,
-        )
-        if res.returncode != 0:
-            raise Exception(f"Failed to update template: {res.stderr}")
-
-        print("\n2. Regenerating deltas for previous architectures")
-        for prev in [
-            a for a in get_all_archs(config["paths"]["configs_root"]) if a != arch_name
-        ]:
-            prev_dir = root / prev
-            gen = run_script(
-                "tools/config_management/generate_config_deltas.py",
-                [str(arch_dir), str(prev_dir)],
-                capture_output=True,
-            )
-            if gen.returncode != 0:
-                raise Exception(f"Failed to generate delta for {prev}")
-
-        for prev in [
-            a for a in get_all_archs(config["paths"]["configs_root"]) if a != arch_name
-        ]:
-            hash_manager.update_hashes(
-                prev, config["paths"]["configs_root"], config["paths"]["hashes"]
-            )
-
-        print("\n3. Validating all architectures")
-        ok, msg = validate_all_archs(config)
-        if not ok:
-            raise Exception(msg)
-
-        print("\n4. Syncing metric descriptions")
-        ok = metric_description_manager.sync_arch(
-            arch_name,
-            config["paths"]["configs_root"],
-            config["paths"]["per_arch_metrics"],
-            config["paths"]["docs_metrics"],
-            is_latest=True,
-        )
-        if not ok:
-            raise Exception("Failed to sync metric descriptions")
-
-        print("\n5. Updating hashes")
-        hash_manager.update_hashes(
-            arch_name, config["paths"]["configs_root"], config["paths"]["hashes"]
-        )
-
-        print(f"\nSuccessfully updated latest arch {arch_name}!")
-        return True
-
-    except Exception as e:
-        print(f"\nERROR: {e}\nRestoring from backup...")
-        restore_backup(backup_path, backup_paths)
-        return False
-
-
-def update_older_arch_from_edits(arch_name: str, config: dict) -> bool:
-    """Re-validate/sync/hash older arch after direct edits (legacy in-place)."""
-    print(f"\nUPDATING OLDER ARCH {arch_name} FROM DIRECT EDITS...")
-    root = Path(config["paths"]["configs_root"])
-    arch_dir = root / arch_name
-    backup_path = create_backup([str(arch_dir)], config["paths"]["backups"])
-
-    try:
-        print("\n1. Validating against template")
-        ok, msg = validate_arch_against_template(arch_name, config)
-        if not ok:
-            raise Exception(msg)
-
-        print("\n2. Syncing metric descriptions")
-        ok = metric_description_manager.sync_arch(
-            arch_name,
-            config["paths"]["configs_root"],
-            config["paths"]["per_arch_metrics"],
-            config["paths"]["docs_metrics"],
-            is_latest=False,
-        )
-        if not ok:
-            raise Exception("Failed to sync metric descriptions")
-
-        print("\n3. Updating hashes")
-        hash_manager.update_hashes(
-            arch_name, config["paths"]["configs_root"], config["paths"]["hashes"]
-        )
-
-        print(f"\nSuccessfully updated older arch {arch_name}!")
-        return True
-
-    except Exception as e:
-        print(f"\nERROR: {e}\nRestoring from backup...")
-        restore_backup(backup_path, [str(arch_dir)])
-        return False
-
-
-# =============================================================================
-# NEW: PROMOTE NEW ARCH FROM (A) EDITS or (B) DELTA
-# =============================================================================
-
-
-def _git_restore_pristine(path: Path) -> None:
-    """
-    Best-effort restore of a directory to HEAD using Git.
-    No-op if not in a Git repo. Raises on checkout failure when in a repo.
-    """
-    chk = subprocess.run(
-        ["git", "rev-parse", "--is-inside-work-tree"], capture_output=True, text=True
-    )
-    if chk.returncode != 0 or chk.stdout.strip() != "true":
-        return
-    res = subprocess.run(
-        ["git", "checkout", "--", str(path)], capture_output=True, text=True
-    )
-    if res.returncode != 0:
-        raise Exception(f"Failed to restore pristine state from Git for {path}")
-
-
-def promote_new_arch_from_latest_edits(
-    latest_arch: str, new_arch: str, config: dict
-) -> bool:
-    """
-    Flow (A): Direct edits were made to the current latest arch.
-    1) Snapshot edited latest to temp
-    2) Restore pristine latest (via Git)
-    3) Copy pristine latest → new arch
-    4) Generate delta (edited_tmp vs pristine_latest) → write under latest/config_delta/
-    5) Apply delta to new arch
-    6) Update template latest=new_arch, regen deltas, validate, sync, hash
-    """
-    print(f"\nPROMOTING {new_arch} FROM EDITS IN {latest_arch}...")
-    root = Path(config["paths"]["configs_root"])
-    latest_dir = root / latest_arch
-    new_dir = root / new_arch
-    edited_tmp = root / f"_{latest_arch}_edited_tmp"
-    new_tmp = root / f"_{new_arch}_tmp"
-
-    backup_paths = [config["paths"]["configs_root"], config["paths"]["template"]]
-    backup_path = create_backup(backup_paths, config["paths"]["backups"])
-
-    try:
-        # 1) Snapshot edited latest
-        if edited_tmp.exists():
-            shutil.rmtree(edited_tmp)
-        shutil.copytree(latest_dir, edited_tmp)
-
-        # 2) Restore pristine latest
-        _git_restore_pristine(latest_dir)
-
-        # 3) Copy pristine latest → new arch
-        if new_dir.exists():
-            raise Exception(f"Target new arch directory already exists: {new_dir}")
-        shutil.copytree(latest_dir, new_dir)
-
-        # 4) Generate delta: edited (curr) vs pristine latest (prev)
-        print("\nGenerating delta (edited latest → pristine latest)")
-        gen = run_script(
-            "tools/config_management/generate_config_deltas.py",
-            [str(edited_tmp), str(latest_dir)],
-            capture_output=True,
-        )
-        if gen.returncode != 0:
-            raise Exception(f"Failed to generate delta: {gen.stderr}")
-
-        delta_dir = latest_dir / "config_delta"
-        # Prefer the file named for edited_tmp; otherwise take the latest *_diff.yaml
-        candidates = sorted(delta_dir.glob(f"{edited_tmp.name}_diff.yaml")) or sorted(
-            delta_dir.glob("*_diff.yaml")
-        )
-        if not candidates:
-            raise Exception("Delta file not found after generation.")
-        delta_file = candidates[-1]
-
-        # 5) Apply delta onto new arch
-        if new_tmp.exists():
-            shutil.rmtree(new_tmp)
-        print(f"\nApplying delta to {new_arch}: {delta_file.name}")
-        app = run_script(
-            "tools/config_management/apply_config_deltas.py",
-            [str(new_dir), str(delta_file), str(new_tmp)],
-            capture_output=True,
-        )
-        if app.returncode != 0:
-            raise Exception(f"Failed to apply delta: {app.stderr}")
-        shutil.rmtree(new_dir)
-        shutil.move(str(new_tmp), str(new_dir))
-
-        # 6) Promote to latest, regen deltas, validate, sync, hash
-        return promote_to_latest(new_arch, config, reuse_backup=backup_path)
-
-    except Exception as e:
-        print(f"\nERROR: {e}\nRestoring from backup...")
-        restore_backup(backup_path, backup_paths)
-        return False
-    finally:
-        if edited_tmp.exists():
-            shutil.rmtree(edited_tmp, ignore_errors=True)
-        if new_tmp.exists():
-            shutil.rmtree(new_tmp, ignore_errors=True)
-
-
-def promote_new_arch_from_delta(
-    latest_arch: str, new_arch: str, delta_file: str, config: dict
-) -> bool:
-    """
-    Flow (B): Developer added a delta YAML targeting the latest arch.
-    1) Copy pristine latest → new arch
-    2) Apply the provided delta to new arch
-    3) Promote to latest, regen deltas, validate, sync, hash
-    """
-    print(f"\nPROMOTING {new_arch} FROM DELTA ON {latest_arch}...")
-    root = Path(config["paths"]["configs_root"])
-    latest_dir = root / latest_arch
-    new_dir = root / new_arch
-    new_tmp = root / f"_{new_arch}_tmp"
-
-    backup_paths = [config["paths"]["configs_root"], config["paths"]["template"]]
-    backup_path = create_backup(backup_paths, config["paths"]["backups"])
-
-    try:
-        if not Path(delta_file).is_file():
-            raise Exception(f"Delta file does not exist: {delta_file}")
-        if not latest_dir.is_dir():
-            raise Exception(f"Latest arch not found: {latest_dir}")
-        if new_dir.exists():
-            raise Exception(f"Target new arch directory already exists: {new_dir}")
-
-        # Start from pristine latest
-        _git_restore_pristine(latest_dir)
-
-        # 1) Copy pristine latest → new arch
-        shutil.copytree(latest_dir, new_dir)
-
-        # 2) Apply delta onto the new arch
-        if new_tmp.exists():
-            shutil.rmtree(new_tmp)
-        print(f"\nApplying delta to {new_arch}: {Path(delta_file).name}")
-        app = run_script(
-            "tools/config_management/apply_config_deltas.py",
-            [str(new_dir), str(delta_file), str(new_tmp)],
-            capture_output=True,
-        )
-        if app.returncode != 0:
-            raise Exception(f"Failed to apply delta: {app.stderr}")
-        shutil.rmtree(new_dir)
-        shutil.move(str(new_tmp), str(new_dir))
-
-        # 3) Promote to latest, regen deltas, validate, sync, hash
-        return promote_to_latest(new_arch, config, reuse_backup=backup_path)
-
-    except Exception as e:
-        print(f"\nERROR: {e}\nRestoring from backup...")
-        restore_backup(backup_path, backup_paths)
-        return False
-    finally:
-        if new_tmp.exists():
-            shutil.rmtree(new_tmp, ignore_errors=True)
-
-
-# =============================================================================
-# USER-FACING SCENARIO HANDLERS
-# =============================================================================
-
-
-def handle_new_arch(arch_name: str, config: dict, dry_run: bool = False) -> bool:
-    print(f"\n{'=' * 80}\nNEW ARCHITECTURE DETECTED: {arch_name}\n{'=' * 80}")
-    if not prompt_yes_no(f"Is {arch_name} the new latest architecture?"):
-        print(
-            "ERROR: New arch detected but not marked as latest.\n   "
-            "Only the latest arch should be added as a new directory."
-        )
-        return False
-    if dry_run:
-        print(f"[DRY RUN] Would promote {arch_name} to latest")
-        return True
-    return promote_to_latest(arch_name, config)
-
-
-def handle_delta_file(
-    delta_file: str, arch_name: str, config: dict, dry_run: bool = False
-) -> bool:
-    print(
-        f"\n{'=' * 80}\nDELTA FILE DETECTED: {Path(delta_file).name}\n   "
-        f"Target architecture: {arch_name}\n{'=' * 80}"
-    )
-
-    valid, err = validate_delta_structure(delta_file)
-    if not valid:
-        print(f"ERROR: Invalid delta structure - {err}")
-        return False
-
-    latest = (
-        get_latest_arch(config["paths"]["template"])
-        or (get_all_archs(config["paths"]["configs_root"]) or [None])[-1]
-    )
-
-    if arch_name == latest:
-        print(f"\nDelta targets the current latest arch: {latest}")
-        print("Choose how to apply this delta:")
-        print("  1. Update the existing latest arch in-place")
-        print(
-            "  2. Create a NEW architecture from latest and apply "
-            "the delta there (promote to latest)"
-        )
-
-        while True:
-            choice = input("Enter choice (1 or 2): ").strip()
-            if choice == "1":
-                if dry_run:
-                    print(f"[DRY RUN] Would update latest arch {latest} from delta")
-                    return True
-                return update_latest_arch_from_delta(delta_file, latest, config)
-            if choice == "2":
-                new_arch_name = input(
-                    "Enter new architecture name (e.g., gfx955): "
-                ).strip()
-                if not new_arch_name:
-                    print("New architecture name cannot be empty.")
-                    continue
-                if not prompt_yes_no(
-                    f"Promote {new_arch_name} to new latest architecture?"
-                ):
-                    print("Operation cancelled.")
-                    return False
-                if dry_run:
-                    print(
-                        "[DRY RUN] Would create "
-                        f"{new_arch_name} from {latest} and apply delta"
-                    )
-                    return True
-                return promote_new_arch_from_delta(
-                    latest, new_arch_name, delta_file, config
-                )
-            print("Invalid choice. Please enter 1 or 2.")
-    else:
-        if not prompt_yes_no(f"Apply delta to older arch ({arch_name}) in-place?"):
-            return False
-        if dry_run:
-            print(f"[DRY RUN] Would update older arch {arch_name} from delta")
-            return True
-        return update_older_arch_from_delta(delta_file, arch_name, config)
-
-
-def handle_direct_edits(
-    arch_name: str, modified_files: list[str], config: dict, dry_run: bool = False
-) -> bool:
-    print(f"\n{'=' * 80}\nDIRECT EDITS DETECTED: {arch_name}\n{'=' * 80}")
-    print("Modified files:")
-    for f in modified_files:
-        print(f"   • {f}")
-
-    latest = (
-        get_latest_arch(config["paths"]["template"])
-        or (get_all_archs(config["paths"]["configs_root"]) or [None])[-1]
-    )
-
-    if arch_name == latest:
-        print(f"\nThis is the current latest architecture ({latest}).")
-        print("Are you:")
-        print("  1. Updating the existing latest arch")
-        print("  2. Creating a new architecture (this will become the new latest)")
-
-        while True:
-            choice = input("Enter choice (1 or 2): ").strip()
-            if choice == "1":
-                if dry_run:
-                    print(
-                        f"[DRY RUN] Would update latest arch {latest} from direct edits"
-                    )
-                    return True
-                return update_latest_arch_from_edits(arch_name, config)
-            if choice == "2":
-                new_arch_name = (
-                    input(
-                        "Enter new architecture name "
-                        f"(currently detected as {arch_name}): "
-                    ).strip()
-                    or arch_name
-                )
-                if not prompt_yes_no(
-                    f"Promote {new_arch_name} to new latest architecture?"
-                ):
-                    print("Operation cancelled.")
-                    return False
-                if dry_run:
-                    print(
-                        "[DRY RUN] Would promote "
-                        f"{new_arch_name} from edits in {arch_name}"
-                    )
-                    return True
-                return promote_new_arch_from_latest_edits(
-                    arch_name, new_arch_name, config
-                )
-            print("Invalid choice. Please enter 1 or 2.")
-    else:
-        if not prompt_yes_no(
-            f"These are edits to older arch ({arch_name}). Continue (in-place)?"
-        ):
-            return False
-        if dry_run:
-            print(f"[DRY RUN] Would update older arch {arch_name} from direct edits")
-            return True
-        return update_older_arch_from_edits(arch_name, config)
-
-
-# =============================================================================
-# MAIN
-# =============================================================================
-
-
-def main() -> int:
-    parser = argparse.ArgumentParser(
-        description="Master workflow for managing architecture configurations"
-    )
-    parser.add_argument(
-        "--dry-run",
-        action="store_true",
-        help="Show what would be done without making changes",
-    )
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--ci", action="store_true")
+    parser.add_argument("--hash-only", action="store_true")
+    parser.add_argument("--validate-only", action="store_true")
+    parser.add_argument("--edit-existing", action="store_true")
+    parser.add_argument("--promote", metavar="ARCH")
     args = parser.parse_args()
 
-    print("=" * 80)
-    print("ARCHITECTURE CONFIG WORKFLOW")
-    print("=" * 80)
+    # --------------------------------------------------------
+    # CI / HASH-ONLY PATH (non-mutating)
+    # --------------------------------------------------------
+    if args.ci or args.hash_only:
+        if not HASH_CHECKER_SCRIPT.exists():
+            fatal("hash_checker.py not found")
+        sys.exit(run([PYTHON, HASH_CHECKER_SCRIPT]))
 
-    config = load_config()
+    # --------------------------------------------------------
+    # HARD PREFLIGHT (STRUCTURAL VALIDATION) for all non-hash paths
+    # --------------------------------------------------------
+    if not VERIFY_SCRIPT.exists():
+        fatal("verify_against_config_template.py not found")
 
-    if args.dry_run:
-        print("\nDRY RUN MODE - No changes will be made\n")
+    rc = run([PYTHON, VERIFY_SCRIPT, ANALYSIS_CONFIGS, TEMPLATE_FILE])
+    if rc != 0:
+        fatal("Template / architecture verification failed")
 
-    changes = detect_changes(config)
-    has_changes = display_change_summary(changes)
-    if not has_changes:
-        return 0
+    # --------------------------------------------------------
+    # VALIDATE-ONLY
+    # --------------------------------------------------------
+    if args.validate_only:
+        print("\nValidation successful.")
+        sys.exit(0)
 
-    latest_arch = (
-        get_latest_arch(config["paths"]["template"])
-        or (get_all_archs(config["paths"]["configs_root"]) or [None])[-1]
-    )
-    latest_has_edits = latest_arch in (changes.get("modified_archs") or {})
+    # --------------------------------------------------------
+    # EDIT EXISTING ARCHITECTURE (helpers only; no template/hash updates)
+    # --------------------------------------------------------
+    if args.edit_existing:
+        print("\nEdit existing architecture mode.")
 
-    # New arch directories that appeared on disk
-    for new_arch in changes.get("new_archs", []):
-        if not handle_new_arch(new_arch, config, args.dry_run):
-            return 1
+        choice = input(
+            "\nChoose:\n  1) Generate delta\n  2) Apply delta\n  3) Exit\nSelect: "
+        ).strip()
 
-    # If latest was directly edited, prioritize resolving that path
-    # (user will choose in-place vs new arch)
-    if latest_has_edits:
-        if not handle_direct_edits(
-            latest_arch, changes["modified_archs"][latest_arch], config, args.dry_run
+        if choice == "1":
+            base = input("Base arch dir (absolute or relative to repo root): ").strip()
+            new = input("New  arch dir: ").strip()
+            out = input("Output delta yaml: ").strip()
+            sys.exit(run([PYTHON, GENERATE_DELTAS_SCRIPT, base, new, out]))
+
+        if choice == "2":
+            base = input("Base arch dir: ").strip()
+            delta = input("Delta yaml: ").strip()
+            out = input("Output dir: ").strip()
+
+            rc = run([PYTHON, APPLY_DELTAS_SCRIPT, base, delta, out])
+            if rc != 0:
+                sys.exit(rc)
+
+            # Re-verify after apply
+            rc = run([PYTHON, VERIFY_SCRIPT, ANALYSIS_CONFIGS, TEMPLATE_FILE])
+            sys.exit(rc)
+
+        sys.exit(0)
+
+    # --------------------------------------------------------
+    # PROMOTE NEW LATEST ARCHITECTURE (mutating, with rollback)
+    # --------------------------------------------------------
+    if args.promote:
+        new_latest = args.promote
+        new_arch_dir = ANALYSIS_CONFIGS / new_latest
+
+        if not new_arch_dir.is_dir():
+            fatal(f"Architecture directory not found: {new_arch_dir}")
+
+        if not confirm(
+            f"Promote {new_latest} to latest? "
+            "This will update template, regenerate deltas, and update hashes."
         ):
-            return 1
-        print("\nNote: Delta files for older archs will be regenerated automatically.")
-        print("Skipping delta file processing for older architectures.\n")
-    else:
-        # Process delta files
-        for arch, delta_file in changes.get("delta_files", {}).items():
-            if not handle_delta_file(delta_file, arch, config, args.dry_run):
-                return 1
+            sys.exit(0)
 
-    # Remaining direct edits (excluding latest if already processed)
-    for arch, files in (changes.get("modified_archs") or {}).items():
-        if arch == latest_arch and latest_has_edits:
-            continue
-        if arch in (changes.get("delta_files") or {}):
-            continue
-        if not handle_direct_edits(arch, files, config, args.dry_run):
-            return 1
+        # Back up the things we mutate
+        backup_path = backup([ANALYSIS_CONFIGS, TEMPLATE_FILE, HASH_JSON])
 
-    if not args.dry_run:
-        cleanup_old_backups(config["paths"]["backups"])
-        print("\n" + "=" * 80)
-        print("ALL OPERATIONS COMPLETED SUCCESSFULLY!")
-        print("=" * 80)
-    else:
-        print("\n" + "=" * 80)
-        print("DRY RUN COMPLETE")
-        print("=" * 80)
+        try:
+            # 1) Update template
+            if not PARSE_TEMPLATE_SCRIPT.exists():
+                raise RuntimeError("parse_config_template.py not found")
 
-    return 0
+            rc = run([
+                PYTHON,
+                PARSE_TEMPLATE_SCRIPT,
+                new_arch_dir,
+                TEMPLATE_FILE,
+                "--latest-arch",
+                new_latest,
+            ])
+            if rc != 0:
+                raise RuntimeError("Failed to update template")
+
+            # 2) Regenerate deltas for all other archs
+            for arch_dir in sorted(ANALYSIS_CONFIGS.iterdir()):
+                if not arch_dir.is_dir():
+                    continue
+                if arch_dir.name == new_latest:
+                    continue
+
+                delta_dir = arch_dir / "config_delta"
+                delta_dir.mkdir(exist_ok=True)
+                out_delta = delta_dir / f"{new_latest}_diff.yaml"
+
+                rc = run([
+                    PYTHON,
+                    GENERATE_DELTAS_SCRIPT,
+                    new_arch_dir,
+                    arch_dir,
+                    out_delta,
+                ])
+                if rc != 0:
+                    raise RuntimeError(f"Delta generation failed for {arch_dir.name}")
+
+                for f in delta_dir.glob("*_diff.yaml"):
+                    if f.name != f"{new_latest}_diff.yaml":
+                        f.unlink()
+
+            # 3) Re-verify everything against updated template
+            rc = run([PYTHON, VERIFY_SCRIPT, ANALYSIS_CONFIGS, TEMPLATE_FILE])
+            if rc != 0:
+                raise RuntimeError("Post-promotion verification failed")
+
+            # 4) Now update the hash DB to the new steady state.
+            #    Promotion touched many delta files, so compute-all is the safest.
+            if not HASH_MANAGER_SCRIPT.exists():
+                raise RuntimeError("hash_manager.py not found")
+
+            rc = run([
+                PYTHON,
+                HASH_MANAGER_SCRIPT,
+                "--compute-all",
+                ANALYSIS_CONFIGS,
+                HASH_JSON,
+            ])
+            if rc != 0:
+                raise RuntimeError("Hash DB update failed (--compute-all)")
+
+            # 5) run hash_checker
+            rc = run([PYTHON, HASH_CHECKER_SCRIPT])
+            if rc != 0:
+                raise RuntimeError(
+                    "Hash consistency check failed (after hash DB update)"
+                )
+
+            print(f"\nSUCCESS: {new_latest} promoted to latest.")
+            sys.exit(0)
+
+        except Exception as e:
+            print(f"\nERROR: {e}")
+            restore(backup_path, [ANALYSIS_CONFIGS, TEMPLATE_FILE, HASH_JSON])
+            sys.exit(1)
+
+    # --------------------------------------------------------
+    # NO INTENT PROVIDED
+    # --------------------------------------------------------
+    print(
+        "\nNo workflow selected.\n"
+        "Use one of:\n"
+        "  --validate-only\n"
+        "  --edit-existing\n"
+        "  --promote gfxXYZ\n"
+        "  --hash-only / --ci\n"
+    )
+    sys.exit(0)
 
 
 if __name__ == "__main__":
-    sys.exit(main())
+    main()
diff --git a/projects/rocprofiler-compute/tools/config_management/metric_description_manager.py b/projects/rocprofiler-compute/tools/config_management/metric_description_manager.py
index 3fa75d5122..1dd7c30190 100644
--- a/projects/rocprofiler-compute/tools/config_management/metric_description_manager.py
+++ b/projects/rocprofiler-compute/tools/config_management/metric_description_manager.py
@@ -43,21 +43,11 @@ from typing import Union
 
 import yaml
 
-try:
-    from . import utils as cm_utils
-except Exception:
-    repo_root = Path(__file__).resolve().parents[1]
-    if str(repo_root) not in sys.path:
-        sys.path.insert(0, str(repo_root))
-    try:
-        import config_management.utils as cm_utils  # type: ignore
-    except Exception:
-        import utils as cm_utils  # type: ignore
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
 
-AUTOGEN_TEXT = (
-    "# AUTOGENERATED FILE. Only edit for testing purposes, not for development. "
-    "Generated by tools/config_management/metric_description_manager.py\n"
-)
+from config_management import utils_ruamel as cm_utils  # noqa: E402
 
 # Section to panel ID mapping for organizing descriptions
 SECTION_PANEL_MAP: dict[str, int] = {
@@ -274,7 +264,7 @@ def update_per_arch_metrics_file(
                 entry["unit"] = desc_data["unit"]
             rst_descriptions[section][metric_name] = entry
 
-    cm_utils.save_yaml(rst_descriptions, output_path, AUTOGEN_TEXT)
+    cm_utils.save_yaml(rst_descriptions, output_path)
     print(f"Updated: {output_path}")
 
 
@@ -303,7 +293,7 @@ def update_docs_metrics_file(
 
     docs_path.parent.mkdir(parents=True, exist_ok=True)
 
-    cm_utils.save_yaml(existing, docs_path, AUTOGEN_TEXT)
+    cm_utils.save_yaml(existing, docs_path)
     return True
 
 
diff --git a/projects/rocprofiler-compute/tools/config_management/parse_config_template.py b/projects/rocprofiler-compute/tools/config_management/parse_config_template.py
index 15ca94699a..894a8fb968 100644
--- a/projects/rocprofiler-compute/tools/config_management/parse_config_template.py
+++ b/projects/rocprofiler-compute/tools/config_management/parse_config_template.py
@@ -1,8 +1,49 @@
 #!/usr/bin/env python3
+##############################################################################
+# MIT License
+#
+# Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+##############################################################################
+
 """
-Parse panel configuration based on YAML files for an architecture.
-Usage:
-    python parse_config_template.py <dir_path> [output_file.yaml] [--latest-arch ARCH]
+parse_config_template.py
+
+Parse panel configuration based on YAML files for an architecture and, optionally,
+generate a lightweight template describing panel IDs, titles, aliases, and
+data-source ordering.
+
+Usage
+-----
+Generate a template from an architecture directory:
+
+    python tools/config_management/parse_config_template.py \
+        analysis_configs/gfx950 \
+        analysis_configs/config_template.yaml \
+        --latest-arch gfx950
+
+Inspect an architecture (no template written):
+
+    python tools/config_management/parse_config_template.py \
+        analysis_configs/gfx950
 """
 
 from __future__ import annotations
@@ -12,45 +53,93 @@ import sys
 from pathlib import Path
 from typing import Any, Optional
 
-try:
-    from . import utils as cm_utils
-except Exception:
-    repo_root = Path(__file__).resolve().parents[1]
-    if str(repo_root) not in sys.path:
-        sys.path.insert(0, str(repo_root))
-    try:
-        import config_management.utils as cm_utils  # type: ignore
-    except Exception:
-        import utils as cm_utils  # type: ignore
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
 
-AUTOGEN_TEXT = (
-    "# AUTOGENERATED FILE. Only edit for testing purposes, not for development. "
-    "Generated by tools/config_management/parse_config_template.py\n"
-)
+from config_management import utils_ruamel as cm_utils  # noqa: E402
 
 
-def parse_panel_config(yaml_file: Path) -> Optional[dict]:
-    """Parse a single YAML file and extract panel and data source info."""
+def normalize_panel_id(panel_id: Optional[int]) -> Optional[int]:
+    """Normalize panel ID by dividing by 100 if needed."""
+    if panel_id is None:
+        return None
+    return panel_id // 100 if panel_id >= 100 else panel_id
+
+
+def normalize_table_id(table_id: Optional[int]) -> Optional[int]:
+    """Normalize table ID using modulo 100."""
+    if table_id is None:
+        return None
+    return table_id % 100
+
+
+def parse_panel_config(yaml_file: Path) -> Optional[dict[str, Any]]:
+    """
+    Parse a single panel YAML file and extract template-relevant info.
+
+    Returns a dict with:
+      - file: panel filename (without leading numeric prefix)
+      - panel_id: normalized panel id (id // 100 when >= 100)
+      - panel_title: Panel Config.title
+      - panel_alias: Panel Config.alias (optional)
+      - data_sources: ordered list of
+            {type: <key>, id: <normalized table id>, title: <title>}
+    or None if the file does not contain a valid Panel Config or fails basic checks.
+    """
     data = cm_utils.load_yaml(yaml_file)
     panel_config = data.get("Panel Config")
     if not isinstance(panel_config, dict):
+        print(f"WARNING: {yaml_file} has no valid 'Panel Config' mapping, skipping.")
+        return None
+
+    # Enforce presence of core panel-level keys
+    missing_keys: list[str] = []
+    for key in ("id", "title", "data source", "metrics_description"):
+        if key not in panel_config:
+            missing_keys.append(key)
+
+    if missing_keys:
+        missing_str = ", ".join(missing_keys)
+        print(
+            f"ERROR: {yaml_file} is missing required Panel Config keys: {missing_str}"
+        )
         return None
 
     filename = (
         yaml_file.name.split("_", 1)[1] if "_" in yaml_file.name else yaml_file.name
     )
 
-    panel_id = panel_config.get("id")
-    if panel_id and panel_id >= 100:
-        panel_id = panel_id // 100
+    raw_panel_id = panel_config.get("id")
+    if not isinstance(raw_panel_id, int):
+        print(
+            f"ERROR: {yaml_file} has non-integer or missing Panel Config.id "
+            f"({raw_panel_id!r})"
+        )
+        return None
 
-    data_sources = []
-    for ds in panel_config.get("data source", []):
+    panel_id = normalize_panel_id(raw_panel_id)
+
+    # Extract and normalize data sources
+    data_sources: list[dict[str, Any]] = []
+    ds_list = panel_config.get("data source", [])
+    if not isinstance(ds_list, list):
+        print(
+            f"ERROR: {yaml_file} has non-list 'data source' field "
+            f"({type(ds_list).__name__})"
+        )
+        return None
+
+    for ds in ds_list:
+        if not isinstance(ds, dict):
+            print(f"WARNING: {yaml_file} has non-dict data source entry: {ds!r}")
+            continue
         for key, value in ds.items():
             if isinstance(value, dict) and "id" in value and "title" in value:
+                norm_id = normalize_table_id(value["id"])
                 data_sources.append({
                     "type": key,
-                    "id": value["id"] % 100,
+                    "id": norm_id,
                     "title": value["title"],
                 })
 
@@ -63,15 +152,65 @@ def parse_panel_config(yaml_file: Path) -> Optional[dict]:
     }
 
 
+def build_template_from_directory(
+    directory: Path,
+    existing_panels_by_id: Optional[dict[int, dict]],
+) -> list[dict]:
+    panels: list[dict] = []
+    errors = 0
+
+    for yaml_file in sorted(directory.glob("*.yaml")):
+        info = parse_panel_config(yaml_file)
+        if info is None:
+            errors += 1
+            continue
+
+        panel_id = info.get("panel_id")
+
+        if (
+            existing_panels_by_id
+            and panel_id is not None
+            and panel_id in existing_panels_by_id
+        ):
+            old_panel = existing_panels_by_id[panel_id]
+
+            # Preserve panel_alias unless explicitly set by panel YAML
+            if info.get("panel_alias") is None and "panel_alias" in old_panel:
+                info["panel_alias"] = old_panel["panel_alias"]
+
+        panels.append(info)
+
+    # Deterministic ordering for stable templates
+    panels.sort(key=lambda p: (p["panel_id"], p["file"]))
+
+    if errors:
+        print(
+            f"\nEncountered {errors} panel file(s) with structural errors "
+            "while building template."
+        )
+
+    return panels
+
+
 def main() -> None:
     parser = argparse.ArgumentParser(
-        description="Parse panel configuration from YAML files"
+        description=(
+            "Parse panel YAML files for an architecture and optionally generate "
+            "a config_template-style YAML describing panel IDs and data sources."
+        )
+    )
+    parser.add_argument("directory", help="Directory containing panel YAML files")
+    parser.add_argument(
+        "output",
+        nargs="?",
+        help="Output YAML file (optional). If omitted, only a summary is printed.",
     )
-    parser.add_argument("directory", help="Directory containing YAML files")
-    parser.add_argument("output", nargs="?", help="Output YAML file (optional)")
     parser.add_argument(
         "--latest-arch",
-        help="Specify this architecture as latest (adds metadata to output)",
+        help=(
+            "Specify this architecture as latest (adds 'latest_arch' metadata "
+            "to the generated template). Only used when an output file is given."
+        ),
     )
     args = parser.parse_args()
 
@@ -80,33 +219,45 @@ def main() -> None:
         print(f"Error: '{args.directory}' is not a valid directory")
         sys.exit(1)
 
-    results = []
-    for yaml_file in sorted(directory.glob("*.yaml")):
-        parsed = parse_panel_config(yaml_file)
-        if parsed:
-            results.append(parsed)
+    existing_template = None
+    if args.output and Path(args.output).exists():
+        existing_template = cm_utils.load_yaml(Path(args.output))
 
-    if not results:
-        print("No valid panel configurations found.")
+    existing_panels_by_id = {}
+    if existing_template:
+        for p in existing_template.get("panels", []):
+            pid = p.get("panel_id")
+            if pid is not None:
+                existing_panels_by_id[pid] = p
+
+    panels = build_template_from_directory(
+        directory,
+        existing_panels_by_id=existing_panels_by_id if args.output else None,
+    )
+
+    if not panels:
+        print("No valid panel YAML files found; nothing to do.")
         sys.exit(1)
 
-    for panel in results:
-        print(f"\n{'=' * 80}")
-        print(f"File: {panel['file']}")
+    # Always show a human-readable summary.
+    print(f"Found {len(panels)} panel(s) in {directory}:")
+    for panel in panels:
+        print(f"\nFile: {panel['file']}")
         print(f"Panel ID: {panel['panel_id']}")
         print(f"Panel Title: {panel['panel_title']}")
-        if panel.get("panel_alias"):
+        if panel["panel_alias"]:
             print(f"Panel Alias: {panel['panel_alias']}")
         print(f"\nData Sources ({len(panel['data_sources'])}):")
         for ds in panel["data_sources"]:
             print(f"  - {ds['type']}: {ds['id']} - {ds['title']}")
 
+    # Optionally write a template YAML.
     if args.output:
-        output_data: Any = results
+        output_data: Any = {"panels": panels}
         if args.latest_arch:
-            output_data = {"latest_arch": args.latest_arch, "panels": results}
-        cm_utils.save_yaml(output_data, args.output, AUTOGEN_TEXT)
-        print(f"\nResults saved to: {args.output}")
+            output_data = {"latest_arch": args.latest_arch, "panels": panels}
+        cm_utils.save_yaml(output_data, Path(args.output))
+        print(f"\nTemplate saved to: {args.output}")
 
 
 if __name__ == "__main__":
diff --git a/projects/rocprofiler-compute/tools/config_management/utils.py b/projects/rocprofiler-compute/tools/config_management/utils.py
deleted file mode 100644
index d79d65a257..0000000000
--- a/projects/rocprofiler-compute/tools/config_management/utils.py
+++ /dev/null
@@ -1,52 +0,0 @@
-##############################################################################
-# MIT License
-#
-# Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-# THE SOFTWARE.
-
-##############################################################################
-
-from pathlib import Path
-from typing import Optional, Union
-
-import yaml
-
-
-def str_representer(dumper, data):
-    if "\n" in data:
-        return dumper.represent_scalar("tag:yaml.org,2002:str", data, style=">")
-    return dumper.represent_scalar("tag:yaml.org,2002:str", data)
-
-
-yaml.add_representer(str, str_representer)
-
-
-def load_yaml(filepath: Union[str, Path]) -> dict:
-    with open(filepath) as f:
-        return yaml.safe_load(f) or {}
-
-
-def save_yaml(
-    data: dict, filepath: Union[str, Path], header: Optional[str] = None
-) -> None:
-    with open(filepath, "w") as f:
-        if header:
-            f.write(header)
-        yaml.dump(data, f, sort_keys=False)
diff --git a/projects/rocprofiler-compute/tools/config_management/utils_ruamel.py b/projects/rocprofiler-compute/tools/config_management/utils_ruamel.py
new file mode 100644
index 0000000000..512ad03e4d
--- /dev/null
+++ b/projects/rocprofiler-compute/tools/config_management/utils_ruamel.py
@@ -0,0 +1,92 @@
+##############################################################################
+# MIT License
+#
+# Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+##############################################################################
+
+from pathlib import Path
+from typing import Any, Union
+
+from ruamel.yaml import YAML
+from ruamel.yaml.comments import CommentedMap
+
+# --- Round-trip YAML (for writing) ---
+RT_YAML = YAML(typ="rt")
+RT_YAML.preserve_quotes = True
+RT_YAML.width = 4096  # prevent unwanted line wrapping
+RT_YAML.indent(mapping=2, sequence=2, offset=0)
+RT_YAML.explicit_start = False
+RT_YAML.explicit_end = False
+
+# --- Read-only YAML (safe loader) ---
+RO_YAML = YAML(typ="safe")
+RO_YAML.width = 4096
+
+
+def load_yaml(
+    filepath: Union[str, Path],
+    *,
+    round_trip: bool = False,
+) -> Any:
+    path = Path(filepath)
+    if not path.exists():
+        raise FileNotFoundError(f"YAML file not found: {path}")
+
+    yaml = RT_YAML if round_trip else RO_YAML
+
+    with open(path, "r", encoding="utf-8") as f:
+        return yaml.load(f) or CommentedMap()
+
+
+def save_yaml(data: Any, filepath: Union[str, Path]) -> None:
+    path = Path(filepath)
+    path.parent.mkdir(parents=True, exist_ok=True)
+
+    with open(path, "w", encoding="utf-8") as f:
+        RT_YAML.dump(data, f)
+
+
+def strip_existing_header(yaml_data) -> None:
+    ca = getattr(yaml_data, "ca", None)
+    if not ca or not hasattr(ca, "comment") or ca.comment is None:
+        return
+
+    original = ca.comment
+
+    cleaned = []
+
+    for block in original:
+        if block is None:
+            cleaned.append(None)
+            continue
+
+        new_block = [token for token in block if "AUTOGENERATED" not in token.value]
+
+        if not new_block:
+            cleaned.append(None)
+        else:
+            cleaned.append(new_block)
+
+    if len(cleaned) < 2:
+        cleaned.extend([None] * (2 - len(cleaned)))
+
+    ca.comment = cleaned
diff --git a/projects/rocprofiler-compute/tools/config_management/verify_against_config_template.py b/projects/rocprofiler-compute/tools/config_management/verify_against_config_template.py
index 32b9044edb..0930764c86 100644
--- a/projects/rocprofiler-compute/tools/config_management/verify_against_config_template.py
+++ b/projects/rocprofiler-compute/tools/config_management/verify_against_config_template.py
@@ -23,155 +23,318 @@
 # THE SOFTWARE.
 
 ##############################################################################
+
 """
-Validate panel YAML files against base template ordering.
-Checks that panel configs match expected structure, IDs, titles, and data source order.
+verify_against_config_template.py
+
+Validate per-architecture panel YAMLs against a shared config template.
+- Validate structure + ordering only.
+- Treat any deviation as an error.
+- Collect all errors and report at end.
+
+Template format (generated by parse_config_template.py):
+  latest_arch: gfx###   (optional)
+  panels:
+    - file: <filename without numeric prefix>
+      panel_id: <normalized panel id>
+      panel_title: <title>
+      panel_alias: <optional>
+      data_sources:
+        - type: metric_table|raw_csv_table|...
+          id: <normalized table id>
+          title: <title>
 
 Usage:
-    python verify_against_config_template.py <analysis_configs_dir> <template_yaml>
+  python verify_against_config_template.py <analysis_configs_dir> <template_yaml>
 """
 
 from __future__ import annotations
 
+import argparse
 import sys
+from dataclasses import dataclass
 from pathlib import Path
-from typing import Optional
+from typing import Any, Optional
 
-import yaml
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
+
+from config_management import utils_ruamel as cm_utils  # noqa: E402
+
+REQUIRED_PANEL_KEYS = ("id", "title", "data source", "metrics_description")
+OPTIONAL_PANEL_KEYS = ("alias",)
+DEFAULT_ALLOWED_PANEL_KEYS = set(REQUIRED_PANEL_KEYS) | set(OPTIONAL_PANEL_KEYS)
 
 
 def normalize_panel_id(panel_id: int) -> int:
-    """Normalize panel ID by dividing by 100."""
-    return panel_id // 100 if panel_id and panel_id >= 100 else panel_id
+    return panel_id // 100 if panel_id >= 100 else panel_id
 
 
-def normalize_table_id(table_id: int) -> Optional[int]:
-    """Normalize table ID using modulo 100."""
-    return table_id % 100 if table_id else None
+def normalize_table_id(table_id: int) -> int:
+    return table_id % 100
 
 
-def load_template(template_file: Path) -> dict[int, dict]:
-    """Load template and create lookup by normalized panel ID."""
-    with open(template_file) as f:
-        data = yaml.safe_load(f) or {}
-
-    panels = data.get("panels", [])
-    lookup: dict[int, dict] = {}
-    for panel in panels:
-        pid = normalize_panel_id(panel["panel_id"])
-        lookup[pid] = {
-            "panel_title": panel["panel_title"],
-            "panel_alias": panel.get("panel_alias"),
-            "data_sources": [
-                {"type": ds["type"], "id": ds["id"], "title": ds["title"]}
-                for ds in panel.get("data_sources", [])
-            ],
-        }
-    return lookup
+@dataclass(frozen=True)
+class TemplateDataSource:
+    type: str
+    id: int
+    title: str
 
 
-def extract_panel_info(yaml_file: Path) -> Optional[dict]:
-    """Extract panel config info from YAML file."""
-    with open(yaml_file) as f:
-        data = yaml.safe_load(f) or {}
-    if "Panel Config" not in data:
-        return None
-
-    panel_config = data["Panel Config"]
-    data_sources = []
-    for ds in panel_config.get("data source", []):
-        for key, value in ds.items():
-            if isinstance(value, dict) and "id" in value and "title" in value:
-                data_sources.append({
-                    "type": key,
-                    "id": normalize_table_id(value["id"]),
-                    "title": value["title"],
-                })
-
-    return {
-        "panel_id": normalize_panel_id(panel_config.get("id")),
-        "panel_title": panel_config.get("title"),
-        "data_sources": data_sources,
-    }
+@dataclass(frozen=True)
+class TemplatePanel:
+    file: str
+    panel_id: int
+    panel_title: str
+    panel_alias: Any
+    data_sources: tuple[TemplateDataSource, ...]
 
 
-def validate_panel(
-    yaml_file: Path, panel_info: dict, template: dict[int, dict], stats: dict
-) -> None:
-    """Validate a single panel YAML against template."""
-    panel_id = panel_info["panel_id"]
-    file_path = f"{yaml_file.parent.name}/{yaml_file.name}"
+def _as_str(v: Any) -> str:
+    return "" if v is None else str(v)
 
-    if panel_id not in template:
-        print(f"WARNING [{file_path}]: Panel ID {panel_id} not found in template")
-        stats["warnings"] += 1
-        return
 
-    expected = template[panel_id]
+def load_template(
+    template_file: Path,
+) -> tuple[list[TemplatePanel], dict[int, TemplatePanel]]:
+    data = cm_utils.load_yaml(template_file) or {}
+    panels_raw = data.get("panels", [])
+    if not isinstance(panels_raw, list):
+        raise ValueError("Template YAML must contain a top-level 'panels' list")
+
+    panels: list[TemplatePanel] = []
+    by_id: dict[int, TemplatePanel] = {}
+
+    for idx, p in enumerate(panels_raw):
+        if not isinstance(p, dict):
+            raise ValueError(f"Template panels[{idx}] must be a mapping")
+        if "panel_id" not in p or "panel_title" not in p:
+            raise ValueError(
+                f"Template panels[{idx}] missing 'panel_id' or 'panel_title'"
+            )
+
+        pid_raw = p.get("panel_id")
+        if not isinstance(pid_raw, int):
+            raise ValueError(
+                f"Template panels[{idx}].panel_id must be int, got {pid_raw!r}"
+            )
+        pid = normalize_panel_id(pid_raw)
+
+        ds_list = p.get("data_sources", []) or []
+        if not isinstance(ds_list, list):
+            raise ValueError(f"Template panels[{idx}].data_sources must be list")
+
+        ds_out: list[TemplateDataSource] = []
+        for j, ds in enumerate(ds_list):
+            if not isinstance(ds, dict):
+                raise ValueError(
+                    f"Template panels[{idx}].data_sources[{j}] must be mapping"
+                )
+            for k in ("type", "id", "title"):
+                if k not in ds:
+                    raise ValueError(
+                        f"Template panels[{idx}].data_sources[{j}] missing '{k}'"
+                    )
+
+            ds_id = ds["id"]
+            if not isinstance(ds_id, int):
+                raise ValueError(
+                    f"Template panels[{idx}].data_sources[{j}].id must be int, "
+                    f"got {ds_id!r}"
+                )
+
+            ds_out.append(
+                TemplateDataSource(
+                    type=_as_str(ds["type"]),
+                    id=normalize_table_id(ds_id),
+                    title=_as_str(ds["title"]),
+                )
+            )
+
+        panel = TemplatePanel(
+            file=_as_str(p.get("file", "")),
+            panel_id=pid,
+            panel_title=_as_str(p.get("panel_title")),
+            panel_alias=p.get("panel_alias"),
+            data_sources=tuple(ds_out),
+        )
+
+        if pid in by_id:
+            raise ValueError(f"Duplicate panel_id {pid} in template")
+
+        panels.append(panel)
+        by_id[pid] = panel
+
+    return panels, by_id
+
+
+def extract_panel_info(
+    yaml_file: Path,
+) -> tuple[Optional[int], dict[str, Any], list[dict[str, Any]]]:
+    """Return (panel_id, panel_config, extracted_data_sources)."""
+    data = cm_utils.load_yaml(yaml_file) or {}
+    panel_config = data.get("Panel Config")
+    if not isinstance(panel_config, dict):
+        return None, {}, []
+
+    pid_raw = panel_config.get("id")
+    pid = normalize_panel_id(pid_raw) if isinstance(pid_raw, int) else None
+
+    ds_extracted: list[dict[str, Any]] = []
+    ds_list = panel_config.get("data source", [])
+    if isinstance(ds_list, list):
+        for item in ds_list:
+            if not isinstance(item, dict):
+                continue
+            for ds_type, value in item.items():
+                if (
+                    isinstance(value, dict)
+                    and isinstance(value.get("id"), int)
+                    and "title" in value
+                ):
+                    ds_extracted.append({
+                        "type": str(ds_type),
+                        "id": normalize_table_id(value["id"]),
+                        "title": _as_str(value.get("title")),
+                    })
+
+    return pid, panel_config, ds_extracted
+
+
+def validate_arch(
+    arch_dir: Path,
+    template_panels: list[TemplatePanel],
+    template_by_id: dict[int, TemplatePanel],
+    allowed_panel_keys: set[str],
+) -> list[str]:
+    """Validate one architecture directory. Returns list of errors."""
     errors: list[str] = []
-    warnings: list[str] = []
 
-    if panel_info["panel_title"] != expected["panel_title"]:
-        errors.append(
-            f"Panel title mismatch: expected '{expected['panel_title']}', "
-            f"got '{panel_info['panel_title']}'"
-        )
+    panel_files = sorted(arch_dir.glob("*.yaml"))
+    actual_by_id: dict[int, Path] = {}
+    actual_order: list[int] = []
 
-    if len(panel_info["data_sources"]) != len(expected["data_sources"]):
-        errors.append(
-            f"Data source count mismatch: expected {len(expected['data_sources'])}, "
-            f"got {len(panel_info['data_sources'])}"
-        )
+    for f in panel_files:
+        pid, panel_config, ds_actual = extract_panel_info(f)
+        rel = f"{arch_dir.name}/{f.name}"
 
-    for i, actual_ds in enumerate(panel_info["data_sources"]):
-        matching_idx = next(
-            (
-                j
-                for j, exp_ds in enumerate(expected["data_sources"])
-                if actual_ds["id"] == exp_ds["id"]
-                and actual_ds["title"] == exp_ds["title"]
-                and actual_ds["type"] == exp_ds["type"]
-            ),
-            None,
-        )
-        if matching_idx is None:
+        if pid is None:
+            errors.append(f"ERROR [{rel}]: Missing or non-integer Panel Config.id")
+            continue
+
+        # required keys
+        missing = [k for k in REQUIRED_PANEL_KEYS if k not in panel_config]
+        if missing:
             errors.append(
-                f"Data source {i + 1}: No matching entry in template for "
-                f"{actual_ds['type']} id={actual_ds['id']} title='{actual_ds['title']}'"
-            )
-        elif matching_idx != i:
-            warnings.append(
-                f"Data source {i + 1}: Order mismatch - appears at position {i + 1} "
-                f"but expected at position {matching_idx + 1}"
+                f"ERROR [{rel}]: Missing required Panel Config keys: "
+                f"{', '.join(missing)}"
             )
 
-    if errors:
-        print(f"ERROR [{file_path}]:")
-        for error in errors:
-            print(f"  - {error}")
-        stats["errors"] += len(errors)
-        stats["failed_files"] += 1
-    elif warnings:
-        print(f"WARNING [{file_path}]:")
-        for warning in warnings:
-            print(f"  - {warning}")
-        stats["warnings"] += len(warnings)
-        stats["passed_files"] += 1
-    else:
-        print(f"PASS [{file_path}]")
-        stats["passed_files"] += 1
+        # prohibited keys (unknown keys)
+        for k in panel_config.keys():
+            if k not in allowed_panel_keys:
+                errors.append(
+                    f"ERROR [{rel}]: Prohibited/unknown Panel Config key '{k}' "
+                    f"(allowed: {sorted(allowed_panel_keys)})"
+                )
+
+        # panel must exist in template
+        if pid not in template_by_id:
+            errors.append(f"ERROR [{rel}]: Panel ID {pid} not found in template")
+        else:
+            expected = template_by_id[pid]
+            actual_title = _as_str(panel_config.get("title"))
+            if actual_title != expected.panel_title:
+                errors.append(
+                    f"ERROR [{rel}]: Panel title mismatch for id {pid}: "
+                    f"expected '{expected.panel_title}', got '{actual_title}'"
+                )
+
+            # data sources must match count + order strictly
+            if len(ds_actual) != len(expected.data_sources):
+                errors.append(
+                    f"ERROR [{rel}]: Data source count mismatch for panel "
+                    f"{pid}: expected {len(expected.data_sources)}, "
+                    f"got {len(ds_actual)}"
+                )
+
+            for i, exp_ds in enumerate(expected.data_sources):
+                if i >= len(ds_actual):
+                    break
+                act = ds_actual[i]
+                if (
+                    act["type"] != exp_ds.type
+                    or act["id"] != exp_ds.id
+                    or act["title"] != exp_ds.title
+                ):
+                    errors.append(
+                        f"ERROR [{rel}]: Data source #{i + 1} mismatch "
+                        f"for panel {pid}: expected {exp_ds.type} id={exp_ds.id} "
+                        f"title='{exp_ds.title}', got {act['type']} "
+                        f"id={act['id']} title='{act['title']}'"
+                    )
+
+        # duplicates
+        if pid in actual_by_id:
+            errors.append(
+                f"ERROR [{rel}]: Duplicate panel id {pid} "
+                f"(also in {arch_dir.name}/{actual_by_id[pid].name})"
+            )
+        else:
+            actual_by_id[pid] = f
+            actual_order.append(pid)
+
+    # missing / extra panels
+    expected_ids = [p.panel_id for p in template_panels]
+    actual_ids = set(actual_by_id.keys())
+    expected_set = set(expected_ids)
+
+    for pid in expected_ids:
+        if pid not in actual_ids:
+            errors.append(
+                f"ERROR [{arch_dir.name}]: Missing panel id {pid} required by template"
+            )
+
+    for pid in sorted(actual_ids - expected_set):
+        errors.append(
+            f"ERROR [{arch_dir.name}/{actual_by_id[pid].name}]: "
+            f"Extra panel id {pid} not present in template"
+        )
+
+    # panel ordering (based on file sorting)
+    expected_order = [pid for pid in expected_ids if pid in actual_ids]
+    if actual_order and expected_order and actual_order != expected_order:
+        for i, (a, e) in enumerate(zip(actual_order, expected_order)):
+            if a != e:
+                errors.append(
+                    f"ERROR [{arch_dir.name}]: Panel file order mismatch at position "
+                    f"{i + 1}: expected panel id {e}, got {a} "
+                    "(files must follow template order)"
+                )
+                break
+
+    return errors
 
 
 def main() -> None:
-    if len(sys.argv) != 3:
-        print(
-            "Usage: python verify_against_config_template.py "
-            "<analysis_configs_dir> <template_yaml>"
-        )
-        sys.exit(1)
+    parser = argparse.ArgumentParser(
+        description="Validate per-arch panel YAMLs against a shared config template."
+    )
+    parser.add_argument(
+        "analysis_configs_dir", help="Directory containing architecture subdirs"
+    )
+    parser.add_argument("template_yaml", help="Template YAML (config_template.yaml)")
+    parser.add_argument(
+        "--allow-panel-key",
+        action="append",
+        default=[],
+        help="Allow an additional key under 'Panel Config' (repeatable)",
+    )
+    args = parser.parse_args()
 
-    configs_dir = Path(sys.argv[1])
-    template_file = Path(sys.argv[2])
+    configs_dir = Path(args.analysis_configs_dir)
+    template_file = Path(args.template_yaml)
 
     if not configs_dir.is_dir():
         print(f"Error: {configs_dir} is not a directory")
@@ -180,45 +343,40 @@ def main() -> None:
         print(f"Error: {template_file} is not a file")
         sys.exit(1)
 
+    template_panels, template_by_id = load_template(template_file)
+    allowed_panel_keys = set(DEFAULT_ALLOWED_PANEL_KEYS) | set(args.allow_panel_key)
     print(f"Loading template from {template_file}")
-    template = load_template(template_file)
-    print(f"Template loaded: {len(template)} panels\n")
+    print(f"Template loaded: {len(template_panels)} panels\n")
 
-    stats = {
-        "total_files": 0,
-        "passed_files": 0,
-        "failed_files": 0,
-        "errors": 0,
-        "warnings": 0,
-    }
+    all_errors: list[str] = []
+    total_arches = 0
 
     for arch_dir in sorted(configs_dir.iterdir()):
         if not arch_dir.is_dir():
             continue
+        total_arches += 1
         print(f"{'=' * 80}\nValidating architecture: {arch_dir.name}\n{'=' * 80}")
-        for yaml_file in sorted(arch_dir.glob("*.yaml")):
-            stats["total_files"] += 1
-            panel_info = extract_panel_info(yaml_file)
-            if panel_info:
-                validate_panel(yaml_file, panel_info, template, stats)
-            else:
-                print(f"ERROR [{arch_dir.name}/{yaml_file.name}]: Invalid panel config")
-                stats["errors"] += 1
-                stats["failed_files"] += 1
+        arch_errors = validate_arch(
+            arch_dir=arch_dir,
+            template_panels=template_panels,
+            template_by_id=template_by_id,
+            allowed_panel_keys=allowed_panel_keys,
+        )
+        if arch_errors:
+            for e in arch_errors:
+                print(e)
+            all_errors.extend(arch_errors)
+        else:
+            print(f"PASS [{arch_dir.name}]: All panel YAMLs match template")
         print()
 
     print(f"{'=' * 80}\nVALIDATION SUMMARY\n{'=' * 80}")
-    print(f"Total files checked: {stats['total_files']}")
-    print(f"Passed: {stats['passed_files']}")
-    print(f"Failed: {stats['failed_files']}")
-    print(f"Total errors: {stats['errors']}")
-    print(f"Total warnings: {stats['warnings']}")
+    print(f"Architectures checked: {total_arches}")
+    print(f"Total errors: {len(all_errors)}")
 
-    if stats["failed_files"] > 0:
+    if all_errors:
         print("\nValidation FAILED")
         sys.exit(1)
-    elif stats["warnings"] > 0:
-        print("\nValidation PASSED with warnings")
     else:
         print("\nValidation PASSED")
 
diff --git a/projects/rocprofiler-compute/tools/split_config.py b/projects/rocprofiler-compute/tools/split_config.py
deleted file mode 100644
index d1f0a55ca3..0000000000
--- a/projects/rocprofiler-compute/tools/split_config.py
+++ /dev/null
@@ -1,307 +0,0 @@
-##############################################################################
-# MIT License
-#
-# Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-# THE SOFTWARE.
-
-##############################################################################
-
-# NOTES
-#
-# Read tools/unified_config.yaml and split it into per gfx architecture per panel
-# config files. WARNING: This script will overwrite existing files under per gfx
-# architecture folders under src/rocprof_compute_soc/analysis_configs.
-#
-# Read tools/unified_config.yaml and split it into metric tables per documentation
-# section.
-# WARNING: This script will overwrite existing docs/data/metrics_description.yaml.
-
-import copy
-import hashlib
-import re
-from pathlib import Path
-
-import yaml
-
-# Get root directory of the project
-ROOT_DIR = Path(__file__).parent.parent
-SOURCE_DIR = ROOT_DIR / "tools"
-TARGET_DIR = ROOT_DIR / "src" / "rocprof_compute_soc" / "analysis_configs"
-SETS_TARGET_DIR = ROOT_DIR / "src" / "rocprof_compute_soc" / "profile_configs" / "sets"
-DOC_TARGET_DIR = ROOT_DIR / "docs" / "data"
-AUTOGEN_TEXT = (
-    "# AUTOGENERATED FILE. Only edit for testing purposes, not for development. "
-    "Generated from tools/unified_config.yaml. Generated by tools/split_config.py\n"
-)
-HASH_FILE = ROOT_DIR / "tools" / "autogen_hash.yaml"
-HASH_FILE_MAP = {}
-GFX_VERSIONS = ["gfx908", "gfx90a", "gfx940", "gfx941", "gfx942", "gfx950"]
-METRIC_ID_TO_NAME_MAP = {gfx_version: {} for gfx_version in GFX_VERSIONS}
-
-
-def str_representer(dumper, data):
-    if "\n" in data:
-        return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
-    return dumper.represent_scalar("tag:yaml.org,2002:str", data)
-
-
-yaml.add_representer(str, str_representer)
-
-
-def get_autogen_text(config_file="tools/unified_config.yaml"):
-    return (
-        f"# AUTOGENERATED FILE. Only edit for testing purposes, "
-        f"not for development. Generated from {config_file}. "
-        f"Generated by tools/split_config.py\n"
-    )
-
-
-def update_analysis_config():
-    global METRIC_ID_TO_NAME_MAP
-
-    # Read the unified config file
-    with open(SOURCE_DIR / "unified_config.yaml") as file:
-        unified_config = yaml.safe_load(file)
-
-    # Create per panel config file
-    for panel_config in unified_config["panels"]:
-        new_panel_config = {"Panel Config": {}}
-        new_panel_config["Panel Config"]["id"] = panel_config["id"]
-        new_panel_config["Panel Config"]["title"] = panel_config["title"]
-
-        panel_id_int = panel_config["id"]
-        # Convert int into str with 4 digits
-        panel_id = str(panel_config["id"]).zfill(4)
-        # Replace parentehsis, hyphen, slash and space with underscore
-        # Remove duplicate underscore
-        # Convert to lower case
-        panel_title = re.sub(r"[()\-/ ]+", "_", panel_config["title"])
-        panel_title = "_".join(filter(None, panel_title.split("_")))
-        panel_title = panel_title.lower()
-
-        for gfx_version in GFX_VERSIONS:
-            # Create per gfx architecture folder
-            gfx_dir = TARGET_DIR / gfx_version
-            # Create directory if it doesn't exist
-            if not gfx_dir.exists():
-                gfx_dir.mkdir()
-                print(f"Created directory: {gfx_dir}")
-
-            # Collect metrics for this gfx_version
-            gfx_metrics = []
-
-            # Select metrics from current gfx arch
-            new_panel_config["Panel Config"]["data source"] = []
-            for data_source_index, data_source_config in enumerate(
-                panel_config["data source"]
-            ):
-                data_source_config = copy.deepcopy(data_source_config)
-                if "metric_table" in data_source_config:
-                    data_source_config["metric_table"]["metric"] = data_source_config[
-                        "metric_table"
-                    ]["metric"][gfx_version]
-
-                    # Collect metric names for this gfx version (preserve order)
-                    for metric_name in data_source_config["metric_table"][
-                        "metric"
-                    ].keys():
-                        if metric_name not in gfx_metrics:
-                            gfx_metrics.append(metric_name)
-
-                    build_metric_id_mapping(
-                        panel_id_int,
-                        data_source_index,
-                        data_source_config["metric_table"]["metric"],
-                        gfx_version,
-                    )
-                new_panel_config["Panel Config"]["data source"].append(
-                    data_source_config
-                )
-
-            # Only include metric descriptions for metrics that exist in this gfx
-            new_panel_config["Panel Config"]["metrics_description"] = {
-                key: value["plain"].strip()
-                for key, value in panel_config.get("metrics_description", {}).items()
-                if key in gfx_metrics
-            }
-
-            # Write panel config to file
-            filename = TARGET_DIR / gfx_version / f"{panel_id}_{panel_title}.yaml"
-            with open(filename, "w") as file:
-                file.write(get_autogen_text())
-                yaml.dump(new_panel_config, file, sort_keys=False)
-                print(f"File write: {filename}")
-            # Calculate hash of filename
-            HASH_FILE_MAP[str(filename.relative_to(ROOT_DIR))] = hashlib.sha256(
-                filename.read_bytes()
-            ).hexdigest()
-
-
-def build_metric_id_mapping(panel_id, data_source_index, metrics, gfx_version):
-    # Build metric id to metric name mapping
-    global METRIC_ID_TO_NAME_MAP
-    for metric_index, metric_name in enumerate(metrics.keys()):
-        metric_id = f"{panel_id // 100}.{data_source_index + 1}.{metric_index}"
-        METRIC_ID_TO_NAME_MAP[gfx_version][str(metric_id)] = metric_name
-
-
-def update_sets_config():
-    # Create directory if it doesn't exist
-    if not SETS_TARGET_DIR.exists():
-        SETS_TARGET_DIR.mkdir()
-        print(f"Created directory: {SETS_TARGET_DIR}")
-
-    # Read the unified config file
-    with open(SOURCE_DIR / "unified_sets.yaml") as file:
-        unified_sets = yaml.safe_load(file)
-
-    # Create per gfx version file
-    for gfx_version in GFX_VERSIONS:
-        new_sets = {"sets": []}
-
-        for sets in unified_sets["sets"]:
-            # Create new set object for each set
-            current_set = {
-                "title": sets["title"],
-                "set_option": sets["set_option"],
-                "description": sets["description"],
-                "metric": [],
-            }
-
-            for metric_id in sets["metric"][gfx_version]:
-                current_set["metric"].append({
-                    metric_id: METRIC_ID_TO_NAME_MAP[gfx_version][str(metric_id)]
-                })
-
-            new_sets["sets"].append(current_set)
-
-        # Write gfx version sets to file
-        filename = SETS_TARGET_DIR / f"{gfx_version}_sets.yaml"
-        with open(filename, "w") as file:
-            file.write(get_autogen_text("tools/unified_sets.yaml"))
-            yaml.dump(new_sets, file, sort_keys=False)
-            print(f"File write: {filename}")
-        # Calculate hash of filename
-        HASH_FILE_MAP[str(filename.relative_to(ROOT_DIR))] = hashlib.sha256(
-            filename.read_bytes()
-        ).hexdigest()
-
-
-def update_documentation():
-    # Documentation sections
-    section_panel_map = {
-        "Wavefront launch stats": 701,
-        "Wavefront runtime stats": 702,
-        "Overall instruction mix": 1001,
-        "VALU arithmetic instruction mix": 1002,
-        "MFMA instruction mix": 1004,
-        "Compute Speed-of-Light": 1101,
-        "Pipeline statistics": 1102,
-        "Arithmetic operations": 1103,
-        "LDS Speed-of-Light": 1201,
-        "LDS Statistics": 1202,
-        "vL1D Speed-of-Light": 1601,
-        "Busy / stall metrics": 1501,
-        "Instruction counts": 1502,
-        "Spill / stack metrics": 1503,
-        "L1 Unified Translation Cache (UTCL1)": 1605,
-        "vL1D cache stall metrics": 1602,
-        "vL1D cache access metrics": 1603,
-        "Vector L1 data-return path or Texture Data (TD)": 1504,
-        "L2 Speed-of-Light": 1701,
-        "L2 cache accesses": 1703,
-        "L2-Fabric interface metrics": 1702,
-        "L2 - Fabric interface detailed metrics": 1706,
-        "L2 - Fabric Interface stalls": 1705,
-        "Scalar L1D Speed-of-Light": 1401,
-        "Scalar L1D cache accesses": 1402,
-        "Scalar L1D Cache - L2 Interface": 1403,
-        "L1I Speed-of-Light": 1301,
-        "L1I cache accesses": 1302,
-        "L1I <-> L2 interface": 1303,
-        "Workgroup manager utilizations": 601,
-        "Workgroup Manager - Resource Allocation": 602,
-        "Command processor fetcher (CPF)": 501,
-        "Command processor packet processor (CPC)": 502,
-        "System Speed-of-Light": 201,
-    }
-
-    # Read the unified config file
-    with open(SOURCE_DIR / "unified_config.yaml") as file:
-        unified_config = yaml.safe_load(file)
-
-    panel_metric_map = {}
-    for panel_config in unified_config["panels"]:
-        for data_source in panel_config["data source"]:
-            if "metric_table" in data_source:
-                metrics_info = {}
-                # Metric names from data source
-                metric_names = {
-                    metric
-                    for _, gfx_data in data_source["metric_table"]["metric"].items()
-                    for metric in gfx_data
-                }
-                # Select metrics with descriptions available
-                metric_names = metric_names.intersection(
-                    panel_config["metrics_description"].keys()
-                )
-                # Add metrics info
-                for metric_name in sorted(list(metric_names)):
-                    metrics_info[metric_name] = {
-                        "rst": panel_config["metrics_description"][metric_name][
-                            "rst"
-                        ].strip(),
-                        "unit": panel_config["metrics_description"][metric_name][
-                            "unit"
-                        ],
-                    }
-                panel_metric_map[data_source["metric_table"]["id"]] = metrics_info
-
-    # Merge panel_metric_map with section_panel_map
-    section_metric_map = {}
-    for section, panel_id in section_panel_map.items():
-        if panel_id in panel_metric_map:
-            section_metric_map[section] = panel_metric_map[panel_id]
-
-    # Write documentation metrics description file
-    filename = DOC_TARGET_DIR / "metrics_description.yaml"
-    with open(filename, "w") as file:
-        file.write(get_autogen_text())
-        yaml.dump(section_metric_map, file, sort_keys=False)
-        print(f"File write: {filename}")
-    # Calculate hash of filename
-    HASH_FILE_MAP[str(filename.relative_to(ROOT_DIR))] = hashlib.sha256(
-        filename.read_bytes()
-    ).hexdigest()
-
-
-def update_hash():
-    # Write hash file
-    with open(HASH_FILE, "w") as file:
-        file.write(get_autogen_text())
-        yaml.dump(HASH_FILE_MAP, file, sort_keys=False)
-        print(f"File write: {HASH_FILE}")
-
-
-if __name__ == "__main__":
-    update_analysis_config()
-    update_sets_config()
-    update_documentation()
-    update_hash()
diff --git a/projects/rocprofiler-compute/tools/unified_config.yaml b/projects/rocprofiler-compute/tools/unified_config.yaml
deleted file mode 100644
index d157b14ac1..0000000000
--- a/projects/rocprofiler-compute/tools/unified_config.yaml
+++ /dev/null
@@ -1,17736 +0,0 @@
-# NOTE: Please run tools/split_config.py after making changes to this file to auto-generate configs
-panels:
-- id: 0
-  title: Top Stats
-  data source:
-  - raw_csv_table:
-      id: 1
-      title: Top Kernels
-      source: pmc_kernel_top.csv
-  - raw_csv_table:
-      id: 2
-      title: Dispatch List
-      source: pmc_dispatch_info.csv
-- id: 100
-  title: System Info
-  data source:
-  - raw_csv_table:
-      id: 101
-      title: System Info
-      source: sysinfo.csv
-      columnwise: true
-- id: 200
-  title: System Speed-of-Light
-  data source:
-  - metric_table:
-      id: 201
-      title: System Speed-of-Light
-      header:
-        metric: Metric
-        value: Avg
-        unit: Unit
-        peak: Peak
-        pop: Pct of Peak
-      metric:
-        gfx90a:
-          VALU FLOPs:
-            value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
-              + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
-              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
-              + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
-              + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP/s
-            peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
-            pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
-              + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
-              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
-              + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
-              + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))))
-              / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000))
-          VALU IOPs:
-            value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp
-              - Start_Timestamp)))
-            unit: GIOP/s
-            peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
-            pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) /
-              (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu)
-              * 64) * 2) / 1000))
-          MFMA FLOPs (BF16):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP/s
-            peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 1024) / 1000))
-          MFMA FLOPs (F16):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP/s
-            peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 1024) / 1000))
-          MFMA FLOPs (F32):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP/s
-            peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
-          MFMA FLOPs (F64):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP/s
-            peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
-          MFMA IOPs (Int8):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GIOP/s
-            peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 1024) / 1000))
-          Active CUs:
-            value: $numActiveCUs
-            unit: CUs
-            peak: $cu_per_gpu
-            pop: ((100 * $numActiveCUs) / $cu_per_gpu)
-          SALU Utilization:
-            value: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            unit: pct
-            peak: 100
-            pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-          VALU Utilization:
-            value: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu)))
-            unit: pct
-            peak: 100
-            pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-          MFMA Utilization:
-            value: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu) * 4)))
-            unit: pct
-            peak: 100
-            pop: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu) * 4)))
-          VMEM Utilization:
-            value: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
-              / $cu_per_gpu))
-            unit: pct
-            peak: 100
-            pop: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
-              / $cu_per_gpu))
-          Branch Utilization:
-            value: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) /
-              $cu_per_gpu))
-            unit: pct
-            peak: 100
-            pop: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-          VALU Active Threads:
-            value: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
-              != 0) else None))
-            unit: Threads
-            peak: 64
-            pop: (AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
-              != 0) else None)) * 1.5625)
-          IPC:
-            value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))
-            unit: Instr/cycle
-            peak: 5
-            pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5)
-          Wavefront Occupancy:
-            value: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
-            unit: Wavefronts
-            peak: ($max_waves_per_cu * $cu_per_gpu)
-            pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / ($max_waves_per_cu
-              * $cu_per_gpu))))
-            coll_level: SQ_LEVEL_WAVES
-          Theoretical LDS Bandwidth:
-            value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
-              / (End_Timestamp - Start_Timestamp)))
-            unit: GB/s
-            peak: (($max_sclk * $cu_per_gpu) * 0.128)
-            pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
-              / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) *
-              0.00128)))
-          LDS Bank Conflicts/Access:
-            value: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
-              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
-            unit: Conflicts/access
-            peak: 32
-            pop: ((100 * AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
-              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))) /
-              32)
-          vL1D Cache Hit Rate:
-            value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
-              0) else None))
-            unit: pct
-            peak: 100
-            pop: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
-              0) else None))
-          vL1D Cache BW:
-            value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp)))
-            unit: GB/s
-            peak: ((($max_sclk / 1000) * 64) * $cu_per_gpu)
-            pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk / 1000) * 64) * $cu_per_gpu))
-          L2 Cache Hit Rate:
-            value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-              + TCC_MISS_sum) != 0) else None))
-            unit: pct
-            peak: 100
-            pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-              + TCC_MISS_sum) != 0) else None))
-          L2 Cache BW:
-            value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))
-            unit: GB/s
-            peak: ((($max_sclk / 1000) * 64) * TO_INT($total_l2_chan))
-            pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))))
-              / ((($max_sclk / 1000) * 64) * TO_INT($total_l2_chan)))
-          L2-Fabric Read BW:
-            value: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum)
-              * 64)) / (End_Timestamp - Start_Timestamp)))
-            unit: GB/s
-            peak: $hbmBandwidth
-            pop: ((100 * AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum -
-              TCC_EA_RDREQ_32B_sum) * 64)) / (End_Timestamp - Start_Timestamp))))
-              / $hbmBandwidth)
-          L2-Fabric Write BW:
-            value: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum)
-              * 32)) / (End_Timestamp - Start_Timestamp)))
-            unit: GB/s
-            peak: $hbmBandwidth
-            pop: ((100 * AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum -
-              TCC_EA_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp))))
-              / $hbmBandwidth)
-          L2-Fabric Read Latency:
-            value: AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
-              != 0) else None))
-            unit: Cycles
-            peak: None
-            pop: None
-          L2-Fabric Write Latency:
-            value: AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
-              != 0) else None))
-            unit: Cycles
-            peak: None
-            pop: None
-          sL1D Cache Hit Rate:
-            value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES))
-              if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None))
-            unit: pct
-            peak: 100
-            pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES))
-              if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None))
-          sL1D Cache BW:
-            value: AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))
-            unit: GB/s
-            peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu)
-            pop: ((100 * AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp))
-              * 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu))
-          L1I Hit Rate:
-            value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
-            unit: pct
-            peak: 100
-            pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
-          L1I BW:
-            value: AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))
-            unit: GB/s
-            peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu)
-            pop: ((100 * AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp))
-              * 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu))
-          L1I Fetch Latency:
-            value: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
-            unit: Cycles
-            peak: None
-            pop: None
-            coll_level: SQ_IFETCH_LEVEL
-        gfx941:
-          VALU FLOPs:
-            value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
-              + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
-              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
-              + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
-              + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP/s
-            peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
-            pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
-              + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
-              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
-              + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
-              + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))))
-              / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000))
-          VALU IOPs:
-            value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp
-              - Start_Timestamp)))
-            unit: GIOP/s
-            peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
-            pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) /
-              (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu)
-              * 64) * 2) / 1000))
-          MFMA FLOPs (F8):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP
-            peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
-          MFMA FLOPs (BF16):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP/s
-            peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
-          MFMA FLOPs (F16):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP/s
-            peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
-          MFMA FLOPs (F32):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP/s
-            peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
-          MFMA FLOPs (F64):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP/s
-            peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
-          MFMA IOPs (Int8):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GIOP/s
-            peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
-          Active CUs:
-            value: $numActiveCUs
-            unit: CUs
-            peak: $cu_per_gpu
-            pop: ((100 * $numActiveCUs) / $cu_per_gpu)
-          SALU Utilization:
-            value: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            unit: pct
-            peak: 100
-            pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-          VALU Utilization:
-            value: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu)))
-            unit: pct
-            peak: 100
-            pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-          MFMA Utilization:
-            value: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu) * 4)))
-            unit: pct
-            peak: 100
-            pop: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu) * 4)))
-          VMEM Utilization:
-            value: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
-              / $cu_per_gpu))
-            unit: pct
-            peak: 100
-            pop: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
-              / $cu_per_gpu))
-          Branch Utilization:
-            value: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) /
-              $cu_per_gpu))
-            unit: pct
-            peak: 100
-            pop: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-          VALU Active Threads:
-            value: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
-              != 0) else None))
-            unit: Threads
-            peak: $wave_size
-            pop: (100 * AVG((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU / $wave_size)
-              if (SQ_ACTIVE_INST_VALU != 0) else None))
-          IPC:
-            value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))
-            unit: Instr/cycle
-            peak: 5
-            pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5)
-          Wavefront Occupancy:
-            value: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
-            unit: Wavefronts
-            peak: ($max_waves_per_cu * $cu_per_gpu)
-            pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / ($max_waves_per_cu
-              * $cu_per_gpu))))
-            coll_level: SQ_LEVEL_WAVES
-          Theoretical LDS Bandwidth:
-            value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
-              / (End_Timestamp - Start_Timestamp)))
-            unit: GB/s
-            peak: (($max_sclk * $cu_per_gpu) * 0.128)
-            pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
-              / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) *
-              0.00128)))
-          LDS Bank Conflicts/Access:
-            value: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
-              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
-            unit: Conflicts/access
-            peak: 32
-            pop: ((100 * AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
-              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))) /
-              32)
-          vL1D Cache Hit Rate:
-            value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
-              0) else None))
-            unit: pct
-            peak: 100
-            pop: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
-              0) else None))
-          vL1D Cache BW:
-            value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
-            unit: GB/s
-            peak: ((($max_sclk / 1000) * 128) * $cu_per_gpu)
-            pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu))
-          L2 Cache Hit Rate:
-            value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-              + TCC_MISS_sum) != 0) else None))
-            unit: pct
-            peak: 100
-            pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-              + TCC_MISS_sum) != 0) else None))
-          L2 Cache BW:
-            value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))
-            unit: GB/s
-            peak: ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))
-            pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))))
-              / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)))
-          L2-Fabric Read BW:
-            value: AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
-              - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
-              - Start_Timestamp))
-            unit: GB/s
-            peak: $hbmBandwidth
-            pop: ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
-              - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
-              - Start_Timestamp)))) / $hbmBandwidth)
-          L2-Fabric Write BW:
-            value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
-              * 32)) / (End_Timestamp - Start_Timestamp)))
-            unit: GB/s
-            peak: $hbmBandwidth
-            pop: ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum
-              - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp))))
-              / $hbmBandwidth)
-          L2-Fabric Read Latency:
-            value: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else None))
-            unit: Cycles
-            peak: None
-            pop: None
-          L2-Fabric Write Latency:
-            value: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            unit: Cycles
-            peak: None
-            pop: None
-          sL1D Cache Hit Rate:
-            value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES))
-              if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None))
-            unit: pct
-            peak: 100
-            pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES))
-              if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None))
-          sL1D Cache BW:
-            value: AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))
-            unit: GB/s
-            peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu)
-            pop: ((100 * AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp))
-              * 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu))
-          L1I Hit Rate:
-            value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
-            unit: pct
-            peak: 100
-            pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
-          L1I BW:
-            value: AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))
-            unit: GB/s
-            peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu)
-            pop: ((100 * AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp))
-              * 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu))
-          L1I Fetch Latency:
-            value: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
-            unit: Cycles
-            peak: None
-            pop: None
-            coll_level: SQ_IFETCH_LEVEL
-        gfx940:
-          VALU FLOPs:
-            value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
-              + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
-              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
-              + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
-              + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP/s
-            peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
-            pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
-              + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
-              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
-              + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
-              + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))))
-              / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000))
-          VALU IOPs:
-            value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp
-              - Start_Timestamp)))
-            unit: GIOP/s
-            peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
-            pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) /
-              (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu)
-              * 64) * 2) / 1000))
-          MFMA FLOPs (F8):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP/s
-            peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
-          MFMA FLOPs (BF16):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP/s
-            peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
-          MFMA FLOPs (F16):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP/s
-            peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
-          MFMA FLOPs (F32):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP/s
-            peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
-          MFMA FLOPs (F64):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP/s
-            peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
-          MFMA IOPs (Int8):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GIOP/s
-            peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
-          Active CUs:
-            value: $numActiveCUs
-            unit: CUs
-            peak: $cu_per_gpu
-            pop: ((100 * $numActiveCUs) / $cu_per_gpu)
-          SALU Utilization:
-            value: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            unit: pct
-            peak: 100
-            pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-          VALU Utilization:
-            value: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu)))
-            unit: pct
-            peak: 100
-            pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-          MFMA Utilization:
-            value: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu) * 4)))
-            unit: pct
-            peak: 100
-            pop: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu) * 4)))
-          VMEM Utilization:
-            value: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
-              / $cu_per_gpu))
-            unit: pct
-            peak: 100
-            pop: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
-              / $cu_per_gpu))
-          Branch Utilization:
-            value: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) /
-              $cu_per_gpu))
-            unit: pct
-            peak: 100
-            pop: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-          VALU Active Threads:
-            value: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
-              != 0) else None))
-            unit: Threads
-            peak: $wave_size
-            pop: (100 * AVG((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU / $wave_size)
-              if (SQ_ACTIVE_INST_VALU != 0) else None))
-          IPC:
-            value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))
-            unit: Instr/cycle
-            peak: 5
-            pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5)
-          Wavefront Occupancy:
-            value: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
-            unit: Wavefronts
-            peak: ($max_waves_per_cu * $cu_per_gpu)
-            pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / ($max_waves_per_cu
-              * $cu_per_gpu))))
-            coll_level: SQ_LEVEL_WAVES
-          Theoretical LDS Bandwidth:
-            value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
-              / (End_Timestamp - Start_Timestamp)))
-            unit: GB/s
-            peak: (($max_sclk * $cu_per_gpu) * 0.128)
-            pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
-              / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) *
-              0.00128)))
-          LDS Bank Conflicts/Access:
-            value: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
-              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
-            unit: Conflicts/access
-            peak: 32
-            pop: ((100 * AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
-              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))) /
-              32)
-          vL1D Cache Hit Rate:
-            value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
-              0) else None))
-            unit: pct
-            peak: 100
-            pop: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
-              0) else None))
-          vL1D Cache BW:
-            value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
-            unit: GB/s
-            peak: ((($max_sclk / 1000) * 128) * $cu_per_gpu)
-            pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu))
-          L2 Cache Hit Rate:
-            value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-              + TCC_MISS_sum) != 0) else None))
-            unit: pct
-            peak: 100
-            pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-              + TCC_MISS_sum) != 0) else None))
-          L2 Cache BW:
-            value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))
-            unit: GB/s
-            peak: ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))
-            pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))))
-              / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)))
-          L2-Fabric Read BW:
-            value: AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
-              - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
-              - Start_Timestamp))
-            unit: GB/s
-            peak: $hbmBandwidth
-            pop: ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
-              - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
-              - Start_Timestamp)))) / $hbmBandwidth)
-          L2-Fabric Write BW:
-            value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
-              * 32)) / (End_Timestamp - Start_Timestamp)))
-            unit: GB/s
-            peak: $hbmBandwidth
-            pop: ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum
-              - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp))))
-              / $hbmBandwidth)
-          L2-Fabric Read Latency:
-            value: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else None))
-            unit: Cycles
-            peak: None
-            pop: None
-          L2-Fabric Write Latency:
-            value: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            unit: Cycles
-            peak: None
-            pop: None
-          sL1D Cache Hit Rate:
-            value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES))
-              if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None))
-            unit: pct
-            peak: 100
-            pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES))
-              if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None))
-          sL1D Cache BW:
-            value: AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))
-            unit: GB/s
-            peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu)
-            pop: ((100 * AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp))
-              * 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu))
-          L1I Hit Rate:
-            value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
-            unit: pct
-            peak: 100
-            pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
-          L1I BW:
-            value: AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))
-            unit: GB/s
-            peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu)
-            pop: ((100 * AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp))
-              * 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu))
-          L1I Fetch Latency:
-            value: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
-            unit: Cycles
-            peak: None
-            pop: None
-            coll_level: SQ_IFETCH_LEVEL
-        gfx942:
-          VALU FLOPs:
-            value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
-              + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
-              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
-              + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
-              + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP/s
-            peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
-            pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
-              + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
-              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
-              + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
-              + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))))
-              / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000))
-          VALU IOPs:
-            value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp
-              - Start_Timestamp)))
-            unit: GIOP/s
-            peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
-            pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) /
-              (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu)
-              * 64) * 2) / 1000))
-          MFMA FLOPs (F8):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP/s
-            peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
-          MFMA FLOPs (BF16):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP/s
-            peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
-          MFMA FLOPs (F16):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP/s
-            peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
-          MFMA FLOPs (F32):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP/s
-            peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
-          MFMA FLOPs (F64):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP/s
-            peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
-          MFMA IOPs (Int8):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GIOP/s
-            peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
-          Active CUs:
-            value: $numActiveCUs
-            unit: CUs
-            peak: $cu_per_gpu
-            pop: ((100 * $numActiveCUs) / $cu_per_gpu)
-          SALU Utilization:
-            value: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            unit: pct
-            peak: 100
-            pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-          VALU Utilization:
-            value: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu)))
-            unit: pct
-            peak: 100
-            pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-          MFMA Utilization:
-            value: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu) * 4)))
-            unit: pct
-            peak: 100
-            pop: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu) * 4)))
-          VMEM Utilization:
-            value: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
-              / $cu_per_gpu))
-            unit: pct
-            peak: 100
-            pop: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
-              / $cu_per_gpu))
-          Branch Utilization:
-            value: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) /
-              $cu_per_gpu))
-            unit: pct
-            peak: 100
-            pop: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-          VALU Active Threads:
-            value: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
-              != 0) else None))
-            unit: Threads
-            peak: $wave_size
-            pop: (100 * AVG((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU / $wave_size)
-              if (SQ_ACTIVE_INST_VALU != 0) else None))
-          IPC:
-            value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))
-            unit: Instr/cycle
-            peak: 5
-            pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5)
-          Wavefront Occupancy:
-            value: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
-            unit: Wavefronts
-            peak: ($max_waves_per_cu * $cu_per_gpu)
-            pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / ($max_waves_per_cu
-              * $cu_per_gpu))))
-            coll_level: SQ_LEVEL_WAVES
-          Theoretical LDS Bandwidth:
-            value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
-              / (End_Timestamp - Start_Timestamp)))
-            unit: GB/s
-            peak: (($max_sclk * $cu_per_gpu) * 0.128)
-            pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
-              / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) *
-              0.00128)))
-          LDS Bank Conflicts/Access:
-            value: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
-              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
-            unit: Conflicts/access
-            peak: 32
-            pop: ((100 * AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
-              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))) /
-              32)
-          vL1D Cache Hit Rate:
-            value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
-              0) else None))
-            unit: pct
-            peak: 100
-            pop: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
-              0) else None))
-          vL1D Cache BW:
-            value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
-            unit: GB/s
-            peak: ((($max_sclk / 1000) * 128) * $cu_per_gpu)
-            pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu))
-          L2 Cache Hit Rate:
-            value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-              + TCC_MISS_sum) != 0) else None))
-            unit: pct
-            peak: 100
-            pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-              + TCC_MISS_sum) != 0) else None))
-          L2 Cache BW:
-            value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))
-            unit: GB/s
-            peak: ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))
-            pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))))
-              / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)))
-          L2-Fabric Read BW:
-            value: AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
-              - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
-              - Start_Timestamp))
-            unit: GB/s
-            peak: $hbmBandwidth
-            pop: ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
-              - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
-              - Start_Timestamp)))) / $hbmBandwidth)
-          L2-Fabric Write BW:
-            value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
-              * 32)) / (End_Timestamp - Start_Timestamp)))
-            unit: GB/s
-            peak: $hbmBandwidth
-            pop: ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum
-              - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp))))
-              / $hbmBandwidth)
-          L2-Fabric Read Latency:
-            value: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else None))
-            unit: Cycles
-            peak: None
-            pop: None
-          L2-Fabric Write Latency:
-            value: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            unit: Cycles
-            peak: None
-            pop: None
-          sL1D Cache Hit Rate:
-            value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES))
-              if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None))
-            unit: pct
-            peak: 100
-            pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES))
-              if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None))
-          sL1D Cache BW:
-            value: AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))
-            unit: GB/s
-            peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu)
-            pop: ((100 * AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp))
-              * 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu))
-          L1I Hit Rate:
-            value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
-            unit: pct
-            peak: 100
-            pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
-          L1I BW:
-            value: AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))
-            unit: GB/s
-            peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu)
-            pop: ((100 * AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp))
-              * 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu))
-          L1I Fetch Latency:
-            value: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
-            unit: Cycles
-            peak: None
-            pop: None
-            coll_level: SQ_IFETCH_LEVEL
-        gfx950:
-          VALU FLOPs:
-            value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
-              + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
-              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
-              + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
-              + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP/s
-            peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
-            pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
-              + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
-              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
-              + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
-              + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))))
-              / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000))
-          VALU IOPs:
-            value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp
-              - Start_Timestamp)))
-            unit: GIOP/s
-            peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
-            pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) /
-              (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu)
-              * 64) * 2) / 1000))
-          MFMA FLOPs (F8):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP/s
-            peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
-          MFMA FLOPs (BF16):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP/s
-            peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
-          MFMA FLOPs (F16):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP/s
-            peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
-          MFMA FLOPs (F32):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP/s
-            peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
-          MFMA FLOPs (F64):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP/s
-            peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 128) / 1000))
-          MFMA FLOPs (F6F4):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP/s
-            peak: ((($max_sclk * $cu_per_gpu) * 16834) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 16834) / 1000))
-          MFMA IOPs (Int8):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GIOP/s
-            peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
-          Active CUs:
-            value: $numActiveCUs
-            unit: CUs
-            peak: $cu_per_gpu
-            pop: ((100 * $numActiveCUs) / $cu_per_gpu)
-          SALU Utilization:
-            value: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            unit: pct
-            peak: 100
-            pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-          VALU Utilization:
-            value: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu)))
-            unit: pct
-            peak: 100
-            pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-          MFMA Utilization:
-            value: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu) * 4)))
-            unit: pct
-            peak: 100
-            pop: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu) * 4)))
-          VMEM Utilization:
-            value: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
-              / $cu_per_gpu))
-            unit: pct
-            peak: 100
-            pop: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
-              / $cu_per_gpu))
-          Branch Utilization:
-            value: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) /
-              $cu_per_gpu))
-            unit: pct
-            peak: 100
-            pop: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-          VALU Active Threads:
-            value: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
-              != 0) else None))
-            unit: Threads
-            peak: $wave_size
-            pop: (100 * AVG((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU / $wave_size)
-              if (SQ_ACTIVE_INST_VALU != 0) else None))
-          IPC:
-            value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))
-            unit: Instr/cycle
-            peak: 5
-            pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5)
-          Wavefront Occupancy:
-            value: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
-            unit: Wavefronts
-            peak: ($max_waves_per_cu * $cu_per_gpu)
-            pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / ($max_waves_per_cu
-              * $cu_per_gpu))))
-            coll_level: SQ_LEVEL_WAVES
-          Theoretical LDS Bandwidth:
-            value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
-              / (End_Timestamp - Start_Timestamp)))
-            unit: GB/s
-            peak: (($max_sclk * $cu_per_gpu) * 0.128)
-            pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
-              / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) *
-              0.00128)))
-          LDS Bank Conflicts/Access:
-            value: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
-              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
-            unit: Conflicts/access
-            peak: 32
-            pop: ((100 * AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
-              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))) /
-              32)
-          vL1D Cache Hit Rate:
-            value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
-              0) else None))
-            unit: pct
-            peak: 100
-            pop: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
-              0) else None))
-          vL1D Cache BW:
-            value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
-            unit: GB/s
-            peak: ((($max_sclk / 1000) * 128) * $cu_per_gpu)
-            pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu))
-          L2 Cache Hit Rate:
-            value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-              + TCC_MISS_sum) != 0) else None))
-            unit: pct
-            peak: 100
-            pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-              + TCC_MISS_sum) != 0) else None))
-          L2 Cache BW:
-            value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))
-            unit: GB/s
-            peak: ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))
-            pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))))
-              / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)))
-          L2-Fabric Read BW:
-            value: AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
-              - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
-              - Start_Timestamp))
-            unit: GB/s
-            peak: $hbmBandwidth
-            pop: ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
-              - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
-              - Start_Timestamp)))) / $hbmBandwidth)
-          L2-Fabric Write BW:
-            value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
-              * 32)) / (End_Timestamp - Start_Timestamp)))
-            unit: GB/s
-            peak: $hbmBandwidth
-            pop: ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum
-              - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp))))
-              / $hbmBandwidth)
-          L2-Fabric Read Latency:
-            value: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else None))
-            unit: Cycles
-            peak: None
-            pop: None
-          L2-Fabric Write Latency:
-            value: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            unit: Cycles
-            peak: None
-            pop: None
-          sL1D Cache Hit Rate:
-            value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES))
-              if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None))
-            unit: pct
-            peak: 100
-            pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES))
-              if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None))
-          sL1D Cache BW:
-            value: AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))
-            unit: GB/s
-            peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu)
-            pop: ((100 * AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp))
-              * 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu))
-          L1I Hit Rate:
-            value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
-            unit: pct
-            peak: 100
-            pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
-          L1I BW:
-            value: AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))
-            unit: GB/s
-            peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu)
-            pop: ((100 * AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp))
-              * 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu))
-          L1I Fetch Latency:
-            value: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
-            unit: Cycles
-            peak: None
-            pop: None
-            coll_level: SQ_IFETCH_LEVEL
-        gfx908:
-          VALU FLOPs:
-            value: None
-            unit: GFLOP/s
-            peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
-            pop: None
-          VALU IOPs:
-            value: None
-            unit: GIOP/s
-            peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
-            pop: None
-          MFMA FLOPs (BF16):
-            value: None
-            unit: GFLOP/s
-            peak: ((($max_sclk * $cu_per_gpu) * 512) / 1000)
-            pop: None
-          MFMA FLOPs (F16):
-            value: None
-            unit: GFLOP/s
-            peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000)
-            pop: None
-          MFMA FLOPs (F32):
-            value: None
-            unit: GFLOP/s
-            peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
-            pop: None
-          MFMA FLOPs (F64):
-            value: None
-            unit: GFLOP/s
-            peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
-            pop: None
-          MFMA IOPs (Int8):
-            value: None
-            unit: GIOP/s
-            peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000)
-            pop: None
-          Active CUs:
-            value: $numActiveCUs
-            unit: CUs
-            peak: $cu_per_gpu
-            pop: ((100 * $numActiveCUs) / $cu_per_gpu)
-          SALU Utilization:
-            value: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            unit: pct
-            peak: 100
-            pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-          VALU Utilization:
-            value: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu)))
-            unit: pct
-            peak: 100
-            pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-          MFMA Utilization:
-            value: None
-            unit: pct
-            peak: 100
-            pop: None
-          VMEM Utilization:
-            value: None
-            unit: pct
-            peak: 100
-            pop: None
-          Branch Utilization:
-            value: None
-            unit: pct
-            peak: 100
-            pop: None
-          VALU Active Threads:
-            value: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
-              != 0) else None))
-            unit: Threads
-            peak: $wave_size
-            pop: (100 * AVG((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU / $wave_size)
-              if (SQ_ACTIVE_INST_VALU != 0) else None))
-          IPC:
-            value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))
-            unit: Instr/cycle
-            peak: 5
-            pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5)
-          Wavefront Occupancy:
-            value: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
-            unit: Wavefronts
-            peak: ($max_waves_per_cu * $cu_per_gpu)
-            pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / ($max_waves_per_cu
-              * $cu_per_gpu))))
-            coll_level: SQ_LEVEL_WAVES
-          Theoretical LDS Bandwidth:
-            value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
-              / (End_Timestamp - Start_Timestamp)))
-            unit: GB/s
-            peak: (($max_sclk * $cu_per_gpu) * 0.128)
-            pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
-              / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) *
-              0.00128)))
-          LDS Bank Conflicts/Access:
-            value: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
-              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
-            unit: Conflicts/access
-            peak: 32
-            pop: ((100 * AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
-              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))) /
-              32)
-          vL1D Cache Hit Rate:
-            value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
-              0) else None))
-            unit: pct
-            peak: 100
-            pop: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
-              0) else None))
-          vL1D Cache BW:
-            value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp)))
-            unit: GB/s
-            peak: ((($max_sclk / 1000) * 64) * $cu_per_gpu)
-            pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk / 1000) * 64) * $cu_per_gpu))
-          L2 Cache Hit Rate:
-            value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-              + TCC_MISS_sum) != 0) else None))
-            unit: pct
-            peak: 100
-            pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-              + TCC_MISS_sum) != 0) else None))
-          L2 Cache BW:
-            value: AVG(((TCC_REQ_sum * 64) / (End_Timestamp - Start_Timestamp)))
-            unit: GB/s
-            peak: ((($max_sclk / 1000) * 64) * TO_INT($total_l2_chan))
-            pop: ((100 * AVG(((TCC_REQ_sum * 64) / (End_Timestamp - Start_Timestamp))))
-              / ((($max_sclk / 1000) * 64) * TO_INT($total_l2_chan)))
-          L2-Fabric Read BW:
-            value: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
-              * 64)) / (End_Timestamp - Start_Timestamp)))
-            unit: GB/s
-            peak: $hbmBandwidth
-            pop: ((100 * AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum -
-              TCC_EA0_RDREQ_32B_sum) * 64)) / (End_Timestamp - Start_Timestamp))))
-              / $hbmBandwidth)
-          L2-Fabric Write BW:
-            value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
-              * 32)) / (End_Timestamp - Start_Timestamp)))
-            unit: GB/s
-            peak: $hbmBandwidth
-            pop: ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum -
-              TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp))))
-              / $hbmBandwidth)
-          L2-Fabric Read Latency:
-            value: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else None))
-            unit: Cycles
-            peak: None
-            pop: None
-          L2-Fabric Write Latency:
-            value: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            unit: Cycles
-            peak: None
-            pop: None
-          sL1D Cache Hit Rate:
-            value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES))
-              if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None))
-            unit: pct
-            peak: 100
-            pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES))
-              if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None))
-          sL1D Cache BW:
-            value: AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))
-            unit: GB/s
-            peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu)
-            pop: ((100 * AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp))
-              * 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu))
-          L1I Hit Rate:
-            value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
-            unit: pct
-            peak: 100
-            pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
-          L1I BW:
-            value: AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))
-            unit: GB/s
-            peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu)
-            pop: ((100 * AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp))
-              * 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu))
-          L1I Fetch Latency:
-            value: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
-            unit: Cycles
-            peak: None
-            pop: None
-            coll_level: SQ_IFETCH_LEVEL
-  metrics_description:
-    VALU FLOPs:
-      plain: |-
-        The total floating-point operations executed per second on the VALU.
-        This is also presented as a percent of the peak theoretical FLOPs achievable
-        on the specific accelerator. Note: this does not include any floating-point
-        operations from MFMA instructions.
-      rst: |-
-        The total floating-point operations executed per second on the :ref:`VALU
-        <desc-valu>`. This is also presented as a percent of the peak theoretical
-        FLOPs achievable on the specific accelerator. Note: this does not include
-        any floating-point operations from :ref:`MFMA <desc-mfma>` instructions.
-      unit: GFLOPs
-    VALU IOPs:
-      plain: |-
-        The total integer operations executed per second on the VALU. This is
-        also presented as a percent of the peak theoretical IOPs achievable on the
-        specific accelerator. Note: this does not include any integer operations from
-        MFMA instructions.
-      rst: |-
-        The total integer operations executed per second on the :ref:`VALU <desc-valu>`.
-        This is also presented as a percent of the peak theoretical IOPs achievable
-        on the specific accelerator. Note: this does not include any integer operations
-        from :ref:`MFMA <desc-mfma>` instructions.
-      unit: GOIPs
-    MFMA FLOPs (F8):
-      plain: The total number of 8-bit brain floating point MFMA operations executed
-        per second. This does not include any 16-bit brain floating point operations
-        from VALU instructions. This is also presented as a percent of the peak theoretical
-        F8 MFMA operations achievable on the specific accelerator. It is supported
-        on AMD Instinct MI300 series and later only.
-      rst: |-
-        The total number of 8-bit brain floating point :ref:`MFMA <desc-mfma>`
-        operations executed per second. Note: this does not include any 16-bit brain
-        floating point operations from :ref:`VALU <desc-valu>` instructions. This
-        is also presented as a percent of the peak theoretical F8 MFMA operations
-        achievable on the specific accelerator. It is supported on AMD Instinct MI300
-        series and later only.
-      unit: GFLOPs
-    MFMA FLOPs (BF16):
-      plain: |-
-        The total number of 16-bit brain floating point MFMA operations executed
-        per second. Note: this does not include any 16-bit brain floating point operations
-        from VALU instructions. This is also presented as a percent of the peak theoretical
-        BF16 MFMA operations achievable on the specific accelerator.
-      rst: |-
-        The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>`
-        operations executed per second. Note: this does not include any 16-bit brain
-        floating point operations from :ref:`VALU <desc-valu>` instructions. This
-        is also presented as a percent of the peak theoretical BF16 MFMA operations
-        achievable on the specific accelerator.
-      unit: GFLOPs
-    MFMA FLOPs (F16):
-      plain: |-
-        The total number of 16-bit floating point MFMA operations executed per
-        second. Note: this does not include any 16-bit floating point operations from
-        VALU instructions. This is also presented as a percent of the peak theoretical
-        F16 MFMA operations achievable on the specific accelerator.
-      rst: |-
-        The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
-        executed per second. Note: this does not include any 16-bit floating point
-        operations from :ref:`VALU <desc-valu>` instructions. This is also presented
-        as a percent of the peak theoretical F16 MFMA operations achievable on the
-        specific accelerator.
-      unit: GFLOPs
-    MFMA FLOPs (F32):
-      plain: |-
-        The total number of 32-bit floating point MFMA operations executed per
-        second. Note: this does not include any 32-bit floating point operations from
-        VALU instructions. This is also presented as a percent of the peak theoretical
-        F32 MFMA operations achievable on the specific accelerator.
-      rst: |-
-        The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
-        executed per second. Note: this does not include any 32-bit floating point
-        operations from :ref:`VALU <desc-valu>` instructions. This is also presented
-        as a percent of the peak theoretical F32 MFMA operations achievable on the
-        specific accelerator.
-      unit: GFLOPs
-    MFMA FLOPs (F64):
-      plain: |-
-        The total number of 64-bit floating point MFMA operations executed per
-        second. Note: this does not include any 64-bit floating point operations from
-        VALU instructions. This is also presented as a percent of the peak theoretical
-        F64 MFMA operations achievable on the specific accelerator.
-      rst: |-
-        The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
-        executed per second. Note: this does not include any 64-bit floating point
-        operations from :ref:`VALU <desc-valu>` instructions. This is also presented
-        as a percent of the peak theoretical F64 MFMA operations achievable on the
-        specific accelerator.
-      unit: GFLOPs
-    MFMA IOPs (Int8):
-      plain: |-
-        The total number of 8-bit integer MFMA operations executed per second.
-        Note: this does not include any 8-bit integer operations from VALU instructions.
-        This is also presented as a percent of the peak theoretical INT8 MFMA operations
-        achievable on the specific accelerator.
-      rst: |-
-        The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
-        per second. Note: this does not include any 8-bit integer operations from
-        :ref:`VALU <desc-valu>` instructions. This is also presented as a percent
-        of the peak theoretical INT8 MFMA operations achievable on the specific accelerator.
-      unit: GIOPs
-    Active CUs:
-      plain: Total number of active compute units (CUs) on the accelerator during
-        the kernel execution.
-      unit: Number
-      rst: Total number of active compute units (CUs) on the accelerator during the
-        kernel execution.
-    SALU Utilization:
-      plain: Indicates what percent of the kernel's duration the SALU was busy executing
-        instructions. Computed as the ratio of the total number of cycles spent by
-        the scheduler issuing SALU or SMEM instructions over the total CU cycles.
-      rst: Indicates what percent of the kernel's duration the :ref:`SALU <desc-salu>`
-        was busy executing instructions. Computed as the ratio of the total number
-        of cycles spent by the :ref:`scheduler <desc-scheduler>` issuing SALU / :ref:`SMEM
-        <desc-smem>` instructions over the :ref:`total CU cycles <total-cu-cycles>`.
-      unit: Percent
-    VALU Utilization:
-      plain: Indicates what percent of the kernel's duration the VALU was busy executing
-        instructions. Does not include VMEM operations. Computed as the ratio of the
-        total number of cycles spent by the scheduler issuing VALU instructions over
-        the total CU cycles.
-      rst: Indicates what percent of the kernel's duration the :ref:`VALU <desc-valu>`
-        was busy executing instructions. Does not include :ref:`VMEM <desc-vmem>`
-        operations. Computed as the ratio of the total number of cycles spent by the
-        :ref:`scheduler <desc-scheduler>` issuing VALU instructions over the :ref:`total
-        CU cycles <total-cu-cycles>`.
-      unit: Percent
-    MFMA Utilization:
-      plain: Indicates what percent of the kernel's duration the MFMA unit was busy
-        executing instructions. Computed as the ratio of the total number of cycles
-        the MFMA was busy over the total CU cycles.
-      rst: Indicates what percent of the kernel's duration the :ref:`MFMA <desc-mfma>`
-        unit was busy executing instructions. Computed as the ratio of the total number
-        of cycles spent by the :ref:`MFMA <desc-salu>` was busy over the :ref:`total
-        CU cycles <total-cu-cycles>`.
-      unit: Percent
-    VMEM Utilization:
-      plain: Indicates what percent of the kernel's duration the VMEM unit was busy
-        executing instructions, including both global/generic and spill/scratch operations
-        (see the VMEM instruction count metrics) for more detail). Does not include
-        VALU operations. Computed as the ratio of the total number of cycles spent
-        by the scheduler issuing VMEM instructions over the total CU cycles.
-      rst: Indicates what percent of the kernel's duration the :ref:`VMEM <desc-vmem>`
-        unit was busy executing instructions, including both global/generic and spill/scratch
-        operations (see the :ref:`VMEM instruction count metrics <ta-instruction-counts>`
-        for more detail). Does not include :ref:`VALU <desc-valu>` operations. Computed
-        as the ratio of the total number of cycles spent by the :ref:`scheduler <desc-scheduler>`
-        issuing VMEM instructions over the :ref:`total CU cycles <total-cu-cycles>`.
-      unit: Percent
-    Branch Utilization:
-      plain: Indicates what percent of the kernel's duration the branch unit was busy
-        executing instructions. Computed as the ratio of the total number of cycles
-        spent by the scheduler issuing branch instructions over the total CU cycles
-      rst: Indicates what percent of the kernel's duration the :ref:`branch <desc-branch>`
-        unit was busy executing instructions. Computed as the ratio of the total number
-        of cycles spent by the :ref:`scheduler <desc-scheduler>` issuing branch instructions
-        over the :ref:`total CU cycles <total-cu-cycles>`.
-      unit: Percent
-    VALU Active Threads:
-      plain: Indicates the average level of divergence within a wavefront over the
-        lifetime of the kernel. The number of work-items that were active in a wavefront
-        during execution of each VALU instruction, time-averaged over all VALU instructions
-        run on all wavefronts in the kernel.
-      rst: Indicates the average level of :ref:`divergence <desc-divergence>` within
-        a wavefront over the lifetime of the kernel. The number of work-items that
-        were active in a wavefront during execution of each :ref:`VALU <desc-valu>`
-        instruction, time-averaged over all VALU instructions run on all wavefronts
-        in the kernel.
-      unit: Work-items
-    IPC:
-      plain: The ratio of the total number of instructions executed on the CU over
-        the total active CU cycles. This is also presented as a percent of the peak
-        theoretical bandwidth achievable on the specific accelerator.
-      rst: The ratio of the total number of instructions executed on the :doc:`CU
-        <compute-unit>` over the :ref:`total active CU cycles <total-active-cu-cycles>`.
-      unit: Instructions per-cycle
-    Wavefront Occupancy:
-      plain: |-
-        The time-averaged number of wavefronts resident on the accelerator over
-        the lifetime of the kernel. Note: this metric may be inaccurate for short-running
-        kernels (less than 1ms). This is also presented as a percent of the peak theoretical
-        occupancy achievable on the specific accelerator.
-      rst: |-
-        The time-averaged number of wavefronts resident on the accelerator over
-        the lifetime of the kernel. Note: this metric may be inaccurate for short-running
-        kernels (less than 1ms). This is also presented as a percent of the peak theoretical
-        occupancy achievable on the specific accelerator.
-      unit: Wavefronts
-    Theoretical LDS Bandwidth:
-      plain: Indicates the maximum amount of bytes that could have been loaded from,
-        stored to, or atomically updated in the LDS per unit time (see LDS Bandwidth
-        example for more detail). This is also presented as a percent of the peak
-        theoretical F64 MFMA operations achievable on the specific accelerator.
-      rst: Indicates the maximum amount of bytes that could have been loaded from,
-        stored to, or atomically updated in the LDS per unit time (see :ref:`LDS Bandwidth
-        <lds-bandwidth>` example for more detail). This is also presented as a percent
-        of the peak theoretical F64 MFMA operations achievable on the specific accelerator.
-      unit: GB/s
-    LDS Bank Conflicts/Access:
-      plain: The ratio of the number of cycles spent in the LDS scheduler due to bank
-        conflicts (as determined by the conflict resolution hardware) to the base
-        number of cycles that would be spent in the LDS scheduler in a completely
-        uncontended case. This is also presented in normalized form (i.e., the Bank
-        Conflict Rate).
-      rst: The ratio of the number of cycles spent in the :doc:`LDS scheduler <local-data-share>`
-        due to bank conflicts (as determined by the conflict resolution hardware)
-        to the base number of cycles that would be spent in the LDS scheduler in
-        a completely uncontended case. This is also presented in normalized form
-        (i.e., the Bank Conflict Rate).
-      unit: Conflicts/Access
-    vL1D Cache Hit Rate:
-      plain: The ratio of the number of vL1D cache line requests that hit in vL1D
-        cache over the total number of cache line requests to the vL1D cache RAM.
-      rst: The ratio of the number of vL1D cache line requests that hit in vL1D cache
-        over the total number of cache line requests to the :ref:`vL1D cache RAM
-        <desc-tc>`.
-      unit: Percent
-    vL1D Cache BW:
-      plain: The number of bytes looked up in the vL1D cache as a result of VMEM instructions
-        per unit time. The number of bytes is calculated as the number of cache lines
-        requested multiplied by the cache line size. This value does not consider
-        partial requests, so e.g., if only a single value is requested in a cache
-        line, the data movement will still be counted as a full cache line. This is
-        also presented as a percent of the peak theoretical bandwidth achievable on
-        the specific accelerator.
-      rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM
-        <desc-vmem>` instructions per unit time. The number of bytes is calculated
-        as the number of cache lines requested multiplied by the cache line size.
-        This value does not consider partial requests, so e.g., if only a single
-        value is requested in a cache line, the data movement will still be counted
-        as a full cache line. This is also presented as a percent of the peak theoretical
-        bandwidth achievable on the specific accelerator.
-      unit: GB/s
-    L2 Cache Hit Rate:
-      plain: The ratio of the number of L2 cache line requests that hit in the L2
-        cache over the total number of incoming cache line requests to the L2 cache.
-      rst: The ratio of the number of L2 cache line requests that hit in the L2 cache
-        over the total number of incoming cache line requests to the L2 cache.
-      unit: Percent
-    L2 Cache BW:
-      plain: The number of bytes looked up in the L2 cache per unit time. The number
-        of bytes is calculated as the number of cache lines requested multiplied by
-        the cache line size. This value does not consider partial requests, so e.g.,
-        if only a single value is requested in a cache line, the data movement will
-        still be counted as a full cache line. This is also presented as a percent
-        of the peak theoretical bandwidth achievable on the specific accelerator.
-      rst: The number of bytes looked up in the L2 cache per unit time. The number of
-        bytes is calculated as the number of cache lines requested multiplied by
-        the cache line size. This value does not consider partial requests, so e.g.,
-        if only a single value is requested in a cache line, the data movement will
-        still be counted as a full cache line. This is also presented as a percent
-        of the peak theoretical bandwidth achievable on the specific accelerator.
-      unit: GB/s
-    L2-Fabric Read BW:
-      plain: |-
-        The number of bytes read by the L2 over the Infinity Fabric\u2122 interface
-        per unit time. This is also presented as a percent of the peak theoretical
-        bandwidth achievable on the specific accelerator.
-      rst: |-
-        The number of bytes read by the L2 over the :ref:`Infinity Fabric\u2122
-        interface <l2-fabric>` per unit time. This is also presented as a percent
-        of the peak theoretical bandwidth achievable on the specific accelerator.
-      unit: GB/s
-    L2-Fabric Write BW:
-      plain: The number of bytes sent by the L2 over the Infinity Fabric interface
-        by write and atomic operations per unit time. This is also presented as a
-        percent of the peak theoretical bandwidth achievable on the specific accelerator.
-      rst: The number of bytes sent by the L2 over the :ref:`Infinity Fabric interface
-        <l2-fabric>` by write and atomic operations per unit time. This is also presented
-        as a percent of the peak theoretical bandwidth achievable on the specific
-        accelerator.
-      unit: GB/s
-    L2-Fabric Read Latency:
-      plain: The time-averaged number of cycles read requests spent in Infinity Fabric
-        before data was returned to the L2.
-      rst: The time-averaged number of cycles read requests spent in Infinity Fabric before
-        data was returned to the L2.
-      unit: Cycles
-    L2-Fabric Write Latency:
-      plain: The time-averaged number of cycles write requests spent in Infinity Fabric
-        before a completion acknowledgement was returned to the L2.
-      rst: The time-averaged number of cycles write requests spent in Infinity Fabric
-        before a completion acknowledgement was returned to the L2.
-      unit: Cycles
-    sL1D Cache Hit Rate:
-      plain: The percent of sL1D requests that hit on a previously loaded line the
-        cache. Calculated as the ratio of the number of sL1D requests that hit over
-        the number of all sL1D requests.
-      rst: The percent of sL1D requests that hit on a previously loaded line the cache.
-        Calculated as the ratio of the number of sL1D requests that hit over the
-        number of all sL1D requests.
-      unit: Percent
-    sL1D Cache BW:
-      plain: The number of bytes looked up in the sL1D cache per unit time. This is
-        also presented as a percent of the peak theoretical bandwidth achievable on
-        the specific accelerator.
-      rst: The number of bytes looked up in the sL1D cache per unit time. This is also
-        presented as a percent of the peak theoretical bandwidth achievable on the
-        specific accelerator.
-      unit: GB/s
-    L1I Hit Rate:
-      plain: The number of bytes looked up in the L1I cache per unit time. This is
-        also presented as a percent of the peak theoretical bandwidth achievable on
-        the specific accelerator.
-      rst: The percent of L1I requests that hit on a previously loaded line the cache.
-        Calculated as the ratio of the number of L1I requests that hit over the number
-        of all L1I requests.
-      unit: GB/s
-    L1I BW:
-      plain: The percent of L1I requests that hit on a previously loaded line the
-        cache. Calculated as the ratio of the number of L1I requests that hit over
-        the number of all L1I requests.
-      rst: The number of bytes looked up in the L1I cache per unit time. This is also
-        presented as a percent of the peak theoretical bandwidth achievable on the
-        specific accelerator.
-      unit: Percent
-    L1I Fetch Latency:
-      plain: The average number of cycles spent to fetch instructions to a CU.
-      rst: The average number of cycles spent to fetch instructions to a :doc:`CU
-        <compute-unit>`.
-      unit: Cycles
-- id: 300
-  title: Memory Chart
-  data source:
-  - metric_table:
-      id: 301
-      title: Memory Chart
-      header:
-        metric: Metric
-        value: Value
-      metric:
-        gfx90a:
-          Wavefront Occupancy:
-            value: ROUND(AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / $numActiveCUs),
-              0)
-            coll_level: SQ_LEVEL_WAVES
-          Wave Life:
-            value: ROUND(AVG(((4 * (SQ_WAVE_CYCLES / SQ_WAVES)) if (SQ_WAVES != 0)
-              else 0)), 0)
-          SALU:
-            value: ROUND(AVG((SQ_INSTS_SALU / $denom)), 0)
-          SMEM:
-            value: ROUND(AVG((SQ_INSTS_SMEM / $denom)), 0)
-          VALU:
-            value: ROUND(AVG((SQ_INSTS_VALU / $denom)), 0)
-          MFMA:
-            value: ROUND(AVG((SQ_INSTS_MFMA / $denom)), 0)
-          VMEM:
-            value: ROUND(AVG((SQ_INSTS_VMEM / $denom)), 0)
-          LDS:
-            value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0)
-          GWS:
-            value: ROUND(AVG((SQ_INSTS_GDS / $denom)), 0)
-          BR:
-            value: ROUND(AVG((SQ_INSTS_BRANCH / $denom)), 0)
-          Active CUs:
-            value: $numActiveCUs
-          Num CUs:
-            value: $cu_per_gpu
-          VGPR:
-            value: ROUND(AVG(Arch_VGPR), 0)
-          SGPR:
-            value: ROUND(AVG(SGPR), 0)
-          LDS Allocation:
-            value: ROUND(AVG(LDS_Per_Workgroup), 0)
-          Scratch Allocation:
-            value: ROUND(AVG(Scratch_Per_Workitem), 0)
-          Wavefronts:
-            value: ROUND(AVG(SPI_CSN_WAVE), 0)
-          Workgroups:
-            value: ROUND(AVG(SPI_CSN_NUM_THREADGROUPS), 0)
-          LDS Req:
-            value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0)
-          LDS Util:
-            value: ROUND(AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu))), 0)
-          LDS Latency:
-            value: ROUND(AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS
-              != 0) else None)),0)
-            coll_level: SQ_INST_LEVEL_LDS
-          VL1 Rd:
-            value: ROUND(AVG((TCP_TOTAL_READ_sum / $denom)), 0)
-          VL1 Wr:
-            value: ROUND(AVG((TCP_TOTAL_WRITE_sum / $denom)), 0)
-          VL1 Atomic:
-            value: ROUND(AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
-              / $denom)), 0)
-          VL1 Hit:
-            value: ROUND(AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
-              0) else None )), 0)
-          VL1 Lat:
-            value: ROUND(AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if
-              (TCP_TA_TCP_STATE_READ_sum != 0) else None)), 0)
-          VL1 Coalesce:
-            value: ROUND(AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum
-              * 4)) if (TCP_TOTAL_ACCESSES_sum != None) else 0)), 0)
-          VL1 Stall:
-            value: ROUND(AVG((((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
-              if (TCP_GATE_EN1_sum != 0) else None)), 0)
-          VL1_L2 Rd:
-            value: ROUND(AVG((TCP_TCC_READ_REQ_sum / $denom)), 0)
-          VL1_L2 Wr:
-            value: ROUND(AVG((TCP_TCC_WRITE_REQ_sum / $denom)), 0)
-          VL1_L2 Atomic:
-            value: ROUND(AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
-              / $denom)), 0)
-          sL1D Rd:
-            value: ROUND(AVG((SQC_DCACHE_REQ / $denom)), 0)
-          sL1D Hit:
-            value: ROUND((AVG(((SQC_DCACHE_HITS / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ
-              != 0) else None)) * 100), 0)
-          sL1D Lat:
-            value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ
-              != 0) else None)) * 100), 0)
-            coll_level: SQC_DCACHE_INFLIGHT_LEVEL
-          sL1D_L2 Rd:
-            value: ROUND(AVG((SQC_TC_DATA_READ_REQ / $denom)), 0)
-          sL1D_L2 Wr:
-            value: ROUND(AVG((SQC_TC_DATA_WRITE_REQ / $denom)), 0)
-          sL1D_L2 Atomic:
-            value: ROUND(AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)), 0)
-          IL1 Fetch:
-            value: ROUND(AVG((SQC_ICACHE_REQ / $denom)), 0)
-          IL1 Hit:
-            value: ROUND((AVG((SQC_ICACHE_HITS / SQC_ICACHE_REQ)) * 100), 0)
-          IL1 Lat:
-            value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_ICACHE_REQ) if (SQC_ICACHE_REQ
-              != 0) else None)) * 100), 0)
-            coll_level: SQC_ICACHE_INFLIGHT_LEVEL
-          IL1_L2 Rd:
-            value: ROUND(AVG((SQC_TC_INST_REQ / $denom)), 0)
-          L2 Rd:
-            value: ROUND(AVG((TCC_READ_sum / $denom)), 0)
-          L2 Wr:
-            value: ROUND(AVG((TCC_WRITE_sum / $denom)), 0)
-          L2 Atomic:
-            value: ROUND(AVG((TCC_ATOMIC_sum / $denom)), 0)
-          L2 Hit:
-            value: ROUND(AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum))
-              if ((TCC_HIT_sum + TCC_MISS_sum) != 0) else 0)), 0)
-          L2 Rd Lat:
-            value: ROUND(AVG(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)
-              != 0) else None)), 0)
-          L2 Wr Lat:
-            value: ROUND(AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum
-              + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) if ((TCP_TCC_WRITE_REQ_sum +
-              TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != 0) else None)), 0)
-          Fabric_L2 Rd:
-            value: ROUND(AVG((TCC_EA_RDREQ_sum / $denom)), 0)
-          Fabric_L2 Wr:
-            value: ROUND(AVG((TCC_EA_WRREQ_sum / $denom)), 0)
-          Fabric_L2 Atomic:
-            value: ROUND(AVG((TCC_EA_ATOMIC_sum / $denom)), 0)
-          Fabric Rd Lat:
-            value: ROUND(AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
-              != 0) else 0)), 0)
-          Fabric Wr Lat:
-            value: ROUND(AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
-              != 0) else 0)), 0)
-          Fabric Atomic Lat:
-            value: ROUND(AVG(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum
-              != 0) else 0)), 0)
-          HBM Rd:
-            value: ROUND(AVG((TCC_EA_RDREQ_DRAM_sum / $denom)), 0)
-          HBM Wr:
-            value: ROUND(AVG((TCC_EA_WRREQ_DRAM_sum / $denom)), 0)
-        gfx941:
-          Wavefront Occupancy:
-            value: ROUND(AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / $numActiveCUs),
-              0)
-            coll_level: SQ_LEVEL_WAVES
-          Wave Life:
-            value: ROUND(AVG(((4 * (SQ_WAVE_CYCLES / SQ_WAVES)) if (SQ_WAVES != 0)
-              else 0)), 0)
-          SALU:
-            value: ROUND(AVG((SQ_INSTS_SALU / $denom)), 0)
-          SMEM:
-            value: ROUND(AVG((SQ_INSTS_SMEM / $denom)), 0)
-          VALU:
-            value: ROUND(AVG((SQ_INSTS_VALU / $denom)), 0)
-          MFMA:
-            value: ROUND(AVG((SQ_INSTS_MFMA / $denom)), 0)
-          VMEM:
-            value: ROUND(AVG((SQ_INSTS_VMEM / $denom)), 0)
-          LDS:
-            value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0)
-          GWS:
-            value: ROUND(AVG((SQ_INSTS_GDS / $denom)), 0)
-          BR:
-            value: ROUND(AVG((SQ_INSTS_BRANCH / $denom)), 0)
-          Active CUs:
-            value: $numActiveCUs
-          Num CUs:
-            value: $cu_per_gpu
-          VGPR:
-            value: ROUND(AVG(Arch_VGPR), 0)
-          SGPR:
-            value: ROUND(AVG(SGPR), 0)
-          LDS Allocation:
-            value: ROUND(AVG(LDS_Per_Workgroup), 0)
-          Scratch Allocation:
-            value: ROUND(AVG(Scratch_Per_Workitem), 0)
-          Wavefronts:
-            value: ROUND(AVG(SPI_CSN_WAVE), 0)
-          Workgroups:
-            value: ROUND(AVG(SPI_CSN_NUM_THREADGROUPS), 0)
-          LDS Req:
-            value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0)
-          LDS Util:
-            value: ROUND(AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu))), 0)
-          LDS Latency:
-            value: ROUND(AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS
-              != 0) else None)),0)
-            coll_level: SQ_INST_LEVEL_LDS
-          VL1 Rd:
-            value: ROUND(AVG((TCP_TOTAL_READ_sum / $denom)), 0)
-          VL1 Wr:
-            value: ROUND(AVG((TCP_TOTAL_WRITE_sum / $denom)), 0)
-          VL1 Atomic:
-            value: ROUND(AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
-              / $denom)), 0)
-          VL1 Hit:
-            value: ROUND(AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
-              0) else None )), 0)
-          VL1 Lat:
-            value: ROUND(AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if
-              (TCP_TA_TCP_STATE_READ_sum != 0) else None)), 0)
-          VL1 Coalesce:
-            value: ROUND(AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum
-              * 4)) if (TCP_TOTAL_ACCESSES_sum != None) else 0)), 0)
-          VL1 Stall:
-            value: ROUND(AVG((((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
-              if (TCP_GATE_EN1_sum != 0) else None)), 0)
-          VL1_L2 Rd:
-            value: ROUND(AVG((TCP_TCC_READ_REQ_sum / $denom)), 0)
-          VL1_L2 Wr:
-            value: ROUND(AVG((TCP_TCC_WRITE_REQ_sum / $denom)), 0)
-          VL1_L2 Atomic:
-            value: ROUND(AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
-              / $denom)), 0)
-          sL1D Rd:
-            value: ROUND(AVG((SQC_DCACHE_REQ / $denom)), 0)
-          sL1D Hit:
-            value: ROUND((AVG(((SQC_DCACHE_HITS / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ
-              != 0) else None)) * 100), 0)
-          sL1D Lat:
-            value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ
-              != 0) else None)) * 100), 0)
-            coll_level: SQC_DCACHE_INFLIGHT_LEVEL
-          sL1D_L2 Rd:
-            value: ROUND(AVG((SQC_TC_DATA_READ_REQ / $denom)), 0)
-          sL1D_L2 Wr:
-            value: ROUND(AVG((SQC_TC_DATA_WRITE_REQ / $denom)), 0)
-          sL1D_L2 Atomic:
-            value: ROUND(AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)), 0)
-          IL1 Fetch:
-            value: ROUND(AVG((SQC_ICACHE_REQ / $denom)), 0)
-          IL1 Hit:
-            value: ROUND((AVG((SQC_ICACHE_HITS / SQC_ICACHE_REQ)) * 100), 0)
-          IL1 Lat:
-            value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_ICACHE_REQ) if (SQC_ICACHE_REQ
-              != 0) else None)) * 100), 0)
-            coll_level: SQC_ICACHE_INFLIGHT_LEVEL
-          IL1_L2 Rd:
-            value: ROUND(AVG((SQC_TC_INST_REQ / $denom)), 0)
-          L2 Rd:
-            value: ROUND(AVG((TCC_READ_sum / $denom)), 0)
-          L2 Wr:
-            value: ROUND(AVG((TCC_WRITE_sum / $denom)), 0)
-          L2 Atomic:
-            value: ROUND(AVG((TCC_ATOMIC_sum / $denom)), 0)
-          L2 Hit:
-            value: ROUND(AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum))
-              if ((TCC_HIT_sum + TCC_MISS_sum) != 0) else 0)), 0)
-          Fabric_L2 Rd:
-            value: ROUND(AVG((TCC_EA0_RDREQ_sum / $denom)), 0)
-          Fabric_L2 Wr:
-            value: ROUND(AVG((TCC_EA0_WRREQ_sum / $denom)), 0)
-          Fabric_L2 Atomic:
-            value: ROUND(AVG((TCC_EA0_ATOMIC_sum / $denom)), 0)
-          Fabric Rd Lat:
-            value: ROUND(AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else 0)), 0)
-          Fabric Wr Lat:
-            value: ROUND(AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else 0)), 0)
-          Fabric Atomic Lat:
-            value: ROUND(AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
-              != 0) else 0)), 0)
-          HBM Rd:
-            value: ROUND(AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)), 0)
-          HBM Wr:
-            value: ROUND(AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)), 0)
-        gfx940:
-          Wavefront Occupancy:
-            value: ROUND(AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / $numActiveCUs),
-              0)
-            coll_level: SQ_LEVEL_WAVES
-          Wave Life:
-            value: ROUND(AVG(((4 * (SQ_WAVE_CYCLES / SQ_WAVES)) if (SQ_WAVES != 0)
-              else 0)), 0)
-          SALU:
-            value: ROUND(AVG((SQ_INSTS_SALU / $denom)), 0)
-          SMEM:
-            value: ROUND(AVG((SQ_INSTS_SMEM / $denom)), 0)
-          VALU:
-            value: ROUND(AVG((SQ_INSTS_VALU / $denom)), 0)
-          MFMA:
-            value: ROUND(AVG((SQ_INSTS_MFMA / $denom)), 0)
-          VMEM:
-            value: ROUND(AVG((SQ_INSTS_VMEM / $denom)), 0)
-          LDS:
-            value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0)
-          GWS:
-            value: ROUND(AVG((SQ_INSTS_GDS / $denom)), 0)
-          BR:
-            value: ROUND(AVG((SQ_INSTS_BRANCH / $denom)), 0)
-          Active CUs:
-            value: $numActiveCUs
-          Num CUs:
-            value: $cu_per_gpu
-          VGPR:
-            value: ROUND(AVG(Arch_VGPR), 0)
-          SGPR:
-            value: ROUND(AVG(SGPR), 0)
-          LDS Allocation:
-            value: ROUND(AVG(LDS_Per_Workgroup), 0)
-          Scratch Allocation:
-            value: ROUND(AVG(Scratch_Per_Workitem), 0)
-          Wavefronts:
-            value: ROUND(AVG(SPI_CSN_WAVE), 0)
-          Workgroups:
-            value: ROUND(AVG(SPI_CSN_NUM_THREADGROUPS), 0)
-          LDS Req:
-            value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0)
-          LDS Util:
-            value: ROUND(AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu))), 0)
-          LDS Latency:
-            value: ROUND(AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS
-              != 0) else None)),0)
-            coll_level: SQ_INST_LEVEL_LDS
-          VL1 Rd:
-            value: ROUND(AVG((TCP_TOTAL_READ_sum / $denom)), 0)
-          VL1 Wr:
-            value: ROUND(AVG((TCP_TOTAL_WRITE_sum / $denom)), 0)
-          VL1 Atomic:
-            value: ROUND(AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
-              / $denom)), 0)
-          VL1 Hit:
-            value: ROUND(AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
-              0) else None )), 0)
-          VL1 Lat:
-            value: ROUND(AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if
-              (TCP_TA_TCP_STATE_READ_sum != 0) else None)), 0)
-          VL1 Coalesce:
-            value: ROUND(AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum
-              * 4)) if (TCP_TOTAL_ACCESSES_sum != None) else 0)), 0)
-          VL1 Stall:
-            value: ROUND(AVG((((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
-              if (TCP_GATE_EN1_sum != 0) else None)), 0)
-          VL1_L2 Rd:
-            value: ROUND(AVG((TCP_TCC_READ_REQ_sum / $denom)), 0)
-          VL1_L2 Wr:
-            value: ROUND(AVG((TCP_TCC_WRITE_REQ_sum / $denom)), 0)
-          VL1_L2 Atomic:
-            value: ROUND(AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
-              / $denom)), 0)
-          sL1D Rd:
-            value: ROUND(AVG((SQC_DCACHE_REQ / $denom)), 0)
-          sL1D Hit:
-            value: ROUND((AVG(((SQC_DCACHE_HITS / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ
-              != 0) else None)) * 100), 0)
-          sL1D Lat:
-            value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ
-              != 0) else None)) * 100), 0)
-            coll_level: SQC_DCACHE_INFLIGHT_LEVEL
-          sL1D_L2 Rd:
-            value: ROUND(AVG((SQC_TC_DATA_READ_REQ / $denom)), 0)
-          sL1D_L2 Wr:
-            value: ROUND(AVG((SQC_TC_DATA_WRITE_REQ / $denom)), 0)
-          sL1D_L2 Atomic:
-            value: ROUND(AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)), 0)
-          IL1 Fetch:
-            value: ROUND(AVG((SQC_ICACHE_REQ / $denom)), 0)
-          IL1 Hit:
-            value: ROUND((AVG((SQC_ICACHE_HITS / SQC_ICACHE_REQ)) * 100), 0)
-          IL1 Lat:
-            value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_ICACHE_REQ) if (SQC_ICACHE_REQ
-              != 0) else None)) * 100), 0)
-            coll_level: SQC_ICACHE_INFLIGHT_LEVEL
-          IL1_L2 Rd:
-            value: ROUND(AVG((SQC_TC_INST_REQ / $denom)), 0)
-          L2 Rd:
-            value: ROUND(AVG((TCC_READ_sum / $denom)), 0)
-          L2 Wr:
-            value: ROUND(AVG((TCC_WRITE_sum / $denom)), 0)
-          L2 Atomic:
-            value: ROUND(AVG((TCC_ATOMIC_sum / $denom)), 0)
-          L2 Hit:
-            value: ROUND(AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum))
-              if ((TCC_HIT_sum + TCC_MISS_sum) != 0) else 0)), 0)
-          Fabric_L2 Rd:
-            value: ROUND(AVG((TCC_EA0_RDREQ_sum / $denom)), 0)
-          Fabric_L2 Wr:
-            value: ROUND(AVG((TCC_EA0_WRREQ_sum / $denom)), 0)
-          Fabric_L2 Atomic:
-            value: ROUND(AVG((TCC_EA0_ATOMIC_sum / $denom)), 0)
-          Fabric Rd Lat:
-            value: ROUND(AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else 0)), 0)
-          Fabric Wr Lat:
-            value: ROUND(AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else 0)), 0)
-          Fabric Atomic Lat:
-            value: ROUND(AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
-              != 0) else 0)), 0)
-          HBM Rd:
-            value: ROUND(AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)), 0)
-          HBM Wr:
-            value: ROUND(AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)), 0)
-        gfx942:
-          Wavefront Occupancy:
-            value: ROUND(AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / $numActiveCUs),
-              0)
-            coll_level: SQ_LEVEL_WAVES
-          Wave Life:
-            value: ROUND(AVG(((4 * (SQ_WAVE_CYCLES / SQ_WAVES)) if (SQ_WAVES != 0)
-              else 0)), 0)
-          SALU:
-            value: ROUND(AVG((SQ_INSTS_SALU / $denom)), 0)
-          SMEM:
-            value: ROUND(AVG((SQ_INSTS_SMEM / $denom)), 0)
-          VALU:
-            value: ROUND(AVG((SQ_INSTS_VALU / $denom)), 0)
-          MFMA:
-            value: ROUND(AVG((SQ_INSTS_MFMA / $denom)), 0)
-          VMEM:
-            value: ROUND(AVG((SQ_INSTS_VMEM / $denom)), 0)
-          LDS:
-            value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0)
-          GWS:
-            value: ROUND(AVG((SQ_INSTS_GDS / $denom)), 0)
-          BR:
-            value: ROUND(AVG((SQ_INSTS_BRANCH / $denom)), 0)
-          Active CUs:
-            value: $numActiveCUs
-          Num CUs:
-            value: $cu_per_gpu
-          VGPR:
-            value: ROUND(AVG(Arch_VGPR), 0)
-          SGPR:
-            value: ROUND(AVG(SGPR), 0)
-          LDS Allocation:
-            value: ROUND(AVG(LDS_Per_Workgroup), 0)
-          Scratch Allocation:
-            value: ROUND(AVG(Scratch_Per_Workitem), 0)
-          Wavefronts:
-            value: ROUND(AVG(SPI_CSN_WAVE), 0)
-          Workgroups:
-            value: ROUND(AVG(SPI_CSN_NUM_THREADGROUPS), 0)
-          LDS Req:
-            value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0)
-          LDS Util:
-            value: ROUND(AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu))), 0)
-          LDS Latency:
-            value: ROUND(AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS
-              != 0) else None)),0)
-            coll_level: SQ_INST_LEVEL_LDS
-          VL1 Rd:
-            value: ROUND(AVG((TCP_TOTAL_READ_sum / $denom)), 0)
-          VL1 Wr:
-            value: ROUND(AVG((TCP_TOTAL_WRITE_sum / $denom)), 0)
-          VL1 Atomic:
-            value: ROUND(AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
-              / $denom)), 0)
-          VL1 Hit:
-            value: ROUND(AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
-              0) else None )), 0)
-          VL1 Lat:
-            value: ROUND(AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if
-              (TCP_TA_TCP_STATE_READ_sum != 0) else None)), 0)
-          VL1 Coalesce:
-            value: ROUND(AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum
-              * 4)) if (TCP_TOTAL_ACCESSES_sum != None) else 0)), 0)
-          VL1 Stall:
-            value: ROUND(AVG((((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
-              if (TCP_GATE_EN1_sum != 0) else None)), 0)
-          VL1_L2 Rd:
-            value: ROUND(AVG((TCP_TCC_READ_REQ_sum / $denom)), 0)
-          VL1_L2 Wr:
-            value: ROUND(AVG((TCP_TCC_WRITE_REQ_sum / $denom)), 0)
-          VL1_L2 Atomic:
-            value: ROUND(AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
-              / $denom)), 0)
-          sL1D Rd:
-            value: ROUND(AVG((SQC_DCACHE_REQ / $denom)), 0)
-          sL1D Hit:
-            value: ROUND((AVG(((SQC_DCACHE_HITS / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ
-              != 0) else None)) * 100), 0)
-          sL1D Lat:
-            value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ
-              != 0) else None)) * 100), 0)
-            coll_level: SQC_DCACHE_INFLIGHT_LEVEL
-          sL1D_L2 Rd:
-            value: ROUND(AVG((SQC_TC_DATA_READ_REQ / $denom)), 0)
-          sL1D_L2 Wr:
-            value: ROUND(AVG((SQC_TC_DATA_WRITE_REQ / $denom)), 0)
-          sL1D_L2 Atomic:
-            value: ROUND(AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)), 0)
-          IL1 Fetch:
-            value: ROUND(AVG((SQC_ICACHE_REQ / $denom)), 0)
-          IL1 Hit:
-            value: ROUND((AVG((SQC_ICACHE_HITS / SQC_ICACHE_REQ)) * 100), 0)
-          IL1 Lat:
-            value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_ICACHE_REQ) if (SQC_ICACHE_REQ
-              != 0) else None)) * 100), 0)
-            coll_level: SQC_ICACHE_INFLIGHT_LEVEL
-          IL1_L2 Rd:
-            value: ROUND(AVG((SQC_TC_INST_REQ / $denom)), 0)
-          L2 Rd:
-            value: ROUND(AVG((TCC_READ_sum / $denom)), 0)
-          L2 Wr:
-            value: ROUND(AVG((TCC_WRITE_sum / $denom)), 0)
-          L2 Atomic:
-            value: ROUND(AVG((TCC_ATOMIC_sum / $denom)), 0)
-          L2 Hit:
-            value: ROUND(AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum))
-              if ((TCC_HIT_sum + TCC_MISS_sum) != 0) else 0)), 0)
-          Fabric_L2 Rd:
-            value: ROUND(AVG((TCC_EA0_RDREQ_sum / $denom)), 0)
-          Fabric_L2 Wr:
-            value: ROUND(AVG((TCC_EA0_WRREQ_sum / $denom)), 0)
-          Fabric_L2 Atomic:
-            value: ROUND(AVG((TCC_EA0_ATOMIC_sum / $denom)), 0)
-          Fabric Rd Lat:
-            value: ROUND(AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else 0)), 0)
-          Fabric Wr Lat:
-            value: ROUND(AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else 0)), 0)
-          Fabric Atomic Lat:
-            value: ROUND(AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
-              != 0) else 0)), 0)
-          HBM Rd:
-            value: ROUND(AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)), 0)
-          HBM Wr:
-            value: ROUND(AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)), 0)
-        gfx950:
-          Wavefront Occupancy:
-            value: ROUND(AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / $numActiveCUs),
-              0)
-            coll_level: SQ_LEVEL_WAVES
-          Wave Life:
-            value: ROUND(AVG(((4 * (SQ_WAVE_CYCLES / SQ_WAVES)) if (SQ_WAVES != 0)
-              else 0)), 0)
-          SALU:
-            value: ROUND(AVG((SQ_INSTS_SALU / $denom)), 0)
-          SMEM:
-            value: ROUND(AVG((SQ_INSTS_SMEM / $denom)), 0)
-          VALU:
-            value: ROUND(AVG((SQ_INSTS_VALU / $denom)), 0)
-          MFMA:
-            value: ROUND(AVG((SQ_INSTS_MFMA / $denom)), 0)
-          VMEM:
-            value: ROUND(AVG((SQ_INSTS_VMEM / $denom)), 0)
-          LDS:
-            value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0)
-          GWS:
-            value: ROUND(AVG((SQ_INSTS_GDS / $denom)), 0)
-          BR:
-            value: ROUND(AVG((SQ_INSTS_BRANCH / $denom)), 0)
-          Active CUs:
-            value: $numActiveCUs
-          Num CUs:
-            value: $cu_per_gpu
-          VGPR:
-            value: ROUND(AVG(Arch_VGPR), 0)
-          SGPR:
-            value: ROUND(AVG(SGPR), 0)
-          LDS Allocation:
-            value: ROUND(AVG(LDS_Per_Workgroup), 0)
-          Scratch Allocation:
-            value: ROUND(AVG(Scratch_Per_Workitem), 0)
-          Wavefronts:
-            value: ROUND(AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE),
-              0)
-          Workgroups:
-            value: ROUND(AVG(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS +
-              SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS), 0)
-          LDS Req:
-            value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0)
-          LDS Util:
-            value: ROUND(AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu))), 0)
-          LDS Latency:
-            value: ROUND(AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS
-              != 0) else None)),0)
-            coll_level: SQ_INST_LEVEL_LDS
-          VL1 Rd:
-            value: ROUND(AVG((TCP_TOTAL_READ_sum / $denom)), 0)
-          VL1 Wr:
-            value: ROUND(AVG((TCP_TOTAL_WRITE_sum / $denom)), 0)
-          VL1 Atomic:
-            value: ROUND(AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
-              / $denom)), 0)
-          VL1 Hit:
-            value: ROUND(AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
-              0) else None )), 0)
-          VL1 Lat:
-            value: ROUND(AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if
-              (TCP_TA_TCP_STATE_READ_sum != 0) else None)), 0)
-          VL1 Coalesce:
-            value: ROUND(AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum
-              * 4)) if (TCP_TOTAL_ACCESSES_sum != None) else 0)), 0)
-          VL1 Stall:
-            value: ROUND(AVG((((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
-              if (TCP_GATE_EN1_sum != 0) else None)), 0)
-          VL1_L2 Rd:
-            value: ROUND(AVG((TCP_TCC_READ_REQ_sum / $denom)), 0)
-          VL1_L2 Wr:
-            value: ROUND(AVG((TCP_TCC_WRITE_REQ_sum / $denom)), 0)
-          VL1_L2 Atomic:
-            value: ROUND(AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
-              / $denom)), 0)
-          sL1D Rd:
-            value: ROUND(AVG((SQC_DCACHE_REQ / $denom)), 0)
-          sL1D Hit:
-            value: ROUND((AVG(((SQC_DCACHE_HITS / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ
-              != 0) else None)) * 100), 0)
-          sL1D Lat:
-            value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ
-              != 0) else None)) * 100), 0)
-            coll_level: SQC_DCACHE_INFLIGHT_LEVEL
-          sL1D_L2 Rd:
-            value: ROUND(AVG((SQC_TC_DATA_READ_REQ / $denom)), 0)
-          sL1D_L2 Wr:
-            value: ROUND(AVG((SQC_TC_DATA_WRITE_REQ / $denom)), 0)
-          sL1D_L2 Atomic:
-            value: ROUND(AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)), 0)
-          IL1 Fetch:
-            value: ROUND(AVG((SQC_ICACHE_REQ / $denom)), 0)
-          IL1 Hit:
-            value: ROUND((AVG((SQC_ICACHE_HITS / SQC_ICACHE_REQ)) * 100), 0)
-          IL1 Lat:
-            value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_ICACHE_REQ) if (SQC_ICACHE_REQ
-              != 0) else None)) * 100), 0)
-            coll_level: SQC_ICACHE_INFLIGHT_LEVEL
-          IL1_L2 Rd:
-            value: ROUND(AVG((SQC_TC_INST_REQ / $denom)), 0)
-          L2 Rd:
-            value: ROUND(AVG((TCC_READ_sum / $denom)), 0)
-          L2 Wr:
-            value: ROUND(AVG((TCC_WRITE_sum / $denom)), 0)
-          L2 Atomic:
-            value: ROUND(AVG((TCC_ATOMIC_sum / $denom)), 0)
-          L2 Hit:
-            value: ROUND(AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum))
-              if ((TCC_HIT_sum + TCC_MISS_sum) != 0) else 0)), 0)
-          L2 Rd Lat:
-            value: ROUND(AVG(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)
-              != 0) else None)), 0)
-          L2 Wr Lat:
-            value: ROUND(AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum
-              + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) if ((TCP_TCC_WRITE_REQ_sum +
-              TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != 0) else None)), 0)
-          Fabric_L2 Rd:
-            value: ROUND(AVG((TCC_EA0_RDREQ_sum / $denom)), 0)
-          Fabric_L2 Wr:
-            value: ROUND(AVG((TCC_EA0_WRREQ_sum / $denom)), 0)
-          Fabric_L2 Atomic:
-            value: ROUND(AVG((TCC_EA0_ATOMIC_sum / $denom)), 0)
-          Fabric Rd Lat:
-            value: ROUND(AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else 0)), 0)
-          Fabric Wr Lat:
-            value: ROUND(AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else 0)), 0)
-          Fabric Atomic Lat:
-            value: ROUND(AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
-              != 0) else 0)), 0)
-          HBM Rd:
-            value: ROUND(AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)), 0)
-          HBM Wr:
-            value: ROUND(AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)), 0)
-        gfx908:
-          Wavefront Occupancy:
-            value: ROUND(AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / $numActiveCUs),
-              0)
-            coll_level: SQ_LEVEL_WAVES
-          Wave Life:
-            value: ROUND(AVG(((4 * (SQ_WAVE_CYCLES / SQ_WAVES)) if (SQ_WAVES != 0)
-              else 0)), 0)
-          SALU:
-            value: ROUND(AVG((SQ_INSTS_SALU / $denom)), 0)
-          SMEM:
-            value: ROUND(AVG((SQ_INSTS_SMEM / $denom)), 0)
-          VALU:
-            value: ROUND(AVG((SQ_INSTS_VALU / $denom)), 0)
-          MFMA:
-            value: None
-          VMEM:
-            value: ROUND(AVG((SQ_INSTS_VMEM / $denom)), 0)
-          LDS:
-            value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0)
-          GWS:
-            value: ROUND(AVG((SQ_INSTS_GDS / $denom)), 0)
-          BR:
-            value: ROUND(AVG((SQ_INSTS_BRANCH / $denom)), 0)
-          Active CUs:
-            value: $numActiveCUs
-          Num CUs:
-            value: $cu_per_gpu
-          VGPR:
-            value: ROUND(AVG(Arch_VGPR), 0)
-          SGPR:
-            value: ROUND(AVG(SGPR), 0)
-          LDS Allocation:
-            value: ROUND(AVG(LDS_Per_Workgroup), 0)
-          Scratch Allocation:
-            value: ROUND(AVG(Scratch_Per_Workitem), 0)
-          Wavefronts:
-            value: ROUND(AVG(SPI_CSN_WAVE), 0)
-          Workgroups:
-            value: ROUND(AVG(SPI_CSN_NUM_THREADGROUPS), 0)
-          LDS Req:
-            value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0)
-          LDS Util:
-            value: ROUND(AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu))), 0)
-          LDS Latency:
-            value: ROUND(AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS
-              != 0) else None)),0)
-            coll_level: SQ_INST_LEVEL_LDS
-          VL1 Rd:
-            value: ROUND(AVG((TCP_TOTAL_READ_sum / $denom)), 0)
-          VL1 Wr:
-            value: ROUND(AVG((TCP_TOTAL_WRITE_sum / $denom)), 0)
-          VL1 Atomic:
-            value: ROUND(AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
-              / $denom)), 0)
-          VL1 Hit:
-            value: ROUND(AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
-              0) else None )), 0)
-          VL1 Lat:
-            value: ROUND(AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if
-              (TCP_TA_TCP_STATE_READ_sum != 0) else None)), 0)
-          VL1 Coalesce:
-            value: ROUND(AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum
-              * 4)) if (TCP_TOTAL_ACCESSES_sum != None) else 0)), 0)
-          VL1 Stall:
-            value: ROUND(AVG((((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
-              if (TCP_GATE_EN1_sum != 0) else None)), 0)
-          VL1_L2 Rd:
-            value: ROUND(AVG((TCP_TCC_READ_REQ_sum / $denom)), 0)
-          VL1_L2 Wr:
-            value: ROUND(AVG((TCP_TCC_WRITE_REQ_sum / $denom)), 0)
-          VL1_L2 Atomic:
-            value: ROUND(AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
-              / $denom)), 0)
-          sL1D Rd:
-            value: ROUND(AVG((SQC_DCACHE_REQ / $denom)), 0)
-          sL1D Hit:
-            value: ROUND((AVG(((SQC_DCACHE_HITS / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ
-              != 0) else None)) * 100), 0)
-          sL1D Lat:
-            value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ
-              != 0) else None)) * 100), 0)
-            coll_level: SQC_DCACHE_INFLIGHT_LEVEL
-          sL1D_L2 Rd:
-            value: ROUND(AVG((SQC_TC_DATA_READ_REQ / $denom)), 0)
-          sL1D_L2 Wr:
-            value: ROUND(AVG((SQC_TC_DATA_WRITE_REQ / $denom)), 0)
-          sL1D_L2 Atomic:
-            value: ROUND(AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)), 0)
-          IL1 Fetch:
-            value: ROUND(AVG((SQC_ICACHE_REQ / $denom)), 0)
-          IL1 Hit:
-            value: ROUND((AVG((SQC_ICACHE_HITS / SQC_ICACHE_REQ)) * 100), 0)
-          IL1 Lat:
-            value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_ICACHE_REQ) if (SQC_ICACHE_REQ
-              != 0) else None)) * 100), 0)
-            coll_level: SQC_ICACHE_INFLIGHT_LEVEL
-          IL1_L2 Rd:
-            value: ROUND(AVG((SQC_TC_INST_REQ / $denom)), 0)
-          L2 Rd:
-            value: ROUND(AVG((TCC_READ_sum / $denom)), 0)
-          L2 Wr:
-            value: ROUND(AVG((TCC_WRITE_sum / $denom)), 0)
-          L2 Atomic:
-            value: ROUND(AVG((TCC_ATOMIC_sum / $denom)), 0)
-          L2 Hit:
-            value: ROUND(AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum))
-              if ((TCC_HIT_sum + TCC_MISS_sum) != 0) else 0)), 0)
-          L2 Rd Lat:
-            value: ROUND(AVG(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)
-              != 0) else None)), 0)
-          L2 Wr Lat:
-            value: ROUND(AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum
-              + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) if ((TCP_TCC_WRITE_REQ_sum +
-              TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != 0) else None)), 0)
-          Fabric_L2 Rd:
-            value: ROUND(AVG((TCC_EA0_RDREQ_sum / $denom)), 0)
-          Fabric_L2 Wr:
-            value: ROUND(AVG((TCC_EA0_WRREQ_sum / $denom)), 0)
-          Fabric_L2 Atomic:
-            value: ROUND(AVG((TCC_EA0_ATOMIC_sum / $denom)), 0)
-          Fabric Rd Lat:
-            value: ROUND(AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else 0)), 0)
-          Fabric Wr Lat:
-            value: ROUND(AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else 0)), 0)
-          Fabric Atomic Lat:
-            value: ROUND(AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
-              != 0) else 0)), 0)
-          HBM Rd:
-            value: ROUND(AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)), 0)
-          HBM Wr:
-            value: ROUND(AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)), 0)
-      comparable: false
-      cli_style: mem_chart
-      tui_style: mem_chart
-  metrics_description:
-    Wavefront Occupancy:
-      plain: Wavefronts per active CU.
-      rst: Wavefronts per active CU.
-      unit: Wavefronts
-    Wave Life:
-      plain: Average number of cycles executing a wave.
-      rst: Average number of cycles executing a wave.
-      unit: Cycles per wave
-    SALU:
-      plain: Total Number of SALU (Scalar ALU) instructions issued per normalization
-        unit.
-      rst: Total Number of SALU (Scalar ALU) instructions issued per normalization
-        unit.
-      unit: Instructions per normalization unit
-    SMEM:
-      plain: Total number of SMEM (Scalar Memory Read) instructions issued normalization
-        unit.
-      rst: Total number of SMEM (Scalar Memory Read) instructions issued normalization
-        unit.
-      unit: Instructions per normalization unit
-    VALU:
-      plain: The number of VALU (Vector ALU) instructions issued per normalization
-        unit.
-      rst: The number of VALU (Vector ALU) instructions issued per normalization unit.
-      unit: Instructions per normalization unit
-    MFMA:
-      plain: Total number of MFMA (Matrix-Fused-Multiply-Add) instructions issued
-        per normalization unit.
-      rst: Total number of MFMA (Matrix-Fused-Multiply-Add) instructions issued per
-        normalization unit.
-      unit: Instructions per normalization unit
-    VMEM:
-      plain: The number of VMEM (GPU Memory) read instructions issued (including FLAT/scratch
-        memory) per normalization unit.
-      rst: The number of VMEM (GPU Memory) read instructions issued (including FLAT/scratch
-        memory) per normalization unit.
-      unit: Instructions per normalization unit
-    LDS:
-      plain: The total number of LDS instructions (including, but not limited to,
-        read/write/atomics and HIP's __shfl instructions) executed per normalization
-        unit.
-      rst: The total number of LDS instructions (including, but not limited to, read/write/atomics
-        and HIP's __shfl instructions) executed per normalization unit.
-      unit: Instructions per normalization unit
-    GWS:
-      plain: Total number of GDS (global data sync) instructions issued per normalization
-        unit.
-      rst: Total number of GDS (global data sync) instructions issued per normalization
-        unit.
-      unit: Instructions per normalization unit
-    BR:
-      plain: Total number of BRANCH instructions issued per normalization unit.
-      rst: Total number of BRANCH instructions issued per normalization unit.
-      unit: Instructions per normalization unit
-    Active CUs:
-      plain: Total number of active compute units (CUs) on the accelerator during
-        the kernel execution.
-      rst: Total number of active compute units (CUs) on the accelerator during the
-        kernel execution.
-      unit: CUs
-    Num CUs:
-      plain: Total number of compute units (CUs) on the accelerator.
-      rst: Total number of compute units (CUs) on the accelerator.
-      unit: CUs
-    VGPR:
-      plain: |-
-        The number of architected vector general-purpose registers allocated
-        for the kernel, see VALU. Note: this may not exactly match the number of VGPRs
-        requested by the compiler due to allocation granularity.
-      rst: |-
-        The number of architected vector general-purpose registers allocated for the
-        kernel, see :ref:`VALU <desc-valu>`. Note: this may not exactly match the
-        number of VGPRs requested by the compiler due to allocation granularity.
-      unit: VGPRs
-    SGPR:
-      plain: |-
-        The number of scalar general-purpose registers allocated for the kernel,
-        see SALU. Note: this may not exactly match the number of SGPRs requested by
-        the compiler due to allocation granularity.
-      rst: |-
-        The number of scalar general-purpose registers allocated for the kernel, see
-        :ref:`SALU <desc-salu>`. Note: this may not exactly match the number of
-        SGPRs requested by the compiler due to allocation granularity.
-      unit: SGPRs
-    LDS Allocation:
-      plain: |-
-        The number of bytes of LDS memory (or, shared memory) allocated for
-        this kernel. Note: This may also be larger than what was requested at compile
-        time due to both allocation granularity and dynamic per-dispatch LDS allocations.
-      rst: |-
-        The number of bytes of :doc:`LDS <local-data-share>` memory (or, shared memory)
-        allocated for this kernel. Note: This may also be larger than what was requested
-        at compile time due to both allocation granularity and dynamic per-dispatch
-        LDS allocations.
-      unit: Bytes per workgroup
-    Scratch Allocation:
-      plain: The number of bytes of scratch memory requested per work-item for this
-        kernel. Scratch memory is used for stack memory on the accelerator, as well
-        as for register spills and restores.
-      rst: The number of bytes of :ref:`scratch memory <memory-spaces>` requested per
-        work-item for this kernel. Scratch memory is used for stack memory on the
-        accelerator, as well as for register spills and restores.
-      unit: Bytes per workgroup
-    Wavefronts:
-      plain: The total number of wavefronts, summed over all workgroups, forming this
-        kernel launch.
-      rst: The total number of wavefronts, summed over all workgroups, forming this
-        kernel launch.
-      unit: Wavefronts
-    Workgroups:
-      plain: The total number of workgroups forming this kernel launch.
-      rst: The total number of workgroups forming this kernel launch.
-      unit: Workgroups
-    LDS Req:
-      plain: The total number of LDS instructions (including, but not limited to,
-        read/write/atomics and HIP's __shfl instructions) executed per normalization
-        unit.
-      rst: The total number of LDS instructions (including, but not limited to,
-        read/write/atomics and HIP's ``__shfl`` instructions) executed
-        per :ref:`normalization unit <normalization-units>`.
-      unit: Instructions per normalization unit
-    LDS Util:
-      plain: Indicates what percent of the kernel's duration the LDS was actively
-        executing instructions (including, but not limited to, load, store, atomic
-        and HIP's __shfl operations). Calculated as the ratio of the total number
-        of cycles LDS was active over the total CU cycles.
-      rst: Indicates what percent of the kernel's duration the :ref:`LDS <desc-lds>` was
-        actively executing instructions (including, but not limited to, load, store,
-        atomic and HIP's ``__shfl`` operations). Calculated as the ratio of the
-        total number of cycles LDS was active over the :ref:`total CU cycles <total-cu-cycles>`.
-      unit: Percent
-    LDS Latency:
-      plain: The average number of round-trip cycles (i.e., from issue to data-return
-        / acknowledgment) required for an LDS instruction to complete.
-      rst: The average number of round-trip cycles (i.e., from issue to data-return /
-        acknowledgment) required for an LDS instruction to complete.
-      unit: Cycles
-    VL1 Rd:
-      plain: The total number of incoming read requests from the address processing
-        unit after coalescing per normalization unit
-      rst: The total number of incoming read requests from the :ref:`address processing
-        unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
-      unit: Requests per normalization unit
-    VL1 Wr:
-      plain: The total number of incoming write requests from the address processing
-        unit after coalescing per normalization unit
-      rst: The total number of incoming write requests from the :ref:`address processing
-        unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
-      unit: Requests per normalization unit
-    VL1 Atomic:
-      plain: The total number of incoming atomic requests from the address processing
-        unit after coalescing per normalization unit
-      rst: The total number of incoming atomic requests from the :ref:`address processing
-        unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
-      unit: Requests per normalization unit
-    VL1 Hit:
-      plain: The ratio of the number of vL1D cache line requests that hit in vL1D
-        cache over the total number of cache line requests to the vL1D Cache RAM.
-      rst: The ratio of the number of vL1D cache line requests that hit in vL1D cache
-        over the total number of cache line requests to the :ref:`vL1D Cache RAM
-        <desc-tc>`.
-      unit: Percent
-    VL1 Lat:
-      plain: Calculated as the average number of cycles that a vL1D cache line request
-        spent in the vL1D cache pipeline.
-      rst: Calculated as the average number of cycles that a vL1D cache line request
-        spent in the vL1D cache pipeline.
-      unit: Cycles
-    VL1 Coalesce:
-      plain: Indicates how well memory instructions were coalesced by the address
-        processing unit, ranging from uncoalesced (25%) to fully coalesced (100%).
-        Calculated as the average number of thread-requests generated per instruction
-        divided by the ideal number of thread-requests per instruction.
-      rst: Indicates how well memory instructions were coalesced by the :ref:`address
-        processing unit <desc-ta>`, ranging from uncoalesced (25%) to fully coalesced
-        (100%). Calculated as the average number of :ref:`thread-requests <thread-requests>`
-        generated per instruction divided by the ideal number of thread-requests per
-        instruction.
-      unit: Percent
-    VL1 Stall:
-      plain: The ratio of the number of cycles where the vL1D is stalled waiting to
-        issue a request for data to the L2 cache divided by the number of cycles where
-        the vL1D is active.
-      rst: The ratio of the number of cycles where the vL1D is stalled waiting to issue
-        a request for data to the :doc:`L2 cache <l2-cache>` divided by the number
-        of cycles where the vL1D is active [#vl1d-activity]_.
-      unit: Percent
-    VL1_L2 Rd:
-      plain: The number of read requests for a vL1D cache line that were not satisfied
-        by the vL1D and must be retrieved from the to the L2 Cache per normalization
-        unit.
-      rst: The number of read requests for a vL1D cache line that were not satisfied by
-        the vL1D and must be retrieved from the to the :doc:`L2 Cache <l2-cache>`
-        per :ref:`normalization unit <normalization-units>`.
-      unit: Requests per normalization unit
-    VL1_L2 Wr:
-      plain: The number of write requests to a vL1D cache line that were sent through
-        the vL1D to the L2 cache, per normalization unit.
-      rst: The number of write requests to a vL1D cache line that were sent through the
-        vL1D to the :doc:`L2 cache <l2-cache>`, per :ref:`normalization unit <normalization-units>`.
-      unit: Requests per normalization unit
-    VL1_L2 Atomic:
-      plain: The number of atomic requests that are sent through the vL1D to the L2
-        cache, per normalization unit. This includes requests for atomics with, and
-        without return.
-      rst: The number of atomic requests that are sent through the vL1D to the :doc:`L2
-        cache <l2-cache>`, per :ref:`normalization unit <normalization-units>`. This
-        includes requests for atomics with, and without return.
-      unit: Requests per normalization unit
-    sL1D Rd:
-      plain: The total number of requests, of any size or type, made to the sL1D per
-        normalization unit.
-      rst: The total number of requests, of any size or type, made to the sL1D per :ref:`normalization
-        unit <normalization-units>`.
-      unit: Requests per normalization unit
-    sL1D Hit:
-      plain: The total number of sL1D requests that hit on a previously loaded cache
-        line, per normalization unit.
-      rst: The total number of sL1D requests that hit on a previously loaded cache line,
-        per :ref:`normalization unit <normalization-units>`.
-      unit: Requests per normalization unit
-    sL1D_L2 Rd:
-      plain: The total number of read requests from sL1D to the L2, per normalization
-        unit.
-      rst: The total number of read requests from sL1D to the :doc:`L2 <l2-cache>`, per
-        :ref:`normalization unit <normalization-units>`.
-      unit: Requests per normalization unit
-    sL1D_L2 Wr:
-      plain: The total number of write requests from sL1D to the L2, per normalization
-        unit. Typically unused on current CDNA accelerators.
-      rst: The total number of write requests from sL1D to the :doc:`L2 <l2-cache>`, per
-        :ref:`normalization unit <normalization-units>`. Typically unused on current
-        CDNA accelerators.
-      unit: Requests per normalization unit
-    sL1D_L2 Atomic:
-      plain: The total number of atomic requests from sL1D to the L2, per normalization
-        unit. Typically unused on current CDNA accelerators.
-      rst: The total number of atomic requests from sL1D to the :doc:`L2 <l2-cache>`,
-        per :ref:`normalization unit <normalization-units>`. Typically unused on current
-        CDNA accelerators.
-      unit: Requests per normalization unit
-    IL1 Fetch:
-      plain: The total number of requests made to the L1I per normalization-unit.
-      rst: The total number of requests made to the L1I per :ref:`normalization-unit
-        <normalization-units>`.
-      unit: Requests per normalization unit
-    IL1 Hit:
-      plain: The percent of L1I requests that hit on a previously loaded line the
-        cache. Calculated as the ratio of the number of L1I requests that hit over
-        the number of all L1I requests.
-      rst: The total number of L1I requests that hit on a previously loaded cache line,
-        per :ref:`normalization-unit <normalization-units>`.
-      unit: Percent
-    IL1 Lat:
-      plain: The average number of cycles spent to fetch instructions to a CU.
-      rst: The average number of cycles spent to fetch instructions to a :doc:`CU
-        <compute-unit>`.
-      unit: Cycles
-    IL1_L2 Rd:
-      plain: The total number of requests across the L1I - L2 interface per normalization-unit.
-      rst: The total number of requests across the L1I - L2 interface per normalization-unit.
-      unit: Requests per normalization unit
-    L2 Rd:
-      plain: The total number of read requests to the L2 from all clients.
-      rst: The total number of read requests to the L2 from all clients.
-      unit: Requests per normalization unit
-    L2 Wr:
-      plain: The total number of write requests to the L2 from all clients.
-      rst: The total number of write requests to the L2 from all clients.
-      unit: Requests per normalization unit
-    L2 Atomic:
-      plain: The total number of atomic requests (with and without return) to the
-        L2 from all clients.
-      rst: The total number of atomic requests (with and without return) to the L2 from
-        all clients.
-      unit: Requests per normalization unit
-    L2 Hit:
-      plain: The ratio of the number of L2 cache line requests that hit in the L2
-        cache over the total number of incoming cache line requests to the L2 cache.
-      rst: The ratio of the number of L2 cache line requests that hit in the L2 cache
-        over the total number of incoming cache line requests to the L2 cache.
-      unit: Percent
-    L2 Rd Lat:
-      plain: Calculated as the average number of cycles that the vL1D cache took to
-        issue and receive read requests from the L2 Cache. This number also includes
-        requests for atomics with return values.
-      rst: Calculated as the average number of cycles that the vL1D cache took to issue
-        and receive read requests from the :doc:`L2 Cache <l2-cache>`. This number
-        also includes requests for atomics with return values.
-      unit: Cycles
-    L2 Wr Lat:
-      plain: Calculated as the average number of cycles that the vL1D cache took to
-        issue and receive acknowledgement of a write request to the L2 Cache. This
-        number also includes requests for atomics without return values.
-      rst: Calculated as the average number of cycles that the vL1D cache took to issue
-        and receive acknowledgement of a write request to the :doc:`L2 Cache <l2-cache>`.
-        This number also includes requests for atomics without return values.
-      unit: Cycles
-    Fabric_L2 Rd:
-      plain: Number of L2 cache - Infinity Fabric read requests (either 32-byte or
-        64-byte) summed over TCC instances per normalization unit.
-      rst: Number of L2 cache - Infinity Fabric read requests (either 32-byte or 64-byte)
-        summed over TCC instances per normalization unit.
-      unit: Requests per normalization unit
-    Fabric_L2 Wr:
-      plain: Number of L2 cache - Infinity Fabric write requests (either 32-byte or
-        64-byte) summed over TCC instances per normalization unit.
-      rst: Number of L2 cache - Infinity Fabric write requests (either 32-byte or
-        64-byte) summed over TCC instances per normalization unit.
-      unit: Requests per normalization unit
-    Fabric_L2 Atomic:
-      plain: Number of L2 cache - Infinity Fabric write requests (either 32-byte or
-        64-byte) that are actually atomic requests summed over TCC instances per normalization
-        unit.
-      rst: Number of L2 cache - Infinity Fabric write requests (either 32-byte or
-        64-byte) that are actually atomic requests summed over TCC instances per normalization
-        unit.
-      unit: Requests per normalization unit
-    Fabric Rd Lat:
-      plain: The time-averaged number of cycles read requests spent in Infinity Fabric
-        before data was returned to the L2.
-      rst: The time-averaged number of cycles read requests spent in Infinity Fabric
-        before data was returned to the L2.
-      unit: Cycles
-    Fabric Wr Lat:
-      plain: The time-averaged number of cycles write requests spent in Infinity Fabric
-        before a completion acknowledgement was returned to the L2.
-      rst: The time-averaged number of cycles write requests spent in Infinity Fabric
-        before a completion acknowledgement was returned to the L2.
-      unit: Cycles
-    Fabric Atomic Lat:
-      plain: The time-averaged number of cycles atomic requests spent in Infinity
-        Fabric before a completion acknowledgement (atomic without return value) or
-        data (atomic with return value) was returned to the L2.
-      rst: The time-averaged number of cycles atomic requests spent in Infinity Fabric
-        before a completion acknowledgement (atomic without return value) or data
-        (atomic with return value) was returned to the L2.
-      unit: Cycles
-    HBM Rd:
-      plain: The total number of L2 requests to Infinity Fabric to read 32B or 64B
-        of data from the accelerator's local HBM, per normalization unit.
-      rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of data
-        from the accelerator's local HBM, per :ref:`normalization unit <normalization-units>`.
-        See :ref:`l2-request-flow` for more detail.
-      unit: Requests per normalization unit
-    HBM Wr:
-      plain: |-
-        The total number of L2 requests to Infinity Fabric to write or atomically
-        update 32B or 64B of data in the accelerator's local HBM, per normalization
-        unit.
-      rst: The total number of L2 requests to Infinity Fabric to write 32B or 64B
-        of data from the accelerator's local HBM, per :ref:`normalization unit <normalization-units>`.
-        See :ref:`l2-request-flow` for more detail.
-      unit: Requests per normalization unit
-- id: 400
-  title: Roofline
-  data source:
-  - metric_table:
-      id: 401
-      title: Roofline Performance Rates
-      cli_style: Roofline
-      tui_style: Roofline
-      header:
-        metric: Metric
-        value: Value
-        unit: Unit
-        peak: Peak (Empirical)
-      metric:
-        gfx90a:
-          VALU FLOPs (F16):
-            value: AVG((($wave_size * (
-              SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16
-              )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GFLOP/s
-            peak: $FP16Flops_empirical_peak
-          VALU FLOPs (F32):
-            value: AVG((($wave_size * (
-              SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32
-              )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GFLOP/s
-            peak: $FP32Flops_empirical_peak
-          VALU FLOPs (F64):
-            value: AVG((($wave_size * (
-              SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64
-              )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GFLOP/s
-            peak: $FP64Flops_empirical_peak
-          MFMA FLOPs (F64):
-            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GFLOP/s
-            peak: $MFMAF64Flops_empirical_peak
-          MFMA FLOPs (F32):
-            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GFLOP/s
-            peak: $MFMAF32Flops_empirical_peak
-          MFMA FLOPs (F16):
-            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GFLOP/s
-            peak: $MFMAF16Flops_empirical_peak
-          MFMA FLOPs (BF16):
-            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GFLOP/s
-            peak: $MFMABF16Flops_empirical_peak
-          MFMA IOPs (Int8):
-            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GIOP/s
-            peak: $MFMAI8Ops_empirical_peak
-          HBM Bandwidth:
-            value: AVG(((
-              (TCC_EA_RDREQ_32B_sum * 32) +
-              ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) * 64) +
-              (TCC_EA_WRREQ_64B_sum * 64) +
-              ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) * 32)
-              ) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GB/s
-            peak: $HBMBw_empirical_peak
-          L2 Cache Bandwidth:
-            value: AVG(((((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum +
-              TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) *
-              64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GB/s
-            peak: $L2Bw_empirical_peak
-          L1 Cache Bandwidth:
-            value: AVG((((TCP_TOTAL_CACHE_ACCESSES_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GB/s
-            peak: $L1Bw_empirical_peak
-          LDS Bandwidth:
-            value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) *
-              4 * $lds_banks_per_cu)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GB/s
-            peak: $LDSBw_empirical_peak
-        gfx908:
-          VALU FLOPs (F16):
-            value: AVG((($wave_size * (
-              SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16
-              )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GFLOP/s
-            peak: $FP16Flops_empirical_peak
-          VALU FLOPs (F32):
-            value: AVG((($wave_size * (
-              SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32
-              )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GFLOP/s
-            peak: $FP32Flops_empirical_peak
-          VALU FLOPs (F64):
-            value: AVG((($wave_size * (
-              SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64
-              )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GFLOP/s
-            peak: $FP64Flops_empirical_peak
-          MFMA FLOPs (F64):
-            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GFLOP/s
-            peak: $MFMAF64Flops_empirical_peak
-          MFMA FLOPs (F32):
-            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GFLOP/s
-            peak: $MFMAF32Flops_empirical_peak
-          MFMA FLOPs (F16):
-            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GFLOP/s
-            peak: $MFMAF16Flops_empirical_peak
-          MFMA FLOPs (BF16):
-            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GFLOP/s
-            peak: $MFMABF16Flops_empirical_peak
-          MFMA IOPs (Int8):
-            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GIOP/s
-            peak: $MFMAI8Ops_empirical_peak
-          HBM Bandwidth:
-            value: AVG(((
-              (TCC_BUBBLE_sum * 128) +
-              (TCC_EA0_RDREQ_32B_sum * 32) +
-              ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) +
-              ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) +
-              (TCC_EA0_WRREQ_64B_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GB/s
-            peak: $HBMBw_empirical_peak
-          L2 Cache Bandwidth:
-            value: AVG(((((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum +
-              TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) *
-              64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GB/s
-            peak: $L2Bw_empirical_peak
-          L1 Cache Bandwidth:
-            value: AVG((((TCP_TOTAL_CACHE_ACCESSES_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GB/s
-            peak: $L1Bw_empirical_peak
-          LDS Bandwidth:
-            value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) *
-              4 * $lds_banks_per_cu)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GB/s
-            peak: $LDSBw_empirical_peak
-        gfx940:
-          VALU FLOPs (F16):
-            value: AVG((($wave_size * (
-              SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16
-              )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GFLOP/s
-            peak: $FP16Flops_empirical_peak
-          VALU FLOPs (F32):
-            value: AVG((($wave_size * (
-              SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32
-              )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GFLOP/s
-            peak: $FP32Flops_empirical_peak
-          VALU FLOPs (F64):
-            value: AVG((($wave_size * (
-              SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64
-              )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GFLOP/s
-            peak: $FP64Flops_empirical_peak
-          MFMA FLOPs (F64):
-            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GFLOP/s
-            peak: $MFMAF64Flops_empirical_peak
-          MFMA FLOPs (F32):
-            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GFLOP/s
-            peak: $MFMAF32Flops_empirical_peak
-          MFMA FLOPs (F16):
-            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GFLOP/s
-            peak: $MFMAF16Flops_empirical_peak
-          MFMA FLOPs (BF16):
-            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GFLOP/s
-            peak: $MFMABF16Flops_empirical_peak
-          MFMA FLOPs (F8):
-            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GFLOP/s
-            peak: $MFMAF8Flops_empirical_peak
-          MFMA IOPs (Int8):
-            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GIOP/s
-            peak: $MFMAI8Ops_empirical_peak
-          HBM Bandwidth:
-            value: AVG(((
-              (TCC_BUBBLE_sum * 128) +
-              (TCC_EA0_RDREQ_32B_sum * 32) +
-              ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) +
-              ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) +
-              (TCC_EA0_WRREQ_64B_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GB/s
-            peak: $HBMBw_empirical_peak
-          L2 Cache Bandwidth:
-            value: AVG(((((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum +
-              TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) *
-              64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GB/s
-            peak: $L2Bw_empirical_peak
-          L1 Cache Bandwidth:
-            value: AVG((((TCP_TOTAL_CACHE_ACCESSES_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GB/s
-            peak: $L1Bw_empirical_peak
-          LDS Bandwidth:
-            value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) *
-              4 * $lds_banks_per_cu)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GB/s
-            peak: $LDSBw_empirical_peak
-        gfx941:
-          VALU FLOPs (F16):
-            value: AVG((($wave_size * (
-              SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16
-              )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GFLOP/s
-            peak: $FP16Flops_empirical_peak
-          VALU FLOPs (F32):
-            value: AVG((($wave_size * (
-              SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32
-              )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GFLOP/s
-            peak: $FP32Flops_empirical_peak
-          VALU FLOPs (F64):
-            value: AVG((($wave_size * (
-              SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64
-              )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GFLOP/s
-            peak: $FP64Flops_empirical_peak
-          MFMA FLOPs (F64):
-            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GFLOP/s
-            peak: $MFMAF64Flops_empirical_peak
-          MFMA FLOPs (F32):
-            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GFLOP/s
-            peak: $MFMAF32Flops_empirical_peak
-          MFMA FLOPs (F16):
-            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GFLOP/s
-            peak: $MFMAF16Flops_empirical_peak
-          MFMA FLOPs (BF16):
-            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GFLOP/s
-            peak: $MFMABF16Flops_empirical_peak
-          MFMA FLOPs (F8):
-            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GFLOP/s
-            peak: $MFMAF8Flops_empirical_peak
-          MFMA IOPs (Int8):
-            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GIOP/s
-            peak: $MFMAI8Ops_empirical_peak
-          HBM Bandwidth:
-            value: AVG(((
-              (TCC_BUBBLE_sum * 128) +
-              (TCC_EA0_RDREQ_32B_sum * 32) +
-              ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) +
-              ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) +
-              (TCC_EA0_WRREQ_64B_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GB/s
-            peak: $HBMBw_empirical_peak
-          L2 Cache Bandwidth:
-            value: AVG(((((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum +
-              TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) *
-              64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GB/s
-            peak: $L2Bw_empirical_peak
-          L1 Cache Bandwidth:
-            value: AVG((((TCP_TOTAL_CACHE_ACCESSES_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GB/s
-            peak: $L1Bw_empirical_peak
-          LDS Bandwidth:
-            value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) *
-              4 * $lds_banks_per_cu)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GB/s
-            peak: $LDSBw_empirical_peak
-        gfx942:
-          VALU FLOPs (F16):
-            value: AVG((($wave_size * (
-              SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16
-              )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GFLOP/s
-            peak: $FP16Flops_empirical_peak
-          VALU FLOPs (F32):
-            value: AVG((($wave_size * (
-              SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32
-              )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GFLOP/s
-            peak: $FP32Flops_empirical_peak
-          VALU FLOPs (F64):
-            value: AVG((($wave_size * (
-              SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64
-              )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GFLOP/s
-            peak: $FP64Flops_empirical_peak
-          MFMA FLOPs (F64):
-            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GFLOP/s
-            peak: $MFMAF64Flops_empirical_peak
-          MFMA FLOPs (F32):
-            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GFLOP/s
-            peak: $MFMAF32Flops_empirical_peak
-          MFMA FLOPs (F16):
-            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GFLOP/s
-            peak: $MFMAF16Flops_empirical_peak
-          MFMA FLOPs (BF16):
-            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GFLOP/s
-            peak: $MFMABF16Flops_empirical_peak
-          MFMA FLOPs (F8):
-            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GFLOP/s
-            peak: $MFMAF8Flops_empirical_peak
-          MFMA IOPs (Int8):
-            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GIOP/s
-            peak: $MFMAI8Ops_empirical_peak
-          HBM Bandwidth:
-            value: AVG(((
-              (TCC_BUBBLE_sum * 128) +
-              (TCC_EA0_RDREQ_32B_sum * 32) +
-              ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) +
-              ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) +
-              (TCC_EA0_WRREQ_64B_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GB/s
-            peak: $HBMBw_empirical_peak
-          L2 Cache Bandwidth:
-            value: AVG(((((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum +
-              TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) *
-              64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GB/s
-            peak: $L2Bw_empirical_peak
-          L1 Cache Bandwidth:
-            value: AVG((((TCP_TOTAL_CACHE_ACCESSES_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GB/s
-            peak: $L1Bw_empirical_peak
-          LDS Bandwidth:
-            value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) *
-              4 * $lds_banks_per_cu)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GB/s
-            peak: $LDSBw_empirical_peak
-        gfx950:
-          VALU FLOPs (F16):
-            value: AVG((($wave_size * (
-              SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16
-              )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GFLOP/s
-            peak: $FP16Flops_empirical_peak
-          VALU FLOPs (F32):
-            value: AVG((($wave_size * (
-              SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32
-              )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GFLOP/s
-            peak: $FP32Flops_empirical_peak
-          VALU FLOPs (F64):
-            value: AVG((($wave_size * (
-              SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64
-              )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GFLOP/s
-            peak: $FP64Flops_empirical_peak
-          MFMA FLOPs (F64):
-            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GFLOP/s
-            peak: $MFMAF64Flops_empirical_peak
-          MFMA FLOPs (F32):
-            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GFLOP/s
-            peak: $MFMAF32Flops_empirical_peak
-          MFMA FLOPs (F16):
-            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GFLOP/s
-            peak: $MFMAF16Flops_empirical_peak
-          MFMA FLOPs (BF16):
-            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GFLOP/s
-            peak: $MFMABF16Flops_empirical_peak
-          MFMA FLOPs (F8):
-            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GFLOP/s
-            peak: $MFMAF8Flops_empirical_peak
-          MFMA FLOPs (F6F4):
-            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GFLOP/s
-            peak: $MFMA_FLOPs_F6F4_empirical_peak
-          MFMA IOPs (Int8):
-            value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GIOP/s
-            peak: $MFMAI8Ops_empirical_peak
-          HBM Bandwidth:
-            value: AVG(((
-              (TCC_BUBBLE_sum * 128) +
-              (TCC_EA0_RDREQ_32B_sum * 32) +
-              ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) +
-              ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) +
-              (TCC_EA0_WRREQ_64B_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GB/s
-            peak: $HBMBw_empirical_peak
-          L2 Cache Bandwidth:
-            value: AVG(((((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum +
-              TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) *
-              64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GB/s
-            peak: $L2Bw_empirical_peak
-          L1 Cache Bandwidth:
-            value: AVG((((TCP_TOTAL_CACHE_ACCESSES_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GB/s
-            peak: $L1Bw_empirical_peak
-          LDS Bandwidth:
-            value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) *
-              4 * $lds_banks_per_cu)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
-            unit: GB/s
-            peak: $LDSBw_empirical_peak
-  - metric_table:
-      id: 402
-      title: Roofline Plot Points
-      cli_style: Roofline
-      tui_style: Roofline
-      header:
-        metric: Metric
-        value: Value
-        unit: Unit
-      metric:
-        gfx90a:
-          AI HBM:
-            value: (
-              SUM(
-                ($wave_size * (
-                  (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
-                  (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
-                  (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
-                )) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)
-              ) /
-              SUM(
-                (TCC_EA_RDREQ_32B_sum * 32) +
-                ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) * 64) +
-                (TCC_EA_WRREQ_64B_sum * 64) +
-                ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) * 32)
-              )
-              )
-            unit: FLOPs/Byte
-          AI L2:
-            value: (
-              SUM(
-                ($wave_size * (
-                  (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
-                  (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
-                  (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
-                )) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)
-              ) /
-              SUM(
-                (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum +
-                TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) * 64
-              )
-              )
-            unit: FLOPs/Byte
-          AI L1:
-            value: (
-              SUM(
-                ($wave_size * (
-                  (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
-                  (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
-                  (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
-                )) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)
-              ) /
-              SUM(TCP_TOTAL_CACHE_ACCESSES_sum * 64)
-              )
-            unit: FLOPs/Byte
-          Performance (GFLOPs):
-            value: (
-              SUM(
-                ($wave_size * (
-                  (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
-                  (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
-                  (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
-                )) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)
-              ) /
-              (SUM(End_Timestamp - Start_Timestamp) / 1e9)
-              ) / 1e9
-            unit: GFLOP/s
-        gfx908:
-          AI HBM:
-            value: (
-              SUM(
-                ($wave_size * (
-                  (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
-                  (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
-                  (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
-                )) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)
-              ) /
-              SUM(
-                (TCC_BUBBLE_sum * 128) +
-                (TCC_EA0_RDREQ_32B_sum * 32) +
-                ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) +
-                ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) +
-                (TCC_EA0_WRREQ_64B_sum * 64)
-              )
-              )
-            unit: FLOPs/Byte
-          AI L2:
-            value: (
-              SUM(
-                ($wave_size * (
-                  (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
-                  (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
-                  (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
-                )) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)
-              ) /
-              SUM(
-                (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum +
-                TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) * 64
-              )
-              )
-            unit: FLOPs/Byte
-          AI L1:
-            value: (
-              SUM(
-                ($wave_size * (
-                  (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
-                  (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
-                  (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
-                )) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)
-              ) /
-              SUM(TCP_TOTAL_CACHE_ACCESSES_sum * 64)
-              )
-            unit: FLOPs/Byte
-          Performance (GFLOPs):
-            value: (
-              SUM(
-                ($wave_size * (
-                  (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
-                  (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
-                  (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
-                )) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)
-              ) /
-              (SUM(End_Timestamp - Start_Timestamp) / 1e9)
-              ) / 1e9
-            unit: GFLOP/s
-        gfx940:
-          AI HBM:
-            value: (
-              SUM(
-                ($wave_size * (
-                  (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
-                  (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
-                  (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
-                )) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512)
-              ) /
-              SUM(
-                (TCC_BUBBLE_sum * 128) +
-                (TCC_EA0_RDREQ_32B_sum * 32) +
-                ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) +
-                ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) +
-                (TCC_EA0_WRREQ_64B_sum * 64)
-              )
-              )
-            unit: FLOPs/Byte
-          AI L2:
-            value: (
-              SUM(
-                ($wave_size * (
-                  (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
-                  (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
-                  (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
-                )) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) +
-              ) /
-              SUM(
-                (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum +
-                TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) * 64
-              )
-              )
-            unit: FLOPs/Byte
-          AI L1:
-            value: (
-              SUM(
-                ($wave_size * (
-                  (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
-                  (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
-                  (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
-                )) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) +
-              ) /
-              SUM(TCP_TOTAL_CACHE_ACCESSES_sum * 64)
-              )
-            unit: FLOPs/Byte
-          Performance (GFLOPs):
-            value: (
-              SUM(
-                ($wave_size * (
-                  (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
-                  (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
-                  (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
-                )) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512)
-              ) /
-              (SUM(End_Timestamp - Start_Timestamp) / 1e9)
-              ) / 1e9
-            unit: GFLOP/s
-        gfx941:
-          AI HBM:
-            value: (
-              SUM(
-                ($wave_size * (
-                  (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
-                  (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
-                  (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
-                )) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512)
-              ) /
-              SUM(
-                (TCC_BUBBLE_sum * 128) +
-                (TCC_EA0_RDREQ_32B_sum * 32) +
-                ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) +
-                ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) +
-                (TCC_EA0_WRREQ_64B_sum * 64)
-              )
-              )
-            unit: FLOPs/Byte
-          AI L2:
-            value: (
-              SUM(
-                ($wave_size * (
-                  (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
-                  (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
-                  (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
-                )) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512)
-              ) /
-              SUM(
-                (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum +
-                TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) * 64
-              )
-              )
-            unit: FLOPs/Byte
-          AI L1:
-            value: (
-              SUM(
-                ($wave_size * (
-                  (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
-                  (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
-                  (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
-                )) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512)
-              ) /
-              SUM(TCP_TOTAL_CACHE_ACCESSES_sum * 64)
-              )
-            unit: FLOPs/Byte
-          Performance (GFLOPs):
-            value: (
-              SUM(
-                ($wave_size * (
-                  (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
-                  (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
-                  (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
-                )) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512)
-              ) /
-              (SUM(End_Timestamp - Start_Timestamp) / 1e9)
-              ) / 1e9
-            unit: GFLOP/s
-        gfx942:
-          AI HBM:
-            value: ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16
-              + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32
-              + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32)
-              + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64)
-              + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
-              (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 *
-              512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8
-              * 512) ) / SUM( (TCC_BUBBLE_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32)
-              + ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64)
-              + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum
-              * 64) ) )
-            unit: FLOPs/Byte
-          AI L2:
-            value: ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16
-              + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32
-              + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32)
-              + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64)
-              + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
-              (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 *
-              512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8
-              * 512) ) / SUM( (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum +
-              TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) * 64 ) )
-            unit: FLOPs/Byte
-          AI L1:
-            value: ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16
-              + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32
-              + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32)
-              + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64)
-              + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
-              (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 *
-              512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8
-              * 512) ) / SUM( TCP_TOTAL_CACHE_ACCESSES_sum * 64 ) )
-            unit: FLOPs/Byte
-          Performance (GFLOPs):
-            value: ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16
-              + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32
-              + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32)
-              + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64)
-              + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
-              (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 *
-              512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8
-              * 512) ) / (SUM(End_Timestamp - Start_Timestamp) / 1e9) ) / 1e9
-            unit: GFLOP/s
-        gfx950:
-          AI HBM:
-            value: (
-              SUM(
-                ($wave_size * (
-                  (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
-                  (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
-                  (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
-                )) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512)
-              ) /
-              SUM(
-                (TCC_BUBBLE_sum * 128) +
-                (TCC_EA0_RDREQ_32B_sum * 32) +
-                ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) +
-                ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) +
-                (TCC_EA0_WRREQ_64B_sum * 64)
-              )
-              )
-            unit: FLOPs/Byte
-          AI L2:
-            value: (
-              SUM(
-                ($wave_size * (
-                  (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
-                  (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
-                  (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
-                )) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512)
-              ) /
-              SUM(
-                (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum +
-                TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) * 64
-              )
-              )
-            unit: FLOPs/Byte
-          AI L1:
-            value: (
-              SUM(
-                ($wave_size * (
-                  (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
-                  (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
-                  (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
-                )) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512)
-              ) /
-              SUM(TCP_TOTAL_CACHE_ACCESSES_sum * 64)
-              )
-            unit: FLOPs/Byte
-          Performance (GFLOPs):
-            value: (
-              SUM(
-                ($wave_size * (
-                  (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
-                  (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
-                  (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
-                )) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) +
-                (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512)
-              ) /
-              (SUM(End_Timestamp - Start_Timestamp) / 1e9)
-              ) / 1e9
-            unit: GFLOP/s
-  metrics_description:
-      VALU FLOPs (F16):
-        plain: |-
-          The total 16-bit floating-point operations executed per second on the VALU.
-          This is presented with the value of the peak empirical F16 FLOPs achievable
-          on the specific accelerator. Note: this does not include any F16 operations
-          from MFMA instructions.
-        rst: |-
-          The total 16-bit floating-point operations executed per second on the :ref:`VALU
-          <desc-valu>`. This is presented with the value of the peak empirical F16 FLOPs achievable
-          on the specific accelerator. Note: this does not include any F16 operations
-          from :ref:`MFMA <desc-mfma>` instructions.
-        unit: GFLOPs
-      VALU FLOPs (F32):
-        plain: |-
-          The total 32-bit floating-point operations executed per second on the VALU.
-          This is presented with the value of the peak empirical F32 FLOPs achievable
-          on the specific accelerator. Note: this does not include any F32 operations
-          from MFMA instructions.
-        rst: |-
-          The total 32-bit floating-point operations executed per second on the :ref:`VALU
-          <desc-valu>`. This is presented with the value of the peak empirical F32 FLOPs achievable
-          on the specific accelerator. Note: this does not include any F32 operations
-          from :ref:`MFMA <desc-mfma>` instructions.
-        unit: GFLOPs
-      VALU FLOPs (F64):
-        plain: |-
-          The total 64-bit floating-point operations executed per second on the VALU.
-          This is presented with the value of the peak empirical F64 FLOPs achievable
-          on the specific accelerator. Note: this does not include any F64 operations
-          from MFMA instructions.
-        rst: |-
-          The total 64-bit floating-point operations executed per second on the :ref:`VALU
-          <desc-valu>`. This is presented with the value of the peak empirical F64 FLOPs achievable
-          on the specific accelerator. Note: this does not include any F64 operations
-          from :ref:`MFMA <desc-mfma>` instructions.
-        unit: GFLOPs
-      MFMA FLOPs (F8):
-        plain: The total number of 8-bit brain floating point MFMA operations executed
-          per second. This does not include any 16-bit brain floating point operations
-          from VALU instructions. The peak empirically measured F8 MFMA operations
-          achievable on the specific accelerator is displayed alongside for comparison.
-          It is supported on AMD Instinct MI300 series and later only.
-        rst: |-
-          The total number of 8-bit brain floating point :ref:`MFMA <desc-mfma>`
-          operations executed per second. Note: this does not include any 16-bit brain
-          floating point operations from :ref:`VALU <desc-valu>` instructions. The
-          peak empirically measured F8 MFMA operations achievable on the specific
-          accelerator is displayed alongside for comparison. It is supported on AMD
-          Instinct MI300 series and later only.
-        unit: GFLOPs
-      MFMA FLOPs (BF16):
-        plain: |-
-          The total number of 16-bit brain floating point MFMA operations executed
-          per second. Note: this does not include any 16-bit brain floating point
-          operations from VALU instructions. The peak empirically measured BF16 MFMA
-          operations achievable on the specific accelerator is displayed alongside
-          for comparison.
-        rst: |-
-          The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>`
-          operations executed per second. Note: this does not include any 16-bit brain
-          floating point operations from :ref:`VALU <desc-valu>` instructions. The
-          peak empirically measured BF16 MFMA operations achievable on the specific
-          accelerator is displayed alongside for comparison.
-        unit: GFLOPs
-      MFMA FLOPs (F16):
-        plain: |-
-          The total number of 16-bit floating point MFMA operations executed per
-          second. Note: this does not include any 16-bit floating point operations from
-          VALU instructions. The peak empirically measured F16 MFMA operations
-          achievable on the specific accelerator is displayed alongside for comparison.
-        rst: |-
-          The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
-          executed per second. Note: this does not include any 16-bit floating point
-          operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
-          measured F16 MFMA operations achievable on the specific accelerator is
-          displayed alongside for comparison.
-        unit: GFLOPs
-      MFMA FLOPs (F32):
-        plain: |-
-          The total number of 32-bit floating point MFMA operations executed per
-          second. Note: this does not include any 32-bit floating point operations from
-          VALU instructions. The peak empirically measured F32 MFMA operations
-          achievable on the specific accelerator is displayed alongside for comparison.
-        rst: |-
-          The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
-          executed per second. Note: this does not include any 32-bit floating point
-          operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
-          measured F32 MFMA operations achievable on the specific accelerator is
-          displayed alongside for comparison.
-        unit: GFLOPs
-      MFMA FLOPs (F64):
-        plain: |-
-          The total number of 64-bit floating point MFMA operations executed per
-          second. Note: this does not include any 64-bit floating point operations from
-          VALU instructions. The peak empirically measured F64 MFMA operations
-          achievable on the specific accelerator is displayed alongside for comparison.
-        rst: |-
-          The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
-          executed per second. Note: this does not include any 64-bit floating point
-          operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
-          measured F64 MFMA operations achievable on the specific accelerator is
-          displayed alongside for comparison.
-        unit: GFLOPs
-      MFMA FLOPs (F6F4):
-        plain: |-
-          The total number of 4-bit and 6-bit floating point MFMA operations executed
-          per second. Note: this does not include any floating point operations from
-          VALU instructions. The peak empirically measured F6F4 MFMA operations
-          achievable on the specific accelerator is displayed alongside for comparison.
-          It is supported on AMD Instinct MI350 series (gfx950) and later only.
-        rst: |-
-          The total number of 4-bit and 6-bit floating point :ref:`MFMA <desc-mfma>`
-          operations executed per second. Note: this does not include any floating point
-          operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
-          measured F6F4 MFMA operations achievable on the specific accelerator is
-          displayed alongside for comparison. It is supported on AMD Instinct MI350
-          series (gfx950) and later only.
-        unit: GFLOPs
-      MFMA IOPs (Int8):
-        plain: |-
-          The total number of 8-bit integer MFMA operations executed per second.
-          Note: this does not include any 8-bit integer operations from VALU instructions.
-          The peak empirically measured INT8 MFMA operations achievable on the specific
-          accelerator is displayed alongside for comparison.
-        rst: |-
-          The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
-          per second. Note: this does not include any 8-bit integer operations from
-          :ref:`VALU <desc-valu>` instructions. The peak empirically measured INT8 MFMA
-          operations achievable on the specific accelerator is displayed alongside
-          for comparison.
-        unit: GIOPs
-      HBM Bandwidth:
-        plain: |-
-          The total number of bytes read from and written to High-Bandwidth
-          Memory (HBM) per second. The peak empirically measured bandwidth achievable
-          on the specific accelerator is displayed alongside for comparison.
-        rst: |-
-          The total number of bytes read from and written to High-Bandwidth
-          Memory (HBM) per second. The peak empirically measured bandwidth achievable
-          on the specific accelerator is displayed alongside for comparison.
-        unit: GB/s
-      L2 Cache Bandwidth:
-        plain: The number of bytes looked up in the L2 cache per unit time. The number
-          of bytes is calculated as the number of cache lines requested multiplied by
-          the cache line size. This value does not consider partial requests, so e.g.,
-          if only a single value is requested in a cache line, the data movement will
-          still be counted as a full cache line. The peak empirically measured bandwidth
-          achievable on the specific accelerator is displayed alongside for comparison.
-        rst: The number of bytes looked up in the L2 cache per unit time. The number of
-          bytes is calculated as the number of cache lines requested multiplied by
-          the cache line size. This value does not consider partial requests, so e.g.,
-          if only a single value is requested in a cache line, the data movement will
-          still be counted as a full cache line. The peak empirically measured
-          bandwidth achievable on the specific accelerator is displayed alongside
-          for comparison.
-        unit: GB/s
-      L1 Cache Bandwidth:
-        plain: The number of bytes looked up in the vL1D cache as a result of VMEM
-          instructions per unit time. The number of bytes is calculated as the number
-          of cache lines requested multiplied by the cache line size. This value does
-          not consider partial requests, so e.g., if only a single value is requested
-          in a cache line, the data movement will still be counted as a full cache line.
-          The peak empirically measured bandwidth achievable on the specific accelerator
-          is displayed alongside for comparison.
-        rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM
-          <desc-vmem>` instructions per unit time. The number of bytes is calculated
-          as the number of cache lines requested multiplied by the cache line size.
-          This value does not consider partial requests, so e.g., if only a single
-          value is requested in a cache line, the data movement will still be counted
-          as a full cache line. The peak empirically measured bandwidth achievable on
-          the specific accelerator is displayed alongside for comparison.
-        unit: GB/s
-      LDS Bandwidth:
-        plain: Indicates the maximum amount of bytes that could have been loaded from,
-          stored to, or atomically updated in the LDS per unit time (see LDS Bandwidth
-          example for more detail). The peak empirically measured LDS bandwidth
-          achievable on the specific accelerator is displayed alongside for comparison.
-        rst: Indicates the maximum amount of bytes that could have been loaded from,
-          stored to, or atomically updated in the LDS per unit time (see :ref:`LDS
-          Bandwidth <lds-bandwidth>` example for more detail). The peak empirically
-          measured LDS bandwidth achievable on the specific accelerator is displayed
-          alongside for comparison.
-        unit: GB/s
-      AI L1:
-        plain: |-
-          The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio
-          of total floating-point operations (FLOPs) to total bytes transferred between
-          the L1 cache and the processing units. This value is used as the x-coordinate
-          for the L1 roofline.
-        rst: |-
-          The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio
-          of total floating-point operations (FLOPs) to total bytes transferred between
-          the L1 cache and the processing units. This value is used as the x-coordinate
-          for the L1 roofline.
-        unit: FLOPs/Byte
-      AI L2:
-        plain: |-
-          The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio
-          of total floating-point operations (FLOPs) to total bytes transferred between
-          the L2 cache and the L1 cache. This value is used as the x-coordinate for
-          the L2 roofline.
-        rst: |-
-          The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio
-          of total floating-point operations (FLOPs) to total bytes transferred between
-          the L2 cache and the L1 cache. This value is used as the x-coordinate for
-          the L2 roofline.
-        unit: FLOPs/Byte
-      AI HBM:
-        plain: |-
-          The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM).
-          It is the ratio of total floating-point operations (FLOPs) to total bytes
-          transferred between HBM and the L2 cache. This value is used as the x-coordinate
-          for the HBM roofline.
-        rst: |-
-          The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM).
-          It is the ratio of total floating-point operations (FLOPs) to total bytes
-          transferred between HBM and the L2 cache. This value is used as the x-coordinate
-          for the HBM roofline.
-        unit: FLOPs/Byte
-      Performance (GFLOPs):
-        plain: |-
-          The overall achieved performance, measured in GigaFLOPs
-          per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point
-          operations divided by the total execution time. This value is used as the y-coordinate
-          for the kernel's point on the Roofline plot.
-        rst: |-
-          The overall achieved performance, measured in GigaFLOPs
-          per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point
-          operations divided by the total execution time. This value is used as the y-coordinate
-          for the kernel's point on the Roofline plot.
-        unit: GFLOP/s
-- id: 500
-  title: Command Processor (CPC/CPF)
-  data source:
-  - metric_table:
-      id: 501
-      title: Command processor fetcher (CPF)
-      header:
-        metric: Metric
-        avg: Avg
-        min: Min
-        max: Max
-        unit: Unit
-      metric:
-        gfx90a:
-          CPF Utilization:
-            avg: AVG((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
-              if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
-            min: MIN((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
-              if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
-            max: MAX((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
-              if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
-            unit: pct
-          CPF Stall:
-            avg: AVG((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
-              != 0) else None))
-            min: MIN((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
-              != 0) else None))
-            max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
-              != 0) else None))
-            unit: pct
-          CPF-L2 Utilization:
-            avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
-              if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
-            min: MIN((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
-              if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
-            max: MAX((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
-              if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
-            unit: pct
-          CPF-L2 Stall:
-            avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
-              != 0) else None))
-            min: MIN((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
-              != 0) else None))
-            max: MAX((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
-              != 0) else None))
-            unit: pct
-          CPF-UTCL1 Stall:
-            avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
-              if (CPF_CPF_STAT_BUSY != 0) else None)
-            min: MIN(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
-              if (CPF_CPF_STAT_BUSY != 0) else None)
-            max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
-              if (CPF_CPF_STAT_BUSY != 0) else None)
-            unit: pct
-        gfx941:
-          CPF Utilization:
-            avg: AVG((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
-              if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
-            min: MIN((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
-              if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
-            max: MAX((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
-              if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
-            unit: pct
-          CPF Stall:
-            avg: AVG((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
-              != 0) else None))
-            min: MIN((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
-              != 0) else None))
-            max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
-              != 0) else None))
-            unit: pct
-          CPF-L2 Utilization:
-            avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
-              if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
-            min: MIN((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
-              if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
-            max: MAX((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
-              if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
-            unit: pct
-          CPF-L2 Stall:
-            avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
-              != 0) else None))
-            min: MIN((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
-              != 0) else None))
-            max: MAX((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
-              != 0) else None))
-            unit: pct
-          CPF-UTCL1 Stall:
-            avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
-              if (CPF_CPF_STAT_BUSY != 0) else None)
-            min: MIN(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
-              if (CPF_CPF_STAT_BUSY != 0) else None)
-            max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
-              if (CPF_CPF_STAT_BUSY != 0) else None)
-            unit: pct
-        gfx940:
-          CPF Utilization:
-            avg: AVG((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
-              if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
-            min: MIN((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
-              if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
-            max: MAX((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
-              if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
-            unit: pct
-          CPF Stall:
-            avg: AVG((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
-              != 0) else None))
-            min: MIN((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
-              != 0) else None))
-            max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
-              != 0) else None))
-            unit: pct
-          CPF-L2 Utilization:
-            avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
-              if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
-            min: MIN((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
-              if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
-            max: MAX((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
-              if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
-            unit: pct
-          CPF-L2 Stall:
-            avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
-              != 0) else None))
-            min: MIN((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
-              != 0) else None))
-            max: MAX((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
-              != 0) else None))
-            unit: pct
-          CPF-UTCL1 Stall:
-            avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
-              if (CPF_CPF_STAT_BUSY != 0) else None)
-            min: MIN(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
-              if (CPF_CPF_STAT_BUSY != 0) else None)
-            max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
-              if (CPF_CPF_STAT_BUSY != 0) else None)
-            unit: pct
-        gfx942:
-          CPF Utilization:
-            avg: AVG((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
-              if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
-            min: MIN((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
-              if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
-            max: MAX((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
-              if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
-            unit: pct
-          CPF Stall:
-            avg: AVG((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
-              != 0) else None))
-            min: MIN((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
-              != 0) else None))
-            max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
-              != 0) else None))
-            unit: pct
-          CPF-L2 Utilization:
-            avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
-              if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
-            min: MIN((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
-              if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
-            max: MAX((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
-              if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
-            unit: pct
-          CPF-L2 Stall:
-            avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
-              != 0) else None))
-            min: MIN((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
-              != 0) else None))
-            max: MAX((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
-              != 0) else None))
-            unit: pct
-          CPF-UTCL1 Stall:
-            avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
-              if (CPF_CPF_STAT_BUSY != 0) else None)
-            min: MIN(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
-              if (CPF_CPF_STAT_BUSY != 0) else None)
-            max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
-              if (CPF_CPF_STAT_BUSY != 0) else None)
-            unit: pct
-        gfx950:
-          CPF Utilization:
-            avg: AVG((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
-              if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
-            min: MIN((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
-              if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
-            max: MAX((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
-              if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
-            unit: pct
-          CPF Stall:
-            avg: AVG((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
-              != 0) else None))
-            min: MIN((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
-              != 0) else None))
-            max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
-              != 0) else None))
-            unit: pct
-          CPF-L2 Utilization:
-            avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
-              if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
-            min: MIN((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
-              if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
-            max: MAX((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
-              if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
-            unit: pct
-          CPF-L2 Stall:
-            avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
-              != 0) else None))
-            min: MIN((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
-              != 0) else None))
-            max: MAX((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
-              != 0) else None))
-            unit: pct
-          CPF-UTCL1 Stall:
-            avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
-              if (CPF_CPF_STAT_BUSY != 0) else None)
-            min: MIN(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
-              if (CPF_CPF_STAT_BUSY != 0) else None)
-            max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
-              if (CPF_CPF_STAT_BUSY != 0) else None)
-            unit: pct
-        gfx908:
-          CPF Utilization:
-            avg: AVG((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
-              if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
-            min: MIN((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
-              if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
-            max: MAX((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE))
-              if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None))
-            unit: pct
-          CPF Stall:
-            avg: AVG((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
-              != 0) else None))
-            min: MIN((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
-              != 0) else None))
-            max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY
-              != 0) else None))
-            unit: pct
-          CPF-L2 Utilization:
-            avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
-              if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
-            min: MIN((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
-              if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
-            max: MAX((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE))
-              if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None))
-            unit: pct
-          CPF-L2 Stall:
-            avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
-              != 0) else None))
-            min: MIN((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
-              != 0) else None))
-            max: MAX((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY
-              != 0) else None))
-            unit: pct
-          CPF-UTCL1 Stall:
-            avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
-              if (CPF_CPF_STAT_BUSY != 0) else None)
-            min: MIN(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
-              if (CPF_CPF_STAT_BUSY != 0) else None)
-            max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY)
-              if (CPF_CPF_STAT_BUSY != 0) else None)
-            unit: pct
-  - metric_table:
-      id: 502
-      title: Command processor packet processor (CPC)
-      header:
-        metric: Metric
-        avg: Avg
-        min: Min
-        max: Max
-        unit: Unit
-      metric:
-        gfx90a:
-          CPC Utilization:
-            avg: AVG((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
-              if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
-            min: MIN((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
-              if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
-            max: MAX((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
-              if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
-            unit: pct
-          CPC Stall Rate:
-            avg: AVG((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
-              != 0) else None))
-            min: MIN((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
-              != 0) else None))
-            max: MAX((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
-              != 0) else None))
-            unit: pct
-          CPC Packet Decoding Utilization:
-            avg: AVG((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
-              (CPC_CPC_STAT_BUSY != 0) else None)
-            min: MIN((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
-              (CPC_CPC_STAT_BUSY != 0) else None)
-            max: MAX((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
-              (CPC_CPC_STAT_BUSY != 0) else None)
-            unit: pct
-          CPC-Workgroup Manager Utilization:
-            avg: AVG((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
-              != 0) else None)
-            min: MIN((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
-              != 0) else None)
-            max: MAX((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
-              != 0) else None)
-            unit: Pct
-          CPC-L2 Utilization:
-            avg: AVG((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
-              if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
-            min: MIN((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
-              if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
-            max: MAX((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
-              if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
-            unit: pct
-          CPC-UTCL1 Stall:
-            avg: AVG(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
-              if (CPC_CPC_STAT_BUSY != 0) else None)
-            min: MIN(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
-              if (CPC_CPC_STAT_BUSY != 0) else None)
-            max: MAX(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
-              if (CPC_CPC_STAT_BUSY != 0) else None)
-            unit: pct
-          CPC-UTCL2 Utilization:
-            avg: AVG((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
-              if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
-            min: MIN((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
-              if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
-            max: MAX((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
-              if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
-            unit: pct
-        gfx941:
-          CPC Utilization:
-            avg: AVG((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
-              if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
-            min: MIN((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
-              if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
-            max: MAX((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
-              if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
-            unit: pct
-          CPC Stall Rate:
-            avg: AVG((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
-              != 0) else None))
-            min: MIN((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
-              != 0) else None))
-            max: MAX((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
-              != 0) else None))
-            unit: pct
-          CPC Packet Decoding Utilization:
-            avg: AVG((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
-              (CPC_CPC_STAT_BUSY != 0) else None)
-            min: MIN((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
-              (CPC_CPC_STAT_BUSY != 0) else None)
-            max: MAX((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
-              (CPC_CPC_STAT_BUSY != 0) else None)
-            unit: pct
-          CPC-Workgroup Manager Utilization:
-            avg: AVG((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
-              != 0) else None)
-            min: MIN((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
-              != 0) else None)
-            max: MAX((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
-              != 0) else None)
-            unit: Pct
-          CPC-L2 Utilization:
-            avg: AVG((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
-              if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
-            min: MIN((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
-              if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
-            max: MAX((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
-              if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
-            unit: pct
-          CPC-UTCL1 Stall:
-            avg: AVG(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
-              if (CPC_CPC_STAT_BUSY != 0) else None)
-            min: MIN(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
-              if (CPC_CPC_STAT_BUSY != 0) else None)
-            max: MAX(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
-              if (CPC_CPC_STAT_BUSY != 0) else None)
-            unit: pct
-          CPC-UTCL2 Utilization:
-            avg: AVG((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
-              if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
-            min: MIN((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
-              if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
-            max: MAX((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
-              if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
-            unit: pct
-        gfx940:
-          CPC Utilization:
-            avg: AVG((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
-              if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
-            min: MIN((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
-              if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
-            max: MAX((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
-              if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
-            unit: pct
-          CPC Stall Rate:
-            avg: AVG((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
-              != 0) else None))
-            min: MIN((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
-              != 0) else None))
-            max: MAX((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
-              != 0) else None))
-            unit: pct
-          CPC Packet Decoding Utilization:
-            avg: AVG((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
-              (CPC_CPC_STAT_BUSY != 0) else None)
-            min: MIN((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
-              (CPC_CPC_STAT_BUSY != 0) else None)
-            max: MAX((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
-              (CPC_CPC_STAT_BUSY != 0) else None)
-            unit: pct
-          CPC-Workgroup Manager Utilization:
-            avg: AVG((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
-              != 0) else None)
-            min: MIN((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
-              != 0) else None)
-            max: MAX((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
-              != 0) else None)
-            unit: Pct
-          CPC-L2 Utilization:
-            avg: AVG((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
-              if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
-            min: MIN((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
-              if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
-            max: MAX((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
-              if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
-            unit: pct
-          CPC-UTCL1 Stall:
-            avg: AVG(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
-              if (CPC_CPC_STAT_BUSY != 0) else None)
-            min: MIN(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
-              if (CPC_CPC_STAT_BUSY != 0) else None)
-            max: MAX(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
-              if (CPC_CPC_STAT_BUSY != 0) else None)
-            unit: pct
-          CPC-UTCL2 Utilization:
-            avg: AVG((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
-              if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
-            min: MIN((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
-              if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
-            max: MAX((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
-              if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
-            unit: pct
-        gfx942:
-          CPC Utilization:
-            avg: AVG((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
-              if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
-            min: MIN((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
-              if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
-            max: MAX((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
-              if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
-            unit: pct
-          CPC Stall Rate:
-            avg: AVG((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
-              != 0) else None))
-            min: MIN((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
-              != 0) else None))
-            max: MAX((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
-              != 0) else None))
-            unit: pct
-          CPC Packet Decoding Utilization:
-            avg: AVG((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
-              (CPC_CPC_STAT_BUSY != 0) else None)
-            min: MIN((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
-              (CPC_CPC_STAT_BUSY != 0) else None)
-            max: MAX((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
-              (CPC_CPC_STAT_BUSY != 0) else None)
-            unit: pct
-          CPC-Workgroup Manager Utilization:
-            avg: AVG((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
-              != 0) else None)
-            min: MIN((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
-              != 0) else None)
-            max: MAX((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
-              != 0) else None)
-            unit: Pct
-          CPC-L2 Utilization:
-            avg: AVG((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
-              if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
-            min: MIN((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
-              if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
-            max: MAX((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
-              if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
-            unit: pct
-          CPC-UTCL1 Stall:
-            avg: AVG(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
-              if (CPC_CPC_STAT_BUSY != 0) else None)
-            min: MIN(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
-              if (CPC_CPC_STAT_BUSY != 0) else None)
-            max: MAX(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
-              if (CPC_CPC_STAT_BUSY != 0) else None)
-            unit: pct
-          CPC-UTCL2 Utilization:
-            avg: AVG((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
-              if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
-            min: MIN((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
-              if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
-            max: MAX((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
-              if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
-            unit: pct
-        gfx950:
-          CPC SYNC FIFO Full Rate:
-            avg: AVG((100 * CPC_SYNC_FIFO_FULL) / CPC_SYNC_WRREQ_FIFO_BUSY if (CPC_SYNC_WRREQ_FIFO_BUSY
-              != 0) else None)
-            min: MIN((100 * CPC_SYNC_FIFO_FULL) / CPC_SYNC_WRREQ_FIFO_BUSY if (CPC_SYNC_WRREQ_FIFO_BUSY
-              != 0) else None)
-            max: MAX((100 * CPC_SYNC_FIFO_FULL) / CPC_SYNC_WRREQ_FIFO_BUSY if (CPC_SYNC_WRREQ_FIFO_BUSY
-              != 0) else None)
-            unit: pct
-          CPC CANE Stall Rate:
-            avg: AVG((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0)
-              else None)
-            min: MIN((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0)
-              else None)
-            max: MAX((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0)
-              else None)
-            unit: pct
-          CPC ADC Utilization:
-            avg: AVG((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else
-              None)
-            min: MIN((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else
-              None)
-            max: MAX((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else
-              None)
-            unit: pct
-          CPC Utilization:
-            avg: AVG((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
-              if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
-            min: MIN((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
-              if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
-            max: MAX((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
-              if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
-            unit: pct
-          CPC Stall Rate:
-            avg: AVG((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
-              != 0) else None))
-            min: MIN((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
-              != 0) else None))
-            max: MAX((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
-              != 0) else None))
-            unit: pct
-          CPC Packet Decoding Utilization:
-            avg: AVG((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
-              (CPC_CPC_STAT_BUSY != 0) else None)
-            min: MIN((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
-              (CPC_CPC_STAT_BUSY != 0) else None)
-            max: MAX((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
-              (CPC_CPC_STAT_BUSY != 0) else None)
-            unit: pct
-          CPC-Workgroup Manager Utilization:
-            avg: AVG((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
-              != 0) else None)
-            min: MIN((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
-              != 0) else None)
-            max: MAX((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
-              != 0) else None)
-            unit: Pct
-          CPC-L2 Utilization:
-            avg: AVG((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
-              if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
-            min: MIN((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
-              if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
-            max: MAX((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
-              if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
-            unit: pct
-          CPC-UTCL1 Stall:
-            avg: AVG(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
-              if (CPC_CPC_STAT_BUSY != 0) else None)
-            min: MIN(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
-              if (CPC_CPC_STAT_BUSY != 0) else None)
-            max: MAX(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
-              if (CPC_CPC_STAT_BUSY != 0) else None)
-            unit: pct
-          CPC-UTCL2 Utilization:
-            avg: AVG((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
-              if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
-            min: MIN((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
-              if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
-            max: MAX((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
-              if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
-            unit: pct
-        gfx908:
-          CPC Utilization:
-            avg: AVG((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
-              if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
-            min: MIN((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
-              if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
-            max: MAX((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE))
-              if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None))
-            unit: pct
-          CPC Stall Rate:
-            avg: AVG((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
-              != 0) else None))
-            min: MIN((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
-              != 0) else None))
-            max: MAX((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY
-              != 0) else None))
-            unit: pct
-          CPC Packet Decoding Utilization:
-            avg: AVG((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
-              (CPC_CPC_STAT_BUSY != 0) else None)
-            min: MIN((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
-              (CPC_CPC_STAT_BUSY != 0) else None)
-            max: MAX((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if
-              (CPC_CPC_STAT_BUSY != 0) else None)
-            unit: pct
-          CPC-Workgroup Manager Utilization:
-            avg: AVG((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
-              != 0) else None)
-            min: MIN((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
-              != 0) else None)
-            max: MAX((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY
-              != 0) else None)
-            unit: Pct
-          CPC-L2 Utilization:
-            avg: AVG((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
-              if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
-            min: MIN((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
-              if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
-            max: MAX((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE))
-              if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None))
-            unit: pct
-          CPC-UTCL1 Stall:
-            avg: AVG(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
-              if (CPC_CPC_STAT_BUSY != 0) else None)
-            min: MIN(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
-              if (CPC_CPC_STAT_BUSY != 0) else None)
-            max: MAX(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY)
-              if (CPC_CPC_STAT_BUSY != 0) else None)
-            unit: pct
-          CPC-UTCL2 Utilization:
-            avg: AVG((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
-              if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
-            min: MIN((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
-              if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
-            max: MAX((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE))
-              if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None))
-            unit: pct
-  metrics_description:
-    CPF Utilization:
-      plain: Percent of total cycles where the CPF was busy actively doing any work.
-        The ratio of CPF busy cycles over total cycles counted by the CPF.
-      rst: Percent of total cycles where the CPF was busy actively doing any work.
-        The ratio of CPF busy cycles over total cycles counted by the CPF.
-      unit: Percent
-    CPF Stall:
-      plain: Percent of CPF busy cycles where the CPF was stalled for any reason.
-      rst: Percent of CPF busy cycles where the CPF was stalled for any reason.
-      unit: Percent
-    CPF-L2 Utilization:
-      plain: Percent of total cycles counted by the CPF-L2 interface where the CPF-L2
-        interface was active doing any work. The ratio of CPF-L2 busy cycles over
-        total cycles counted by the CPF-L2.
-      rst: Percent of total cycles counted by the CPF-:doc:`L2 <l2-cache>` interface where
-        the CPF-L2 interface was active doing any work. The ratio of CPF-L2 busy
-        cycles over total cycles counted by the CPF-L2.
-      unit: Percent
-    CPF-L2 Stall:
-      plain: Percent of CPF-L2 L2 busy cycles where the CPF-L2 interface was stalled
-        for any reason.
-      rst: Percent of CPF-:doc:`L2 <l2-cache>` L2 busy cycles where the CPF-L2 interface
-        was stalled for any reason.
-      unit: Percent
-    CPF-UTCL1 Stall:
-      plain: Percent of CPF busy cycles where the CPF was stalled by address translation.
-      rst: Percent of CPF busy cycles where the CPF was stalled by address translation.
-      unit: Percent
-    CPC Utilization:
-      plain: Percent of total cycles where the CPC was busy actively doing any work.
-        The ratio of CPC busy cycles over total cycles counted by the CPC.
-      rst: Percent of total cycles where the CPC was busy actively doing any work.
-        The ratio of CPC busy cycles over total cycles counted by the CPC.
-      unit: Percent
-    CPC Stall Rate:
-      plain: Percent of CPC busy cycles where the CPC was stalled for any reason.
-      rst: Percent of CPC busy cycles where the CPC was stalled for any reason.
-      unit: Percent
-    CPC Packet Decoding Utilization:
-      plain: Percent of CPC busy cycles spent decoding commands for processing.
-      rst: Percent of CPC busy cycles spent decoding commands for processing.
-      unit: Percent
-    CPC-Workgroup Manager Utilization:
-      plain: Percent of CPC busy cycles spent dispatching workgroups to the workgroup
-        manager.
-      rst: Percent of CPC busy cycles spent dispatching workgroups to the :ref:`workgroup
-        manager <desc-spi>`.
-      unit: Percent
-    CPC-L2 Utilization:
-      plain: Percent of total cycles counted by the CPC-L2 interface where the CPC-L2
-        interface was active doing any work.
-      rst: Percent of total cycles counted by the CPC-:doc:`L2 <l2-cache>` interface where
-        the CPC-L2 interface was active doing any work.
-      unit: Percent
-    CPC-UTCL1 Stall:
-      plain: Percent of CPC busy cycles where the CPC was stalled by address translation
-      rst: Percent of CPC busy cycles where the CPC was stalled by address translation
-      unit: Percent
-    CPC-UTCL2 Utilization:
-      plain: |-
-        Percent of total cycles counted by the CPC's L2 address translation
-        interface where the CPC was busy doing address translation work.
-      rst: Percent of total cycles counted by the CPC's :doc:`L2 <l2-cache>` address translation
-        interface where the CPC was busy doing address translation work.
-      unit: Percent
-- id: 600
-  title: Workgroup Manager (SPI)
-  data source:
-  - metric_table:
-      id: 601
-      title: Workgroup manager utilizations
-      header:
-        metric: Metric
-        avg: Avg
-        min: Min
-        max: Max
-        unit: Unit
-      metric:
-        gfx90a:
-          Accelerator Utilization:
-            avg: AVG(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
-            min: MIN(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
-            max: MAX(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
-            unit: Pct
-          Scheduler-Pipe Utilization:
-            avg: AVG(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
-              * $se_per_gpu))
-            min: MIN(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
-              * $se_per_gpu))
-            max: MAX(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
-              * $se_per_gpu))
-            unit: Pct
-          Workgroup Manager Utilization:
-            avg: AVG(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
-            min: MIN(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
-            max: MAX(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
-            unit: Pct
-          Shader Engine Utilization:
-            avg: AVG(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
-            min: MIN(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
-            max: MAX(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
-            unit: Pct
-          SIMD Utilization:
-            avg: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            min: MIN(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            max: MAX(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            unit: Pct
-          Dispatched Workgroups:
-            avg: AVG(SPI_CSN_NUM_THREADGROUPS)
-            min: MIN(SPI_CSN_NUM_THREADGROUPS)
-            max: MAX(SPI_CSN_NUM_THREADGROUPS)
-            unit: Workgroups
-          Dispatched Wavefronts:
-            avg: AVG(SPI_CSN_WAVE)
-            min: MIN(SPI_CSN_WAVE)
-            max: MAX(SPI_CSN_WAVE)
-            unit: Wavefronts
-          VGPR Writes:
-            avg: AVG((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
-              else None))
-            min: MIN((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
-              else None))
-            max: MAX((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
-              else None))
-            unit: Cycles/wave
-          SGPR Writes:
-            avg: AVG((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
-              else None))
-            min: MIN((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
-              else None))
-            max: MAX((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
-              else None))
-            unit: Cycles/wave
-        gfx941:
-          Accelerator Utilization:
-            avg: AVG(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
-            min: MIN(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
-            max: MAX(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
-            unit: Pct
-          Scheduler-Pipe Utilization:
-            avg: AVG(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
-              * $se_per_gpu))
-            min: MIN(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
-              * $se_per_gpu))
-            max: MAX(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
-              * $se_per_gpu))
-            unit: Pct
-          Workgroup Manager Utilization:
-            avg: AVG(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
-            min: MIN(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
-            max: MAX(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
-            unit: Pct
-          Shader Engine Utilization:
-            avg: AVG(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
-            min: MIN(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
-            max: MAX(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
-            unit: Pct
-          SIMD Utilization:
-            avg: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            min: MIN(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            max: MAX(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            unit: Pct
-          Dispatched Workgroups:
-            avg: AVG(SPI_CSN_NUM_THREADGROUPS)
-            min: MIN(SPI_CSN_NUM_THREADGROUPS)
-            max: MAX(SPI_CSN_NUM_THREADGROUPS)
-            unit: Workgroups
-          Dispatched Wavefronts:
-            avg: AVG(SPI_CSN_WAVE)
-            min: MIN(SPI_CSN_WAVE)
-            max: MAX(SPI_CSN_WAVE)
-            unit: Wavefronts
-          VGPR Writes:
-            avg: AVG((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
-              else None))
-            min: MIN((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
-              else None))
-            max: MAX((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
-              else None))
-            unit: Cycles/wave
-          SGPR Writes:
-            avg: AVG((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
-              else None))
-            min: MIN((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
-              else None))
-            max: MAX((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
-              else None))
-            unit: Cycles/wave
-        gfx940:
-          Accelerator Utilization:
-            avg: AVG(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
-            min: MIN(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
-            max: MAX(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
-            unit: Pct
-          Scheduler-Pipe Utilization:
-            avg: AVG(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
-              * $se_per_gpu))
-            min: MIN(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
-              * $se_per_gpu))
-            max: MAX(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
-              * $se_per_gpu))
-            unit: Pct
-          Workgroup Manager Utilization:
-            avg: AVG(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
-            min: MIN(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
-            max: MAX(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
-            unit: Pct
-          Shader Engine Utilization:
-            avg: AVG(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
-            min: MIN(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
-            max: MAX(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
-            unit: Pct
-          SIMD Utilization:
-            avg: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            min: MIN(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            max: MAX(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            unit: Pct
-          Dispatched Workgroups:
-            avg: AVG(SPI_CSN_NUM_THREADGROUPS)
-            min: MIN(SPI_CSN_NUM_THREADGROUPS)
-            max: MAX(SPI_CSN_NUM_THREADGROUPS)
-            unit: Workgroups
-          Dispatched Wavefronts:
-            avg: AVG(SPI_CSN_WAVE)
-            min: MIN(SPI_CSN_WAVE)
-            max: MAX(SPI_CSN_WAVE)
-            unit: Wavefronts
-          VGPR Writes:
-            avg: AVG((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
-              else None))
-            min: MIN((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
-              else None))
-            max: MAX((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
-              else None))
-            unit: Cycles/wave
-          SGPR Writes:
-            avg: AVG((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
-              else None))
-            min: MIN((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
-              else None))
-            max: MAX((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
-              else None))
-            unit: Cycles/wave
-        gfx942:
-          Accelerator Utilization:
-            avg: AVG(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
-            min: MIN(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
-            max: MAX(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
-            unit: Pct
-          Scheduler-Pipe Utilization:
-            avg: AVG(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
-              * $se_per_gpu))
-            min: MIN(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
-              * $se_per_gpu))
-            max: MAX(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
-              * $se_per_gpu))
-            unit: Pct
-          Workgroup Manager Utilization:
-            avg: AVG(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
-            min: MIN(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
-            max: MAX(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
-            unit: Pct
-          Shader Engine Utilization:
-            avg: AVG(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
-            min: MIN(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
-            max: MAX(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
-            unit: Pct
-          SIMD Utilization:
-            avg: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            min: MIN(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            max: MAX(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            unit: Pct
-          Dispatched Workgroups:
-            avg: AVG(SPI_CSN_NUM_THREADGROUPS)
-            min: MIN(SPI_CSN_NUM_THREADGROUPS)
-            max: MAX(SPI_CSN_NUM_THREADGROUPS)
-            unit: Workgroups
-          Dispatched Wavefronts:
-            avg: AVG(SPI_CSN_WAVE)
-            min: MIN(SPI_CSN_WAVE)
-            max: MAX(SPI_CSN_WAVE)
-            unit: Wavefronts
-          VGPR Writes:
-            avg: AVG((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
-              else None))
-            min: MIN((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
-              else None))
-            max: MAX((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
-              else None))
-            unit: Cycles/wave
-          SGPR Writes:
-            avg: AVG((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
-              else None))
-            min: MIN((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
-              else None))
-            max: MAX((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
-              else None))
-            unit: Cycles/wave
-        gfx950:
-          Schedule-Pipe Wave Occupancy:
-            avg: AVG(SPI_CSQ_P0_OCCUPANCY + SPI_CSQ_P1_OCCUPANCY + SPI_CSQ_P2_OCCUPANCY
-              + SPI_CSQ_P3_OCCUPANCY)
-            min: MIN(SPI_CSQ_P0_OCCUPANCY + SPI_CSQ_P1_OCCUPANCY + SPI_CSQ_P2_OCCUPANCY
-              + SPI_CSQ_P3_OCCUPANCY)
-            max: MAX(SPI_CSQ_P0_OCCUPANCY + SPI_CSQ_P1_OCCUPANCY + SPI_CSQ_P2_OCCUPANCY
-              + SPI_CSQ_P3_OCCUPANCY)
-            unit: Wave
-          Accelerator Utilization:
-            avg: AVG(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
-            min: MIN(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
-            max: MAX(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
-            unit: Pct
-          Scheduler-Pipe Utilization:
-            avg: AVG(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY)
-              / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
-            min: MIN(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY)
-              / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
-            max: MAX(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY)
-              / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu))
-            unit: Pct
-          Scheduler-Pipe Wave Utilization:
-            avg: AVG(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
-              * $se_per_gpu))
-            min: MIN(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
-              * $se_per_gpu))
-            max: MAX(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
-              * $se_per_gpu))
-            unit: Pct
-          Workgroup Manager Utilization:
-            avg: AVG(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
-            min: MIN(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
-            max: MAX(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
-            unit: Pct
-          Shader Engine Utilization:
-            avg: AVG(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
-            min: MIN(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
-            max: MAX(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
-            unit: Pct
-          SIMD Utilization:
-            avg: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            min: MIN(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            max: MAX(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            unit: Pct
-          Dispatched Workgroups:
-            avg: AVG(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS
-              + SPI_CS3_NUM_THREADGROUPS)
-            min: MIN(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS
-              + SPI_CS3_NUM_THREADGROUPS)
-            max: MAX(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS
-              + SPI_CS3_NUM_THREADGROUPS)
-            unit: Workgroups
-          Dispatched Wavefronts:
-            avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
-            min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
-            max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
-            unit: Wavefronts
-          VGPR Writes:
-            avg: AVG((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE
-              + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE
-              + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
-            min: MIN((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE
-              + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE
-              + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
-            max: MAX((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE
-              + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE
-              + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None))
-            unit: Cycles/wave
-          SGPR Writes:
-            avg: AVG((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE
-              + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
-              != 0) else None))
-            min: MIN((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE
-              + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
-              != 0) else None))
-            max: MAX((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE
-              + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
-              != 0) else None))
-            unit: Cycles/wave
-        gfx908:
-          Accelerator Utilization:
-            avg: AVG(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
-            min: MIN(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
-            max: MAX(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD)
-            unit: Pct
-          Scheduler-Pipe Utilization:
-            avg: AVG(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
-              * $se_per_gpu))
-            min: MIN(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
-              * $se_per_gpu))
-            max: MAX(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu
-              * $se_per_gpu))
-            unit: Pct
-          Workgroup Manager Utilization:
-            avg: AVG(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
-            min: MIN(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
-            max: MAX(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD)
-            unit: Pct
-          Shader Engine Utilization:
-            avg: AVG(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
-            min: MIN(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
-            max: MAX(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu))
-            unit: Pct
-          SIMD Utilization:
-            avg: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            min: MIN(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            max: MAX(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            unit: Pct
-          Dispatched Workgroups:
-            avg: AVG(SPI_CSN_NUM_THREADGROUPS)
-            min: MIN(SPI_CSN_NUM_THREADGROUPS)
-            max: MAX(SPI_CSN_NUM_THREADGROUPS)
-            unit: Workgroups
-          Dispatched Wavefronts:
-            avg: AVG(SPI_CSN_WAVE)
-            min: MIN(SPI_CSN_WAVE)
-            max: MAX(SPI_CSN_WAVE)
-            unit: Wavefronts
-          VGPR Writes:
-            avg: AVG((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
-              else None))
-            min: MIN((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
-              else None))
-            max: MAX((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
-              else None))
-            unit: Cycles/wave
-          SGPR Writes:
-            avg: AVG((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
-              else None))
-            min: MIN((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
-              else None))
-            max: MAX((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0)
-              else None))
-            unit: Cycles/wave
-  - metric_table:
-      id: 602
-      title: Workgroup Manager - Resource Allocation
-      header:
-        metric: Metric
-        avg: Avg
-        min: Min
-        max: Max
-        unit: Unit
-      metric:
-        gfx90a:
-          Not-scheduled Rate (Workgroup Manager):
-            avg: AVG((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            min: MIN((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            max: MAX((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            unit: Pct
-          Not-scheduled Rate (Scheduler-Pipe):
-            avg: AVG((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            min: MIN((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            max: MAX((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            unit: Pct
-          Scheduler-Pipe Stall Rate:
-            avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
-            min: MIN((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
-            max: MAX((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
-            unit: Pct
-          Scratch Stall Rate:
-            avg: AVG((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            min: MIN((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            max: MAX((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            unit: Pct
-          Insufficient SIMD Waveslots:
-            avg: AVG(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            min: MIN(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            max: MAX(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            unit: Pct
-          Insufficient SIMD VGPRs:
-            avg: AVG(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            min: MIN(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            max: MAX(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            unit: Pct
-          Insufficient SIMD SGPRs:
-            avg: AVG(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            min: MIN(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            max: MAX(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            unit: Pct
-          Insufficient CU LDS:
-            avg: AVG(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            min: MIN(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            max: MAX(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            unit: Pct
-          Insufficient CU Barriers:
-            avg: AVG(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            min: MIN(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            max: MAX(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            unit: Pct
-          Reached CU Workgroup Limit:
-            avg: AVG(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            min: MIN(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            max: MAX(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            unit: Pct
-          Reached CU Wavefront Limit:
-            avg: AVG(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            unit: Pct
-        gfx941:
-          Not-scheduled Rate (Workgroup Manager):
-            avg: AVG((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            min: MIN((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            max: MAX((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            unit: Pct
-          Not-scheduled Rate (Scheduler-Pipe):
-            avg: AVG((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            min: MIN((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            max: MAX((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            unit: Pct
-          Scheduler-Pipe Stall Rate:
-            avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
-            min: MIN((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
-            max: MAX((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
-            unit: Pct
-          Scratch Stall Rate:
-            avg: AVG((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            min: MIN((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            max: MAX((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            unit: Pct
-          Insufficient SIMD Waveslots:
-            avg: AVG(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            min: MIN(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            max: MAX(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            unit: Pct
-          Insufficient SIMD VGPRs:
-            avg: AVG(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            min: MIN(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            max: MAX(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            unit: Pct
-          Insufficient SIMD SGPRs:
-            avg: AVG(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            min: MIN(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            max: MAX(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            unit: Pct
-          Insufficient CU LDS:
-            avg: AVG(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            min: MIN(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            max: MAX(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            unit: Pct
-          Insufficient CU Barriers:
-            avg: AVG(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            min: MIN(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            max: MAX(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            unit: Pct
-          Reached CU Workgroup Limit:
-            avg: AVG(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            min: MIN(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            max: MAX(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            unit: Pct
-          Reached CU Wavefront Limit:
-            avg: AVG(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            unit: Pct
-        gfx940:
-          Not-scheduled Rate (Workgroup Manager):
-            avg: AVG((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            min: MIN((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            max: MAX((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            unit: Pct
-          Not-scheduled Rate (Scheduler-Pipe):
-            avg: AVG((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            min: MIN((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            max: MAX((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            unit: Pct
-          Scheduler-Pipe Stall Rate:
-            avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
-            min: MIN((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
-            max: MAX((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
-            unit: Pct
-          Scratch Stall Rate:
-            avg: AVG((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            min: MIN((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            max: MAX((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            unit: Pct
-          Insufficient SIMD Waveslots:
-            avg: AVG(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            min: MIN(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            max: MAX(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            unit: Pct
-          Insufficient SIMD VGPRs:
-            avg: AVG(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            min: MIN(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            max: MAX(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            unit: Pct
-          Insufficient SIMD SGPRs:
-            avg: AVG(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            min: MIN(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            max: MAX(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            unit: Pct
-          Insufficient CU LDS:
-            avg: AVG(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            min: MIN(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            max: MAX(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            unit: Pct
-          Insufficient CU Barriers:
-            avg: AVG(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            min: MIN(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            max: MAX(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            unit: Pct
-          Reached CU Workgroup Limit:
-            avg: AVG(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            min: MIN(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            max: MAX(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            unit: Pct
-          Reached CU Wavefront Limit:
-            avg: AVG(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            unit: Pct
-        gfx942:
-          Not-scheduled Rate (Workgroup Manager):
-            avg: AVG((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            min: MIN((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            max: MAX((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            unit: Pct
-          Not-scheduled Rate (Scheduler-Pipe):
-            avg: AVG((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            min: MIN((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            max: MAX((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            unit: Pct
-          Scheduler-Pipe Stall Rate:
-            avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
-            min: MIN((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
-            max: MAX((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
-            unit: Pct
-          Scratch Stall Rate:
-            avg: AVG((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            min: MIN((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            max: MAX((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            unit: Pct
-          Insufficient SIMD Waveslots:
-            avg: AVG(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            min: MIN(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            max: MAX(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            unit: Pct
-          Insufficient SIMD VGPRs:
-            avg: AVG(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            min: MIN(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            max: MAX(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            unit: Pct
-          Insufficient SIMD SGPRs:
-            avg: AVG(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            min: MIN(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            max: MAX(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            unit: Pct
-          Insufficient CU LDS:
-            avg: AVG(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            min: MIN(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            max: MAX(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            unit: Pct
-          Insufficient CU Barriers:
-            avg: AVG(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            min: MIN(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            max: MAX(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            unit: Pct
-          Reached CU Workgroup Limit:
-            avg: AVG(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            min: MIN(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            max: MAX(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            unit: Pct
-          Reached CU Wavefront Limit:
-            avg: AVG(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            unit: Pct
-        gfx950:
-          Not-scheduled Rate (Workgroup Manager):
-            avg: AVG((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            min: MIN((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            max: MAX((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            unit: Pct
-          Not-scheduled Rate (Scheduler-Pipe):
-            avg: AVG((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            min: MIN((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            max: MAX((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            unit: Pct
-          Scheduler-Pipe FIFO Full Rate:
-            avg: AVG((100 * (SPI_CS0_CRAWLER_STALL + SPI_CS1_CRAWLER_STALL + SPI_CS2_CRAWLER_STALL
-              + SPI_CS3_CRAWLER_STALL) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if
-              ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            min: MIN((100 * (SPI_CS0_CRAWLER_STALL + SPI_CS1_CRAWLER_STALL + SPI_CS2_CRAWLER_STALL
-              + SPI_CS3_CRAWLER_STALL) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if
-              ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            max: MAX((100 * (SPI_CS0_CRAWLER_STALL + SPI_CS1_CRAWLER_STALL + SPI_CS2_CRAWLER_STALL
-              + SPI_CS3_CRAWLER_STALL) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if
-              ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            unit: Pct
-          Scheduler-Pipe Stall Rate:
-            avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
-            min: MIN((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
-            max: MAX((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
-            unit: Pct
-          Scratch Stall Rate:
-            avg: AVG((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            min: MIN((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            max: MAX((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            unit: Pct
-          Insufficient SIMD Waveslots:
-            avg: AVG(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            min: MIN(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            max: MAX(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            unit: Pct
-          Insufficient SIMD VGPRs:
-            avg: AVG(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            min: MIN(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            max: MAX(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            unit: Pct
-          Insufficient SIMD SGPRs:
-            avg: AVG(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            min: MIN(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            max: MAX(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            unit: Pct
-          Insufficient CU LDS:
-            avg: AVG(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            min: MIN(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            max: MAX(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            unit: Pct
-          Insufficient CU Barriers:
-            avg: AVG(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            min: MIN(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            max: MAX(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            unit: Pct
-          Reached CU Workgroup Limit:
-            avg: AVG(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            min: MIN(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            max: MAX(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            unit: Pct
-          Reached CU Wavefront Limit:
-            avg: AVG(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            unit: Pct
-        gfx908:
-          Not-scheduled Rate (Workgroup Manager):
-            avg: AVG((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            min: MIN((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            max: MAX((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            unit: Pct
-          Not-scheduled Rate (Scheduler-Pipe):
-            avg: AVG((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            min: MIN((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            max: MAX((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            unit: Pct
-          Scheduler-Pipe Stall Rate:
-            avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
-            min: MIN((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
-            max: MAX((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None))
-            unit: Pct
-          Scratch Stall Rate:
-            avg: AVG((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            min: MIN((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            max: MAX((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu))
-              if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)
-            unit: Pct
-          Insufficient SIMD Waveslots:
-            avg: AVG(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            min: MIN(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            max: MAX(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            unit: Pct
-          Insufficient SIMD VGPRs:
-            avg: AVG(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            min: MIN(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            max: MAX(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            unit: Pct
-          Insufficient SIMD SGPRs:
-            avg: AVG(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            min: MIN(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            max: MAX(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            unit: Pct
-          Insufficient CU LDS:
-            avg: AVG(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            min: MIN(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            max: MAX(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            unit: Pct
-          Insufficient CU Barriers:
-            avg: AVG(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            min: MIN(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            max: MAX(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            unit: Pct
-          Reached CU Workgroup Limit:
-            avg: AVG(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            min: MIN(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            max: MAX(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD *
-              $cu_per_gpu))
-            unit: Pct
-          Reached CU Wavefront Limit:
-            avg: AVG(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
-            unit: Pct
-  metrics_description:
-    Accelerator Utilization:
-      plain: The percent of cycles in the kernel where the accelerator was actively
-        doing any work.
-      rst: The percent of cycles in the kernel where the accelerator was actively
-        doing any work.
-      unit: Percent
-    Scheduler-Pipe Utilization:
-      plain: The percent of total scheduler-pipe cycles in the kernel where the scheduler-pipes
-        were actively doing any work.
-      rst: |-
-        The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
-        in the kernel where the scheduler-pipes were actively doing any work. Note: this
-        value is expected to range between 0% and 25%. See :ref:`desc-spi`.
-      unit: Percent
-    Workgroup Manager Utilization:
-      plain: The percent of cycles in the kernel where the workgroup manager was actively
-        doing any work.
-      rst: The percent of cycles in the kernel where the workgroup manager was actively
-        doing any work.
-      unit: Percent
-    Shader Engine Utilization:
-      plain: The percent of total shader engine cycles in the kernel where any CU
-        in a shader-engine was actively doing any work, normalized over all shader-engines.
-        Low values (e.g., << 100%) indicate that the accelerator was not fully saturated
-        by the kernel, or a potential load-imbalance issue.
-      rst: The percent of :ref:`total shader engine cycles <total-se-cycles>` in the kernel
-        where any CU in a shader-engine was actively doing any work, normalized over
-        all shader-engines. Low values (e.g., << 100%) indicate that the accelerator
-        was not fully saturated by the kernel, or a potential load-imbalance issue.
-      unit: Percent
-    SIMD Utilization:
-      plain: The percent of total SIMD cycles in the kernel where any SIMD on a CU
-        was actively doing any work, summed over all CUs. Low values (less than 100%)
-        indicate that the accelerator was not fully saturated by the kernel, or a
-        potential load-imbalance issue.
-      rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel where
-        any :ref:`SIMD <desc-valu>` on a CU was actively doing any work, summed over
-        all CUs. Low values (less than 100%) indicate that the accelerator was not
-        fully saturated by the kernel, or a potential load-imbalance issue.
-      unit: Percent
-    Dispatched Workgroups:
-      plain: The total number of workgroups forming this kernel launch.
-      rst: The total number of workgroups forming this kernel launch.
-      unit: Workgroups
-    Dispatched Wavefronts:
-      plain: The total number of wavefronts, summed over all workgroups, forming this
-        kernel launch.
-      rst: The total number of wavefronts, summed over all workgroups, forming this
-        kernel launch.
-      unit: Wavefronts
-    VGPR Writes:
-      plain: The average number of cycles spent initializing VGPRs at wave creation.
-      rst: The average number of cycles spent initializing :ref:`VGPRs <desc-valu>` at
-        wave creation.
-      unit: Cycles/wave
-    SGPR Writes:
-      plain: The average number of cycles spent initializing SGPRs at wave creation.
-      rst: The average number of cycles spent initializing :ref:`SGPRs <desc-salu>` at
-        wave creation.
-      unit: Cycles/wave
-    Not-scheduled Rate (Workgroup Manager):
-      plain: The percent of total scheduler-pipe cycles in the kernel where a workgroup
-        could not be scheduled to a CU due to a bottleneck within the workgroup manager
-        rather than a lack of a CU or SIMD with sufficient resources.
-      rst: |-
-        The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
-        in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
-        due to a bottleneck within the workgroup manager rather than a lack of a
-        CU or :ref:`SIMD <desc-valu>` with sufficient resources. Note: this value
-        is expected to range between 0-25%. See note in :ref:`workgroup manager <desc-spi>`
-        description.
-      unit: Percent
-    Not-scheduled Rate (Scheduler-Pipe):
-      plain: |-
-        The percent of total scheduler-pipe cycles in the kernel where a workgroup
-        could not be scheduled to a CU due to a bottleneck within the scheduler-pipes
-        rather than a lack of a CU or SIMD with sufficient resources.
-      rst: |-
-        The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
-        in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
-        due to a bottleneck within the scheduler-pipes rather than a lack of a CU
-        or :ref:`SIMD <desc-valu>` with sufficient resources. Note: this value is
-        expected to range between 0-25%, see note in :ref:`workgroup manager <desc-spi>`
-        description.
-      unit: Percent
-    Scheduler-Pipe Stall Rate:
-      plain: The percent of total scheduler-pipe cycles in the kernel where a workgroup
-        could not be scheduled to a CU due to occupancy limitations (like a lack of
-        a CU or SIMD with sufficient resources).
-      rst: |-
-        The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
-        in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
-        due to occupancy limitations (like a lack of a CU or :ref:`SIMD <desc-valu>`
-        with sufficient resources). Note: this value is expected to range between
-        0-25%, see note in :ref:`workgroup manager <desc-spi>` description.
-      unit: Percent
-    Scratch Stall Rate:
-      plain: The percent of total shader-engine cycles in the kernel where a workgroup
-        could not be scheduled to a CU due to lack of private (a.k.a., scratch) memory
-        slots. While this can reach up to 100%, note that the actual occupancy limitations
-        on a kernel using private memory are typically quite small (for example, less
-        than 1% of the total number of waves that can be scheduled to an accelerator).
-      rst: The percent of :ref:`total shader-engine cycles <total-se-cycles>` in the kernel
-        where a workgroup could not be scheduled to a :doc:`CU <compute-unit>` due
-        to lack of :ref:`private (a.k.a., scratch) memory <memory-type>` slots. While
-        this can reach up to 100%, note that the actual occupancy limitations on
-        a kernel using private memory are typically quite small (for example, less than
-        1% of the total number of waves that can be scheduled to an accelerator).
-      unit: Percent
-    Insufficient SIMD Waveslots:
-      plain: The percent of total SIMD cycles in the kernel where a workgroup could
-        not be scheduled to a SIMD due to lack of available waveslots.
-      rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel where
-        a workgroup could not be scheduled to a :ref:`SIMD <desc-valu>` due to lack
-        of available :ref:`waveslots <desc-valu>`.
-      unit: Percent
-    Insufficient SIMD VGPRs:
-      plain: The percent of total SIMD cycles in the kernel where a workgroup could
-        not be scheduled to a SIMD due to lack of available VGPRs.
-      rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel where
-        a workgroup could not be scheduled to a :ref:`SIMD <desc-valu>` due to lack
-        of available :ref:`VGPRs <desc-valu>`.
-      unit: Percent
-    Insufficient SIMD SGPRs:
-      plain: The percent of total SIMD cycles in the kernel where a workgroup could
-        not be scheduled to a SIMD due to lack of available SGPRs.
-      rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel where
-        a workgroup could not be scheduled to a :ref:`SIMD <desc-valu>` due to lack
-        of available :ref:`SGPRs <desc-salu>`.
-      unit: Percent
-    Insufficient CU LDS:
-      plain: The percent of total CU cycles in the kernel where a workgroup could
-        not be scheduled to a CU due to lack of available LDS.
-      rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel where
-        a workgroup could not be scheduled to a :doc:`CU <compute-unit>` due to lack
-        of available :doc:`LDS <local-data-share>`.
-      unit: Percent
-    Insufficient CU Barriers:
-      plain: The percent of total CU cycles in the kernel where a workgroup could
-        not be scheduled to a CU due to lack of available barriers.
-      rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel where
-        a workgroup could not be scheduled to a :doc:`CU <compute-unit>` due to lack
-        of available :ref:`barriers <desc-barrier>`.
-      unit: Percent
-    Reached CU Workgroup Limit:
-      plain: The percent of total CU cycles in the kernel where a workgroup could
-        not be scheduled to a CU due to limits within the workgroup manager. This
-        is expected to be always be zero on CDNA2 or newer accelerators (and small
-        for previous accelerators).
-      rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel where
-        a workgroup could not be scheduled to a :doc:`CU <compute-unit>` due to limits
-        within the workgroup manager. This is expected to be always be zero on CDNA2
-        or newer accelerators (and small for previous accelerators).
-      unit: Percent
-    Reached CU Wavefront Limit:
-      plain: The percent of total CU cycles in the kernel where a wavefront could
-        not be scheduled to a CU due to limits within the workgroup manager. This
-        is expected to be always be zero on CDNA2 or newer accelerators (and small
-        for previous accelerators).
-      rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel where
-        a wavefront could not be scheduled to a :doc:`CU <compute-unit>` due to limits
-        within the workgroup manager. This is expected to be always be zero on CDNA2
-        or newer accelerators (and small for previous accelerators).
-      unit: Percent
-- id: 700
-  title: Wavefront
-  data source:
-  - metric_table:
-      id: 701
-      title: Wavefront Launch Stats
-      header:
-        metric: Metric
-        avg: Avg
-        min: Min
-        max: Max
-        unit: Unit
-      metric:
-        gfx90a:
-          Grid Size:
-            avg: AVG(Grid_Size)
-            min: MIN(Grid_Size)
-            max: MAX(Grid_Size)
-            unit: Work Items
-          Workgroup Size:
-            avg: AVG(Workgroup_Size)
-            min: MIN(Workgroup_Size)
-            max: MAX(Workgroup_Size)
-            unit: Work Items
-          Total Wavefronts:
-            avg: AVG(SPI_CSN_WAVE)
-            min: MIN(SPI_CSN_WAVE)
-            max: MAX(SPI_CSN_WAVE)
-            unit: Wavefronts
-          Saved Wavefronts:
-            avg: AVG(SQ_WAVES_SAVED)
-            min: MIN(SQ_WAVES_SAVED)
-            max: MAX(SQ_WAVES_SAVED)
-            unit: Wavefronts
-          Restored Wavefronts:
-            avg: AVG(SQ_WAVES_RESTORED)
-            min: MIN(SQ_WAVES_RESTORED)
-            max: MAX(SQ_WAVES_RESTORED)
-            unit: Wavefronts
-          VGPRs:
-            avg: AVG(Arch_VGPR)
-            min: MIN(Arch_VGPR)
-            max: MAX(Arch_VGPR)
-            unit: Registers
-          AGPRs:
-            avg: AVG(Accum_VGPR)
-            min: MIN(Accum_VGPR)
-            max: MAX(Accum_VGPR)
-            unit: Registers
-          SGPRs:
-            avg: AVG(SGPR)
-            min: MIN(SGPR)
-            max: MAX(SGPR)
-            unit: Registers
-          LDS Allocation:
-            avg: AVG(LDS_Per_Workgroup)
-            min: MIN(LDS_Per_Workgroup)
-            max: MAX(LDS_Per_Workgroup)
-            unit: Bytes
-          Scratch Allocation:
-            avg: AVG(Scratch_Per_Workitem)
-            min: MIN(Scratch_Per_Workitem)
-            max: MAX(Scratch_Per_Workitem)
-            unit: Bytes/Workitem
-        gfx941:
-          Grid Size:
-            avg: AVG(Grid_Size)
-            min: MIN(Grid_Size)
-            max: MAX(Grid_Size)
-            unit: Work Items
-          Workgroup Size:
-            avg: AVG(Workgroup_Size)
-            min: MIN(Workgroup_Size)
-            max: MAX(Workgroup_Size)
-            unit: Work Items
-          Total Wavefronts:
-            avg: AVG(SPI_CSN_WAVE)
-            min: MIN(SPI_CSN_WAVE)
-            max: MAX(SPI_CSN_WAVE)
-            unit: Wavefronts
-          Saved Wavefronts:
-            avg: AVG(SQ_WAVES_SAVED)
-            min: MIN(SQ_WAVES_SAVED)
-            max: MAX(SQ_WAVES_SAVED)
-            unit: Wavefronts
-          Restored Wavefronts:
-            avg: AVG(SQ_WAVES_RESTORED)
-            min: MIN(SQ_WAVES_RESTORED)
-            max: MAX(SQ_WAVES_RESTORED)
-            unit: Wavefronts
-          VGPRs:
-            avg: AVG(Arch_VGPR)
-            min: MIN(Arch_VGPR)
-            max: MAX(Arch_VGPR)
-            unit: Registers
-          AGPRs:
-            avg: AVG(Accum_VGPR)
-            min: MIN(Accum_VGPR)
-            max: MAX(Accum_VGPR)
-            unit: Registers
-          SGPRs:
-            avg: AVG(SGPR)
-            min: MIN(SGPR)
-            max: MAX(SGPR)
-            unit: Registers
-          LDS Allocation:
-            avg: AVG(LDS_Per_Workgroup)
-            min: MIN(LDS_Per_Workgroup)
-            max: MAX(LDS_Per_Workgroup)
-            unit: Bytes
-          Scratch Allocation:
-            avg: AVG(Scratch_Per_Workitem)
-            min: MIN(Scratch_Per_Workitem)
-            max: MAX(Scratch_Per_Workitem)
-            unit: Bytes/Workitem
-        gfx940:
-          Grid Size:
-            avg: AVG(Grid_Size)
-            min: MIN(Grid_Size)
-            max: MAX(Grid_Size)
-            unit: Work Items
-          Workgroup Size:
-            avg: AVG(Workgroup_Size)
-            min: MIN(Workgroup_Size)
-            max: MAX(Workgroup_Size)
-            unit: Work Items
-          Total Wavefronts:
-            avg: AVG(SPI_CSN_WAVE)
-            min: MIN(SPI_CSN_WAVE)
-            max: MAX(SPI_CSN_WAVE)
-            unit: Wavefronts
-          Saved Wavefronts:
-            avg: AVG(SQ_WAVES_SAVED)
-            min: MIN(SQ_WAVES_SAVED)
-            max: MAX(SQ_WAVES_SAVED)
-            unit: Wavefronts
-          Restored Wavefronts:
-            avg: AVG(SQ_WAVES_RESTORED)
-            min: MIN(SQ_WAVES_RESTORED)
-            max: MAX(SQ_WAVES_RESTORED)
-            unit: Wavefronts
-          VGPRs:
-            avg: AVG(Arch_VGPR)
-            min: MIN(Arch_VGPR)
-            max: MAX(Arch_VGPR)
-            unit: Registers
-          AGPRs:
-            avg: AVG(Accum_VGPR)
-            min: MIN(Accum_VGPR)
-            max: MAX(Accum_VGPR)
-            unit: Registers
-          SGPRs:
-            avg: AVG(SGPR)
-            min: MIN(SGPR)
-            max: MAX(SGPR)
-            unit: Registers
-          LDS Allocation:
-            avg: AVG(LDS_Per_Workgroup)
-            min: MIN(LDS_Per_Workgroup)
-            max: MAX(LDS_Per_Workgroup)
-            unit: Bytes
-          Scratch Allocation:
-            avg: AVG(Scratch_Per_Workitem)
-            min: MIN(Scratch_Per_Workitem)
-            max: MAX(Scratch_Per_Workitem)
-            unit: Bytes/Workitem
-        gfx942:
-          Grid Size:
-            avg: AVG(Grid_Size)
-            min: MIN(Grid_Size)
-            max: MAX(Grid_Size)
-            unit: Work Items
-          Workgroup Size:
-            avg: AVG(Workgroup_Size)
-            min: MIN(Workgroup_Size)
-            max: MAX(Workgroup_Size)
-            unit: Work Items
-          Total Wavefronts:
-            avg: AVG(SPI_CSN_WAVE)
-            min: MIN(SPI_CSN_WAVE)
-            max: MAX(SPI_CSN_WAVE)
-            unit: Wavefronts
-          Saved Wavefronts:
-            avg: AVG(SQ_WAVES_SAVED)
-            min: MIN(SQ_WAVES_SAVED)
-            max: MAX(SQ_WAVES_SAVED)
-            unit: Wavefronts
-          Restored Wavefronts:
-            avg: AVG(SQ_WAVES_RESTORED)
-            min: MIN(SQ_WAVES_RESTORED)
-            max: MAX(SQ_WAVES_RESTORED)
-            unit: Wavefronts
-          VGPRs:
-            avg: AVG(Arch_VGPR)
-            min: MIN(Arch_VGPR)
-            max: MAX(Arch_VGPR)
-            unit: Registers
-          AGPRs:
-            avg: AVG(Accum_VGPR)
-            min: MIN(Accum_VGPR)
-            max: MAX(Accum_VGPR)
-            unit: Registers
-          SGPRs:
-            avg: AVG(SGPR)
-            min: MIN(SGPR)
-            max: MAX(SGPR)
-            unit: Registers
-          LDS Allocation:
-            avg: AVG(LDS_Per_Workgroup)
-            min: MIN(LDS_Per_Workgroup)
-            max: MAX(LDS_Per_Workgroup)
-            unit: Bytes
-          Scratch Allocation:
-            avg: AVG(Scratch_Per_Workitem)
-            min: MIN(Scratch_Per_Workitem)
-            max: MAX(Scratch_Per_Workitem)
-            unit: Bytes/Workitem
-        gfx950:
-          Grid Size:
-            avg: AVG(Grid_Size)
-            min: MIN(Grid_Size)
-            max: MAX(Grid_Size)
-            unit: Work Items
-          Workgroup Size:
-            avg: AVG(Workgroup_Size)
-            min: MIN(Workgroup_Size)
-            max: MAX(Workgroup_Size)
-            unit: Work Items
-          Total Wavefronts:
-            avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
-            min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
-            max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)
-            unit: Wavefronts
-          Saved Wavefronts:
-            avg: AVG(SQ_WAVES_SAVED)
-            min: MIN(SQ_WAVES_SAVED)
-            max: MAX(SQ_WAVES_SAVED)
-            unit: Wavefronts
-          Restored Wavefronts:
-            avg: AVG(SQ_WAVES_RESTORED)
-            min: MIN(SQ_WAVES_RESTORED)
-            max: MAX(SQ_WAVES_RESTORED)
-            unit: Wavefronts
-          VGPRs:
-            avg: AVG(Arch_VGPR)
-            min: MIN(Arch_VGPR)
-            max: MAX(Arch_VGPR)
-            unit: Registers
-          AGPRs:
-            avg: AVG(Accum_VGPR)
-            min: MIN(Accum_VGPR)
-            max: MAX(Accum_VGPR)
-            unit: Registers
-          SGPRs:
-            avg: AVG(SGPR)
-            min: MIN(SGPR)
-            max: MAX(SGPR)
-            unit: Registers
-          LDS Allocation:
-            avg: AVG(LDS_Per_Workgroup)
-            min: MIN(LDS_Per_Workgroup)
-            max: MAX(LDS_Per_Workgroup)
-            unit: Bytes
-          Scratch Allocation:
-            avg: AVG(Scratch_Per_Workitem)
-            min: MIN(Scratch_Per_Workitem)
-            max: MAX(Scratch_Per_Workitem)
-            unit: Bytes/Workitem
-        gfx908:
-          Grid Size:
-            avg: AVG(Grid_Size)
-            min: MIN(Grid_Size)
-            max: MAX(Grid_Size)
-            unit: Work Items
-          Workgroup Size:
-            avg: AVG(Workgroup_Size)
-            min: MIN(Workgroup_Size)
-            max: MAX(Workgroup_Size)
-            unit: Work Items
-          Total Wavefronts:
-            avg: AVG(SPI_CSN_WAVE)
-            min: MIN(SPI_CSN_WAVE)
-            max: MAX(SPI_CSN_WAVE)
-            unit: Wavefronts
-          Saved Wavefronts:
-            avg: AVG(SQ_WAVES_SAVED)
-            min: MIN(SQ_WAVES_SAVED)
-            max: MAX(SQ_WAVES_SAVED)
-            unit: Wavefronts
-          Restored Wavefronts:
-            avg: AVG(SQ_WAVES_RESTORED)
-            min: MIN(SQ_WAVES_RESTORED)
-            max: MAX(SQ_WAVES_RESTORED)
-            unit: Wavefronts
-          VGPRs:
-            avg: AVG(Arch_VGPR)
-            min: MIN(Arch_VGPR)
-            max: MAX(Arch_VGPR)
-            unit: Registers
-          AGPRs:
-            avg: AVG(Accum_VGPR)
-            min: MIN(Accum_VGPR)
-            max: MAX(Accum_VGPR)
-            unit: Registers
-          SGPRs:
-            avg: AVG(SGPR)
-            min: MIN(SGPR)
-            max: MAX(SGPR)
-            unit: Registers
-          LDS Allocation:
-            avg: AVG(LDS_Per_Workgroup)
-            min: MIN(LDS_Per_Workgroup)
-            max: MAX(LDS_Per_Workgroup)
-            unit: Bytes
-          Scratch Allocation:
-            avg: AVG(Scratch_Per_Workitem)
-            min: MIN(Scratch_Per_Workitem)
-            max: MAX(Scratch_Per_Workitem)
-            unit: Bytes/Workitem
-  - metric_table:
-      id: 702
-      title: Wavefront Runtime Stats
-      header:
-        metric: Metric
-        avg: Avg
-        min: Min
-        max: Max
-        unit: Unit
-      metric:
-        gfx90a:
-          Kernel Time:
-            avg: AVG((End_Timestamp - Start_Timestamp))
-            min: MIN((End_Timestamp - Start_Timestamp))
-            max: MAX((End_Timestamp - Start_Timestamp))
-            unit: ns
-          Kernel Time (Cycles):
-            avg: AVG($GRBM_GUI_ACTIVE_PER_XCD)
-            min: MIN($GRBM_GUI_ACTIVE_PER_XCD)
-            max: MAX($GRBM_GUI_ACTIVE_PER_XCD)
-            unit: Cycle
-          Instructions per wavefront:
-            avg: AVG((SQ_INSTS / SQ_WAVES))
-            min: MIN((SQ_INSTS / SQ_WAVES))
-            max: MAX((SQ_INSTS / SQ_WAVES))
-            unit: Instr/wavefront
-          Wave Cycles:
-            avg: AVG(((4 * SQ_WAVE_CYCLES) / $denom))
-            min: MIN(((4 * SQ_WAVE_CYCLES) / $denom))
-            max: MAX(((4 * SQ_WAVE_CYCLES) / $denom))
-            unit: (Cycles + $normUnit)
-          Dependency Wait Cycles:
-            avg: AVG(((4 * SQ_WAIT_ANY) / $denom))
-            min: MIN(((4 * SQ_WAIT_ANY) / $denom))
-            max: MAX(((4 * SQ_WAIT_ANY) / $denom))
-            unit: (Cycles + $normUnit)
-          Issue Wait Cycles:
-            avg: AVG(((4 * SQ_WAIT_INST_ANY) / $denom))
-            min: MIN(((4 * SQ_WAIT_INST_ANY) / $denom))
-            max: MAX(((4 * SQ_WAIT_INST_ANY) / $denom))
-            unit: (Cycles + $normUnit)
-          Active Cycles:
-            avg: AVG(((4 * SQ_ACTIVE_INST_ANY) / $denom))
-            min: MIN(((4 * SQ_ACTIVE_INST_ANY) / $denom))
-            max: MAX(((4 * SQ_ACTIVE_INST_ANY) / $denom))
-            unit: (Cycles + $normUnit)
-          Wavefront Occupancy:
-            avg: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
-            min: MIN((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
-            max: MAX((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
-            unit: Wavefronts
-            coll_level: SQ_LEVEL_WAVES
-        gfx941:
-          Kernel Time:
-            avg: AVG((End_Timestamp - Start_Timestamp))
-            min: MIN((End_Timestamp - Start_Timestamp))
-            max: MAX((End_Timestamp - Start_Timestamp))
-            unit: ns
-          Kernel Time (Cycles):
-            avg: AVG($GRBM_GUI_ACTIVE_PER_XCD)
-            min: MIN($GRBM_GUI_ACTIVE_PER_XCD)
-            max: MAX($GRBM_GUI_ACTIVE_PER_XCD)
-            unit: Cycle
-          Instructions per wavefront:
-            avg: AVG((SQ_INSTS / SQ_WAVES))
-            min: MIN((SQ_INSTS / SQ_WAVES))
-            max: MAX((SQ_INSTS / SQ_WAVES))
-            unit: Instr/wavefront
-          Wave Cycles:
-            avg: AVG(((4 * SQ_WAVE_CYCLES) / $denom))
-            min: MIN(((4 * SQ_WAVE_CYCLES) / $denom))
-            max: MAX(((4 * SQ_WAVE_CYCLES) / $denom))
-            unit: (Cycles + $normUnit)
-          Dependency Wait Cycles:
-            avg: AVG(((4 * SQ_WAIT_ANY) / $denom))
-            min: MIN(((4 * SQ_WAIT_ANY) / $denom))
-            max: MAX(((4 * SQ_WAIT_ANY) / $denom))
-            unit: (Cycles + $normUnit)
-          Issue Wait Cycles:
-            avg: AVG(((4 * SQ_WAIT_INST_ANY) / $denom))
-            min: MIN(((4 * SQ_WAIT_INST_ANY) / $denom))
-            max: MAX(((4 * SQ_WAIT_INST_ANY) / $denom))
-            unit: (Cycles + $normUnit)
-          Active Cycles:
-            avg: AVG(((4 * SQ_ACTIVE_INST_ANY) / $denom))
-            min: MIN(((4 * SQ_ACTIVE_INST_ANY) / $denom))
-            max: MAX(((4 * SQ_ACTIVE_INST_ANY) / $denom))
-            unit: (Cycles + $normUnit)
-          Wavefront Occupancy:
-            avg: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
-            min: MIN((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
-            max: MAX((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
-            unit: Wavefronts
-            coll_level: SQ_LEVEL_WAVES
-        gfx940:
-          Kernel Time:
-            avg: AVG((End_Timestamp - Start_Timestamp))
-            min: MIN((End_Timestamp - Start_Timestamp))
-            max: MAX((End_Timestamp - Start_Timestamp))
-            unit: ns
-          Kernel Time (Cycles):
-            avg: AVG($GRBM_GUI_ACTIVE_PER_XCD)
-            min: MIN($GRBM_GUI_ACTIVE_PER_XCD)
-            max: MAX($GRBM_GUI_ACTIVE_PER_XCD)
-            unit: Cycle
-          Instructions per wavefront:
-            avg: AVG((SQ_INSTS / SQ_WAVES))
-            min: MIN((SQ_INSTS / SQ_WAVES))
-            max: MAX((SQ_INSTS / SQ_WAVES))
-            unit: Instr/wavefront
-          Wave Cycles:
-            avg: AVG(((4 * SQ_WAVE_CYCLES) / $denom))
-            min: MIN(((4 * SQ_WAVE_CYCLES) / $denom))
-            max: MAX(((4 * SQ_WAVE_CYCLES) / $denom))
-            unit: (Cycles + $normUnit)
-          Dependency Wait Cycles:
-            avg: AVG(((4 * SQ_WAIT_ANY) / $denom))
-            min: MIN(((4 * SQ_WAIT_ANY) / $denom))
-            max: MAX(((4 * SQ_WAIT_ANY) / $denom))
-            unit: (Cycles + $normUnit)
-          Issue Wait Cycles:
-            avg: AVG(((4 * SQ_WAIT_INST_ANY) / $denom))
-            min: MIN(((4 * SQ_WAIT_INST_ANY) / $denom))
-            max: MAX(((4 * SQ_WAIT_INST_ANY) / $denom))
-            unit: (Cycles + $normUnit)
-          Active Cycles:
-            avg: AVG(((4 * SQ_ACTIVE_INST_ANY) / $denom))
-            min: MIN(((4 * SQ_ACTIVE_INST_ANY) / $denom))
-            max: MAX(((4 * SQ_ACTIVE_INST_ANY) / $denom))
-            unit: (Cycles + $normUnit)
-          Wavefront Occupancy:
-            avg: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
-            min: MIN((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
-            max: MAX((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
-            unit: Wavefronts
-            coll_level: SQ_LEVEL_WAVES
-        gfx942:
-          Kernel Time:
-            avg: AVG((End_Timestamp - Start_Timestamp))
-            min: MIN((End_Timestamp - Start_Timestamp))
-            max: MAX((End_Timestamp - Start_Timestamp))
-            unit: ns
-          Kernel Time (Cycles):
-            avg: AVG($GRBM_GUI_ACTIVE_PER_XCD)
-            min: MIN($GRBM_GUI_ACTIVE_PER_XCD)
-            max: MAX($GRBM_GUI_ACTIVE_PER_XCD)
-            unit: Cycle
-          Instructions per wavefront:
-            avg: AVG((SQ_INSTS / SQ_WAVES))
-            min: MIN((SQ_INSTS / SQ_WAVES))
-            max: MAX((SQ_INSTS / SQ_WAVES))
-            unit: Instr/wavefront
-          Wave Cycles:
-            avg: AVG(((4 * SQ_WAVE_CYCLES) / $denom))
-            min: MIN(((4 * SQ_WAVE_CYCLES) / $denom))
-            max: MAX(((4 * SQ_WAVE_CYCLES) / $denom))
-            unit: (Cycles + $normUnit)
-          Dependency Wait Cycles:
-            avg: AVG(((4 * SQ_WAIT_ANY) / $denom))
-            min: MIN(((4 * SQ_WAIT_ANY) / $denom))
-            max: MAX(((4 * SQ_WAIT_ANY) / $denom))
-            unit: (Cycles + $normUnit)
-          Issue Wait Cycles:
-            avg: AVG(((4 * SQ_WAIT_INST_ANY) / $denom))
-            min: MIN(((4 * SQ_WAIT_INST_ANY) / $denom))
-            max: MAX(((4 * SQ_WAIT_INST_ANY) / $denom))
-            unit: (Cycles + $normUnit)
-          Active Cycles:
-            avg: AVG(((4 * SQ_ACTIVE_INST_ANY) / $denom))
-            min: MIN(((4 * SQ_ACTIVE_INST_ANY) / $denom))
-            max: MAX(((4 * SQ_ACTIVE_INST_ANY) / $denom))
-            unit: (Cycles + $normUnit)
-          Wavefront Occupancy:
-            avg: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
-            min: MIN((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
-            max: MAX((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
-            unit: Wavefronts
-            coll_level: SQ_LEVEL_WAVES
-        gfx950:
-          Kernel Time:
-            avg: AVG((End_Timestamp - Start_Timestamp))
-            min: MIN((End_Timestamp - Start_Timestamp))
-            max: MAX((End_Timestamp - Start_Timestamp))
-            unit: ns
-          Kernel Time (Cycles):
-            avg: AVG($GRBM_GUI_ACTIVE_PER_XCD)
-            min: MIN($GRBM_GUI_ACTIVE_PER_XCD)
-            max: MAX($GRBM_GUI_ACTIVE_PER_XCD)
-            unit: Cycle
-          Instructions per wavefront:
-            avg: AVG((SQ_INSTS / SQ_WAVES))
-            min: MIN((SQ_INSTS / SQ_WAVES))
-            max: MAX((SQ_INSTS / SQ_WAVES))
-            unit: Instr/wavefront
-          Wave Cycles:
-            avg: AVG(((4 * SQ_WAVE_CYCLES) / $denom))
-            min: MIN(((4 * SQ_WAVE_CYCLES) / $denom))
-            max: MAX(((4 * SQ_WAVE_CYCLES) / $denom))
-            unit: (Cycles + $normUnit)
-          Dependency Wait Cycles:
-            avg: AVG(((4 * SQ_WAIT_ANY) / $denom))
-            min: MIN(((4 * SQ_WAIT_ANY) / $denom))
-            max: MAX(((4 * SQ_WAIT_ANY) / $denom))
-            unit: (Cycles + $normUnit)
-          Issue Wait Cycles:
-            avg: AVG(((4 * SQ_WAIT_INST_ANY) / $denom))
-            min: MIN(((4 * SQ_WAIT_INST_ANY) / $denom))
-            max: MAX(((4 * SQ_WAIT_INST_ANY) / $denom))
-            unit: (Cycles + $normUnit)
-          Active Cycles:
-            avg: AVG(((4 * SQ_ACTIVE_INST_ANY) / $denom))
-            min: MIN(((4 * SQ_ACTIVE_INST_ANY) / $denom))
-            max: MAX(((4 * SQ_ACTIVE_INST_ANY) / $denom))
-            unit: (Cycles + $normUnit)
-          Wavefront Occupancy:
-            avg: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
-            min: MIN((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
-            max: MAX((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
-            unit: Wavefronts
-            coll_level: SQ_LEVEL_WAVES
-        gfx908:
-          Kernel Time:
-            avg: AVG((End_Timestamp - Start_Timestamp))
-            min: MIN((End_Timestamp - Start_Timestamp))
-            max: MAX((End_Timestamp - Start_Timestamp))
-            unit: ns
-          Kernel Time (Cycles):
-            avg: AVG($GRBM_GUI_ACTIVE_PER_XCD)
-            min: MIN($GRBM_GUI_ACTIVE_PER_XCD)
-            max: MAX($GRBM_GUI_ACTIVE_PER_XCD)
-            unit: Cycle
-          Instructions per wavefront:
-            avg: AVG((SQ_INSTS / SQ_WAVES))
-            min: MIN((SQ_INSTS / SQ_WAVES))
-            max: MAX((SQ_INSTS / SQ_WAVES))
-            unit: Instr/wavefront
-          Wave Cycles:
-            avg: AVG(((4 * SQ_WAVE_CYCLES) / $denom))
-            min: MIN(((4 * SQ_WAVE_CYCLES) / $denom))
-            max: MAX(((4 * SQ_WAVE_CYCLES) / $denom))
-            unit: (Cycles + $normUnit)
-          Dependency Wait Cycles:
-            avg: AVG(((4 * SQ_WAIT_ANY) / $denom))
-            min: MIN(((4 * SQ_WAIT_ANY) / $denom))
-            max: MAX(((4 * SQ_WAIT_ANY) / $denom))
-            unit: (Cycles + $normUnit)
-          Issue Wait Cycles:
-            avg: AVG(((4 * SQ_WAIT_INST_ANY) / $denom))
-            min: MIN(((4 * SQ_WAIT_INST_ANY) / $denom))
-            max: MAX(((4 * SQ_WAIT_INST_ANY) / $denom))
-            unit: (Cycles + $normUnit)
-          Active Cycles:
-            avg: AVG(((4 * SQ_ACTIVE_INST_ANY) / $denom))
-            min: MIN(((4 * SQ_ACTIVE_INST_ANY) / $denom))
-            max: MAX(((4 * SQ_ACTIVE_INST_ANY) / $denom))
-            unit: (Cycles + $normUnit)
-          Wavefront Occupancy:
-            avg: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
-            min: MIN((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
-            max: MAX((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD))
-            unit: Wavefronts
-            coll_level: SQ_LEVEL_WAVES
-  metrics_description:
-    Grid Size:
-      plain: The total number of work-items (or, threads) launched as a part of the
-        kernel dispatch. In HIP, this is equivalent to the total grid size multiplied
-        by the total workgroup (or, block) size.
-      rst: The total number of work-items (or, threads) launched as a part of the
-        kernel dispatch. In HIP, this is equivalent to the total grid size multiplied
-        by the total workgroup (or, block) size.
-      unit: Work-Items
-    Workgroup Size:
-      plain: The total number of work-items (or, threads) in each workgroup (or, block)
-        launched as part of the kernel dispatch. In HIP, this is equivalent to the
-        total block size.
-      rst: The total number of work-items (or, threads) in each workgroup (or, block)
-        launched as part of the kernel dispatch. In HIP, this is equivalent to the
-        total block size.
-      unit: Work-Items
-    Total Wavefronts:
-      plain: |-
-        The total number of wavefronts launched as part of the kernel dispatch.
-        On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront
-        size is always 64 work-items. Thus, the total number of wavefronts should
-        be equivalent to the ceiling of grid size divided by 64.
-      rst: |-
-        The total number of wavefronts launched as part of the kernel dispatch.
-        On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront
-        size is always 64 work-items. Thus, the total number of wavefronts should
-        be equivalent to the ceiling of grid size divided by 64.
-      unit: Wavefronts
-    Saved Wavefronts:
-      plain: The total number of wavefronts saved at a context-save.
-      rst: The total number of wavefronts saved at a context-save. See `cwsr_enable
-        <https://docs.kernel.org/gpu/amdgpu/module-parameters.html?highlight=cwsr>`_.
-      unit: Wavefronts
-    Restored Wavefronts:
-      plain: The total number of wavefronts restored from a context-save.
-      rst: The total number of wavefronts restored from a context-save. See `cwsr_enable
-        <https://docs.kernel.org/gpu/amdgpu/module-parameters.html?highlight=cwsr>`_.
-      unit: Wavefronts
-    VGPRs:
-      plain: |-
-        The number of architected vector general-purpose registers allocated
-        for the kernel, see VALU. Note: this may not exactly match the number of VGPRs
-        requested by the compiler due to allocation granularity.
-      rst: |-
-        The number of architected vector general-purpose registers allocated for the
-        kernel, see :ref:`VALU <desc-valu>`. Note: this may not exactly match the
-        number of VGPRs requested by the compiler due to allocation granularity.
-      unit: VGPRs
-    AGPRs:
-      plain: |-
-        The number of accumulation vector general-purpose registers allocated
-        for the kernel, see AGPRs. Note: this may not exactly match the number of
-        AGPRs requested by the compiler due to allocation granularity.
-      rst: |-
-        The number of accumulation vector general-purpose registers allocated
-        for the kernel, see :ref:`AGPRs <desc-agprs>`. Note: this may not exactly match
-        the number of AGPRs requested by the compiler due to allocation granularity.
-      unit: AGPRs
-    SGPRs:
-      plain: |-
-        The number of scalar general-purpose registers allocated for the kernel,
-        see SALU. Note: this may not exactly match the number of SGPRs requested by
-        the compiler due to allocation granularity.
-      rst: |-
-        The number of scalar general-purpose registers allocated for the kernel, see
-        :ref:`SALU <desc-salu>`. Note: this may not exactly match the number of
-        SGPRs requested by the compiler due to allocation granularity.
-      unit: SGPRs
-    LDS Allocation:
-      plain: |-
-        The number of bytes of LDS memory (or, shared memory) allocated for
-        this kernel. Note: This may also be larger than what was requested at compile
-        time due to both allocation granularity and dynamic per-dispatch LDS allocations.
-      rst: |-
-        The number of bytes of :doc:`LDS <local-data-share>` memory (or, shared memory)
-        allocated for this kernel. Note: This may also be larger than what was requested
-        at compile time due to both allocation granularity and dynamic per-dispatch
-        LDS allocations.
-      unit: Bytes per workgroup
-    Scratch Allocation:
-      plain: The number of bytes of scratch memory requested per work-item for this
-        kernel. Scratch memory is used for stack memory on the accelerator, as well
-        as for register spills and restores.
-      rst: The number of bytes of :ref:`scratch memory <memory-spaces>` requested per
-        work-item for this kernel. Scratch memory is used for stack memory on the
-        accelerator, as well as for register spills and restores.
-      unit: Bytes per work-item
-    Kernel Time:
-      plain: The total duration of the executed kernel.
-      rst: The total duration of the executed kernel.
-      unit: Nanoseconds
-    Kernel Time (Cycles):
-      plain: The total duration of the executed kernel in cycles.
-      rst: The total duration of the executed kernel in cycles.
-      unit: Cycles
-    Instructions per wavefront:
-      plain: The average number of instructions (of all types) executed per wavefront.
-        This is averaged over all wavefronts in a kernel dispatch.
-      rst: The average number of instructions (of all types) executed per wavefront.
-        This is averaged over all wavefronts in a kernel dispatch.
-      unit: Instructions per wavefront
-    Wave Cycles:
-      plain: The number of cycles a wavefront in the kernel dispatch spent resident
-        on a compute unit per normalization unit. This is averaged over all wavefronts
-        in a kernel dispatch.
-      rst: |-
-        The number of cycles a wavefront in the kernel dispatch spent resident
-        on a compute unit per :ref:`normalization unit <normalization-units>`. This is
-        averaged over all wavefronts in a kernel dispatch. Note: this should not
-        be directly compared to the kernel cycles above.
-      unit: Cycles per normalization unit
-    Dependency Wait Cycles:
-      plain: The number of cycles a wavefront in the kernel dispatch spent resident
-        on a compute unit per normalization unit. This is averaged over all wavefronts
-        in a kernel dispatch.
-      rst: The number of cycles a wavefront in the kernel dispatch stalled waiting on
-        memory of any kind (e.g., instruction fetch, vector or scalar memory, etc.)
-        per :ref:`normalization unit <normalization-units>`. This counter is incremented
-        at every cycle by *all* wavefronts on a CU stalled at a memory operation. As
-        such, it is most useful to get a sense of how waves were spending their time,
-        rather than identification of a precise limiter because another wave could
-        be actively executing while a wave is stalled. The sum of this metric, Issue
-        Wait Cycles and Active Cycles should be equal to the total Wave Cycles metric.
-      unit: Cycles per normalization unit
-    Issue Wait Cycles:
-      plain: The number of cycles a wavefront in the kernel dispatch was unable to
-        issue an instruction for any reason (e.g., execution pipe back-pressure, arbitration
-        loss, etc.) per normalization unit. This counter is incremented at every cycle
-        by all wavefronts on a CU unable to issue an instruction. As such, it is most
-        useful to get a sense of how waves were spending their time, rather than identification
-        of a precise limiter because another wave could be actively executing while
-        a wave is issue stalled. The sum of this metric, Dependency Wait Cycles and
-        Active Cycles should be equal to the total Wave Cycles metric.
-      rst: The number of cycles a wavefront in the kernel dispatch was unable to issue
-        an instruction for any reason (e.g., execution pipe back-pressure, arbitration
-        loss, etc.) per :ref:`normalization unit <normalization-units>`. This counter
-        is incremented at every cycle by *all* wavefronts on a CU unable to issue
-        an instruction. As such, it is most useful to get a sense of how waves were spending
-        their time, rather than identification of a precise limiter because another
-        wave could be actively executing while a wave is issue stalled. The sum
-        of this metric, Dependency Wait Cycles and Active Cycles should be equal
-        to the total Wave Cycles metric.
-      unit: Cycles per normalization unit
-    Active Cycles:
-      plain: The average number of cycles a wavefront in the kernel dispatch was actively
-        executing instructions per normalization unit. This measurement is made on
-        a per-wavefront basis, and may include cycles that another wavefront spent
-        actively executing (on another execution unit, for example) or was stalled.
-        As such, it is most useful to get a sense of how waves were spending their
-        time, rather than identification of a precise limiter. The sum of this metric,
-        Issue Wait Cycles and Active Wait Cycles should be equal to the total Wave
-        Cycles metric.
-      rst: The average number of cycles a wavefront in the kernel dispatch was actively
-        executing instructions per :ref:`normalization unit <normalization-units>`.
-        This measurement is made on a per-wavefront basis, and may include cycles
-        that another wavefront spent actively executing (on another execution unit,
-        for example) or was stalled. As such, it is most useful to get a sense of
-        how waves were spending their time, rather than identification of a precise
-        limiter. The sum of this metric, Issue Wait Cycles and Active Wait Cycles
-        should be equal to the total Wave Cycles metric.
-      unit: Cycles per normalization unit
-    Wavefront Occupancy:
-      plain: |-
-        The time-averaged number of wavefronts resident on the accelerator over
-        the lifetime of the kernel. Note: this metric may be inaccurate for short-running
-        kernels (less than 1ms).
-      rst: |-
-        The time-averaged number of wavefronts resident on the accelerator over the
-        lifetime of the kernel. Note: this metric may be inaccurate for short-running
-        kernels (less than 1ms).
-      unit: Wavefronts
-- id: 1000
-  title: Compute Units - Instruction Mix
-  data source:
-  - metric_table:
-      id: 1001
-      title: Overall Instruction Mix
-      header:
-        metric: Metric
-        avg: Avg
-        min: Min
-        max: Max
-        unit: Unit
-      metric:
-        gfx90a:
-          VALU:
-            avg: AVG(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
-            min: MIN(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
-            max: MAX(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
-            unit: (instr + $normUnit)
-          VMEM:
-            avg: AVG(((SQ_INSTS_VMEM - SQ_INSTS_FLAT_LDS_ONLY) / $denom))
-            min: MIN(((SQ_INSTS_VMEM - SQ_INSTS_FLAT_LDS_ONLY) / $denom))
-            max: MAX(((SQ_INSTS_VMEM - SQ_INSTS_FLAT_LDS_ONLY) / $denom))
-            unit: (instr + $normUnit)
-          LDS:
-            avg: AVG((SQ_INSTS_LDS / $denom))
-            min: MIN((SQ_INSTS_LDS / $denom))
-            max: MAX((SQ_INSTS_LDS / $denom))
-            unit: (instr + $normUnit)
-          MFMA:
-            avg: AVG((SQ_INSTS_MFMA / $denom))
-            min: MIN((SQ_INSTS_MFMA / $denom))
-            max: MAX((SQ_INSTS_MFMA / $denom))
-            unit: (instr + $normUnit)
-          SALU:
-            avg: AVG((SQ_INSTS_SALU / $denom))
-            min: MIN((SQ_INSTS_SALU / $denom))
-            max: MAX((SQ_INSTS_SALU / $denom))
-            unit: (instr + $normUnit)
-          SMEM:
-            avg: AVG((SQ_INSTS_SMEM / $denom))
-            min: MIN((SQ_INSTS_SMEM / $denom))
-            max: MAX((SQ_INSTS_SMEM / $denom))
-            unit: (instr + $normUnit)
-          Branch:
-            avg: AVG((SQ_INSTS_BRANCH / $denom))
-            min: MIN((SQ_INSTS_BRANCH / $denom))
-            max: MAX((SQ_INSTS_BRANCH / $denom))
-            unit: (instr + $normUnit)
-        gfx941:
-          VALU:
-            avg: AVG(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
-            min: MIN(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
-            max: MAX(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
-            unit: (instr + $normUnit)
-          VMEM:
-            avg: AVG(((SQ_INSTS_VMEM) / $denom))
-            min: MIN(((SQ_INSTS_VMEM) / $denom))
-            max: MAX(((SQ_INSTS_VMEM) / $denom))
-            unit: (instr + $normUnit)
-          LDS:
-            avg: AVG((SQ_INSTS_LDS / $denom))
-            min: MIN((SQ_INSTS_LDS / $denom))
-            max: MAX((SQ_INSTS_LDS / $denom))
-            unit: (instr + $normUnit)
-          MFMA:
-            avg: AVG((SQ_INSTS_MFMA / $denom))
-            min: MIN((SQ_INSTS_MFMA / $denom))
-            max: MAX((SQ_INSTS_MFMA / $denom))
-            unit: (instr + $normUnit)
-          SALU:
-            avg: AVG((SQ_INSTS_SALU / $denom))
-            min: MIN((SQ_INSTS_SALU / $denom))
-            max: MAX((SQ_INSTS_SALU / $denom))
-            unit: (instr + $normUnit)
-          SMEM:
-            avg: AVG((SQ_INSTS_SMEM / $denom))
-            min: MIN((SQ_INSTS_SMEM / $denom))
-            max: MAX((SQ_INSTS_SMEM / $denom))
-            unit: (instr + $normUnit)
-          Branch:
-            avg: AVG((SQ_INSTS_BRANCH / $denom))
-            min: MIN((SQ_INSTS_BRANCH / $denom))
-            max: MAX((SQ_INSTS_BRANCH / $denom))
-            unit: (instr + $normUnit)
-        gfx940:
-          VALU:
-            avg: AVG(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
-            min: MIN(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
-            max: MAX(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
-            unit: (instr + $normUnit)
-          VMEM:
-            avg: AVG(((SQ_INSTS_VMEM) / $denom))
-            min: MIN(((SQ_INSTS_VMEM) / $denom))
-            max: MAX(((SQ_INSTS_VMEM) / $denom))
-            unit: (instr + $normUnit)
-          LDS:
-            avg: AVG((SQ_INSTS_LDS / $denom))
-            min: MIN((SQ_INSTS_LDS / $denom))
-            max: MAX((SQ_INSTS_LDS / $denom))
-            unit: (instr + $normUnit)
-          MFMA:
-            avg: AVG((SQ_INSTS_MFMA / $denom))
-            min: MIN((SQ_INSTS_MFMA / $denom))
-            max: MAX((SQ_INSTS_MFMA / $denom))
-            unit: (instr + $normUnit)
-          SALU:
-            avg: AVG((SQ_INSTS_SALU / $denom))
-            min: MIN((SQ_INSTS_SALU / $denom))
-            max: MAX((SQ_INSTS_SALU / $denom))
-            unit: (instr + $normUnit)
-          SMEM:
-            avg: AVG((SQ_INSTS_SMEM / $denom))
-            min: MIN((SQ_INSTS_SMEM / $denom))
-            max: MAX((SQ_INSTS_SMEM / $denom))
-            unit: (instr + $normUnit)
-          Branch:
-            avg: AVG((SQ_INSTS_BRANCH / $denom))
-            min: MIN((SQ_INSTS_BRANCH / $denom))
-            max: MAX((SQ_INSTS_BRANCH / $denom))
-            unit: (instr + $normUnit)
-        gfx942:
-          VALU:
-            avg: AVG(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
-            min: MIN(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
-            max: MAX(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
-            unit: (instr + $normUnit)
-          VMEM:
-            avg: AVG(((SQ_INSTS_VMEM) / $denom))
-            min: MIN(((SQ_INSTS_VMEM) / $denom))
-            max: MAX(((SQ_INSTS_VMEM) / $denom))
-            unit: (instr + $normUnit)
-          LDS:
-            avg: AVG((SQ_INSTS_LDS / $denom))
-            min: MIN((SQ_INSTS_LDS / $denom))
-            max: MAX((SQ_INSTS_LDS / $denom))
-            unit: (instr + $normUnit)
-          MFMA:
-            avg: AVG((SQ_INSTS_MFMA / $denom))
-            min: MIN((SQ_INSTS_MFMA / $denom))
-            max: MAX((SQ_INSTS_MFMA / $denom))
-            unit: (instr + $normUnit)
-          SALU:
-            avg: AVG((SQ_INSTS_SALU / $denom))
-            min: MIN((SQ_INSTS_SALU / $denom))
-            max: MAX((SQ_INSTS_SALU / $denom))
-            unit: (instr + $normUnit)
-          SMEM:
-            avg: AVG((SQ_INSTS_SMEM / $denom))
-            min: MIN((SQ_INSTS_SMEM / $denom))
-            max: MAX((SQ_INSTS_SMEM / $denom))
-            unit: (instr + $normUnit)
-          Branch:
-            avg: AVG((SQ_INSTS_BRANCH / $denom))
-            min: MIN((SQ_INSTS_BRANCH / $denom))
-            max: MAX((SQ_INSTS_BRANCH / $denom))
-            unit: (instr + $normUnit)
-        gfx950:
-          VALU:
-            avg: AVG(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
-            min: MIN(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
-            max: MAX(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom))
-            unit: (instr + $normUnit)
-          VMEM:
-            avg: AVG(((SQ_INSTS_VMEM) / $denom))
-            min: MIN(((SQ_INSTS_VMEM) / $denom))
-            max: MAX(((SQ_INSTS_VMEM) / $denom))
-            unit: (instr + $normUnit)
-          LDS:
-            avg: AVG((SQ_INSTS_LDS / $denom))
-            min: MIN((SQ_INSTS_LDS / $denom))
-            max: MAX((SQ_INSTS_LDS / $denom))
-            unit: (instr + $normUnit)
-          MFMA:
-            avg: AVG((SQ_INSTS_MFMA / $denom))
-            min: MIN((SQ_INSTS_MFMA / $denom))
-            max: MAX((SQ_INSTS_MFMA / $denom))
-            unit: (instr + $normUnit)
-          SALU:
-            avg: AVG((SQ_INSTS_SALU / $denom))
-            min: MIN((SQ_INSTS_SALU / $denom))
-            max: MAX((SQ_INSTS_SALU / $denom))
-            unit: (instr + $normUnit)
-          SMEM:
-            avg: AVG((SQ_INSTS_SMEM / $denom))
-            min: MIN((SQ_INSTS_SMEM / $denom))
-            max: MAX((SQ_INSTS_SMEM / $denom))
-            unit: (instr + $normUnit)
-          Branch:
-            avg: AVG((SQ_INSTS_BRANCH / $denom))
-            min: MIN((SQ_INSTS_BRANCH / $denom))
-            max: MAX((SQ_INSTS_BRANCH / $denom))
-            unit: (instr + $normUnit)
-        gfx908:
-          LDS:
-            avg: AVG((SQ_INSTS_LDS / $denom))
-            min: MIN((SQ_INSTS_LDS / $denom))
-            max: MAX((SQ_INSTS_LDS / $denom))
-            unit: (instr + $normUnit)
-          SALU:
-            avg: AVG((SQ_INSTS_SALU / $denom))
-            min: MIN((SQ_INSTS_SALU / $denom))
-            max: MAX((SQ_INSTS_SALU / $denom))
-            unit: (instr + $normUnit)
-          SMEM:
-            avg: AVG((SQ_INSTS_SMEM / $denom))
-            min: MIN((SQ_INSTS_SMEM / $denom))
-            max: MAX((SQ_INSTS_SMEM / $denom))
-            unit: (instr + $normUnit)
-          Branch:
-            avg: AVG((SQ_INSTS_BRANCH / $denom))
-            min: MIN((SQ_INSTS_BRANCH / $denom))
-            max: MAX((SQ_INSTS_BRANCH / $denom))
-            unit: (instr + $normUnit)
-  - metric_table:
-      id: 1002
-      title: VALU Arithmetic Instruction Mix
-      header:
-        metric: Metric
-        avg: Avg
-        min: Min
-        max: Max
-        unit: Unit
-      metric:
-        gfx90a:
-          INT32:
-            avg: AVG((SQ_INSTS_VALU_INT32 / $denom))
-            min: MIN((SQ_INSTS_VALU_INT32 / $denom))
-            max: MAX((SQ_INSTS_VALU_INT32 / $denom))
-            unit: (instr + $normUnit)
-          INT64:
-            avg: AVG((SQ_INSTS_VALU_INT64 / $denom))
-            min: MIN((SQ_INSTS_VALU_INT64 / $denom))
-            max: MAX((SQ_INSTS_VALU_INT64 / $denom))
-            unit: (instr + $normUnit)
-          F16-ADD:
-            avg: AVG((SQ_INSTS_VALU_ADD_F16 / $denom))
-            min: MIN((SQ_INSTS_VALU_ADD_F16 / $denom))
-            max: MAX((SQ_INSTS_VALU_ADD_F16 / $denom))
-            unit: (instr + $normUnit)
-          F16-MUL:
-            avg: AVG((SQ_INSTS_VALU_MUL_F16 / $denom))
-            min: MIN((SQ_INSTS_VALU_MUL_F16 / $denom))
-            max: MAX((SQ_INSTS_VALU_MUL_F16 / $denom))
-            unit: (instr + $normUnit)
-          F16-FMA:
-            avg: AVG((SQ_INSTS_VALU_FMA_F16 / $denom))
-            min: MIN((SQ_INSTS_VALU_FMA_F16 / $denom))
-            max: MAX((SQ_INSTS_VALU_FMA_F16 / $denom))
-            unit: (instr + $normUnit)
-          F16-Trans:
-            avg: AVG((SQ_INSTS_VALU_TRANS_F16 / $denom))
-            min: MIN((SQ_INSTS_VALU_TRANS_F16 / $denom))
-            max: MAX((SQ_INSTS_VALU_TRANS_F16 / $denom))
-            unit: (instr + $normUnit)
-          F32-ADD:
-            avg: AVG((SQ_INSTS_VALU_ADD_F32 / $denom))
-            min: MIN((SQ_INSTS_VALU_ADD_F32 / $denom))
-            max: MAX((SQ_INSTS_VALU_ADD_F32 / $denom))
-            unit: (instr + $normUnit)
-          F32-MUL:
-            avg: AVG((SQ_INSTS_VALU_MUL_F32 / $denom))
-            min: MIN((SQ_INSTS_VALU_MUL_F32 / $denom))
-            max: MAX((SQ_INSTS_VALU_MUL_F32 / $denom))
-            unit: (instr + $normUnit)
-          F32-FMA:
-            avg: AVG((SQ_INSTS_VALU_FMA_F32 / $denom))
-            min: MIN((SQ_INSTS_VALU_FMA_F32 / $denom))
-            max: MAX((SQ_INSTS_VALU_FMA_F32 / $denom))
-            unit: (instr + $normUnit)
-          F32-Trans:
-            avg: AVG((SQ_INSTS_VALU_TRANS_F32 / $denom))
-            min: MIN((SQ_INSTS_VALU_TRANS_F32 / $denom))
-            max: MAX((SQ_INSTS_VALU_TRANS_F32 / $denom))
-            unit: (instr + $normUnit)
-          F64-ADD:
-            avg: AVG((SQ_INSTS_VALU_ADD_F64 / $denom))
-            min: MIN((SQ_INSTS_VALU_ADD_F64 / $denom))
-            max: MAX((SQ_INSTS_VALU_ADD_F64 / $denom))
-            unit: (instr + $normUnit)
-          F64-MUL:
-            avg: AVG((SQ_INSTS_VALU_MUL_F64 / $denom))
-            min: MIN((SQ_INSTS_VALU_MUL_F64 / $denom))
-            max: MAX((SQ_INSTS_VALU_MUL_F64 / $denom))
-            unit: (instr + $normUnit)
-          F64-FMA:
-            avg: AVG((SQ_INSTS_VALU_FMA_F64 / $denom))
-            min: MIN((SQ_INSTS_VALU_FMA_F64 / $denom))
-            max: MAX((SQ_INSTS_VALU_FMA_F64 / $denom))
-            unit: (instr + $normUnit)
-          F64-Trans:
-            avg: AVG((SQ_INSTS_VALU_TRANS_F64 / $denom))
-            min: MIN((SQ_INSTS_VALU_TRANS_F64 / $denom))
-            max: MAX((SQ_INSTS_VALU_TRANS_F64 / $denom))
-            unit: (instr + $normUnit)
-          Conversion:
-            avg: AVG((SQ_INSTS_VALU_CVT / $denom))
-            min: MIN((SQ_INSTS_VALU_CVT / $denom))
-            max: MAX((SQ_INSTS_VALU_CVT / $denom))
-            unit: (instr + $normUnit)
-        gfx941:
-          INT32:
-            avg: AVG((SQ_INSTS_VALU_INT32 / $denom))
-            min: MIN((SQ_INSTS_VALU_INT32 / $denom))
-            max: MAX((SQ_INSTS_VALU_INT32 / $denom))
-            unit: (instr + $normUnit)
-          INT64:
-            avg: AVG((SQ_INSTS_VALU_INT64 / $denom))
-            min: MIN((SQ_INSTS_VALU_INT64 / $denom))
-            max: MAX((SQ_INSTS_VALU_INT64 / $denom))
-            unit: (instr + $normUnit)
-          F16-ADD:
-            avg: AVG((SQ_INSTS_VALU_ADD_F16 / $denom))
-            min: MIN((SQ_INSTS_VALU_ADD_F16 / $denom))
-            max: MAX((SQ_INSTS_VALU_ADD_F16 / $denom))
-            unit: (instr + $normUnit)
-          F16-MUL:
-            avg: AVG((SQ_INSTS_VALU_MUL_F16 / $denom))
-            min: MIN((SQ_INSTS_VALU_MUL_F16 / $denom))
-            max: MAX((SQ_INSTS_VALU_MUL_F16 / $denom))
-            unit: (instr + $normUnit)
-          F16-FMA:
-            avg: AVG((SQ_INSTS_VALU_FMA_F16 / $denom))
-            min: MIN((SQ_INSTS_VALU_FMA_F16 / $denom))
-            max: MAX((SQ_INSTS_VALU_FMA_F16 / $denom))
-            unit: (instr + $normUnit)
-          F16-Trans:
-            avg: AVG((SQ_INSTS_VALU_TRANS_F16 / $denom))
-            min: MIN((SQ_INSTS_VALU_TRANS_F16 / $denom))
-            max: MAX((SQ_INSTS_VALU_TRANS_F16 / $denom))
-            unit: (instr + $normUnit)
-          F32-ADD:
-            avg: AVG((SQ_INSTS_VALU_ADD_F32 / $denom))
-            min: MIN((SQ_INSTS_VALU_ADD_F32 / $denom))
-            max: MAX((SQ_INSTS_VALU_ADD_F32 / $denom))
-            unit: (instr + $normUnit)
-          F32-MUL:
-            avg: AVG((SQ_INSTS_VALU_MUL_F32 / $denom))
-            min: MIN((SQ_INSTS_VALU_MUL_F32 / $denom))
-            max: MAX((SQ_INSTS_VALU_MUL_F32 / $denom))
-            unit: (instr + $normUnit)
-          F32-FMA:
-            avg: AVG((SQ_INSTS_VALU_FMA_F32 / $denom))
-            min: MIN((SQ_INSTS_VALU_FMA_F32 / $denom))
-            max: MAX((SQ_INSTS_VALU_FMA_F32 / $denom))
-            unit: (instr + $normUnit)
-          F32-Trans:
-            avg: AVG((SQ_INSTS_VALU_TRANS_F32 / $denom))
-            min: MIN((SQ_INSTS_VALU_TRANS_F32 / $denom))
-            max: MAX((SQ_INSTS_VALU_TRANS_F32 / $denom))
-            unit: (instr + $normUnit)
-          F64-ADD:
-            avg: AVG((SQ_INSTS_VALU_ADD_F64 / $denom))
-            min: MIN((SQ_INSTS_VALU_ADD_F64 / $denom))
-            max: MAX((SQ_INSTS_VALU_ADD_F64 / $denom))
-            unit: (instr + $normUnit)
-          F64-MUL:
-            avg: AVG((SQ_INSTS_VALU_MUL_F64 / $denom))
-            min: MIN((SQ_INSTS_VALU_MUL_F64 / $denom))
-            max: MAX((SQ_INSTS_VALU_MUL_F64 / $denom))
-            unit: (instr + $normUnit)
-          F64-FMA:
-            avg: AVG((SQ_INSTS_VALU_FMA_F64 / $denom))
-            min: MIN((SQ_INSTS_VALU_FMA_F64 / $denom))
-            max: MAX((SQ_INSTS_VALU_FMA_F64 / $denom))
-            unit: (instr + $normUnit)
-          F64-Trans:
-            avg: AVG((SQ_INSTS_VALU_TRANS_F64 / $denom))
-            min: MIN((SQ_INSTS_VALU_TRANS_F64 / $denom))
-            max: MAX((SQ_INSTS_VALU_TRANS_F64 / $denom))
-            unit: (instr + $normUnit)
-          Conversion:
-            avg: AVG((SQ_INSTS_VALU_CVT / $denom))
-            min: MIN((SQ_INSTS_VALU_CVT / $denom))
-            max: MAX((SQ_INSTS_VALU_CVT / $denom))
-            unit: (instr + $normUnit)
-        gfx940:
-          INT32:
-            avg: AVG((SQ_INSTS_VALU_INT32 / $denom))
-            min: MIN((SQ_INSTS_VALU_INT32 / $denom))
-            max: MAX((SQ_INSTS_VALU_INT32 / $denom))
-            unit: (instr + $normUnit)
-          INT64:
-            avg: AVG((SQ_INSTS_VALU_INT64 / $denom))
-            min: MIN((SQ_INSTS_VALU_INT64 / $denom))
-            max: MAX((SQ_INSTS_VALU_INT64 / $denom))
-            unit: (instr + $normUnit)
-          F16-ADD:
-            avg: AVG((SQ_INSTS_VALU_ADD_F16 / $denom))
-            min: MIN((SQ_INSTS_VALU_ADD_F16 / $denom))
-            max: MAX((SQ_INSTS_VALU_ADD_F16 / $denom))
-            unit: (instr + $normUnit)
-          F16-MUL:
-            avg: AVG((SQ_INSTS_VALU_MUL_F16 / $denom))
-            min: MIN((SQ_INSTS_VALU_MUL_F16 / $denom))
-            max: MAX((SQ_INSTS_VALU_MUL_F16 / $denom))
-            unit: (instr + $normUnit)
-          F16-FMA:
-            avg: AVG((SQ_INSTS_VALU_FMA_F16 / $denom))
-            min: MIN((SQ_INSTS_VALU_FMA_F16 / $denom))
-            max: MAX((SQ_INSTS_VALU_FMA_F16 / $denom))
-            unit: (instr + $normUnit)
-          F16-Trans:
-            avg: AVG((SQ_INSTS_VALU_TRANS_F16 / $denom))
-            min: MIN((SQ_INSTS_VALU_TRANS_F16 / $denom))
-            max: MAX((SQ_INSTS_VALU_TRANS_F16 / $denom))
-            unit: (instr + $normUnit)
-          F32-ADD:
-            avg: AVG((SQ_INSTS_VALU_ADD_F32 / $denom))
-            min: MIN((SQ_INSTS_VALU_ADD_F32 / $denom))
-            max: MAX((SQ_INSTS_VALU_ADD_F32 / $denom))
-            unit: (instr + $normUnit)
-          F32-MUL:
-            avg: AVG((SQ_INSTS_VALU_MUL_F32 / $denom))
-            min: MIN((SQ_INSTS_VALU_MUL_F32 / $denom))
-            max: MAX((SQ_INSTS_VALU_MUL_F32 / $denom))
-            unit: (instr + $normUnit)
-          F32-FMA:
-            avg: AVG((SQ_INSTS_VALU_FMA_F32 / $denom))
-            min: MIN((SQ_INSTS_VALU_FMA_F32 / $denom))
-            max: MAX((SQ_INSTS_VALU_FMA_F32 / $denom))
-            unit: (instr + $normUnit)
-          F32-Trans:
-            avg: AVG((SQ_INSTS_VALU_TRANS_F32 / $denom))
-            min: MIN((SQ_INSTS_VALU_TRANS_F32 / $denom))
-            max: MAX((SQ_INSTS_VALU_TRANS_F32 / $denom))
-            unit: (instr + $normUnit)
-          F64-ADD:
-            avg: AVG((SQ_INSTS_VALU_ADD_F64 / $denom))
-            min: MIN((SQ_INSTS_VALU_ADD_F64 / $denom))
-            max: MAX((SQ_INSTS_VALU_ADD_F64 / $denom))
-            unit: (instr + $normUnit)
-          F64-MUL:
-            avg: AVG((SQ_INSTS_VALU_MUL_F64 / $denom))
-            min: MIN((SQ_INSTS_VALU_MUL_F64 / $denom))
-            max: MAX((SQ_INSTS_VALU_MUL_F64 / $denom))
-            unit: (instr + $normUnit)
-          F64-FMA:
-            avg: AVG((SQ_INSTS_VALU_FMA_F64 / $denom))
-            min: MIN((SQ_INSTS_VALU_FMA_F64 / $denom))
-            max: MAX((SQ_INSTS_VALU_FMA_F64 / $denom))
-            unit: (instr + $normUnit)
-          F64-Trans:
-            avg: AVG((SQ_INSTS_VALU_TRANS_F64 / $denom))
-            min: MIN((SQ_INSTS_VALU_TRANS_F64 / $denom))
-            max: MAX((SQ_INSTS_VALU_TRANS_F64 / $denom))
-            unit: (instr + $normUnit)
-          Conversion:
-            avg: AVG((SQ_INSTS_VALU_CVT / $denom))
-            min: MIN((SQ_INSTS_VALU_CVT / $denom))
-            max: MAX((SQ_INSTS_VALU_CVT / $denom))
-            unit: (instr + $normUnit)
-        gfx942:
-          INT32:
-            avg: AVG((SQ_INSTS_VALU_INT32 / $denom))
-            min: MIN((SQ_INSTS_VALU_INT32 / $denom))
-            max: MAX((SQ_INSTS_VALU_INT32 / $denom))
-            unit: (instr + $normUnit)
-          INT64:
-            avg: AVG((SQ_INSTS_VALU_INT64 / $denom))
-            min: MIN((SQ_INSTS_VALU_INT64 / $denom))
-            max: MAX((SQ_INSTS_VALU_INT64 / $denom))
-            unit: (instr + $normUnit)
-          F16-ADD:
-            avg: AVG((SQ_INSTS_VALU_ADD_F16 / $denom))
-            min: MIN((SQ_INSTS_VALU_ADD_F16 / $denom))
-            max: MAX((SQ_INSTS_VALU_ADD_F16 / $denom))
-            unit: (instr + $normUnit)
-          F16-MUL:
-            avg: AVG((SQ_INSTS_VALU_MUL_F16 / $denom))
-            min: MIN((SQ_INSTS_VALU_MUL_F16 / $denom))
-            max: MAX((SQ_INSTS_VALU_MUL_F16 / $denom))
-            unit: (instr + $normUnit)
-          F16-FMA:
-            avg: AVG((SQ_INSTS_VALU_FMA_F16 / $denom))
-            min: MIN((SQ_INSTS_VALU_FMA_F16 / $denom))
-            max: MAX((SQ_INSTS_VALU_FMA_F16 / $denom))
-            unit: (instr + $normUnit)
-          F16-Trans:
-            avg: AVG((SQ_INSTS_VALU_TRANS_F16 / $denom))
-            min: MIN((SQ_INSTS_VALU_TRANS_F16 / $denom))
-            max: MAX((SQ_INSTS_VALU_TRANS_F16 / $denom))
-            unit: (instr + $normUnit)
-          F32-ADD:
-            avg: AVG((SQ_INSTS_VALU_ADD_F32 / $denom))
-            min: MIN((SQ_INSTS_VALU_ADD_F32 / $denom))
-            max: MAX((SQ_INSTS_VALU_ADD_F32 / $denom))
-            unit: (instr + $normUnit)
-          F32-MUL:
-            avg: AVG((SQ_INSTS_VALU_MUL_F32 / $denom))
-            min: MIN((SQ_INSTS_VALU_MUL_F32 / $denom))
-            max: MAX((SQ_INSTS_VALU_MUL_F32 / $denom))
-            unit: (instr + $normUnit)
-          F32-FMA:
-            avg: AVG((SQ_INSTS_VALU_FMA_F32 / $denom))
-            min: MIN((SQ_INSTS_VALU_FMA_F32 / $denom))
-            max: MAX((SQ_INSTS_VALU_FMA_F32 / $denom))
-            unit: (instr + $normUnit)
-          F32-Trans:
-            avg: AVG((SQ_INSTS_VALU_TRANS_F32 / $denom))
-            min: MIN((SQ_INSTS_VALU_TRANS_F32 / $denom))
-            max: MAX((SQ_INSTS_VALU_TRANS_F32 / $denom))
-            unit: (instr + $normUnit)
-          F64-ADD:
-            avg: AVG((SQ_INSTS_VALU_ADD_F64 / $denom))
-            min: MIN((SQ_INSTS_VALU_ADD_F64 / $denom))
-            max: MAX((SQ_INSTS_VALU_ADD_F64 / $denom))
-            unit: (instr + $normUnit)
-          F64-MUL:
-            avg: AVG((SQ_INSTS_VALU_MUL_F64 / $denom))
-            min: MIN((SQ_INSTS_VALU_MUL_F64 / $denom))
-            max: MAX((SQ_INSTS_VALU_MUL_F64 / $denom))
-            unit: (instr + $normUnit)
-          F64-FMA:
-            avg: AVG((SQ_INSTS_VALU_FMA_F64 / $denom))
-            min: MIN((SQ_INSTS_VALU_FMA_F64 / $denom))
-            max: MAX((SQ_INSTS_VALU_FMA_F64 / $denom))
-            unit: (instr + $normUnit)
-          F64-Trans:
-            avg: AVG((SQ_INSTS_VALU_TRANS_F64 / $denom))
-            min: MIN((SQ_INSTS_VALU_TRANS_F64 / $denom))
-            max: MAX((SQ_INSTS_VALU_TRANS_F64 / $denom))
-            unit: (instr + $normUnit)
-          Conversion:
-            avg: AVG((SQ_INSTS_VALU_CVT / $denom))
-            min: MIN((SQ_INSTS_VALU_CVT / $denom))
-            max: MAX((SQ_INSTS_VALU_CVT / $denom))
-            unit: (instr + $normUnit)
-        gfx950:
-          INT32:
-            avg: AVG((SQ_INSTS_VALU_INT32 / $denom))
-            min: MIN((SQ_INSTS_VALU_INT32 / $denom))
-            max: MAX((SQ_INSTS_VALU_INT32 / $denom))
-            unit: (instr + $normUnit)
-          INT64:
-            avg: AVG((SQ_INSTS_VALU_INT64 / $denom))
-            min: MIN((SQ_INSTS_VALU_INT64 / $denom))
-            max: MAX((SQ_INSTS_VALU_INT64 / $denom))
-            unit: (instr + $normUnit)
-          F16-ADD:
-            avg: AVG((SQ_INSTS_VALU_ADD_F16 / $denom))
-            min: MIN((SQ_INSTS_VALU_ADD_F16 / $denom))
-            max: MAX((SQ_INSTS_VALU_ADD_F16 / $denom))
-            unit: (instr + $normUnit)
-          F16-MUL:
-            avg: AVG((SQ_INSTS_VALU_MUL_F16 / $denom))
-            min: MIN((SQ_INSTS_VALU_MUL_F16 / $denom))
-            max: MAX((SQ_INSTS_VALU_MUL_F16 / $denom))
-            unit: (instr + $normUnit)
-          F16-FMA:
-            avg: AVG((SQ_INSTS_VALU_FMA_F16 / $denom))
-            min: MIN((SQ_INSTS_VALU_FMA_F16 / $denom))
-            max: MAX((SQ_INSTS_VALU_FMA_F16 / $denom))
-            unit: (instr + $normUnit)
-          F16-Trans:
-            avg: AVG((SQ_INSTS_VALU_TRANS_F16 / $denom))
-            min: MIN((SQ_INSTS_VALU_TRANS_F16 / $denom))
-            max: MAX((SQ_INSTS_VALU_TRANS_F16 / $denom))
-            unit: (instr + $normUnit)
-          F32-ADD:
-            avg: AVG((SQ_INSTS_VALU_ADD_F32 / $denom))
-            min: MIN((SQ_INSTS_VALU_ADD_F32 / $denom))
-            max: MAX((SQ_INSTS_VALU_ADD_F32 / $denom))
-            unit: (instr + $normUnit)
-          F32-MUL:
-            avg: AVG((SQ_INSTS_VALU_MUL_F32 / $denom))
-            min: MIN((SQ_INSTS_VALU_MUL_F32 / $denom))
-            max: MAX((SQ_INSTS_VALU_MUL_F32 / $denom))
-            unit: (instr + $normUnit)
-          F32-FMA:
-            avg: AVG((SQ_INSTS_VALU_FMA_F32 / $denom))
-            min: MIN((SQ_INSTS_VALU_FMA_F32 / $denom))
-            max: MAX((SQ_INSTS_VALU_FMA_F32 / $denom))
-            unit: (instr + $normUnit)
-          F32-Trans:
-            avg: AVG((SQ_INSTS_VALU_TRANS_F32 / $denom))
-            min: MIN((SQ_INSTS_VALU_TRANS_F32 / $denom))
-            max: MAX((SQ_INSTS_VALU_TRANS_F32 / $denom))
-            unit: (instr + $normUnit)
-          F64-ADD:
-            avg: AVG((SQ_INSTS_VALU_ADD_F64 / $denom))
-            min: MIN((SQ_INSTS_VALU_ADD_F64 / $denom))
-            max: MAX((SQ_INSTS_VALU_ADD_F64 / $denom))
-            unit: (instr + $normUnit)
-          F64-MUL:
-            avg: AVG((SQ_INSTS_VALU_MUL_F64 / $denom))
-            min: MIN((SQ_INSTS_VALU_MUL_F64 / $denom))
-            max: MAX((SQ_INSTS_VALU_MUL_F64 / $denom))
-            unit: (instr + $normUnit)
-          F64-FMA:
-            avg: AVG((SQ_INSTS_VALU_FMA_F64 / $denom))
-            min: MIN((SQ_INSTS_VALU_FMA_F64 / $denom))
-            max: MAX((SQ_INSTS_VALU_FMA_F64 / $denom))
-            unit: (instr + $normUnit)
-          F64-Trans:
-            avg: AVG((SQ_INSTS_VALU_TRANS_F64 / $denom))
-            min: MIN((SQ_INSTS_VALU_TRANS_F64 / $denom))
-            max: MAX((SQ_INSTS_VALU_TRANS_F64 / $denom))
-            unit: (instr + $normUnit)
-          Conversion:
-            avg: AVG((SQ_INSTS_VALU_CVT / $denom))
-            min: MIN((SQ_INSTS_VALU_CVT / $denom))
-            max: MAX((SQ_INSTS_VALU_CVT / $denom))
-            unit: (instr + $normUnit)
-        gfx908: {}
-  - metric_table:
-      id: 1003
-      title: VMEM Instruction Mix
-      header:
-        metric: Metric
-        avg: Avg
-        min: Min
-        max: Max
-        unit: Unit
-      metric:
-        gfx90a:
-          Global/Generic Instr:
-            avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
-            unit: (instr + $normUnit)
-          Global/Generic Read:
-            avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-            unit: (instr + $normUnit)
-          Global/Generic Write:
-            avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-            unit: (instr + $normUnit)
-          Global/Generic Atomic:
-            avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-            unit: (instr + $normUnit)
-          Spill/Stack Instr:
-            avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
-            unit: (instr + $normUnit)
-          Spill/Stack Read:
-            avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-            unit: (instr + $normUnit)
-          Spill/Stack Write:
-            avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-            unit: (instr + $normUnit)
-          Spill/Stack Atomic:
-            avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-            unit: (instr + $normUnit)
-        gfx941:
-          Global/Generic Instr:
-            avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
-            unit: (instr + $normUnit)
-          Global/Generic Read:
-            avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-            unit: (instr + $normUnit)
-          Global/Generic Write:
-            avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-            unit: (instr + $normUnit)
-          Global/Generic Atomic:
-            avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-            unit: (instr + $normUnit)
-          Spill/Stack Instr:
-            avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
-            unit: (instr + $normUnit)
-          Spill/Stack Read:
-            avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-            unit: (instr + $normUnit)
-          Spill/Stack Write:
-            avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-            unit: (instr + $normUnit)
-          Spill/Stack Atomic:
-            avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-            unit: (instr + $normUnit)
-        gfx940:
-          Global/Generic Instr:
-            avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
-            unit: (instr + $normUnit)
-          Global/Generic Read:
-            avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-            unit: (instr + $normUnit)
-          Global/Generic Write:
-            avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-            unit: (instr + $normUnit)
-          Global/Generic Atomic:
-            avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-            unit: (instr + $normUnit)
-          Spill/Stack Instr:
-            avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
-            unit: (instr + $normUnit)
-          Spill/Stack Read:
-            avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-            unit: (instr + $normUnit)
-          Spill/Stack Write:
-            avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-            unit: (instr + $normUnit)
-          Spill/Stack Atomic:
-            avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-            unit: (instr + $normUnit)
-        gfx942:
-          Global/Generic Instr:
-            avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
-            unit: (instr + $normUnit)
-          Global/Generic Read:
-            avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-            unit: (instr + $normUnit)
-          Global/Generic Write:
-            avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-            unit: (instr + $normUnit)
-          Global/Generic Atomic:
-            avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-            unit: (instr + $normUnit)
-          Spill/Stack Instr:
-            avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
-            unit: (instr + $normUnit)
-          Spill/Stack Read:
-            avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-            unit: (instr + $normUnit)
-          Spill/Stack Write:
-            avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-            unit: (instr + $normUnit)
-          Spill/Stack Atomic:
-            avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-            unit: (instr + $normUnit)
-        gfx950:
-          Global/Generic Instr:
-            avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
-            unit: (instr + $normUnit)
-          Global/Generic Read:
-            avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-            unit: (instr + $normUnit)
-          Global/Generic Write:
-            avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-            unit: (instr + $normUnit)
-          Global/Generic Atomic:
-            avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-            unit: (instr + $normUnit)
-          Spill/Stack Instr:
-            avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
-            unit: (instr + $normUnit)
-          Spill/Stack Coalesceable Instr:
-            avg: AVG((TA_BUFFER_COALESCEABLE_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_BUFFER_COALESCEABLE_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_BUFFER_COALESCEABLE_WAVEFRONTS_sum / $denom))
-            unit: (instr + $normUnit)
-          Spill/Stack Read:
-            avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-            unit: (instr + $normUnit)
-          Spill/Stack Write:
-            avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-            unit: (instr + $normUnit)
-          Spill/Stack Atomic:
-            avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-            unit: (instr + $normUnit)
-        gfx908:
-          Global/Generic Instr:
-            avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
-            unit: (instr + $normUnit)
-          Global/Generic Read:
-            avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-            unit: (instr + $normUnit)
-          Global/Generic Write:
-            avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-            unit: (instr + $normUnit)
-          Global/Generic Atomic:
-            avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-            unit: (instr + $normUnit)
-          Spill/Stack Instr:
-            avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
-            unit: (instr + $normUnit)
-          Spill/Stack Read:
-            avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-            unit: (instr + $normUnit)
-          Spill/Stack Write:
-            avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-            unit: (instr + $normUnit)
-          Spill/Stack Atomic:
-            avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-            unit: (instr + $normUnit)
-  - metric_table:
-      id: 1004
-      title: MFMA Arithmetic Instruction Mix
-      header:
-        metric: Metric
-        avg: Avg
-        min: Min
-        max: Max
-        unit: Unit
-      metric:
-        gfx90a:
-          MFMA-I8:
-            avg: AVG((SQ_INSTS_VALU_MFMA_I8 / $denom))
-            min: MIN((SQ_INSTS_VALU_MFMA_I8 / $denom))
-            max: MAX((SQ_INSTS_VALU_MFMA_I8 / $denom))
-            unit: (instr + $normUnit)
-          MFMA-F16:
-            avg: AVG((SQ_INSTS_VALU_MFMA_F16 / $denom))
-            min: MIN((SQ_INSTS_VALU_MFMA_F16 / $denom))
-            max: MAX((SQ_INSTS_VALU_MFMA_F16 / $denom))
-            unit: (instr + $normUnit)
-          MFMA-BF16:
-            avg: AVG((SQ_INSTS_VALU_MFMA_BF16 / $denom))
-            min: MIN((SQ_INSTS_VALU_MFMA_BF16 / $denom))
-            max: MAX((SQ_INSTS_VALU_MFMA_BF16 / $denom))
-            unit: (instr + $normUnit)
-          MFMA-F32:
-            avg: AVG((SQ_INSTS_VALU_MFMA_F32 / $denom))
-            min: MIN((SQ_INSTS_VALU_MFMA_F32 / $denom))
-            max: MAX((SQ_INSTS_VALU_MFMA_F32 / $denom))
-            unit: (instr + $normUnit)
-          MFMA-F64:
-            avg: AVG((SQ_INSTS_VALU_MFMA_F64 / $denom))
-            min: MIN((SQ_INSTS_VALU_MFMA_F64 / $denom))
-            max: MAX((SQ_INSTS_VALU_MFMA_F64 / $denom))
-            unit: (instr + $normUnit)
-        gfx941:
-          MFMA-I8:
-            avg: AVG((SQ_INSTS_VALU_MFMA_I8 / $denom))
-            min: MIN((SQ_INSTS_VALU_MFMA_I8 / $denom))
-            max: MAX((SQ_INSTS_VALU_MFMA_I8 / $denom))
-            unit: (instr + $normUnit)
-          MFMA-F8:
-            avg: AVG((SQ_INSTS_VALU_MFMA_F8 / $denom))
-            min: MIN((SQ_INSTS_VALU_MFMA_F8 / $denom))
-            max: MAX((SQ_INSTS_VALU_MFMA_F8 / $denom))
-            unit: (instr + $normUnit)
-          MFMA-F16:
-            avg: AVG((SQ_INSTS_VALU_MFMA_F16 / $denom))
-            min: MIN((SQ_INSTS_VALU_MFMA_F16 / $denom))
-            max: MAX((SQ_INSTS_VALU_MFMA_F16 / $denom))
-            unit: (instr + $normUnit)
-          MFMA-BF16:
-            avg: AVG((SQ_INSTS_VALU_MFMA_BF16 / $denom))
-            min: MIN((SQ_INSTS_VALU_MFMA_BF16 / $denom))
-            max: MAX((SQ_INSTS_VALU_MFMA_BF16 / $denom))
-            unit: (instr + $normUnit)
-          MFMA-F32:
-            avg: AVG((SQ_INSTS_VALU_MFMA_F32 / $denom))
-            min: MIN((SQ_INSTS_VALU_MFMA_F32 / $denom))
-            max: MAX((SQ_INSTS_VALU_MFMA_F32 / $denom))
-            unit: (instr + $normUnit)
-          MFMA-F64:
-            avg: AVG((SQ_INSTS_VALU_MFMA_F64 / $denom))
-            min: MIN((SQ_INSTS_VALU_MFMA_F64 / $denom))
-            max: MAX((SQ_INSTS_VALU_MFMA_F64 / $denom))
-            unit: (instr + $normUnit)
-        gfx940:
-          MFMA-I8:
-            avg: AVG((SQ_INSTS_VALU_MFMA_I8 / $denom))
-            min: MIN((SQ_INSTS_VALU_MFMA_I8 / $denom))
-            max: MAX((SQ_INSTS_VALU_MFMA_I8 / $denom))
-            unit: (instr + $normUnit)
-          MFMA-F8:
-            avg: AVG((SQ_INSTS_VALU_MFMA_F8 / $denom))
-            min: MIN((SQ_INSTS_VALU_MFMA_F8 / $denom))
-            max: MAX((SQ_INSTS_VALU_MFMA_F8 / $denom))
-            unit: (instr + $normUnit)
-          MFMA-F16:
-            avg: AVG((SQ_INSTS_VALU_MFMA_F16 / $denom))
-            min: MIN((SQ_INSTS_VALU_MFMA_F16 / $denom))
-            max: MAX((SQ_INSTS_VALU_MFMA_F16 / $denom))
-            unit: (instr + $normUnit)
-          MFMA-BF16:
-            avg: AVG((SQ_INSTS_VALU_MFMA_BF16 / $denom))
-            min: MIN((SQ_INSTS_VALU_MFMA_BF16 / $denom))
-            max: MAX((SQ_INSTS_VALU_MFMA_BF16 / $denom))
-            unit: (instr + $normUnit)
-          MFMA-F32:
-            avg: AVG((SQ_INSTS_VALU_MFMA_F32 / $denom))
-            min: MIN((SQ_INSTS_VALU_MFMA_F32 / $denom))
-            max: MAX((SQ_INSTS_VALU_MFMA_F32 / $denom))
-            unit: (instr + $normUnit)
-          MFMA-F64:
-            avg: AVG((SQ_INSTS_VALU_MFMA_F64 / $denom))
-            min: MIN((SQ_INSTS_VALU_MFMA_F64 / $denom))
-            max: MAX((SQ_INSTS_VALU_MFMA_F64 / $denom))
-            unit: (instr + $normUnit)
-        gfx942:
-          MFMA-I8:
-            avg: AVG((SQ_INSTS_VALU_MFMA_I8 / $denom))
-            min: MIN((SQ_INSTS_VALU_MFMA_I8 / $denom))
-            max: MAX((SQ_INSTS_VALU_MFMA_I8 / $denom))
-            unit: (instr + $normUnit)
-          MFMA-F8:
-            avg: AVG((SQ_INSTS_VALU_MFMA_F8 / $denom))
-            min: MIN((SQ_INSTS_VALU_MFMA_F8 / $denom))
-            max: MAX((SQ_INSTS_VALU_MFMA_F8 / $denom))
-            unit: (instr + $normUnit)
-          MFMA-F16:
-            avg: AVG((SQ_INSTS_VALU_MFMA_F16 / $denom))
-            min: MIN((SQ_INSTS_VALU_MFMA_F16 / $denom))
-            max: MAX((SQ_INSTS_VALU_MFMA_F16 / $denom))
-            unit: (instr + $normUnit)
-          MFMA-BF16:
-            avg: AVG((SQ_INSTS_VALU_MFMA_BF16 / $denom))
-            min: MIN((SQ_INSTS_VALU_MFMA_BF16 / $denom))
-            max: MAX((SQ_INSTS_VALU_MFMA_BF16 / $denom))
-            unit: (instr + $normUnit)
-          MFMA-F32:
-            avg: AVG((SQ_INSTS_VALU_MFMA_F32 / $denom))
-            min: MIN((SQ_INSTS_VALU_MFMA_F32 / $denom))
-            max: MAX((SQ_INSTS_VALU_MFMA_F32 / $denom))
-            unit: (instr + $normUnit)
-          MFMA-F64:
-            avg: AVG((SQ_INSTS_VALU_MFMA_F64 / $denom))
-            min: MIN((SQ_INSTS_VALU_MFMA_F64 / $denom))
-            max: MAX((SQ_INSTS_VALU_MFMA_F64 / $denom))
-            unit: (instr + $normUnit)
-        gfx950:
-          MFMA-I8:
-            avg: AVG((SQ_INSTS_VALU_MFMA_I8 / $denom))
-            min: MIN((SQ_INSTS_VALU_MFMA_I8 / $denom))
-            max: MAX((SQ_INSTS_VALU_MFMA_I8 / $denom))
-            unit: (instr + $normUnit)
-          MFMA-F8:
-            avg: AVG((SQ_INSTS_VALU_MFMA_F8 / $denom))
-            min: MIN((SQ_INSTS_VALU_MFMA_F8 / $denom))
-            max: MAX((SQ_INSTS_VALU_MFMA_F8 / $denom))
-            unit: (instr + $normUnit)
-          MFMA-F16:
-            avg: AVG((SQ_INSTS_VALU_MFMA_F16 / $denom))
-            min: MIN((SQ_INSTS_VALU_MFMA_F16 / $denom))
-            max: MAX((SQ_INSTS_VALU_MFMA_F16 / $denom))
-            unit: (instr + $normUnit)
-          MFMA-BF16:
-            avg: AVG((SQ_INSTS_VALU_MFMA_BF16 / $denom))
-            min: MIN((SQ_INSTS_VALU_MFMA_BF16 / $denom))
-            max: MAX((SQ_INSTS_VALU_MFMA_BF16 / $denom))
-            unit: (instr + $normUnit)
-          MFMA-F32:
-            avg: AVG((SQ_INSTS_VALU_MFMA_F32 / $denom))
-            min: MIN((SQ_INSTS_VALU_MFMA_F32 / $denom))
-            max: MAX((SQ_INSTS_VALU_MFMA_F32 / $denom))
-            unit: (instr + $normUnit)
-          MFMA-F64:
-            avg: AVG((SQ_INSTS_VALU_MFMA_F64 / $denom))
-            min: MIN((SQ_INSTS_VALU_MFMA_F64 / $denom))
-            max: MAX((SQ_INSTS_VALU_MFMA_F64 / $denom))
-            unit: (instr + $normUnit)
-          MFMA-F6F4:
-            avg: AVG((SQ_INSTS_VALU_MFMA_F6F4 / $denom))
-            min: MIN((SQ_INSTS_VALU_MFMA_F6F4 / $denom))
-            max: MAX((SQ_INSTS_VALU_MFMA_F6F4 / $denom))
-            unit: (instr + $normUnit)
-        gfx908: {}
-  metrics_description:
-    VALU:
-      plain: The total number of vector arithmetic logic unit (VALU) operations issued.
-        These are the workhorses of the compute unit, and are used to execute a wide
-        range of instruction types including floating point operations, non-uniform
-        address calculations, transcendental operations, integer operations, shifts,
-        conditional evaluation, etc.
-      rst: The total number of vector arithmetic logic unit (VALU) operations issued.
-        These are the workhorses of the :doc:`compute unit <compute-unit>`, and are
-        used to execute a wide range of instruction types including floating point
-        operations, non-uniform address calculations, transcendental operations,
-        integer operations, shifts, conditional evaluation, etc.
-      unit: Instructions
-    VMEM:
-      plain: The total number of vector memory operations issued. These include most
-        loads, stores and atomic operations and all accesses to generic, global, private
-        and texture memory.
-      rst: The total number of vector memory operations issued. These include most loads,
-        stores and atomic operations and all accesses to :ref:`generic, global, private
-        and texture <memory-spaces>` memory.
-      unit: Instructions
-    LDS:
-      plain: The total number of LDS (also known as shared memory) operations issued.
-        These include loads, stores, atomics, and HIP's __shfl operations.
-      rst: The total number of LDS (also known as shared memory) operations issued. These
-        include loads, stores, atomics, and HIP's ``__shfl`` operations.
-      unit: Instructions
-    MFMA:
-      plain: The total number of matrix fused multiply-add instructions issued.
-      rst: The total number of matrix fused multiply-add instructions issued.
-      unit: Instructions
-    SALU:
-      plain: The total number of scalar arithmetic logic unit (SALU) operations issued.
-        Typically these are used for address calculations, literal constants, and
-        other operations that are provably uniform across a wavefront. Although scalar
-        memory (SMEM) operations are issued by the SALU, they are counted separately
-        in this section.
-      rst: The total number of scalar arithmetic logic unit (SALU) operations issued.
-        Typically these are used for address calculations, literal constants, and
-        other operations that are provably uniform across a wavefront. Although scalar
-        memory (SMEM) operations are issued by the SALU, they are counted separately
-        in this section.
-      unit: Instructions
-    SMEM:
-      plain: The total number of scalar memory (SMEM) operations issued. These are
-        typically used for loading kernel arguments, base-pointers and loads from
-        HIP's __constant__ memory.
-      rst: The total number of scalar memory (SMEM) operations issued. These are typically
-        used for loading kernel arguments, base-pointers and loads from HIP's ``__constant__``
-        memory.
-      unit: Instructions
-    Branch:
-      plain: The total number of branch operations issued. These typically consist
-        of jump or branch operations and are used to implement control flow.
-      rst: The total number of branch operations issued. These typically consist of jump
-        or branch operations and are used to implement control flow.
-      unit: Instructions
-    INT32:
-      plain: The total number of instructions operating on 32-bit integer operands
-        issued to the VALU per normalization unit.
-      rst: The total number of instructions operating on 32-bit integer operands issued
-        to the VALU per :ref:`normalization unit <normalization-units>`.
-      unit: Instructions per normalization unit
-    INT64:
-      plain: The total number of instructions operating on 64-bit integer operands
-        issued to the VALU per normalization unit.
-      rst: The total number of instructions operating on 64-bit integer operands issued
-        to the VALU per :ref:`normalization unit <normalization-units>`.
-      unit: Instructions per normalization unit
-    F16-ADD:
-      plain: The total number of addition instructions operating on 16-bit floating-point
-        operands issued to the VALU per normalization unit.
-      rst: The total number of addition instructions operating on 16-bit floating-point
-        operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
-      unit: Instructions per normalization unit
-    F16-MUL:
-      plain: The total number of multiplication instructions operating on 16-bit floating-point
-        operands issued to the VALU per normalization unit.
-      rst: The total number of multiplication instructions operating on 16-bit floating-point
-        operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
-      unit: Instructions per normalization unit
-    F16-FMA:
-      plain: The total number of fused multiply-add instructions operating on 16-bit
-        floating-point operands issued to the VALU per normalization unit.
-      rst: The total number of fused multiply-add instructions operating on 16-bit floating-point
-        operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
-      unit: Instructions per normalization unit
-    F16-Trans:
-      plain: The total number of transcendental instructions (e.g., sqrt) operating
-        on 16-bit floating-point operands issued to the VALU per normalization unit.
-      rst: The total number of transcendental instructions (e.g., `sqrt`) operating on
-        16-bit floating-point operands issued to the VALU per :ref:`normalization
-        unit <normalization-units>`.
-      unit: Instructions per normalization unit
-    F32-ADD:
-      plain: The total number of addition instructions operating on 32-bit floating-point
-        operands issued to the VALU per normalization unit.
-      rst: The total number of addition instructions operating on 32-bit floating-point
-        operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
-      unit: Instructions per normalization unit
-    F32-MUL:
-      plain: The total number of multiplication instructions operating on 32-bit floating-point
-        operands issued to the VALU per normalization unit.
-      rst: The total number of multiplication instructions operating on 32-bit floating-point
-        operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
-      unit: Instructions per normalization unit
-    F32-FMA:
-      plain: The total number of fused multiply-add instructions operating on 32-bit
-        floating-point operands issued to the VALU per normalization unit.
-      rst: The total number of fused multiply-add instructions operating on 32-bit floating-point
-        operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
-      unit: Instructions per normalization unit
-    F32-Trans:
-      plain: The total number of transcendental instructions (such as sqrt) operating
-        on 32-bit floating-point operands issued to the VALU per normalization unit.
-      rst: The total number of transcendental instructions (such as ``sqrt``) operating
-        on 32-bit floating-point operands issued to the VALU per :ref:`normalization
-        unit <normalization-units>`.
-      unit: Instructions per normalization unit
-    F64-ADD:
-      plain: The total number of addition instructions operating on 64-bit floating-point
-        operands issued to the VALU per normalization unit.
-      rst: The total number of addition instructions operating on 64-bit floating-point
-        operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
-      unit: Instructions per normalization unit
-    F64-MUL:
-      plain: The total number of multiplication instructions operating on 64-bit floating-point
-        operands issued to the VALU per normalization unit.
-      rst: The total number of multiplication instructions operating on 64-bit floating-point
-        operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
-      unit: Instructions per normalization unit
-    F64-FMA:
-      plain: The total number of fused multiply-add instructions operating on 64-bit
-        floating-point operands issued to the VALU per normalization unit.
-      rst: The total number of fused multiply-add instructions operating on 64-bit floating-point
-        operands issued to the VALU per :ref:`normalization unit <normalization-units>`.
-      unit: Instructions per normalization unit
-    F64-Trans:
-      plain: The total number of transcendental instructions (such as sqrt) operating
-        on 64-bit floating-point operands issued to the VALU per normalization unit.
-      rst: The total number of transcendental instructions (such as `sqrt`) operating
-        on 64-bit floating-point operands issued to the VALU per :ref:`normalization
-        unit <normalization-units>`.
-      unit: Instructions per normalization unit
-    Conversion:
-      plain: |-
-        The total number of type conversion instructions (such as converting
-        data to or from F32\u2194F64) issued to the VALU per normalization unit.
-      rst: |-
-        The total number of type conversion instructions (such as converting data
-        to or from F32\u2194F64) issued to the VALU per :ref:`normalization unit
-        <normalization-units>`.
-      unit: Instructions per normalization unit
-    Global/Generic Instr:
-      plain: The total number of global & generic memory instructions executed on
-        all compute units on the accelerator, per normalization unit.
-      rst: The total number of global & generic memory instructions executed on all :doc:`compute
-        units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
-      unit: Instructions per normalization unit
-    Global/Generic Read:
-      plain: The total number of global & generic memory read instructions executed
-        on all compute units on the accelerator, per normalization unit.
-      rst: The total number of global & generic memory read instructions executed
-        on all :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
-        unit <normalization-units>`.
-      unit: Instructions per normalization unit
-    Global/Generic Write:
-      plain: The total number of global & generic memory write instructions executed
-        on all compute units on the accelerator, per normalization unit.
-      rst: The total number of global & generic memory write instructions executed on
-        all :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
-        unit <normalization-units>`.
-      unit: Instructions per normalization unit
-    Global/Generic Atomic:
-      plain: The total number of global & generic memory atomic (with and without
-        return) instructions executed on all compute units on the accelerator, per
-        normalization unit.
-      rst: The total number of global & generic memory atomic (with and without return)
-        instructions executed on all :doc:`compute units <compute-unit>` on the accelerator,
-        per :ref:`normalization unit <normalization-units>`.
-      unit: Instructions per normalization unit
-    Spill/Stack Instr:
-      plain: The total number of spill/stack memory instructions executed on all compute
-        units on the accelerator, per normalization unit.
-      rst: The total number of spill/stack memory instructions executed on all :doc:`compute
-        units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
-      unit: Instructions per normalization unit
-    Spill/Stack Read:
-      plain: The total number of spill/stack memory read instructions executed on
-        all compute units on the accelerator, per normalization unit.
-      rst: The total number of spill/stack memory read instructions executed on all :doc:`compute
-        units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
-      unit: Instructions per normalization unit
-    Spill/Stack Write:
-      plain: The total number of spill/stack memory write instructions executed on
-        all compute units on the accelerator, per normalization unit.
-      rst: The total number of spill/stack memory write instructions executed on all :doc:`compute
-        units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
-      unit: Instructions per normalization unit
-    Spill/Stack Atomic:
-      plain: The total number of spill/stack memory atomic (with and without return)
-        instructions executed on all compute units on the accelerator, per normalization
-        unit. Typically unused as these memory operations are typically used to implement
-        thread-local storage.
-      rst: The total number of spill/stack memory atomic (with and without return) instructions
-        executed on all :doc:`compute units <compute-unit>` on the accelerator, per
-        :ref:`normalization unit <normalization-units>`. Typically unused as these
-        memory operations are typically used to implement thread-local storage.
-      unit: Instructions per normalization unit
-    MFMA-I8:
-      plain: The total number of 8-bit integer MFMA instructions issued per normalization
-        unit.
-      rst: The total number of 8-bit integer :ref:`MFMA <desc-mfma>` instructions issued
-        per :ref:`normalization unit <normalization-units>`.
-      unit: Instructions per normalization unit
-    MFMA-F8:
-      plain: The total number of 8-bit floating point MFMA instructions issued per
-        normalization unit. This is supported in AMD Instinct MI300 series and later
-        only.
-      rst: The total number of 8-bit floating point :ref:`MFMA <desc-mfma>` instructions issued
-        per :ref:`normalization unit <normalization-units>`. This is supported in
-        AMD Instinct MI300 series and later only.
-      unit: Instructions per normalization unit
-    MFMA-F16:
-      plain: The total number of 16-bit floating point MFMA instructions issued per
-        normalization unit.
-      rst: The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` instructions
-        issued per :ref:`normalization unit <normalization-units>`.
-      unit: Instructions per normalization unit
-    MFMA-BF16:
-      plain: The total number of 16-bit brain floating point MFMA instructions issued
-        per normalization unit.
-      rst: The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>` instructions
-        issued per :ref:`normalization unit <normalization-units>`.
-      unit: Instructions per normalization unit
-    MFMA-F32:
-      plain: The total number of 32-bit floating-point MFMA instructions issued per
-        normalization unit.
-      rst: The total number of 32-bit floating-point :ref:`MFMA <desc-mfma>` instructions
-        issued per :ref:`normalization unit <normalization-units>`.
-      unit: Instructions per normalization unit
-    MFMA-F64:
-      plain: The total number of 64-bit floating-point MFMA instructions issued per
-        normalization unit.
-      rst: The total number of 64-bit floating-point :ref:`MFMA <desc-mfma>` instructions
-        issued per :ref:`normalization unit <normalization-units>`.
-      unit: Instructions per normalization unit
-- id: 1100
-  title: Compute Units - Compute Pipeline
-  data source:
-  - metric_table:
-      id: 1101
-      title: Compute Speed-of-Light
-      header:
-        metric: Metric
-        value: Avg
-        unit: Unit
-        peak: Peak
-        pop: Pct of Peak
-      metric:
-        gfx90a:
-          VALU FLOPs:
-            value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
-              + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
-              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
-              + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
-              + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP
-            peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
-            pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
-              + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
-              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
-              + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
-              + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))))
-              / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000))
-          VALU IOPs:
-            value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp
-              - Start_Timestamp)))
-            unit: GIOP
-            peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
-            pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) /
-              (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu)
-              * 64) * 2) / 1000))
-          MFMA FLOPs (BF16):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP
-            peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 1024) / 1000))
-          MFMA FLOPs (F16):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP
-            peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 1024) / 1000))
-          MFMA FLOPs (F32):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP
-            peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
-          MFMA FLOPs (F64):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP
-            peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
-          MFMA IOPs (INT8):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GIOP
-            peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 1024) / 1000))
-        gfx941:
-          VALU FLOPs:
-            value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
-              + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
-              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
-              + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
-              + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP
-            peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
-            pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
-              + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
-              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
-              + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
-              + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))))
-              / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000))
-          VALU IOPs:
-            value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp
-              - Start_Timestamp)))
-            unit: GIOP
-            peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
-            pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) /
-              (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu)
-              * 64) * 2) / 1000))
-          MFMA FLOPs (F8):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP
-            peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
-          MFMA FLOPs (BF16):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP
-            peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
-          MFMA FLOPs (F16):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP
-            peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
-          MFMA FLOPs (F32):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP
-            peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
-          MFMA FLOPs (F64):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP
-            peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
-          MFMA IOPs (INT8):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GIOP
-            peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
-        gfx940:
-          VALU FLOPs:
-            value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
-              + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
-              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
-              + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
-              + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP
-            peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
-            pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
-              + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
-              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
-              + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
-              + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))))
-              / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000))
-          VALU IOPs:
-            value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp
-              - Start_Timestamp)))
-            unit: GIOP
-            peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
-            pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) /
-              (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu)
-              * 64) * 2) / 1000))
-          MFMA FLOPs (F8):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP
-            peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
-          MFMA FLOPs (BF16):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP
-            peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
-          MFMA FLOPs (F16):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP
-            peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
-          MFMA FLOPs (F32):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP
-            peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
-          MFMA FLOPs (F64):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP
-            peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
-          MFMA IOPs (INT8):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GIOP
-            peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
-        gfx942:
-          VALU FLOPs:
-            value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
-              + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
-              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
-              + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
-              + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP
-            peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
-            pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
-              + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
-              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
-              + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
-              + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))))
-              / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000))
-          VALU IOPs:
-            value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp
-              - Start_Timestamp)))
-            unit: GIOP
-            peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
-            pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) /
-              (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu)
-              * 64) * 2) / 1000))
-          MFMA FLOPs (F8):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP
-            peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
-          MFMA FLOPs (BF16):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP
-            peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
-          MFMA FLOPs (F16):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP
-            peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
-          MFMA FLOPs (F32):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP
-            peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
-          MFMA FLOPs (F64):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP
-            peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
-          MFMA IOPs (INT8):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GIOP
-            peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
-        gfx950:
-          VALU FLOPs:
-            value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
-              + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
-              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
-              + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
-              + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP
-            peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
-            pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
-              + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
-              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
-              + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
-              + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))))
-              / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000))
-          VALU IOPs:
-            value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp
-              - Start_Timestamp)))
-            unit: GIOP
-            peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
-            pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) /
-              (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu)
-              * 64) * 2) / 1000))
-          MFMA FLOPs (F8):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP
-            peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
-          MFMA FLOPs (BF16):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP
-            peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
-          MFMA FLOPs (F16):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP
-            peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
-          MFMA FLOPs (F32):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP
-            peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
-          MFMA FLOPs (F64):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP
-            peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 128) / 1000))
-          MFMA FLOPs (F6F4):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GFLOP
-            peak: ((($max_sclk * $cu_per_gpu) * 16834) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 16834) / 1000))
-          MFMA IOPs (INT8):
-            value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))
-            unit: GIOP
-            peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
-            pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
-        gfx908: {}
-  - metric_table:
-      id: 1102
-      title: Pipeline Statistics
-      header:
-        metric: Metric
-        avg: Avg
-        min: Min
-        max: Max
-        unit: Unit
-      metric:
-        gfx90a:
-          IPC:
-            avg: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))
-            min: MIN((SQ_INSTS / SQ_BUSY_CU_CYCLES))
-            max: MAX((SQ_INSTS / SQ_BUSY_CU_CYCLES))
-            unit: Instr/cycle
-          IPC (Issued):
-            avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
-              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
-              / SQ_ACTIVE_INST_ANY))
-            min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
-              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
-              / SQ_ACTIVE_INST_ANY))
-            max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
-              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
-              / SQ_ACTIVE_INST_ANY))
-            unit: Instr/cycle
-          SALU Utilization:
-            avg: AVG((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            min: MIN((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            max: MAX((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            unit: pct
-          VALU Utilization:
-            avg: AVG((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            min: MIN((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            max: MAX((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            unit: pct
-          VMEM Utilization:
-            avg: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
-              / $cu_per_gpu))
-            min: MIN((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
-              / $cu_per_gpu))
-            max: MAX((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
-              / $cu_per_gpu))
-            unit: pct
-          Branch Utilization:
-            avg: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            min: MIN((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            max: MAX((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            unit: pct
-          VALU Active Threads:
-            avg: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
-              != 0) else None))
-            min: MIN(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
-              != 0) else None))
-            max: MAX(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
-              != 0) else None))
-            unit: Threads
-          MFMA Utilization:
-            avg: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD)))
-            min: MIN(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD)))
-            max: MAX(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD)))
-            unit: pct
-          MFMA Instruction Cycles:
-            avg: AVG(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA
-              != 0) else None))
-            min: MIN(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA
-              != 0) else None))
-            max: MAX(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA
-              != 0) else None))
-            unit: cycles/instr
-          VMEM Latency:
-            avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0)
-              else None))
-            min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0)
-              else None))
-            max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0)
-              else None))
-            unit: Cycles
-            coll_level: SQ_INST_LEVEL_VMEM
-          SMEM Latency:
-            avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0)
-              else None))
-            min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0)
-              else None))
-            max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0)
-              else None))
-            unit: Cycles
-            coll_level: SQ_INST_LEVEL_SMEM
-        gfx941:
-          IPC:
-            avg: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))
-            min: MIN((SQ_INSTS / SQ_BUSY_CU_CYCLES))
-            max: MAX((SQ_INSTS / SQ_BUSY_CU_CYCLES))
-            unit: Instr/cycle
-          IPC (Issued):
-            avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
-              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
-              / SQ_ACTIVE_INST_ANY))
-            min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
-              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
-              / SQ_ACTIVE_INST_ANY))
-            max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
-              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
-              / SQ_ACTIVE_INST_ANY))
-            unit: Instr/cycle
-          SALU Utilization:
-            avg: AVG((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            min: MIN((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            max: MAX((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            unit: pct
-          VALU Utilization:
-            avg: AVG((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            min: MIN((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            max: MAX((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            unit: pct
-          VMEM Utilization:
-            avg: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
-              / $cu_per_gpu))
-            min: MIN((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
-              / $cu_per_gpu))
-            max: MAX((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
-              / $cu_per_gpu))
-            unit: pct
-          Branch Utilization:
-            avg: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            min: MIN((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            max: MAX((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            unit: pct
-          VALU Active Threads:
-            avg: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
-              != 0) else None))
-            min: MIN(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
-              != 0) else None))
-            max: MAX(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
-              != 0) else None))
-            unit: Threads
-          MFMA Utilization:
-            avg: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD)))
-            min: MIN(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD)))
-            max: MAX(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD)))
-            unit: pct
-          MFMA Instruction Cycles:
-            avg: AVG(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA
-              != 0) else None))
-            min: MIN(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA
-              != 0) else None))
-            max: MAX(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA
-              != 0) else None))
-            unit: cycles/instr
-          VMEM Latency:
-            avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0)
-              else None))
-            min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0)
-              else None))
-            max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0)
-              else None))
-            unit: Cycles
-            coll_level: SQ_INST_LEVEL_VMEM
-          SMEM Latency:
-            avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0)
-              else None))
-            min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0)
-              else None))
-            max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0)
-              else None))
-            unit: Cycles
-            coll_level: SQ_INST_LEVEL_SMEM
-        gfx940:
-          IPC:
-            avg: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))
-            min: MIN((SQ_INSTS / SQ_BUSY_CU_CYCLES))
-            max: MAX((SQ_INSTS / SQ_BUSY_CU_CYCLES))
-            unit: Instr/cycle
-          IPC (Issued):
-            avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
-              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
-              / SQ_ACTIVE_INST_ANY))
-            min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
-              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
-              / SQ_ACTIVE_INST_ANY))
-            max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
-              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
-              / SQ_ACTIVE_INST_ANY))
-            unit: Instr/cycle
-          SALU Utilization:
-            avg: AVG((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            min: MIN((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            max: MAX((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            unit: pct
-          VALU Utilization:
-            avg: AVG((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            min: MIN((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            max: MAX((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            unit: pct
-          VMEM Utilization:
-            avg: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
-              / $cu_per_gpu))
-            min: MIN((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
-              / $cu_per_gpu))
-            max: MAX((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
-              / $cu_per_gpu))
-            unit: pct
-          Branch Utilization:
-            avg: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            min: MIN((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            max: MAX((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            unit: pct
-          VALU Active Threads:
-            avg: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
-              != 0) else None))
-            min: MIN(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
-              != 0) else None))
-            max: MAX(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
-              != 0) else None))
-            unit: Threads
-          MFMA Utilization:
-            avg: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD)))
-            min: MIN(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD)))
-            max: MAX(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD)))
-            unit: pct
-          MFMA Instruction Cycles:
-            avg: AVG(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA
-              != 0) else None))
-            min: MIN(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA
-              != 0) else None))
-            max: MAX(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA
-              != 0) else None))
-            unit: cycles/instr
-          VMEM Latency:
-            avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0)
-              else None))
-            min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0)
-              else None))
-            max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0)
-              else None))
-            unit: Cycles
-            coll_level: SQ_INST_LEVEL_VMEM
-          SMEM Latency:
-            avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0)
-              else None))
-            min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0)
-              else None))
-            max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0)
-              else None))
-            unit: Cycles
-            coll_level: SQ_INST_LEVEL_SMEM
-        gfx942:
-          IPC:
-            avg: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))
-            min: MIN((SQ_INSTS / SQ_BUSY_CU_CYCLES))
-            max: MAX((SQ_INSTS / SQ_BUSY_CU_CYCLES))
-            unit: Instr/cycle
-          IPC (Issued):
-            avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
-              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
-              / SQ_ACTIVE_INST_ANY))
-            min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
-              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
-              / SQ_ACTIVE_INST_ANY))
-            max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
-              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
-              / SQ_ACTIVE_INST_ANY))
-            unit: Instr/cycle
-          SALU Utilization:
-            avg: AVG((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            min: MIN((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            max: MAX((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            unit: pct
-          VALU Utilization:
-            avg: AVG((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            min: MIN((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            max: MAX((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            unit: pct
-          VMEM Utilization:
-            avg: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
-              / $cu_per_gpu))
-            min: MIN((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
-              / $cu_per_gpu))
-            max: MAX((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
-              / $cu_per_gpu))
-            unit: pct
-          Branch Utilization:
-            avg: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            min: MIN((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            max: MAX((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            unit: pct
-          VALU Active Threads:
-            avg: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
-              != 0) else None))
-            min: MIN(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
-              != 0) else None))
-            max: MAX(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
-              != 0) else None))
-            unit: Threads
-          MFMA Utilization:
-            avg: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD)))
-            min: MIN(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD)))
-            max: MAX(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD)))
-            unit: pct
-          MFMA Instruction Cycles:
-            avg: AVG(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA
-              != 0) else None))
-            min: MIN(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA
-              != 0) else None))
-            max: MAX(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA
-              != 0) else None))
-            unit: cycles/instr
-          VMEM Latency:
-            avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0)
-              else None))
-            min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0)
-              else None))
-            max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0)
-              else None))
-            unit: Cycles
-            coll_level: SQ_INST_LEVEL_VMEM
-          SMEM Latency:
-            avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0)
-              else None))
-            min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0)
-              else None))
-            max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0)
-              else None))
-            unit: Cycles
-            coll_level: SQ_INST_LEVEL_SMEM
-        gfx950:
-          IPC:
-            avg: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))
-            min: MIN((SQ_INSTS / SQ_BUSY_CU_CYCLES))
-            max: MAX((SQ_INSTS / SQ_BUSY_CU_CYCLES))
-            unit: Instr/cycle
-          IPC (Issued):
-            avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
-              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
-              / SQ_ACTIVE_INST_ANY))
-            min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
-              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
-              / SQ_ACTIVE_INST_ANY))
-            max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
-              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
-              / SQ_ACTIVE_INST_ANY))
-            unit: Instr/cycle
-          SALU Utilization:
-            avg: AVG((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            min: MIN((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            max: MAX((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            unit: pct
-          VALU Utilization:
-            avg: AVG((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            min: MIN((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            max: MAX((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            unit: pct
-          VALU Co-Issue Efficiency:
-            avg: AVG((100 * SQ_ACTIVE_INST_VALU2) / (SQ_ACTIVE_INST_VALU - SQ_ACTIVE_INST_VALU2))
-            min: MIN((100 * SQ_ACTIVE_INST_VALU2) / (SQ_ACTIVE_INST_VALU - SQ_ACTIVE_INST_VALU2))
-            max: MAX((100 * SQ_ACTIVE_INST_VALU2) / (SQ_ACTIVE_INST_VALU - SQ_ACTIVE_INST_VALU2))
-            unit: pct
-          VMEM Utilization:
-            avg: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
-              / $cu_per_gpu))
-            min: MIN((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
-              / $cu_per_gpu))
-            max: MAX((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
-              / $cu_per_gpu))
-            unit: pct
-          Branch Utilization:
-            avg: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            min: MIN((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            max: MAX((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            unit: pct
-          VALU Active Threads:
-            avg: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
-              != 0) else None))
-            min: MIN(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
-              != 0) else None))
-            max: MAX(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
-              != 0) else None))
-            unit: Threads
-          MFMA Utilization:
-            avg: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD)))
-            min: MIN(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD)))
-            max: MAX(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD)))
-            unit: pct
-          MFMA Instruction Cycles:
-            avg: AVG(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA
-              != 0) else None))
-            min: MIN(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA
-              != 0) else None))
-            max: MAX(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA
-              != 0) else None))
-            unit: cycles/instr
-          VMEM Latency:
-            avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0)
-              else None))
-            min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0)
-              else None))
-            max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0)
-              else None))
-            unit: Cycles
-            coll_level: SQ_INST_LEVEL_VMEM
-          SMEM Latency:
-            avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0)
-              else None))
-            min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0)
-              else None))
-            max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0)
-              else None))
-            unit: Cycles
-            coll_level: SQ_INST_LEVEL_SMEM
-        gfx908:
-          IPC:
-            avg: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))
-            min: MIN((SQ_INSTS / SQ_BUSY_CU_CYCLES))
-            max: MAX((SQ_INSTS / SQ_BUSY_CU_CYCLES))
-            unit: Instr/cycle
-          IPC (Issued):
-            avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
-              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
-              / SQ_ACTIVE_INST_ANY))
-            min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
-              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
-              / SQ_ACTIVE_INST_ANY))
-            max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM))
-              + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS)
-              / SQ_ACTIVE_INST_ANY))
-            unit: Instr/cycle
-          SALU Utilization:
-            avg: AVG((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            min: MIN((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            max: MAX((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            unit: pct
-          VALU Utilization:
-            avg: AVG((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            min: MIN((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            max: MAX((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
-            unit: pct
-          VALU Active Threads:
-            avg: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
-              != 0) else None))
-            min: MIN(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
-              != 0) else None))
-            max: MAX(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU
-              != 0) else None))
-            unit: Threads
-  - metric_table:
-      id: 1103
-      title: Arithmetic Operations
-      header:
-        metric: Metric
-        avg: Avg
-        min: Min
-        max: Max
-        unit: Unit
-      metric:
-        gfx90a:
-          FLOPs (Total):
-            avg: AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
-              + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 *
-              SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16)))
-              + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
-              + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
-              + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
-              + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
-              / $denom))
-            min: MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
-              + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 *
-              SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16)))
-              + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
-              + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
-              + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
-              + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
-              / $denom))
-            max: MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
-              + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 *
-              SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16)))
-              + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
-              + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
-              + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
-              + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
-              / $denom))
-            unit: (OPs + $normUnit)
-          IOPs (Total):
-            avg: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
-              * 512)) / $denom)
-            min: MIN(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
-              * 512)) / $denom)
-            max: MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
-              * 512)) / $denom)
-            unit: (OPs + $normUnit)
-          F16 OPs:
-            avg: AVG(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
-              + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
-              (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
-            min: MIN(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
-              + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
-              (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
-            max: MAX(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
-              + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
-              (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
-            unit: (OPs + $normUnit)
-          BF16 OPs:
-            avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
-            min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
-            max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
-            unit: (OPs + $normUnit)
-          F32 OPs:
-            avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
-              + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
-              / $denom))
-            min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
-              + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
-              / $denom))
-            max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
-              + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
-              / $denom))
-            unit: (OPs + $normUnit)
-          F64 OPs:
-            avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
-              + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
-              / $denom))
-            min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
-              + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
-              / $denom))
-            max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
-              + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
-              / $denom))
-            unit: (OPs + $normUnit)
-          INT8 OPs:
-            avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
-            min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
-            max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
-            unit: (OPs + $normUnit)
-        gfx941:
-          FLOPs (Total):
-            avg: AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
-              + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 *
-              SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) +
-              (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
-              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32
-              * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64
-              + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64
-              * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom))
-            min: MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
-              + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 *
-              SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) +
-              (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
-              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32
-              * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64
-              + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64
-              * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom))
-            max: MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
-              + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 *
-              SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) +
-              (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
-              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32
-              * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64
-              + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64
-              * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom))
-            unit: (OPs + $normUnit)
-          IOPs (Total):
-            avg: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
-              * 512)) / $denom)
-            min: MIN(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
-              * 512)) / $denom)
-            max: MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
-              * 512)) / $denom)
-            unit: (OPs + $normUnit)
-          F8 OPs:
-            avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
-            min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
-            max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
-            unit: (OPs + $normUnit)
-          F16 OPs:
-            avg: AVG(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
-              + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
-              (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
-            min: MIN(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
-              + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
-              (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
-            max: MAX(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
-              + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
-              (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
-            unit: (OPs + $normUnit)
-          BF16 OPs:
-            avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
-            min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
-            max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
-            unit: (OPs + $normUnit)
-          F32 OPs:
-            avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
-              + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
-              / $denom))
-            min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
-              + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
-              / $denom))
-            max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
-              + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
-              / $denom))
-            unit: (OPs + $normUnit)
-          F64 OPs:
-            avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
-              + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
-              / $denom))
-            min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
-              + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
-              / $denom))
-            max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
-              + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
-              / $denom))
-            unit: (OPs + $normUnit)
-          INT8 OPs:
-            avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
-            min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
-            max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
-            unit: (OPs + $normUnit)
-        gfx940:
-          FLOPs (Total):
-            avg: AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
-              + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 *
-              SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) +
-              (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
-              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32
-              * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64
-              + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64
-              * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom))
-            min: MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
-              + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 *
-              SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) +
-              (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
-              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32
-              * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64
-              + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64
-              * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom))
-            max: MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
-              + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 *
-              SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) +
-              (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
-              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32
-              * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64
-              + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64
-              * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom))
-            unit: (OPs + $normUnit)
-          IOPs (Total):
-            avg: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
-              * 512)) / $denom)
-            min: MIN(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
-              * 512)) / $denom)
-            max: MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
-              * 512)) / $denom)
-            unit: (OPs + $normUnit)
-          F8 OPs:
-            avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
-            min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
-            max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
-            unit: (OPs + $normUnit)
-          F16 OPs:
-            avg: AVG(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
-              + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
-              (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
-            min: MIN(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
-              + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
-              (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
-            max: MAX(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
-              + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
-              (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
-            unit: (OPs + $normUnit)
-          BF16 OPs:
-            avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
-            min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
-            max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
-            unit: (OPs + $normUnit)
-          F32 OPs:
-            avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
-              + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
-              / $denom))
-            min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
-              + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
-              / $denom))
-            max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
-              + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
-              / $denom))
-            unit: (OPs + $normUnit)
-          F64 OPs:
-            avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
-              + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
-              / $denom))
-            min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
-              + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
-              / $denom))
-            max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
-              + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
-              / $denom))
-            unit: (OPs + $normUnit)
-          INT8 OPs:
-            avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
-            min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
-            max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
-            unit: (OPs + $normUnit)
-        gfx942:
-          FLOPs (Total):
-            avg: AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
-              + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 *
-              SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) +
-              (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
-              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32
-              * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64
-              + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64
-              * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom))
-            min: MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
-              + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 *
-              SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) +
-              (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
-              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32
-              * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64
-              + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64
-              * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom))
-            max: MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
-              + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 *
-              SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) +
-              (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
-              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32
-              * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64
-              + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64
-              * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom))
-            unit: (OPs + $normUnit)
-          IOPs (Total):
-            avg: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
-              * 512)) / $denom)
-            min: MIN(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
-              * 512)) / $denom)
-            max: MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
-              * 512)) / $denom)
-            unit: (OPs + $normUnit)
-          F8 OPs:
-            avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
-            min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
-            max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
-            unit: (OPs + $normUnit)
-          F16 OPs:
-            avg: AVG(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
-              + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
-              (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
-            min: MIN(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
-              + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
-              (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
-            max: MAX(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
-              + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
-              (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
-            unit: (OPs + $normUnit)
-          BF16 OPs:
-            avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
-            min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
-            max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
-            unit: (OPs + $normUnit)
-          F32 OPs:
-            avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
-              + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
-              / $denom))
-            min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
-              + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
-              / $denom))
-            max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
-              + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
-              / $denom))
-            unit: (OPs + $normUnit)
-          F64 OPs:
-            avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
-              + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
-              / $denom))
-            min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
-              + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
-              / $denom))
-            max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
-              + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
-              / $denom))
-            unit: (OPs + $normUnit)
-          INT8 OPs:
-            avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
-            min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
-            max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
-            unit: (OPs + $normUnit)
-        gfx950:
-          FLOPs (Total):
-            avg: AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
-              + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 *
-              SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) +
-              (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
-              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32
-              * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64
-              + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64
-              * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4))
-              / $denom))
-            min: MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
-              + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 *
-              SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) +
-              (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
-              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32
-              * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64
-              + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64
-              * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4))
-              / $denom))
-            max: MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
-              + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 *
-              SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) +
-              (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
-              + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32
-              * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64
-              + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64
-              * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4))
-              / $denom))
-            unit: (OPs + $normUnit)
-          IOPs (Total):
-            avg: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
-              * 512)) / $denom)
-            min: MIN(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
-              * 512)) / $denom)
-            max: MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8
-              * 512)) / $denom)
-            unit: (OPs + $normUnit)
-          F8 OPs:
-            avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
-            min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
-            max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom))
-            unit: (OPs + $normUnit)
-          F16 OPs:
-            avg: AVG(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
-              + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
-              (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
-            min: MIN(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
-              + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
-              (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
-            max: MAX(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16))
-              + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) +
-              (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom))
-            unit: (OPs + $normUnit)
-          BF16 OPs:
-            avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
-            min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
-            max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom))
-            unit: (OPs + $normUnit)
-          F32 OPs:
-            avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
-              + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
-              / $denom))
-            min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
-              + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
-              / $denom))
-            max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32)
-              + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32))
-              / $denom))
-            unit: (OPs + $normUnit)
-          F64 OPs:
-            avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
-              + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
-              / $denom))
-            min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
-              + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
-              / $denom))
-            max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
-              + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64))
-              / $denom))
-            unit: (OPs + $normUnit)
-          F6F4 OPs:
-            avg: AVG((512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4) / $denom)
-            min: MIN((512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4) / $denom)
-            max: MAX((512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4) / $denom)
-            unit: (OPs + $normUnit)
-          INT8 OPs:
-            avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
-            min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
-            max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
-            unit: (OPs + $normUnit)
-        gfx908: {}
-  metrics_description:
-    VALU FLOPs:
-      plain: |-
-        The total floating-point operations executed per second on the VALU.
-        This is also presented as a percent of the peak theoretical FLOPs achievable
-        on the specific accelerator. Note: this does not include any floating-point
-        operations from MFMA instructions.
-      rst: |-
-        The total floating-point operations executed per second on the :ref:`VALU
-        <desc-valu>`. This is also presented as a percent of the peak theoretical
-        FLOPs achievable on the specific accelerator. Note: this does not include
-        any floating-point operations from :ref:`MFMA <desc-mfma>` instructions.
-      unit: GFLOPs
-    VALU IOPs:
-      plain: |-
-        The total integer operations executed per second on the VALU. This is
-        also presented as a percent of the peak theoretical IOPs achievable on the
-        specific accelerator. Note: this does not include any integer operations from
-        MFMA instructions.
-      rst: |-
-        The total integer operations executed per second on the :ref:`VALU <desc-valu>`.
-        This is also presented as a percent of the peak theoretical IOPs achievable
-        on the specific accelerator. Note: this does not include any integer operations
-        from :ref:`MFMA <desc-mfma>` instructions.
-      unit: GIOPs
-    MFMA FLOPs (BF16):
-      plain: |-
-        The total number of 16-bit brain floating point MFMA operations executed
-        per second. Note: this does not include any 16-bit brain floating point operations
-        from VALU instructions. This is also presented as a percent of the peak theoretical
-        BF16 MFMA operations achievable on the specific accelerator.
-      rst: |-
-        The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>` operations
-        executed per second. Note: this does not include any 16-bit brain floating
-        point operations from :ref:`VALU <desc-valu>` instructions. This is also
-        presented as a percent of the peak theoretical BF16 MFMA operations achievable
-        on the specific accelerator.
-      unit: GFLOPs
-    MFMA FLOPs (F16):
-      plain: |-
-        The total number of 16-bit floating point MFMA operations executed per
-        second. Note: this does not include any 16-bit floating point operations from
-        VALU instructions. This is also presented as a percent of the peak theoretical
-        F16 MFMA operations achievable on the specific accelerator.
-      rst: |-
-        The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
-        executed per second. Note: this does not include any 16-bit floating point
-        operations from :ref:`VALU <desc-valu>` instructions. This is also presented
-        as a percent of the peak theoretical F16 MFMA operations achievable on the
-        specific accelerator.
-      unit: GFLOPs
-    MFMA FLOPs (F32):
-      plain: |-
-        The total number of 32-bit floating point MFMA operations executed per
-        second. Note: this does not include any 32-bit floating point operations from
-        VALU instructions. This is also presented as a percent of the peak theoretical
-        F32 MFMA operations achievable on the specific accelerator.
-      rst: |-
-        The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
-        executed per second. Note: this does not include any 32-bit floating point
-        operations from :ref:`VALU <desc-valu>` instructions. This is also presented
-        as a percent of the peak theoretical F32 MFMA operations achievable on the
-        specific accelerator.
-      unit: GFLOPs
-    MFMA FLOPs (F64):
-      plain: |-
-        The total number of 64-bit floating point MFMA operations executed per
-        second. Note: this does not include any 64-bit floating point operations from
-        VALU instructions. This is also presented as a percent of the peak theoretical
-        F64 MFMA operations achievable on the specific accelerator.
-      rst: |-
-        The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
-        executed per second. Note: this does not include any 64-bit floating point
-        operations from :ref:`VALU <desc-valu>` instructions. This is also presented
-        as a percent of the peak theoretical F64 MFMA operations achievable on the
-        specific accelerator. The total number of 64-bit floating point :ref:`MFMA
-        <desc-mfma>` operations executed per second. Note: this does not include
-        any 64-bit floating point operations from :ref:`VALU <desc-valu>` instructions.
-        This is also presented as a percent of the peak theoretical F64 MFMA operations
-        achievable on the specific accelerator.
-      unit: GFLOPs
-    MFMA IOPs (INT8):
-      plain: |-
-        The total number of 8-bit integer MFMA operations executed per second.
-        Note: this does not include any 8-bit integer operations from VALU instructions.
-        This is also presented as a percent of the peak theoretical INT8 MFMA operations
-        achievable on the specific accelerator.
-      rst: |-
-        The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
-        per second. Note: this does not include any 8-bit integer operations from
-        :ref:`VALU <desc-valu>` instructions. This is also presented as a percent
-        of the peak theoretical INT8 MFMA operations achievable on the specific accelerator.
-      unit: GFLOPs
-    IPC:
-      plain: The ratio of the total number of instructions executed on the CU over
-        the total active CU cycles.
-      rst: The ratio of the total number of instructions executed on the :doc:`CU
-        <compute-unit>` over the :ref:`total active CU cycles <total-active-cu-cycles>`.
-      unit: Instructions per cycle
-    IPC (Issued):
-      plain: The ratio of the total number of (non-internal) instructions issued over
-        the number of cycles where the scheduler was actively working on issuing instructions.
-      rst: The ratio of the total number of (non-:ref:`internal <ipc-internal-instructions>`)
-        instructions issued over the number of cycles where the :ref:`scheduler <desc-scheduler>`
-        was actively working on issuing instructions. Refer to the :ref:`Issued
-        IPC <issued-ipc>` example for further detail.
-      unit: Instructions per cycle
-    SALU Utilization:
-      plain: Indicates what percent of the kernel's duration the SALU was busy executing
-        instructions. Computed as the ratio of the total number of cycles spent by
-        the scheduler issuing SALU / SMEM instructions over the total CU cycles.
-      rst: Indicates what percent of the kernel's duration the :ref:`SALU <desc-salu>`
-        was busy executing instructions. Computed as the ratio of the total number
-        of cycles spent by the :ref:`scheduler <desc-scheduler>` issuing SALU / :ref:`SMEM
-        <desc-smem>` instructions over the :ref:`total CU cycles <total-cu-cycles>`.
-      unit: Percent
-    VALU Utilization:
-      plain: Indicates what percent of the kernel's duration the VALU was busy executing
-        instructions. Does not include VMEM operations. Computed as the ratio of the
-        total number of cycles spent by the scheduler issuing VALU instructions over
-        the total CU cycles.
-      rst: Indicates what percent of the kernel's duration the :ref:`VALU <desc-valu>`
-        was busy executing instructions. Does not include :ref:`VMEM <desc-vmem>`
-        operations. Computed as the ratio of the total number of cycles spent by
-        the :ref:`scheduler <desc-scheduler>` issuing VALU instructions over the
-        :ref:`total CU cycles <total-cu-cycles>`.
-      unit: Percent
-    VMEM Utilization:
-      plain: Indicates what percent of the kernel's duration the VMEM unit was busy
-        executing instructions, including both global/generic and spill/scratch operations
-        (see the VMEM instruction count metrics for more detail). Does not include
-        VALU operations. Computed as the ratio of the total number of cycles spent
-        by the scheduler issuing VMEM instructions over the total CU cycles.
-      rst: Indicates what percent of the kernel's duration the :ref:`VMEM <desc-vmem>`
-        unit was busy executing instructions, including both global/generic and spill/scratch
-        operations (see the :ref:`VMEM instruction count metrics <ta-instruction-counts>`
-        for more detail). Does not include :ref:`VALU <desc-valu>` operations. Computed as
-        the ratio of the total number of cycles spent by the :ref:`scheduler <desc-scheduler>`
-        issuing VMEM instructions over the :ref:`total CU cycles <total-cu-cycles>`.
-      unit: Percent
-    Branch Utilization:
-      plain: Indicates what percent of the kernel's duration the branch unit was busy
-        executing instructions. Computed as the ratio of the total number of cycles
-        spent by the scheduler issuing branch instructions over the total CU cycles.
-      rst: Indicates what percent of the kernel's duration the :ref:`branch <desc-branch>`
-        unit was busy executing instructions. Computed as the ratio of the total
-        number of cycles spent by the :ref:`scheduler <desc-scheduler>` issuing branch
-        instructions over the :ref:`total CU cycles <total-cu-cycles>`.
-      unit: Percent
-    VALU Active Threads:
-      plain: Indicates the average level of divergence within a wavefront over the
-        lifetime of the kernel. The number of work-items that were active in a wavefront
-        during execution of each VALU instruction, time-averaged over all VALU instructions
-        run on all wavefronts in the kernel
-      rst: Indicates the average level of :ref:`divergence <desc-divergence>` within a
-        wavefront over the lifetime of the kernel. The number of work-items that
-        were active in a wavefront during execution of each :ref:`VALU <desc-valu>`
-        instruction, time-averaged over all VALU instructions run on all wavefronts
-        in the kernel.
-      unit: Work-items
-    MFMA Utilization:
-      plain: Indicates what percent of the kernel's duration the MFMA unit was busy
-        executing instructions. Computed as the ratio of the total number of cycles
-        spent by the MFMA was busy over the total CU cycles.
-      rst: Indicates what percent of the kernel's duration the :ref:`MFMA <desc-mfma>`
-        unit was busy executing instructions. Computed as the ratio of the total
-        number of cycles spent by the :ref:`MFMA <desc-salu>` was busy over the :ref:`total
-        CU cycles <total-cu-cycles>`.
-      unit: Percent
-    MFMA Instruction Cycles:
-      plain: The average duration of MFMA instructions in this kernel in cycles. Computed
-        as the ratio of the total number of cycles the MFMA unit was busy over the
-        total number of MFMA instructions.
-      rst: The average duration of :ref:`MFMA <desc-mfma>` instructions in this kernel
-        in cycles. Computed as the ratio of the total number of cycles the MFMA unit
-        was busy over the total number of MFMA instructions. Compare to, for example,
-        the `AMD Matrix Instruction Calculator <https://github.com/RadeonOpenCompute/amd_matrix_instruction_calculator>`_.
-      unit: Cycles per instruction
-    VMEM Latency:
-      plain: The average number of round-trip cycles (that is, from issue to data
-        return / acknowledgment) required for a VMEM instruction to complete.
-      rst: The average number of round-trip cycles (that is, from issue to data return
-        / acknowledgment) required for a VMEM instruction to complete.
-      unit: Cycles
-    SMEM Latency:
-      plain: The average number of round-trip cycles (that is, from issue to data
-        return / acknowledgment) required for a SMEM instruction to complete.
-      rst: The average number of round-trip cycles (that is, from issue to data return
-        / acknowledgment) required for a SMEM instruction to complete.
-      unit: Cycles
-    FLOPs (Total):
-      plain: The total number of floating-point operations executed on either the
-        VALU or MFMA units, per normalization unit.
-      rst: The total number of floating-point operations executed on either the :ref:`VALU
-        <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization unit
-        <normalization-units>`.
-      unit: FLOP per normalization unit
-    IOPs (Total):
-      plain: The total number of integer operations executed on either the VALU or
-        MFMA units, per normalization unit.
-      rst: The total number of integer operations executed on either the :ref:`VALU
-        <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization unit
-        <normalization-units>`.
-      unit: IOP per normalization unit
-    F16 OPs:
-      plain: The total number of 16-bit floating-point operations executed on either
-        the VALU or MFMA units, per normalization unit.
-      rst: The total number of 16-bit floating-point operations executed on either
-        the :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization
-        unit <normalization-units>`.
-      unit: FLOP per normalization unit
-    BF16 OPs:
-      plain: The total number of 16-bit brain floating-point operations executed on
-        either the VALU or MFMA units, per normalization unit.
-      rst: |-
-        The total number of 16-bit brain floating-point operations executed on
-        either the :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization
-        unit <normalization-units>`. Note: on current CDNA accelerators, the VALU
-        has no native BF16 instructions.
-      unit: FLOP per normalization unit
-    F32 OPs:
-      plain: The total number of 32-bit floating-point operations executed on either
-        the VALU or MFMA units, per normalization unit.
-      rst: The total number of 32-bit floating-point operations executed on either the
-        :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization
-        unit <normalization-units>`.
-      unit: FLOP per normalization unit
-    F64 OPs:
-      plain: The total number of 64-bit floating-point operations executed on either
-        the VALU or MFMA units, per normalization unit.
-      rst: The total number of 64-bit floating-point operations executed on either the
-        :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization
-        unit <normalization-units>`.
-      unit: FLOP per normalization unit
-    INT8 OPs:
-      plain: The total number of 8-bit integer operations executed on either the VALU
-        or MFMA units, per normalization unit.
-      rst: |-
-        The total number of 8-bit integer operations executed on either the :ref:`VALU
-        <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization unit
-        <normalization-units>`. Note: on current CDNA accelerators, the VALU has
-        no native INT8 instructions.
-      unit: IOP per normalization unit
-- id: 1200
-  title: Local Data Share (LDS)
-  data source:
-  - metric_table:
-      id: 1201
-      title: LDS Speed-of-Light
-      header:
-        metric: Metric
-        value: Avg
-        unit: Unit
-      metric:
-        gfx90a:
-          Utilization:
-            value: AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            unit: Pct of Peak
-          Access Rate:
-            value: AVG(((200 * SQ_ACTIVE_INST_LDS) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            unit: Pct of Peak
-          Theoretical Bandwidth Utilization:
-            value: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
-              / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) *
-              0.00128)))
-            unit: Pct of Peak
-          Bank Conflict Rate:
-            value: AVG((((SQ_LDS_BANK_CONFLICT * 3.125) / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
-              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
-            unit: Pct of Peak
-        gfx941:
-          Utilization:
-            value: AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            unit: Pct of Peak
-          Access Rate:
-            value: AVG(((200 * SQ_ACTIVE_INST_LDS) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            unit: Pct of Peak
-          Theoretical Bandwidth Utilization:
-            value: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
-              / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) *
-              0.00128)))
-            unit: Pct of Peak
-          Bank Conflict Rate:
-            value: AVG((((SQ_LDS_BANK_CONFLICT * 3.125) / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
-              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
-            unit: Pct of Peak
-        gfx940:
-          Utilization:
-            value: AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            unit: Pct of Peak
-          Access Rate:
-            value: AVG(((200 * SQ_ACTIVE_INST_LDS) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            unit: Pct of Peak
-          Theoretical Bandwidth Utilization:
-            value: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
-              / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) *
-              0.00128)))
-            unit: Pct of Peak
-          Bank Conflict Rate:
-            value: AVG((((SQ_LDS_BANK_CONFLICT * 3.125) / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
-              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
-            unit: Pct of Peak
-        gfx942:
-          Utilization:
-            value: AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            unit: Pct of Peak
-          Access Rate:
-            value: AVG(((200 * SQ_ACTIVE_INST_LDS) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            unit: Pct of Peak
-          Theoretical Bandwidth Utilization:
-            value: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
-              / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) *
-              0.00128)))
-            unit: Pct of Peak
-          Bank Conflict Rate:
-            value: AVG((((SQ_LDS_BANK_CONFLICT * 3.125) / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
-              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
-            unit: Pct of Peak
-        gfx950:
-          Utilization:
-            value: AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            unit: Pct of Peak
-          Access Rate:
-            value: AVG(((200 * SQ_ACTIVE_INST_LDS) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            unit: Pct of Peak
-          Theoretical Bandwidth Utilization:
-            value: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
-              / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) *
-              0.00128)))
-            unit: Pct of Peak
-          Bank Conflict Rate:
-            value: AVG((((SQ_LDS_BANK_CONFLICT * 3.125) / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
-              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
-            unit: Pct of Peak
-        gfx908:
-          Utilization:
-            value: AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            unit: Pct of Peak
-          Access Rate:
-            value: AVG(((200 * SQ_ACTIVE_INST_LDS) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            unit: Pct of Peak
-          Theoretical Bandwidth Utilization:
-            value: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
-              / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) *
-              0.00128)))
-            unit: Pct of Peak
-          Bank Conflict Rate:
-            value: AVG((((SQ_LDS_BANK_CONFLICT * 3.125) / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
-              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
-            unit: Pct of Peak
-      comparable: false
-      cli_style: simple_bar
-      tui_style: simple_bar
-  - metric_table:
-      id: 1202
-      title: LDS Statistics
-      header:
-        metric: Metric
-        avg: Avg
-        min: Min
-        max: Max
-        unit: Unit
-      metric:
-        gfx90a:
-          LDS Instructions:
-            avg: AVG((SQ_INSTS_LDS / $denom))
-            min: MIN((SQ_INSTS_LDS / $denom))
-            max: MAX((SQ_INSTS_LDS / $denom))
-            unit: (Instr + $normUnit)
-          Theoretical Bandwidth:
-            avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
-              / (End_Timestamp - Start_Timestamp)))
-            min: MIN(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
-              / (End_Timestamp - Start_Timestamp)))
-            max: MAX(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
-              / (End_Timestamp - Start_Timestamp)))
-            unit: Gbps
-          LDS Latency:
-            avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
-              else None))
-            min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
-              else None))
-            max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
-              else None))
-            unit: Cycles
-            coll_level: SQ_INST_LEVEL_LDS
-          Bank Conflicts/Access:
-            avg: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
-              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
-            min: MIN(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
-              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
-            max: MAX(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
-              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
-            unit: Conflicts/Access
-          Index Accesses:
-            avg: AVG((SQ_LDS_IDX_ACTIVE / $denom))
-            min: MIN((SQ_LDS_IDX_ACTIVE / $denom))
-            max: MAX((SQ_LDS_IDX_ACTIVE / $denom))
-            unit: (Cycles + $normUnit)
-          Atomic Return Cycles:
-            avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom))
-            min: MIN((SQ_LDS_ATOMIC_RETURN / $denom))
-            max: MAX((SQ_LDS_ATOMIC_RETURN / $denom))
-            unit: (Cycles + $normUnit)
-          Bank Conflict:
-            avg: AVG((SQ_LDS_BANK_CONFLICT / $denom))
-            min: MIN((SQ_LDS_BANK_CONFLICT / $denom))
-            max: MAX((SQ_LDS_BANK_CONFLICT / $denom))
-            unit: (Cycles + $normUnit)
-          Addr Conflict:
-            avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom))
-            min: MIN((SQ_LDS_ADDR_CONFLICT / $denom))
-            max: MAX((SQ_LDS_ADDR_CONFLICT / $denom))
-            unit: (Cycles + $normUnit)
-          Unaligned Stall:
-            avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom))
-            min: MIN((SQ_LDS_UNALIGNED_STALL / $denom))
-            max: MAX((SQ_LDS_UNALIGNED_STALL / $denom))
-            unit: (Cycles + $normUnit)
-          Mem Violations:
-            avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom))
-            min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom))
-            max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom))
-            unit: (Accesses + $normUnit)
-        gfx941:
-          LDS Instructions:
-            avg: AVG((SQ_INSTS_LDS / $denom))
-            min: MIN((SQ_INSTS_LDS / $denom))
-            max: MAX((SQ_INSTS_LDS / $denom))
-            unit: (Instr + $normUnit)
-          Theoretical Bandwidth:
-            avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
-              / (End_Timestamp - Start_Timestamp)))
-            min: MIN(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
-              / (End_Timestamp - Start_Timestamp)))
-            max: MAX(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
-              / (End_Timestamp - Start_Timestamp)))
-            unit: Gbps
-          LDS Latency:
-            avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
-              else None))
-            min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
-              else None))
-            max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
-              else None))
-            unit: Cycles
-            coll_level: SQ_INST_LEVEL_LDS
-          Bank Conflicts/Access:
-            avg: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
-              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
-            min: MIN(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
-              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
-            max: MAX(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
-              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
-            unit: Conflicts/Access
-          Index Accesses:
-            avg: AVG((SQ_LDS_IDX_ACTIVE / $denom))
-            min: MIN((SQ_LDS_IDX_ACTIVE / $denom))
-            max: MAX((SQ_LDS_IDX_ACTIVE / $denom))
-            unit: (Cycles + $normUnit)
-          Atomic Return Cycles:
-            avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom))
-            min: MIN((SQ_LDS_ATOMIC_RETURN / $denom))
-            max: MAX((SQ_LDS_ATOMIC_RETURN / $denom))
-            unit: (Cycles + $normUnit)
-          Bank Conflict:
-            avg: AVG((SQ_LDS_BANK_CONFLICT / $denom))
-            min: MIN((SQ_LDS_BANK_CONFLICT / $denom))
-            max: MAX((SQ_LDS_BANK_CONFLICT / $denom))
-            unit: (Cycles + $normUnit)
-          Addr Conflict:
-            avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom))
-            min: MIN((SQ_LDS_ADDR_CONFLICT / $denom))
-            max: MAX((SQ_LDS_ADDR_CONFLICT / $denom))
-            unit: (Cycles + $normUnit)
-          Unaligned Stall:
-            avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom))
-            min: MIN((SQ_LDS_UNALIGNED_STALL / $denom))
-            max: MAX((SQ_LDS_UNALIGNED_STALL / $denom))
-            unit: (Cycles + $normUnit)
-          Mem Violations:
-            avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom))
-            min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom))
-            max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom))
-            unit: (Accesses + $normUnit)
-        gfx940:
-          LDS Instructions:
-            avg: AVG((SQ_INSTS_LDS / $denom))
-            min: MIN((SQ_INSTS_LDS / $denom))
-            max: MAX((SQ_INSTS_LDS / $denom))
-            unit: (Instr + $normUnit)
-          Theoretical Bandwidth:
-            avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
-              / (End_Timestamp - Start_Timestamp)))
-            min: MIN(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
-              / (End_Timestamp - Start_Timestamp)))
-            max: MAX(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
-              / (End_Timestamp - Start_Timestamp)))
-            unit: Gbps
-          LDS Latency:
-            avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
-              else None))
-            min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
-              else None))
-            max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
-              else None))
-            unit: Cycles
-            coll_level: SQ_INST_LEVEL_LDS
-          Bank Conflicts/Access:
-            avg: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
-              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
-            min: MIN(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
-              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
-            max: MAX(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
-              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
-            unit: Conflicts/Access
-          Index Accesses:
-            avg: AVG((SQ_LDS_IDX_ACTIVE / $denom))
-            min: MIN((SQ_LDS_IDX_ACTIVE / $denom))
-            max: MAX((SQ_LDS_IDX_ACTIVE / $denom))
-            unit: (Cycles + $normUnit)
-          Atomic Return Cycles:
-            avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom))
-            min: MIN((SQ_LDS_ATOMIC_RETURN / $denom))
-            max: MAX((SQ_LDS_ATOMIC_RETURN / $denom))
-            unit: (Cycles + $normUnit)
-          Bank Conflict:
-            avg: AVG((SQ_LDS_BANK_CONFLICT / $denom))
-            min: MIN((SQ_LDS_BANK_CONFLICT / $denom))
-            max: MAX((SQ_LDS_BANK_CONFLICT / $denom))
-            unit: (Cycles + $normUnit)
-          Addr Conflict:
-            avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom))
-            min: MIN((SQ_LDS_ADDR_CONFLICT / $denom))
-            max: MAX((SQ_LDS_ADDR_CONFLICT / $denom))
-            unit: (Cycles + $normUnit)
-          Unaligned Stall:
-            avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom))
-            min: MIN((SQ_LDS_UNALIGNED_STALL / $denom))
-            max: MAX((SQ_LDS_UNALIGNED_STALL / $denom))
-            unit: (Cycles + $normUnit)
-          Mem Violations:
-            avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom))
-            min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom))
-            max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom))
-            unit: (Accesses + $normUnit)
-        gfx942:
-          LDS Instructions:
-            avg: AVG((SQ_INSTS_LDS / $denom))
-            min: MIN((SQ_INSTS_LDS / $denom))
-            max: MAX((SQ_INSTS_LDS / $denom))
-            unit: (Instr + $normUnit)
-          Theoretical Bandwidth:
-            avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
-              / (End_Timestamp - Start_Timestamp)))
-            min: MIN(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
-              / (End_Timestamp - Start_Timestamp)))
-            max: MAX(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
-              / (End_Timestamp - Start_Timestamp)))
-            unit: Gbps
-          LDS Latency:
-            avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
-              else None))
-            min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
-              else None))
-            max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
-              else None))
-            unit: Cycles
-            coll_level: SQ_INST_LEVEL_LDS
-          Bank Conflicts/Access:
-            avg: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
-              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
-            min: MIN(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
-              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
-            max: MAX(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
-              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
-            unit: Conflicts/Access
-          Index Accesses:
-            avg: AVG((SQ_LDS_IDX_ACTIVE / $denom))
-            min: MIN((SQ_LDS_IDX_ACTIVE / $denom))
-            max: MAX((SQ_LDS_IDX_ACTIVE / $denom))
-            unit: (Cycles + $normUnit)
-          Atomic Return Cycles:
-            avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom))
-            min: MIN((SQ_LDS_ATOMIC_RETURN / $denom))
-            max: MAX((SQ_LDS_ATOMIC_RETURN / $denom))
-            unit: (Cycles + $normUnit)
-          Bank Conflict:
-            avg: AVG((SQ_LDS_BANK_CONFLICT / $denom))
-            min: MIN((SQ_LDS_BANK_CONFLICT / $denom))
-            max: MAX((SQ_LDS_BANK_CONFLICT / $denom))
-            unit: (Cycles + $normUnit)
-          Addr Conflict:
-            avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom))
-            min: MIN((SQ_LDS_ADDR_CONFLICT / $denom))
-            max: MAX((SQ_LDS_ADDR_CONFLICT / $denom))
-            unit: (Cycles + $normUnit)
-          Unaligned Stall:
-            avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom))
-            min: MIN((SQ_LDS_UNALIGNED_STALL / $denom))
-            max: MAX((SQ_LDS_UNALIGNED_STALL / $denom))
-            unit: (Cycles + $normUnit)
-          Mem Violations:
-            avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom))
-            min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom))
-            max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom))
-            unit: (Accesses + $normUnit)
-        gfx950:
-          LDS Instructions:
-            avg: AVG((SQ_INSTS_LDS / $denom))
-            min: MIN((SQ_INSTS_LDS / $denom))
-            max: MAX((SQ_INSTS_LDS / $denom))
-            unit: (Instr + $normUnit)
-          LDS LOAD:
-            avg: AVG((SQ_INSTS_LDS_LOAD / $denom))
-            min: MIN((SQ_INSTS_LDS_LOAD / $denom))
-            max: MAX((SQ_INSTS_LDS_LOAD / $denom))
-            unit: (instr + $normUnit)
-          LDS STORE:
-            avg: AVG((SQ_INSTS_LDS_STORE / $denom))
-            min: MIN((SQ_INSTS_LDS_STORE / $denom))
-            max: MAX((SQ_INSTS_LDS_STORE / $denom))
-            unit: (instr + $normUnit)
-          LDS ATOMIC:
-            avg: AVG((SQ_INSTS_LDS_ATOMIC / $denom))
-            min: MIN((SQ_INSTS_LDS_ATOMIC / $denom))
-            max: MAX((SQ_INSTS_LDS_ATOMIC / $denom))
-            unit: (instr + $normUnit)
-          LDS LOAD Bandwidth:
-            avg: AVG(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp))
-            min: MIN(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp))
-            max: MAX(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp))
-            units: Gbps
-          LDS STORE Bandwidth:
-            avg: AVG(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp))
-            min: MIN(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp))
-            max: MAX(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp))
-            units: Gbps
-          LDS ATOMIC Bandwidth:
-            avg: AVG(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp))
-            min: MIN(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp))
-            max: MAX(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp))
-            units: Gbps
-          Theoretical Bandwidth:
-            avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
-              / (End_Timestamp - Start_Timestamp)))
-            min: MIN(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
-              / (End_Timestamp - Start_Timestamp)))
-            max: MAX(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
-              / (End_Timestamp - Start_Timestamp)))
-            unit: Gbps
-          LDS Latency:
-            avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
-              else None))
-            min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
-              else None))
-            max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
-              else None))
-            unit: Cycles
-            coll_level: SQ_INST_LEVEL_LDS
-          Bank Conflicts/Access:
-            avg: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
-              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
-            min: MIN(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
-              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
-            max: MAX(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
-              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
-            unit: Conflicts/Access
-          Index Accesses:
-            avg: AVG((SQ_LDS_IDX_ACTIVE / $denom))
-            min: MIN((SQ_LDS_IDX_ACTIVE / $denom))
-            max: MAX((SQ_LDS_IDX_ACTIVE / $denom))
-            unit: (Cycles + $normUnit)
-          Atomic Return Cycles:
-            avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom))
-            min: MIN((SQ_LDS_ATOMIC_RETURN / $denom))
-            max: MAX((SQ_LDS_ATOMIC_RETURN / $denom))
-            unit: (Cycles + $normUnit)
-          Bank Conflict:
-            avg: AVG((SQ_LDS_BANK_CONFLICT / $denom))
-            min: MIN((SQ_LDS_BANK_CONFLICT / $denom))
-            max: MAX((SQ_LDS_BANK_CONFLICT / $denom))
-            unit: (Cycles + $normUnit)
-          Addr Conflict:
-            avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom))
-            min: MIN((SQ_LDS_ADDR_CONFLICT / $denom))
-            max: MAX((SQ_LDS_ADDR_CONFLICT / $denom))
-            unit: (Cycles + $normUnit)
-          Unaligned Stall:
-            avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom))
-            min: MIN((SQ_LDS_UNALIGNED_STALL / $denom))
-            max: MAX((SQ_LDS_UNALIGNED_STALL / $denom))
-            unit: (Cycles + $normUnit)
-          Mem Violations:
-            avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom))
-            min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom))
-            max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom))
-            unit: (Accesses + $normUnit)
-          LDS Command FIFO Full Rate:
-            avg: AVG((SQ_LDS_CMD_FIFO_FULL / $denom))
-            min: MIN((SQ_LDS_CMD_FIFO_FULL / $denom))
-            max: MAX((SQ_LDS_CMD_FIFO_FULL / $denom))
-            unit: (Cycles + $normUnit)
-          LDS Data FIFO Full Rate:
-            avg: AVG((SQ_LDS_DATA_FIFO_FULL / $denom))
-            min: MIN((SQ_LDS_DATA_FIFO_FULL / $denom))
-            max: MAX((SQ_LDS_DATA_FIFO_FULL / $denom))
-            unit: (Cycles + $normUnit)
-        gfx908:
-          LDS Instructions:
-            avg: AVG((SQ_INSTS_LDS / $denom))
-            min: MIN((SQ_INSTS_LDS / $denom))
-            max: MAX((SQ_INSTS_LDS / $denom))
-            unit: (Instr + $normUnit)
-          Theoretical Bandwidth:
-            avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
-              / (End_Timestamp - Start_Timestamp)))
-            min: MIN(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
-              / (End_Timestamp - Start_Timestamp)))
-            max: MAX(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
-              / (End_Timestamp - Start_Timestamp)))
-            unit: Gbps
-          LDS Latency:
-            avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
-              else None))
-            min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
-              else None))
-            max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0)
-              else None))
-            unit: Cycles
-            coll_level: SQ_INST_LEVEL_LDS
-          Bank Conflicts/Access:
-            avg: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
-              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
-            min: MIN(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
-              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
-            max: MAX(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT))
-              if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))
-            unit: Conflicts/Access
-          Index Accesses:
-            avg: AVG((SQ_LDS_IDX_ACTIVE / $denom))
-            min: MIN((SQ_LDS_IDX_ACTIVE / $denom))
-            max: MAX((SQ_LDS_IDX_ACTIVE / $denom))
-            unit: (Cycles + $normUnit)
-          Atomic Return Cycles:
-            avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom))
-            min: MIN((SQ_LDS_ATOMIC_RETURN / $denom))
-            max: MAX((SQ_LDS_ATOMIC_RETURN / $denom))
-            unit: (Cycles + $normUnit)
-          Bank Conflict:
-            avg: AVG((SQ_LDS_BANK_CONFLICT / $denom))
-            min: MIN((SQ_LDS_BANK_CONFLICT / $denom))
-            max: MAX((SQ_LDS_BANK_CONFLICT / $denom))
-            unit: (Cycles + $normUnit)
-          Addr Conflict:
-            avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom))
-            min: MIN((SQ_LDS_ADDR_CONFLICT / $denom))
-            max: MAX((SQ_LDS_ADDR_CONFLICT / $denom))
-            unit: (Cycles + $normUnit)
-          Unaligned Stall:
-            avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom))
-            min: MIN((SQ_LDS_UNALIGNED_STALL / $denom))
-            max: MAX((SQ_LDS_UNALIGNED_STALL / $denom))
-            unit: (Cycles + $normUnit)
-          Mem Violations:
-            avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom))
-            min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom))
-            max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom))
-            unit: (Accesses + $normUnit)
-  metrics_description:
-    Utilization:
-      plain: Indicates what percent of the kernel's duration the LDS was actively
-        executing instructions (including, but not limited to, load, store, atomic
-        and HIP's __shfl operations). Calculated as the ratio of the total number
-        of cycles LDS was active over the total CU cycles.
-      rst: Indicates what percent of the kernel's duration the :ref:`LDS <desc-lds>` was
-        actively executing instructions (including, but not limited to, load, store,
-        atomic and HIP's ``__shfl`` operations). Calculated as the ratio of the
-        total number of cycles LDS was active over the :ref:`total CU cycles <total-cu-cycles>`.
-      unit: Percent
-    Access Rate:
-      plain: Indicates the percentage of SIMDs in the VALU actively issuing LDS instructions,
-        averaged over the lifetime of the kernel. Calculated as the ratio of the total
-        number of cycles spent by the scheduler issuing LDS instructions over the
-        total CU cycles.
-      rst: Indicates the percentage of SIMDs in the :ref:`VALU <desc-valu>` [#lds-workload]_
-        actively issuing LDS instructions, averaged over the lifetime of the kernel.
-        Calculated as the ratio of the total number of cycles spent by the :ref:`scheduler
-        <desc-scheduler>` issuing :ref:`LDS <desc-lds>` instructions over the :ref:`total
-        CU cycles <total-cu-cycles>`.
-      unit: Percent
-    Theoretical Bandwidth Utilization:
-      plain: Indicates the maximum amount of bytes that could have been loaded from,
-        stored to, or atomically updated in the LDS divided as percentage of theoretical peak.
-        Does not take into account the execution mask of the wavefront when the instruction
-        was executed.
-      rst: Indicates the maximum amount of bytes that could have been loaded from, stored
-        to, or atomically updated in the LDS divided as percentage of theoretical peak.
-        Does *not* take into account the execution mask of the wavefront when the
-        instruction was executed. See the :ref:`LDS bandwidth example <lds-bandwidth>`
-        for more detail.
-      unit: Percent
-    Theoretical Bandwidth:
-      plain: Indicates the maximum amount of bytes that could have been loaded from,
-        stored to, or atomically updated in the LDS divided by total duration. Does not
-        take into account the execution mask of the wavefront when the instruction
-        was executed.
-      rst: Indicates the maximum amount of bytes that could have been loaded from, stored
-        to, or atomically updated in the LDS divided by total duration.
-        Does *not* take into account the execution mask of the wavefront when the
-        instruction was executed. See the :ref:`LDS bandwidth example <lds-bandwidth>`
-        for more detail.
-      unit: Gbps
-    Bank Conflict Rate:
-      plain: Indicates the percentage of active LDS cycles that were spent servicing
-        bank conflicts. Calculated as the ratio of LDS cycles spent servicing bank
-        conflicts over the number of LDS cycles that would have been required to move
-        the same amount of data in an uncontended access.
-      rst: Indicates the percentage of active LDS cycles that were spent servicing bank
-        conflicts. Calculated as the ratio of LDS cycles spent servicing bank conflicts
-        over the number of LDS cycles that would have been required to move the same
-        amount of data in an uncontended access. [#lds-bank-conflict]_
-      unit: Percent
-    LDS Instructions:
-      plain: The total number of LDS instructions (including, but not limited to,
-        read/write/atomics and HIP's __shfl instructions) executed per normalization
-        unit.
-      rst: The total number of LDS instructions (including, but not limited to, read/write/atomics
-        and HIP's ``__shfl`` instructions) executed per :ref:`normalization unit
-        <normalization-units>`.
-      unit: Instructions per normalization unit
-    LDS Latency:
-      plain: The average number of round-trip cycles (i.e., from issue to data-return
-        acknowledgment) required for an LDS instruction to complete.
-      rst: The average number of round-trip cycles (i.e., from issue to data-return
-        acknowledgment) required for an LDS instruction to complete.
-      unit: Cycles
-    Bank Conflicts/Access:
-      plain: The ratio of the number of cycles spent in the LDS scheduler due to bank
-        conflicts (as determined by the conflict resolution hardware) to the base
-        number of cycles that would be spent in the LDS scheduler in a completely
-        uncontended case. This is the unnormalized form of the Bank Conflict Rate.
-      rst: The ratio of the number of cycles spent in the :ref:`LDS scheduler <desc-lds>`
-        due to bank conflicts (as determined by the conflict resolution hardware)
-        to the base number of cycles that would be spent in the LDS scheduler in
-        a completely uncontended case. This is the unnormalized form of the Bank
-        Conflict Rate.
-      unit: Conflicts per Access
-    Index Accesses:
-      plain: The total number of cycles spent in the LDS scheduler over all operations
-        per normalization unit.
-      rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>` over
-        all operations per :ref:`normalization unit <normalization-units>`.
-      unit: Cycles per normalization unit
-    Atomic Return Cycles:
-      plain: The total number of cycles spent on LDS atomics with return per normalization
-        unit.
-      rst: The total number of cycles spent on LDS atomics with return per :ref:`normalization
-        unit <normalization-units>`.
-      unit: Cycles per normalization unit
-    Bank Conflict:
-      plain: The total number of cycles spent in the LDS scheduler due to bank conflicts
-        (as determined by the conflict resolution hardware) per normalization unit.
-      rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>` due
-        to bank conflicts (as determined by the conflict resolution hardware) per
-        :ref:`normalization unit <normalization-units>`.
-      unit: Cycles per normalization unit
-    Addr Conflict:
-      plain: The total number of cycles spent in the LDS scheduler due to address
-        conflicts (as determined by the conflict resolution hardware) per normalization
-        unit.
-      rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>` due
-        to address conflicts (as determined by the conflict resolution hardware)
-        per :ref:`normalization unit <normalization-units>`.
-      unit: Cycles per normalization unit
-    Unaligned Stall:
-      plain: The total number of cycles spent in the LDS scheduler due to stalls from
-        non-dword aligned addresses per normalization unit.
-      rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>` due
-        to stalls from non-dword aligned addresses per :ref:`normalization unit <normalization-units>`.
-      unit: Cycles per normalization unit
-    Mem Violations:
-      plain: |-
-        The total number of out-of-bounds accesses made to the LDS, per normalization
-        unit. This is unused and expected to be zero in most configurations for
-        modern CDNA\u2122 accelerators.
-      rst: |-
-        The total number of out-of-bounds accesses made to the LDS, per :ref:`normalization
-        unit <normalization-units>`. This is unused and expected to be zero in
-        most configurations for modern CDNA\u2122 accelerators.
-      unit: Accesses per normalization unit
-- id: 1300
-  title: Instruction Cache
-  data source:
-  - metric_table:
-      id: 1301
-      title: L1I Speed-of-Light
-      header:
-        metric: Metric
-        value: Avg
-        unit: Unit
-      metric:
-        gfx90a:
-          Bandwidth Utilization:
-            value: AVG(((SQC_ICACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) *
-              (End_Timestamp - Start_Timestamp))))
-            unit: Pct of Peak
-          Cache Hit Rate:
-            value: AVG(((SQC_ICACHE_HITS * 100) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
-              + SQC_ICACHE_MISSES_DUPLICATE)))
-            unit: Pct of Peak
-          L1I-L2 Bandwidth Utilization:
-            value: AVG(((SQC_TC_INST_REQ * 100000) / (2 * ($max_sclk * $sqc_per_gpu)
-              * (End_Timestamp - Start_Timestamp))))
-            unit: Pct of Peak
-        gfx941:
-          Bandwidth Utilization:
-            value: AVG(((SQC_ICACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) *
-              (End_Timestamp - Start_Timestamp))))
-            unit: Pct of Peak
-          Cache Hit Rate:
-            value: AVG(((SQC_ICACHE_HITS * 100) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
-              + SQC_ICACHE_MISSES_DUPLICATE)))
-            unit: Pct of Peak
-          L1I-L2 Bandwidth Utilization:
-            value: AVG(((SQC_TC_INST_REQ * 100000) / (2 * ($max_sclk * $sqc_per_gpu)
-              * (End_Timestamp - Start_Timestamp))))
-            unit: Pct of Peak
-        gfx940:
-          Bandwidth Utilization:
-            value: AVG(((SQC_ICACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) *
-              (End_Timestamp - Start_Timestamp))))
-            unit: Pct of Peak
-          Cache Hit Rate:
-            value: AVG(((SQC_ICACHE_HITS * 100) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
-              + SQC_ICACHE_MISSES_DUPLICATE)))
-            unit: Pct of Peak
-          L1I-L2 Bandwidth Utilization:
-            value: AVG(((SQC_TC_INST_REQ * 100000) / (2 * ($max_sclk * $sqc_per_gpu)
-              * (End_Timestamp - Start_Timestamp))))
-            unit: Pct of Peak
-        gfx942:
-          Bandwidth Utilization:
-            value: AVG(((SQC_ICACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) *
-              (End_Timestamp - Start_Timestamp))))
-            unit: Pct of Peak
-          Cache Hit Rate:
-            value: AVG(((SQC_ICACHE_HITS * 100) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
-              + SQC_ICACHE_MISSES_DUPLICATE)))
-            unit: Pct of Peak
-          L1I-L2 Bandwidth Utilization:
-            value: AVG(((SQC_TC_INST_REQ * 100000) / (2 * ($max_sclk * $sqc_per_gpu)
-              * (End_Timestamp - Start_Timestamp))))
-            unit: Pct of Peak
-        gfx950:
-          Bandwidth Utilization:
-            value: AVG(((SQC_ICACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) *
-              (End_Timestamp - Start_Timestamp))))
-            unit: Pct of Peak
-          Cache Hit Rate:
-            value: AVG(((SQC_ICACHE_HITS * 100) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
-              + SQC_ICACHE_MISSES_DUPLICATE)))
-            unit: Pct of Peak
-          L1I-L2 Bandwidth Utilization:
-            value: AVG(((SQC_TC_INST_REQ * 100000) / (2 * ($max_sclk * $sqc_per_gpu)
-              * (End_Timestamp - Start_Timestamp))))
-            unit: Pct of Peak
-        gfx908:
-          Bandwidth Utilization:
-            value: AVG(((SQC_ICACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) *
-              (End_Timestamp - Start_Timestamp))))
-            unit: Pct of Peak
-          Cache Hit Rate:
-            value: AVG(((SQC_ICACHE_HITS * 100) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
-              + SQC_ICACHE_MISSES_DUPLICATE)))
-            unit: Pct of Peak
-          L1I-L2 Bandwidth Utilization:
-            value: AVG(((SQC_TC_INST_REQ * 100000) / (2 * ($max_sclk * $sqc_per_gpu)
-              * (End_Timestamp - Start_Timestamp))))
-            unit: Pct of Peak
-      comparable: false
-      cli_style: simple_bar
-      tui_style: simple_bar
-  - metric_table:
-      id: 1302
-      title: L1I cache accesses
-      header:
-        metric: Metric
-        avg: Avg
-        min: Min
-        max: Max
-        unit: Unit
-      metric:
-        gfx90a:
-          Req:
-            avg: AVG((SQC_ICACHE_REQ / $denom))
-            min: MIN((SQC_ICACHE_REQ / $denom))
-            max: MAX((SQC_ICACHE_REQ / $denom))
-            unit: (Req + $normUnit)
-          Hits:
-            avg: AVG((SQC_ICACHE_HITS / $denom))
-            min: MIN((SQC_ICACHE_HITS / $denom))
-            max: MAX((SQC_ICACHE_HITS / $denom))
-            unit: (Hits + $normUnit)
-          Misses - Non Duplicated:
-            avg: AVG((SQC_ICACHE_MISSES / $denom))
-            min: MIN((SQC_ICACHE_MISSES / $denom))
-            max: MAX((SQC_ICACHE_MISSES / $denom))
-            unit: (Misses + $normUnit)
-          Misses - Duplicated:
-            avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom))
-            min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom))
-            max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom))
-            unit: (Misses + $normUnit)
-          Cache Hit Rate:
-            avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
-              + SQC_ICACHE_MISSES_DUPLICATE)))
-            min: MIN(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
-              + SQC_ICACHE_MISSES_DUPLICATE)))
-            max: MAX(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
-              + SQC_ICACHE_MISSES_DUPLICATE)))
-            unit: pct
-          Instruction Fetch Latency:
-            avg: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
-            min: MIN((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
-            max: MAX((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
-            unit: Cycles
-            coll_level: SQ_IFETCH_LEVEL
-        gfx941:
-          Req:
-            avg: AVG((SQC_ICACHE_REQ / $denom))
-            min: MIN((SQC_ICACHE_REQ / $denom))
-            max: MAX((SQC_ICACHE_REQ / $denom))
-            unit: (Req + $normUnit)
-          Hits:
-            avg: AVG((SQC_ICACHE_HITS / $denom))
-            min: MIN((SQC_ICACHE_HITS / $denom))
-            max: MAX((SQC_ICACHE_HITS / $denom))
-            unit: (Hits + $normUnit)
-          Misses - Non Duplicated:
-            avg: AVG((SQC_ICACHE_MISSES / $denom))
-            min: MIN((SQC_ICACHE_MISSES / $denom))
-            max: MAX((SQC_ICACHE_MISSES / $denom))
-            unit: (Misses + $normUnit)
-          Misses - Duplicated:
-            avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom))
-            min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom))
-            max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom))
-            unit: (Misses + $normUnit)
-          Cache Hit Rate:
-            avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
-              + SQC_ICACHE_MISSES_DUPLICATE)))
-            min: MIN(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
-              + SQC_ICACHE_MISSES_DUPLICATE)))
-            max: MAX(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
-              + SQC_ICACHE_MISSES_DUPLICATE)))
-            unit: pct
-          Instruction Fetch Latency:
-            avg: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
-            min: MIN((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
-            max: MAX((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
-            unit: Cycles
-            coll_level: SQ_IFETCH_LEVEL
-        gfx940:
-          Req:
-            avg: AVG((SQC_ICACHE_REQ / $denom))
-            min: MIN((SQC_ICACHE_REQ / $denom))
-            max: MAX((SQC_ICACHE_REQ / $denom))
-            unit: (Req + $normUnit)
-          Hits:
-            avg: AVG((SQC_ICACHE_HITS / $denom))
-            min: MIN((SQC_ICACHE_HITS / $denom))
-            max: MAX((SQC_ICACHE_HITS / $denom))
-            unit: (Hits + $normUnit)
-          Misses - Non Duplicated:
-            avg: AVG((SQC_ICACHE_MISSES / $denom))
-            min: MIN((SQC_ICACHE_MISSES / $denom))
-            max: MAX((SQC_ICACHE_MISSES / $denom))
-            unit: (Misses + $normUnit)
-          Misses - Duplicated:
-            avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom))
-            min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom))
-            max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom))
-            unit: (Misses + $normUnit)
-          Cache Hit Rate:
-            avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
-              + SQC_ICACHE_MISSES_DUPLICATE)))
-            min: MIN(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
-              + SQC_ICACHE_MISSES_DUPLICATE)))
-            max: MAX(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
-              + SQC_ICACHE_MISSES_DUPLICATE)))
-            unit: pct
-          Instruction Fetch Latency:
-            avg: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
-            min: MIN((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
-            max: MAX((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
-            unit: Cycles
-            coll_level: SQ_IFETCH_LEVEL
-        gfx942:
-          Req:
-            avg: AVG((SQC_ICACHE_REQ / $denom))
-            min: MIN((SQC_ICACHE_REQ / $denom))
-            max: MAX((SQC_ICACHE_REQ / $denom))
-            unit: (Req + $normUnit)
-          Hits:
-            avg: AVG((SQC_ICACHE_HITS / $denom))
-            min: MIN((SQC_ICACHE_HITS / $denom))
-            max: MAX((SQC_ICACHE_HITS / $denom))
-            unit: (Hits + $normUnit)
-          Misses - Non Duplicated:
-            avg: AVG((SQC_ICACHE_MISSES / $denom))
-            min: MIN((SQC_ICACHE_MISSES / $denom))
-            max: MAX((SQC_ICACHE_MISSES / $denom))
-            unit: (Misses + $normUnit)
-          Misses - Duplicated:
-            avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom))
-            min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom))
-            max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom))
-            unit: (Misses + $normUnit)
-          Cache Hit Rate:
-            avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
-              + SQC_ICACHE_MISSES_DUPLICATE)))
-            min: MIN(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
-              + SQC_ICACHE_MISSES_DUPLICATE)))
-            max: MAX(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
-              + SQC_ICACHE_MISSES_DUPLICATE)))
-            unit: pct
-          Instruction Fetch Latency:
-            avg: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
-            min: MIN((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
-            max: MAX((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
-            unit: Cycles
-            coll_level: SQ_IFETCH_LEVEL
-        gfx950:
-          Req:
-            avg: AVG((SQC_ICACHE_REQ / $denom))
-            min: MIN((SQC_ICACHE_REQ / $denom))
-            max: MAX((SQC_ICACHE_REQ / $denom))
-            unit: (Req + $normUnit)
-          Hits:
-            avg: AVG((SQC_ICACHE_HITS / $denom))
-            min: MIN((SQC_ICACHE_HITS / $denom))
-            max: MAX((SQC_ICACHE_HITS / $denom))
-            unit: (Hits + $normUnit)
-          Misses - Non Duplicated:
-            avg: AVG((SQC_ICACHE_MISSES / $denom))
-            min: MIN((SQC_ICACHE_MISSES / $denom))
-            max: MAX((SQC_ICACHE_MISSES / $denom))
-            unit: (Misses + $normUnit)
-          Misses - Duplicated:
-            avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom))
-            min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom))
-            max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom))
-            unit: (Misses + $normUnit)
-          Cache Hit Rate:
-            avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
-              + SQC_ICACHE_MISSES_DUPLICATE)))
-            min: MIN(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
-              + SQC_ICACHE_MISSES_DUPLICATE)))
-            max: MAX(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
-              + SQC_ICACHE_MISSES_DUPLICATE)))
-            unit: pct
-          Instruction Fetch Latency:
-            avg: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
-            min: MIN((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
-            max: MAX((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
-            unit: Cycles
-            coll_level: SQ_IFETCH_LEVEL
-        gfx908:
-          Req:
-            avg: AVG((SQC_ICACHE_REQ / $denom))
-            min: MIN((SQC_ICACHE_REQ / $denom))
-            max: MAX((SQC_ICACHE_REQ / $denom))
-            unit: (Req + $normUnit)
-          Hits:
-            avg: AVG((SQC_ICACHE_HITS / $denom))
-            min: MIN((SQC_ICACHE_HITS / $denom))
-            max: MAX((SQC_ICACHE_HITS / $denom))
-            unit: (Hits + $normUnit)
-          Misses - Non Duplicated:
-            avg: AVG((SQC_ICACHE_MISSES / $denom))
-            min: MIN((SQC_ICACHE_MISSES / $denom))
-            max: MAX((SQC_ICACHE_MISSES / $denom))
-            unit: (Misses + $normUnit)
-          Misses - Duplicated:
-            avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom))
-            min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom))
-            max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom))
-            unit: (Misses + $normUnit)
-          Cache Hit Rate:
-            avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
-              + SQC_ICACHE_MISSES_DUPLICATE)))
-            min: MIN(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
-              + SQC_ICACHE_MISSES_DUPLICATE)))
-            max: MAX(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES)
-              + SQC_ICACHE_MISSES_DUPLICATE)))
-            unit: pct
-          Instruction Fetch Latency:
-            avg: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
-            min: MIN((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
-            max: MAX((SQ_ACCUM_PREV_HIRES / SQ_IFETCH))
-            unit: Cycles
-            coll_level: SQ_IFETCH_LEVEL
-  - metric_table:
-      id: 1303
-      title: L1I <-> L2 interface
-      header:
-        metric: Metric
-        avg: Avg
-        min: Min
-        max: Max
-        unit: Unit
-      metric:
-        gfx90a:
-          L1I-L2 Bandwidth:
-            avg: AVG(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
-            min: MIN(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
-            max: MAX(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
-            unit: Gbps
-        gfx941:
-          L1I-L2 Bandwidth:
-            avg: AVG(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
-            min: MIN(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
-            max: MAX(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
-            unit: Gbps
-        gfx940:
-          L1I-L2 Bandwidth:
-            avg: AVG(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
-            min: MIN(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
-            max: MAX(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
-            unit: Gbps
-        gfx942:
-          L1I-L2 Bandwidth:
-            avg: AVG(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
-            min: MIN(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
-            max: MAX(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
-            unit: Gbps
-        gfx950:
-          L1I-L2 Bandwidth:
-            avg: AVG(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
-            min: MIN(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
-            max: MAX(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
-            unit: Gbps
-        gfx908:
-          L1I-L2 Bandwidth:
-            avg: AVG(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
-            min: MIN(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
-            max: MAX(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
-            unit: Gbps
-  metrics_description:
-    Bandwidth Utilization:
-      plain: The number of bytes looked up in the L1I cache, as a percent of the peak
-        theoretical bandwidth. Calculated as the ratio of L1I requests over the total
-        L1I cycles.
-      rst: The number of bytes looked up in the L1I cache, as a percent of the peak theoretical
-        bandwidth. Calculated as the ratio of L1I requests over the :ref:`total L1I
-        cycles <total-l1i-cycles>`.
-      unit: Percent
-    Cache Hit Rate:
-      plain: The percent of L1I requests that hit [#l1i-cache]_ on a previously loaded
-        line the cache. Calculated as the ratio of the number of L1I requests that
-        hit over the number of all L1I requests.
-      rst: The percent of L1I requests that hit [#l1i-cache]_ on a previously loaded line
-        the cache. Calculated as the ratio of the number of L1I requests that hit
-        over the number of all L1I requests.
-      unit: Percent
-    L1I-L2 Bandwidth Utilization:
-      plain: |-
-        The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth
-        achieved. Calculated as the ratio of the total number of requests from the
-        L1I to the L2 cache over the total L1I-L2 interface cycles.
-      rst: |-
-        The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth
-        achieved. Calculated as the ratio of the total number of requests from
-        the L1I to the L2 cache over the :ref:`total L1I-L2 interface cycles <total-l1i-cycles>`.
-      unit: Percent
-    L1I-L2 Bandwidth:
-      plain: Total number of bytes transferred across L1I - L2 interface divided by total duration.
-      rst: Total number of bytes transferred across L1I - L2 interface divided by total duration.
-      unit: Gbps
-    Req:
-      plain: The total number of requests made to the L1I per normalization-unit
-      rst: The total number of requests made to the L1I per normalization-unit
-      unit: Requests per normalization unit
-    Hits:
-      plain: The total number of L1I requests that hit on a previously loaded cache
-        line, per normalization-unit.
-      rst: The total number of L1I requests that hit on a previously loaded cache line,
-        per :ref:`normalization-unit <normalization-units>`.
-      unit: Requests per normalization unit
-    Misses - Non Duplicated:
-      plain: The total number of L1I requests that missed on a cache line that were
-        not already pending due to another request, per normalization-unit.
-      rst: The total number of L1I requests that missed on a cache line that *were
-        not* already pending due to another request, per :ref:`normalization-unit
-        <normalization-units>`. See note in :ref:`desc-l1i-sol` for more detail.
-      unit: Requests per normalization unit
-    Misses - Duplicated:
-      plain: The total number of L1I requests that missed on a cache line that were
-        already pending due to another request, per normalization-unit.
-      rst: The total number of L1I requests that missed on a cache line that *were* already
-        pending due to another request, per :ref:`normalization-unit <normalization-units>`.
-        See note in :ref:`desc-l1i-sol` for more detail.
-      unit: Requests per normalization unit
-    Instruction Fetch Latency:
-      plain: The average number of cycles spent to fetch instructions to a CU.
-      rst: The average number of cycles spent to fetch instructions to a :doc:`CU
-        <compute-unit>`.
-      unit: Cycles
-- id: 1400
-  title: Scalar L1 Data Cache
-  data source:
-  - metric_table:
-      id: 1401
-      title: Scalar L1D Speed-of-Light
-      header:
-        metric: Metric
-        value: Avg
-        unit: Unit
-      metric:
-        gfx90a:
-          Bandwidth Utilization:
-            value: AVG(((SQC_DCACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) *
-              (End_Timestamp - Start_Timestamp))))
-            unit: Pct of Peak
-          Cache Hit Rate:
-            value: AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES
-              + SQC_DCACHE_MISSES_DUPLICATE)) if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES
-              + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
-            unit: Pct of Peak
-          sL1D-L2 BW Utilization:
-            value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
-              * 100000) / (2 * ($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp)))
-            unit: Pct of Peak
-        gfx941:
-          Bandwidth Utilization:
-            value: AVG(((SQC_DCACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) *
-              (End_Timestamp - Start_Timestamp))))
-            unit: Pct of Peak
-          Cache Hit Rate:
-            value: AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES
-              + SQC_DCACHE_MISSES_DUPLICATE)) if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES
-              + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
-            unit: Pct of Peak
-          sL1D-L2 BW Utilization:
-            value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
-              * 100000) / (2 * ($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp)))
-            unit: Pct of Peak
-        gfx940:
-          Bandwidth Utilization:
-            value: AVG(((SQC_DCACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) *
-              (End_Timestamp - Start_Timestamp))))
-            unit: Pct of Peak
-          Cache Hit Rate:
-            value: AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES
-              + SQC_DCACHE_MISSES_DUPLICATE)) if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES
-              + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
-            unit: Pct of Peak
-          sL1D-L2 BW Utilization:
-            value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
-              * 100000) / (2 * ($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp)))
-            unit: Pct of Peak
-        gfx942:
-          Bandwidth Utilization:
-            value: AVG(((SQC_DCACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) *
-              (End_Timestamp - Start_Timestamp))))
-            unit: Pct of Peak
-          Cache Hit Rate:
-            value: AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES
-              + SQC_DCACHE_MISSES_DUPLICATE)) if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES
-              + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
-            unit: Pct of Peak
-          sL1D-L2 BW Utilization:
-            value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
-              * 100000) / (2 * ($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp)))
-            unit: Pct of Peak
-        gfx950:
-          Bandwidth Utilization:
-            value: AVG(((SQC_DCACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) *
-              (End_Timestamp - Start_Timestamp))))
-            unit: Pct of Peak
-          Cache Hit Rate:
-            value: AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES
-              + SQC_DCACHE_MISSES_DUPLICATE)) if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES
-              + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
-            unit: Pct of Peak
-          sL1D-L2 BW Utilization:
-            value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
-              * 100000) / (2 * ($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp)))
-            unit: Pct of Peak
-        gfx908:
-          Bandwidth Utilization:
-            value: AVG(((SQC_DCACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) *
-              (End_Timestamp - Start_Timestamp))))
-            unit: Pct of Peak
-          Cache Hit Rate:
-            value: AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES
-              + SQC_DCACHE_MISSES_DUPLICATE)) if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES
-              + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
-            unit: Pct of Peak
-          sL1D-L2 BW Utilization:
-            value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
-              * 100000) / (2 * ($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp)))
-            unit: Pct of Peak
-      comparable: false
-      cli_style: simple_bar
-      tui_style: simple_bar
-  - metric_table:
-      id: 1402
-      title: Scalar L1D cache accesses
-      header:
-        metric: Metric
-        avg: Avg
-        min: Min
-        max: Max
-        unit: Unit
-      metric:
-        gfx90a:
-          Req:
-            avg: AVG((SQC_DCACHE_REQ / $denom))
-            min: MIN((SQC_DCACHE_REQ / $denom))
-            max: MAX((SQC_DCACHE_REQ / $denom))
-            unit: (Req + $normUnit)
-          Hits:
-            avg: AVG((SQC_DCACHE_HITS / $denom))
-            min: MIN((SQC_DCACHE_HITS / $denom))
-            max: MAX((SQC_DCACHE_HITS / $denom))
-            unit: (Req + $normUnit)
-          Misses - Non Duplicated:
-            avg: AVG((SQC_DCACHE_MISSES / $denom))
-            min: MIN((SQC_DCACHE_MISSES / $denom))
-            max: MAX((SQC_DCACHE_MISSES / $denom))
-            unit: (Req + $normUnit)
-          Misses- Duplicated:
-            avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom))
-            min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom))
-            max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom))
-            unit: (Req + $normUnit)
-          Cache Hit Rate:
-            avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
-              + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
-              + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
-            min: MIN((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
-              + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
-              + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
-            max: MAX((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
-              + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
-              + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
-            unit: pct
-          Read Req (Total):
-            avg: AVG((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
-              + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
-            min: MIN((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
-              + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
-            max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
-              + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
-            unit: (Req + $normUnit)
-          Atomic Req:
-            avg: AVG((SQC_DCACHE_ATOMIC / $denom))
-            min: MIN((SQC_DCACHE_ATOMIC / $denom))
-            max: MAX((SQC_DCACHE_ATOMIC / $denom))
-            unit: (Req + $normUnit)
-          Read Req (1 DWord):
-            avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom))
-            min: MIN((SQC_DCACHE_REQ_READ_1 / $denom))
-            max: MAX((SQC_DCACHE_REQ_READ_1 / $denom))
-            unit: (Req + $normUnit)
-          Read Req (2 DWord):
-            avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom))
-            min: MIN((SQC_DCACHE_REQ_READ_2 / $denom))
-            max: MAX((SQC_DCACHE_REQ_READ_2 / $denom))
-            unit: (Req + $normUnit)
-          Read Req (4 DWord):
-            avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom))
-            min: MIN((SQC_DCACHE_REQ_READ_4 / $denom))
-            max: MAX((SQC_DCACHE_REQ_READ_4 / $denom))
-            unit: (Req + $normUnit)
-          Read Req (8 DWord):
-            avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom))
-            min: MIN((SQC_DCACHE_REQ_READ_8 / $denom))
-            max: MAX((SQC_DCACHE_REQ_READ_8 / $denom))
-            unit: (Req + $normUnit)
-          Read Req (16 DWord):
-            avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom))
-            min: MIN((SQC_DCACHE_REQ_READ_16 / $denom))
-            max: MAX((SQC_DCACHE_REQ_READ_16 / $denom))
-            unit: (Req + $normUnit)
-        gfx941:
-          Req:
-            avg: AVG((SQC_DCACHE_REQ / $denom))
-            min: MIN((SQC_DCACHE_REQ / $denom))
-            max: MAX((SQC_DCACHE_REQ / $denom))
-            unit: (Req + $normUnit)
-          Hits:
-            avg: AVG((SQC_DCACHE_HITS / $denom))
-            min: MIN((SQC_DCACHE_HITS / $denom))
-            max: MAX((SQC_DCACHE_HITS / $denom))
-            unit: (Req + $normUnit)
-          Misses - Non Duplicated:
-            avg: AVG((SQC_DCACHE_MISSES / $denom))
-            min: MIN((SQC_DCACHE_MISSES / $denom))
-            max: MAX((SQC_DCACHE_MISSES / $denom))
-            unit: (Req + $normUnit)
-          Misses- Duplicated:
-            avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom))
-            min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom))
-            max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom))
-            unit: (Req + $normUnit)
-          Cache Hit Rate:
-            avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
-              + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
-              + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
-            min: MIN((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
-              + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
-              + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
-            max: MAX((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
-              + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
-              + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
-            unit: pct
-          Read Req (Total):
-            avg: AVG((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
-              + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
-            min: MIN((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
-              + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
-            max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
-              + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
-            unit: (Req + $normUnit)
-          Atomic Req:
-            avg: AVG((SQC_DCACHE_ATOMIC / $denom))
-            min: MIN((SQC_DCACHE_ATOMIC / $denom))
-            max: MAX((SQC_DCACHE_ATOMIC / $denom))
-            unit: (Req + $normUnit)
-          Read Req (1 DWord):
-            avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom))
-            min: MIN((SQC_DCACHE_REQ_READ_1 / $denom))
-            max: MAX((SQC_DCACHE_REQ_READ_1 / $denom))
-            unit: (Req + $normUnit)
-          Read Req (2 DWord):
-            avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom))
-            min: MIN((SQC_DCACHE_REQ_READ_2 / $denom))
-            max: MAX((SQC_DCACHE_REQ_READ_2 / $denom))
-            unit: (Req + $normUnit)
-          Read Req (4 DWord):
-            avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom))
-            min: MIN((SQC_DCACHE_REQ_READ_4 / $denom))
-            max: MAX((SQC_DCACHE_REQ_READ_4 / $denom))
-            unit: (Req + $normUnit)
-          Read Req (8 DWord):
-            avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom))
-            min: MIN((SQC_DCACHE_REQ_READ_8 / $denom))
-            max: MAX((SQC_DCACHE_REQ_READ_8 / $denom))
-            unit: (Req + $normUnit)
-          Read Req (16 DWord):
-            avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom))
-            min: MIN((SQC_DCACHE_REQ_READ_16 / $denom))
-            max: MAX((SQC_DCACHE_REQ_READ_16 / $denom))
-            unit: (Req + $normUnit)
-        gfx940:
-          Req:
-            avg: AVG((SQC_DCACHE_REQ / $denom))
-            min: MIN((SQC_DCACHE_REQ / $denom))
-            max: MAX((SQC_DCACHE_REQ / $denom))
-            unit: (Req + $normUnit)
-          Hits:
-            avg: AVG((SQC_DCACHE_HITS / $denom))
-            min: MIN((SQC_DCACHE_HITS / $denom))
-            max: MAX((SQC_DCACHE_HITS / $denom))
-            unit: (Req + $normUnit)
-          Misses - Non Duplicated:
-            avg: AVG((SQC_DCACHE_MISSES / $denom))
-            min: MIN((SQC_DCACHE_MISSES / $denom))
-            max: MAX((SQC_DCACHE_MISSES / $denom))
-            unit: (Req + $normUnit)
-          Misses- Duplicated:
-            avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom))
-            min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom))
-            max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom))
-            unit: (Req + $normUnit)
-          Cache Hit Rate:
-            avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
-              + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
-              + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
-            min: MIN((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
-              + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
-              + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
-            max: MAX((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
-              + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
-              + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
-            unit: pct
-          Read Req (Total):
-            avg: AVG((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
-              + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
-            min: MIN((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
-              + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
-            max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
-              + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
-            unit: (Req + $normUnit)
-          Atomic Req:
-            avg: AVG((SQC_DCACHE_ATOMIC / $denom))
-            min: MIN((SQC_DCACHE_ATOMIC / $denom))
-            max: MAX((SQC_DCACHE_ATOMIC / $denom))
-            unit: (Req + $normUnit)
-          Read Req (1 DWord):
-            avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom))
-            min: MIN((SQC_DCACHE_REQ_READ_1 / $denom))
-            max: MAX((SQC_DCACHE_REQ_READ_1 / $denom))
-            unit: (Req + $normUnit)
-          Read Req (2 DWord):
-            avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom))
-            min: MIN((SQC_DCACHE_REQ_READ_2 / $denom))
-            max: MAX((SQC_DCACHE_REQ_READ_2 / $denom))
-            unit: (Req + $normUnit)
-          Read Req (4 DWord):
-            avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom))
-            min: MIN((SQC_DCACHE_REQ_READ_4 / $denom))
-            max: MAX((SQC_DCACHE_REQ_READ_4 / $denom))
-            unit: (Req + $normUnit)
-          Read Req (8 DWord):
-            avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom))
-            min: MIN((SQC_DCACHE_REQ_READ_8 / $denom))
-            max: MAX((SQC_DCACHE_REQ_READ_8 / $denom))
-            unit: (Req + $normUnit)
-          Read Req (16 DWord):
-            avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom))
-            min: MIN((SQC_DCACHE_REQ_READ_16 / $denom))
-            max: MAX((SQC_DCACHE_REQ_READ_16 / $denom))
-            unit: (Req + $normUnit)
-        gfx942:
-          Req:
-            avg: AVG((SQC_DCACHE_REQ / $denom))
-            min: MIN((SQC_DCACHE_REQ / $denom))
-            max: MAX((SQC_DCACHE_REQ / $denom))
-            unit: (Req + $normUnit)
-          Hits:
-            avg: AVG((SQC_DCACHE_HITS / $denom))
-            min: MIN((SQC_DCACHE_HITS / $denom))
-            max: MAX((SQC_DCACHE_HITS / $denom))
-            unit: (Req + $normUnit)
-          Misses - Non Duplicated:
-            avg: AVG((SQC_DCACHE_MISSES / $denom))
-            min: MIN((SQC_DCACHE_MISSES / $denom))
-            max: MAX((SQC_DCACHE_MISSES / $denom))
-            unit: (Req + $normUnit)
-          Misses- Duplicated:
-            avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom))
-            min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom))
-            max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom))
-            unit: (Req + $normUnit)
-          Cache Hit Rate:
-            avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
-              + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
-              + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
-            min: MIN((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
-              + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
-              + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
-            max: MAX((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
-              + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
-              + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
-            unit: pct
-          Read Req (Total):
-            avg: AVG((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
-              + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
-            min: MIN((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
-              + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
-            max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
-              + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
-            unit: (Req + $normUnit)
-          Atomic Req:
-            avg: AVG((SQC_DCACHE_ATOMIC / $denom))
-            min: MIN((SQC_DCACHE_ATOMIC / $denom))
-            max: MAX((SQC_DCACHE_ATOMIC / $denom))
-            unit: (Req + $normUnit)
-          Read Req (1 DWord):
-            avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom))
-            min: MIN((SQC_DCACHE_REQ_READ_1 / $denom))
-            max: MAX((SQC_DCACHE_REQ_READ_1 / $denom))
-            unit: (Req + $normUnit)
-          Read Req (2 DWord):
-            avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom))
-            min: MIN((SQC_DCACHE_REQ_READ_2 / $denom))
-            max: MAX((SQC_DCACHE_REQ_READ_2 / $denom))
-            unit: (Req + $normUnit)
-          Read Req (4 DWord):
-            avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom))
-            min: MIN((SQC_DCACHE_REQ_READ_4 / $denom))
-            max: MAX((SQC_DCACHE_REQ_READ_4 / $denom))
-            unit: (Req + $normUnit)
-          Read Req (8 DWord):
-            avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom))
-            min: MIN((SQC_DCACHE_REQ_READ_8 / $denom))
-            max: MAX((SQC_DCACHE_REQ_READ_8 / $denom))
-            unit: (Req + $normUnit)
-          Read Req (16 DWord):
-            avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom))
-            min: MIN((SQC_DCACHE_REQ_READ_16 / $denom))
-            max: MAX((SQC_DCACHE_REQ_READ_16 / $denom))
-            unit: (Req + $normUnit)
-        gfx950:
-          Req:
-            avg: AVG((SQC_DCACHE_REQ / $denom))
-            min: MIN((SQC_DCACHE_REQ / $denom))
-            max: MAX((SQC_DCACHE_REQ / $denom))
-            unit: (Req + $normUnit)
-          Hits:
-            avg: AVG((SQC_DCACHE_HITS / $denom))
-            min: MIN((SQC_DCACHE_HITS / $denom))
-            max: MAX((SQC_DCACHE_HITS / $denom))
-            unit: (Req + $normUnit)
-          Misses - Non Duplicated:
-            avg: AVG((SQC_DCACHE_MISSES / $denom))
-            min: MIN((SQC_DCACHE_MISSES / $denom))
-            max: MAX((SQC_DCACHE_MISSES / $denom))
-            unit: (Req + $normUnit)
-          Misses- Duplicated:
-            avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom))
-            min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom))
-            max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom))
-            unit: (Req + $normUnit)
-          Cache Hit Rate:
-            avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
-              + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
-              + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
-            min: MIN((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
-              + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
-              + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
-            max: MAX((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
-              + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
-              + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
-            unit: pct
-          Read Req (Total):
-            avg: AVG((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
-              + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
-            min: MIN((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
-              + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
-            max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
-              + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
-            unit: (Req + $normUnit)
-          Atomic Req:
-            avg: AVG((SQC_DCACHE_ATOMIC / $denom))
-            min: MIN((SQC_DCACHE_ATOMIC / $denom))
-            max: MAX((SQC_DCACHE_ATOMIC / $denom))
-            unit: (Req + $normUnit)
-          Read Req (1 DWord):
-            avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom))
-            min: MIN((SQC_DCACHE_REQ_READ_1 / $denom))
-            max: MAX((SQC_DCACHE_REQ_READ_1 / $denom))
-            unit: (Req + $normUnit)
-          Read Req (2 DWord):
-            avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom))
-            min: MIN((SQC_DCACHE_REQ_READ_2 / $denom))
-            max: MAX((SQC_DCACHE_REQ_READ_2 / $denom))
-            unit: (Req + $normUnit)
-          Read Req (4 DWord):
-            avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom))
-            min: MIN((SQC_DCACHE_REQ_READ_4 / $denom))
-            max: MAX((SQC_DCACHE_REQ_READ_4 / $denom))
-            unit: (Req + $normUnit)
-          Read Req (8 DWord):
-            avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom))
-            min: MIN((SQC_DCACHE_REQ_READ_8 / $denom))
-            max: MAX((SQC_DCACHE_REQ_READ_8 / $denom))
-            unit: (Req + $normUnit)
-          Read Req (16 DWord):
-            avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom))
-            min: MIN((SQC_DCACHE_REQ_READ_16 / $denom))
-            max: MAX((SQC_DCACHE_REQ_READ_16 / $denom))
-            unit: (Req + $normUnit)
-        gfx908:
-          Req:
-            avg: AVG((SQC_DCACHE_REQ / $denom))
-            min: MIN((SQC_DCACHE_REQ / $denom))
-            max: MAX((SQC_DCACHE_REQ / $denom))
-            unit: (Req + $normUnit)
-          Hits:
-            avg: AVG((SQC_DCACHE_HITS / $denom))
-            min: MIN((SQC_DCACHE_HITS / $denom))
-            max: MAX((SQC_DCACHE_HITS / $denom))
-            unit: (Req + $normUnit)
-          Misses - Non Duplicated:
-            avg: AVG((SQC_DCACHE_MISSES / $denom))
-            min: MIN((SQC_DCACHE_MISSES / $denom))
-            max: MAX((SQC_DCACHE_MISSES / $denom))
-            unit: (Req + $normUnit)
-          Misses- Duplicated:
-            avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom))
-            min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom))
-            max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom))
-            unit: (Req + $normUnit)
-          Cache Hit Rate:
-            avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
-              + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
-              + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
-            min: MIN((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
-              + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
-              + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
-            max: MAX((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
-              + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES)
-              + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None))
-            unit: pct
-          Read Req (Total):
-            avg: AVG((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
-              + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
-            min: MIN((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
-              + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
-            max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4)
-              + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom))
-            unit: (Req + $normUnit)
-          Atomic Req:
-            avg: AVG((SQC_DCACHE_ATOMIC / $denom))
-            min: MIN((SQC_DCACHE_ATOMIC / $denom))
-            max: MAX((SQC_DCACHE_ATOMIC / $denom))
-            unit: (Req + $normUnit)
-          Read Req (1 DWord):
-            avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom))
-            min: MIN((SQC_DCACHE_REQ_READ_1 / $denom))
-            max: MAX((SQC_DCACHE_REQ_READ_1 / $denom))
-            unit: (Req + $normUnit)
-          Read Req (2 DWord):
-            avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom))
-            min: MIN((SQC_DCACHE_REQ_READ_2 / $denom))
-            max: MAX((SQC_DCACHE_REQ_READ_2 / $denom))
-            unit: (Req + $normUnit)
-          Read Req (4 DWord):
-            avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom))
-            min: MIN((SQC_DCACHE_REQ_READ_4 / $denom))
-            max: MAX((SQC_DCACHE_REQ_READ_4 / $denom))
-            unit: (Req + $normUnit)
-          Read Req (8 DWord):
-            avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom))
-            min: MIN((SQC_DCACHE_REQ_READ_8 / $denom))
-            max: MAX((SQC_DCACHE_REQ_READ_8 / $denom))
-            unit: (Req + $normUnit)
-          Read Req (16 DWord):
-            avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom))
-            min: MIN((SQC_DCACHE_REQ_READ_16 / $denom))
-            max: MAX((SQC_DCACHE_REQ_READ_16 / $denom))
-            unit: (Req + $normUnit)
-  - metric_table:
-      id: 1403
-      title: Scalar L1D Cache - L2 Interface
-      header:
-        metric: Metric
-        avg: Avg
-        min: Min
-        max: Max
-        unit: Unit
-      metric:
-        gfx90a:
-          sL1D-L2 BW:
-            avg: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
-              * 64)) / (End_Timestamp - Start_Timestamp)))
-            min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
-              * 64)) / (End_Timestamp - Start_Timestamp)))
-            max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
-              * 64)) / (End_Timestamp - Start_Timestamp)))
-            unit: Gbps
-          Read Req:
-            avg: AVG((SQC_TC_DATA_READ_REQ / $denom))
-            min: MIN((SQC_TC_DATA_READ_REQ / $denom))
-            max: MAX((SQC_TC_DATA_READ_REQ / $denom))
-            unit: (Req + $normUnit)
-          Write Req:
-            avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom))
-            min: MIN((SQC_TC_DATA_WRITE_REQ / $denom))
-            max: MAX((SQC_TC_DATA_WRITE_REQ / $denom))
-            unit: (Req + $normUnit)
-          Atomic Req:
-            avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom))
-            min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom))
-            max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom))
-            unit: (Req + $normUnit)
-          Stall Cycles:
-            avg: AVG((SQC_TC_STALL / $denom))
-            min: MIN((SQC_TC_STALL / $denom))
-            max: MAX((SQC_TC_STALL / $denom))
-            unit: (Cycles + $normUnit)
-        gfx941:
-          sL1D-L2 BW:
-            avg: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
-              * 64)) / (End_Timestamp - Start_Timestamp)))
-            min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
-              * 64)) / (End_Timestamp - Start_Timestamp)))
-            max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
-              * 64)) / (End_Timestamp - Start_Timestamp)))
-            unit: Gbps
-          Read Req:
-            avg: AVG((SQC_TC_DATA_READ_REQ / $denom))
-            min: MIN((SQC_TC_DATA_READ_REQ / $denom))
-            max: MAX((SQC_TC_DATA_READ_REQ / $denom))
-            unit: (Req + $normUnit)
-          Write Req:
-            avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom))
-            min: MIN((SQC_TC_DATA_WRITE_REQ / $denom))
-            max: MAX((SQC_TC_DATA_WRITE_REQ / $denom))
-            unit: (Req + $normUnit)
-          Atomic Req:
-            avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom))
-            min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom))
-            max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom))
-            unit: (Req + $normUnit)
-          Stall Cycles:
-            avg: AVG((SQC_TC_STALL / $denom))
-            min: MIN((SQC_TC_STALL / $denom))
-            max: MAX((SQC_TC_STALL / $denom))
-            unit: (Cycles + $normUnit)
-        gfx940:
-          sL1D-L2 BW:
-            avg: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
-              * 64)) / (End_Timestamp - Start_Timestamp)))
-            min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
-              * 64)) / (End_Timestamp - Start_Timestamp)))
-            max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
-              * 64)) / (End_Timestamp - Start_Timestamp)))
-            unit: Gbps
-          Read Req:
-            avg: AVG((SQC_TC_DATA_READ_REQ / $denom))
-            min: MIN((SQC_TC_DATA_READ_REQ / $denom))
-            max: MAX((SQC_TC_DATA_READ_REQ / $denom))
-            unit: (Req + $normUnit)
-          Write Req:
-            avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom))
-            min: MIN((SQC_TC_DATA_WRITE_REQ / $denom))
-            max: MAX((SQC_TC_DATA_WRITE_REQ / $denom))
-            unit: (Req + $normUnit)
-          Atomic Req:
-            avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom))
-            min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom))
-            max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom))
-            unit: (Req + $normUnit)
-          Stall Cycles:
-            avg: AVG((SQC_TC_STALL / $denom))
-            min: MIN((SQC_TC_STALL / $denom))
-            max: MAX((SQC_TC_STALL / $denom))
-            unit: (Cycles + $normUnit)
-        gfx942:
-          sL1D-L2 BW:
-            avg: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
-              * 64)) / (End_Timestamp - Start_Timestamp)))
-            min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
-              * 64)) / (End_Timestamp - Start_Timestamp)))
-            max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
-              * 64)) / (End_Timestamp - Start_Timestamp)))
-            unit: Gbps
-          Read Req:
-            avg: AVG((SQC_TC_DATA_READ_REQ / $denom))
-            min: MIN((SQC_TC_DATA_READ_REQ / $denom))
-            max: MAX((SQC_TC_DATA_READ_REQ / $denom))
-            unit: (Req + $normUnit)
-          Write Req:
-            avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom))
-            min: MIN((SQC_TC_DATA_WRITE_REQ / $denom))
-            max: MAX((SQC_TC_DATA_WRITE_REQ / $denom))
-            unit: (Req + $normUnit)
-          Atomic Req:
-            avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom))
-            min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom))
-            max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom))
-            unit: (Req + $normUnit)
-          Stall Cycles:
-            avg: AVG((SQC_TC_STALL / $denom))
-            min: MIN((SQC_TC_STALL / $denom))
-            max: MAX((SQC_TC_STALL / $denom))
-            unit: (Cycles + $normUnit)
-        gfx950:
-          sL1D-L2 BW:
-            avg: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
-              * 64)) / (End_Timestamp - Start_Timestamp)))
-            min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
-              * 64)) / (End_Timestamp - Start_Timestamp)))
-            max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
-              * 64)) / (End_Timestamp - Start_Timestamp)))
-            unit: Gbps
-          Read Req:
-            avg: AVG((SQC_TC_DATA_READ_REQ / $denom))
-            min: MIN((SQC_TC_DATA_READ_REQ / $denom))
-            max: MAX((SQC_TC_DATA_READ_REQ / $denom))
-            unit: (Req + $normUnit)
-          Write Req:
-            avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom))
-            min: MIN((SQC_TC_DATA_WRITE_REQ / $denom))
-            max: MAX((SQC_TC_DATA_WRITE_REQ / $denom))
-            unit: (Req + $normUnit)
-          Atomic Req:
-            avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom))
-            min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom))
-            max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom))
-            unit: (Req + $normUnit)
-          Stall Cycles:
-            avg: AVG((SQC_TC_STALL / $denom))
-            min: MIN((SQC_TC_STALL / $denom))
-            max: MAX((SQC_TC_STALL / $denom))
-            unit: (Cycles + $normUnit)
-        gfx908:
-          sL1D-L2 BW:
-            avg: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
-              * 64)) / (End_Timestamp - Start_Timestamp)))
-            min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
-              * 64)) / (End_Timestamp - Start_Timestamp)))
-            max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
-              * 64)) / (End_Timestamp - Start_Timestamp)))
-            unit: Gbps
-          Read Req:
-            avg: AVG((SQC_TC_DATA_READ_REQ / $denom))
-            min: MIN((SQC_TC_DATA_READ_REQ / $denom))
-            max: MAX((SQC_TC_DATA_READ_REQ / $denom))
-            unit: (Req + $normUnit)
-          Write Req:
-            avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom))
-            min: MIN((SQC_TC_DATA_WRITE_REQ / $denom))
-            max: MAX((SQC_TC_DATA_WRITE_REQ / $denom))
-            unit: (Req + $normUnit)
-          Atomic Req:
-            avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom))
-            min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom))
-            max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom))
-            unit: (Req + $normUnit)
-          Stall Cycles:
-            avg: AVG((SQC_TC_STALL / $denom))
-            min: MIN((SQC_TC_STALL / $denom))
-            max: MAX((SQC_TC_STALL / $denom))
-            unit: (Cycles + $normUnit)
-  metrics_description:
-    Bandwidth Utilization:
-      plain: The number of bytes looked up in the sL1D cache, as a percent of the
-        peak theoretical bandwidth. Calculated as the ratio of sL1D requests over
-        the total sL1D cycles.
-      rst: The number of bytes looked up in the sL1D cache, as a percent of the peak theoretical
-        bandwidth. Calculated as the ratio of sL1D requests over the :ref:`total
-        sL1D cycles <total-sl1d-cycles>`.
-      unit: Percent
-    Cache Hit Rate:
-      plain: Indicates the percent of sL1D requests that hit on a previously loaded
-        line the cache. The ratio of the number of sL1D requests that hit over the
-        number of all sL1D requests.
-      rst: Indicates the percent of sL1D requests that hit on a previously loaded line
-        the cache. The ratio of the number of sL1D requests that hit [#sl1d-cache]_
-        over the number of all sL1D requests.
-      unit: Percent
-    sL1D-L2 BW Utilization:
-      plain: The percentage of the peak theoretical sL1D - L2 interface bandwidth acheived.
-        Calculated as total number of bytes read from, written to, or atomically updated
-        across the sL1D - L2 interface.
-      rst: The percentage of the peak theoretical sL1D - L2 interface bandwidth acheived.
-        Calculated as total number of bytes read from, written to, or atomically updated
-        across the sL1D - L2 interface.
-      unit: Percent
-    sL1D-L2 BW:
-      plain: |-
-        The total number of bytes read from, written to, or atomically updated
-        across the sL1D\u2194L2 interface, divided by total duration. Note that sL1D
-        writes and atomics are typically unused on current CDNA accelerators, so
-        in the majority of cases this can be interpreted as an sL1D\u2192L2 read
-        bandwidth.
-      rst: |-
-        The total number of bytes read from, written to, or atomically updated
-        across the sL1D\u2194:doc:`L2 <l2-cache>` interface, divided by total duration.
-        Note that sL1D writes and atomics are typically
-        unused on current CDNA accelerators, so in the majority of cases this can
-        be interpreted as an sL1D\u2192L2 read bandwidth.
-      unit: Gbps
-    Req:
-      plain: The total number of requests, of any size or type, made to the sL1D per
-        normalization unit.
-      rst: The total number of requests, of any size or type, made to the sL1D per :ref:`normalization
-        unit <normalization-units>`.
-      unit: Requests per normalization unit
-    Hits:
-      plain: The total number of sL1D requests that hit on a previously loaded cache
-        line, per normalization unit.
-      rst: The total number of sL1D requests that hit on a previously loaded cache line,
-        per :ref:`normalization unit <normalization-units>`.
-      unit: Requests per normalization unit
-    Misses - Non Duplicated:
-      plain: |-
-        The total number of sL1D requests that missed on a cache line that was
-        not already pending due to another request, per normalization unit.
-      rst: The total number of sL1D requests that missed on a cache line that *was not*
-        already pending due to another request, per :ref:`normalization unit <normalization-units>`.
-        See :ref:`desc-sl1d-sol` for more detail.
-      unit: Requests per normalization unit
-    Misses- Duplicated:
-      plain: The total number of sL1D requests that missed on a cache line that was
-        already pending due to another request, per normalization unit.
-      rst: The total number of sL1D requests that missed on a cache line that *was* already
-        pending due to another request, per :ref:`normalization unit <normalization-units>`.
-        See :ref:`desc-sl1d-sol` for more detail.
-      unit: Requests per normalization unit
-    Read Req (Total):
-      plain: The total number of sL1D read requests of any size, per normalization
-        unit.
-      rst: The total number of sL1D read requests of any size, per :ref:`normalization
-        unit <normalization-units>`.
-      unit: Requests per normalization unit
-    Atomic Req:
-      plain: The total number of atomic requests from sL1D to the L2, per normalization
-        unit. Typically unused on current CDNA accelerators.
-      rst: The total number of atomic requests from sL1D to the :doc:`L2 <l2-cache>`,
-        per :ref:`normalization unit <normalization-units>`. Typically unused on current
-        CDNA accelerators.
-      unit: Requests per normalization unit
-    Read Req (1 DWord):
-      plain: The total number of sL1D read requests made for a single dword of data
-        (4B), per normalization unit.
-      rst: The total number of sL1D read requests made for a single dword of data (4B),
-        per :ref:`normalization unit <normalization-units>`.
-      unit: Requests per normalization unit
-    Read Req (2 DWord):
-      plain: The total number of sL1D read requests made for a two dwords of data
-        (8B), per normalization unit.
-      rst: The total number of sL1D read requests made for a two dwords of data (8B),
-        per :ref:`normalization unit <normalization-units>`.
-      unit: Requests per normalization unit
-    Read Req (4 DWord):
-      plain: The total number of sL1D read requests made for a four dwords of data
-        (16B), per normalization unit.
-      rst: The total number of sL1D read requests made for a four dwords of data (16B),
-        per :ref:`normalization unit <normalization-units>`.
-      unit: Requests per normalization unit
-    Read Req (8 DWord):
-      plain: The total number of sL1D read requests made for a eight dwords of data
-        (32B), per normalization unit.
-      rst: The total number of sL1D read requests made for a eight dwords of data (32B),
-        per :ref:`normalization unit <normalization-units>`.
-      unit: Requests per normalization unit
-    Read Req (16 DWord):
-      plain: The total number of sL1D read requests made for a sixteen dwords of data
-        (64B), per normalization unit.
-      rst: The total number of sL1D read requests made for a sixteen dwords of data (64B),
-        per :ref:`normalization unit <normalization-units>`.
-      unit: Requests per normalization unit
-    Read Req:
-      plain: The total number of read requests from sL1D to the L2 per normalization
-        unit.
-      rst: The total number of read requests from sL1D to the :doc:`L2 <l2-cache>`, per
-        :ref:`normalization unit <normalization-units>`.
-      unit: Requests per normalization unit
-    Write Req:
-      plain: The total number of write requests from sL1D to the L2, per normalization
-        unit. Typically unused on current CDNA accelerators.
-      rst: The total number of write requests from sL1D to the :doc:`L2 <l2-cache>`, per
-        :ref:`normalization unit <normalization-units>`. Typically unused on current
-        CDNA accelerators.
-      unit: Requests per normalization unit
-    Stall Cycles:
-      plain: |-
-        The total number of cycles the sL1D\u2194L2 interface was stalled, per
-        normalization unit.
-      rst: |-
-        The total number of cycles the sL1D\u2194 :doc:`L2 <l2-cache>` interface
-        was stalled, per :ref:`normalization unit <normalization-units>`.
-      unit: Cycles per normalization unit
-- id: 1500
-  title: Address Processing Unit and Data Return Path (TA/TD)
-  data source:
-  - metric_table:
-      id: 1501
-      title: Busy and stall metrics
-      header:
-        metric: Metric
-        avg: Avg
-        min: Min
-        max: Max
-        unit: Unit
-      metric:
-        gfx90a:
-          Address Processing Unit Busy:
-            avg: AVG(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            min: MIN(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            max: MAX(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            unit: pct
-          Address Stall:
-            avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            unit: pct
-          Data Stall:
-            avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            unit: pct
-          "Data-Processor \u2192 Address Stall":
-            avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            unit: pct
-          "Sequencer \u2192 TA Address Stall":
-            avg: AVG((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom))
-            min: MIN((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom))
-            max: MAX((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom))
-            unit: (Cycles + $normUnit)
-          "Sequencer \u2192 TA Command Stall":
-            avg: AVG((SQ_VMEM_TA_CMD_FIFO_FULL / $denom))
-            min: MIN((SQ_VMEM_TA_CMD_FIFO_FULL / $denom))
-            max: MAX((SQ_VMEM_TA_CMD_FIFO_FULL / $denom))
-            unit: (Cycles + $normUnit)
-          "Sequencer \u2192 TA Data Stall":
-            avg: AVG((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom))
-            min: MIN((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom))
-            max: MAX((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom))
-            unit: (Cycles + $normUnit)
-        gfx941:
-          Address Processing Unit Busy:
-            avg: AVG(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            min: MIN(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            max: MAX(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            unit: pct
-          Address Stall:
-            avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            unit: pct
-          Data Stall:
-            avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            unit: pct
-          "Data-Processor \u2192 Address Stall":
-            avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            unit: pct
-          "Sequencer \u2192 TA Address Stall":
-            avg: AVG((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom))
-            min: MIN((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom))
-            max: MAX((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom))
-            unit: (Cycles + $normUnit)
-          "Sequencer \u2192 TA Command Stall":
-            avg: AVG((SQ_VMEM_TA_CMD_FIFO_FULL / $denom))
-            min: MIN((SQ_VMEM_TA_CMD_FIFO_FULL / $denom))
-            max: MAX((SQ_VMEM_TA_CMD_FIFO_FULL / $denom))
-            unit: (Cycles + $normUnit)
-          "Sequencer \u2192 TA Data Stall":
-            avg: AVG((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom))
-            min: MIN((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom))
-            max: MAX((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom))
-            unit: (Cycles + $normUnit)
-        gfx940:
-          Address Processing Unit Busy:
-            avg: AVG(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            min: MIN(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            max: MAX(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            unit: pct
-          Address Stall:
-            avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            unit: pct
-          Data Stall:
-            avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            unit: pct
-          "Data-Processor \u2192 Address Stall":
-            avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            unit: pct
-          "Sequencer \u2192 TA Address Stall":
-            avg: AVG((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom))
-            min: MIN((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom))
-            max: MAX((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom))
-            unit: (Cycles + $normUnit)
-          "Sequencer \u2192 TA Command Stall":
-            avg: AVG((SQ_VMEM_TA_CMD_FIFO_FULL / $denom))
-            min: MIN((SQ_VMEM_TA_CMD_FIFO_FULL / $denom))
-            max: MAX((SQ_VMEM_TA_CMD_FIFO_FULL / $denom))
-            unit: (Cycles + $normUnit)
-          "Sequencer \u2192 TA Data Stall":
-            avg: AVG((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom))
-            min: MIN((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom))
-            max: MAX((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom))
-            unit: (Cycles + $normUnit)
-        gfx942:
-          Address Processing Unit Busy:
-            avg: AVG(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            min: MIN(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            max: MAX(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            unit: pct
-          Address Stall:
-            avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            unit: pct
-          Data Stall:
-            avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            unit: pct
-          "Data-Processor \u2192 Address Stall":
-            avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            unit: pct
-          "Sequencer \u2192 TA Address Stall":
-            avg: AVG((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom))
-            min: MIN((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom))
-            max: MAX((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom))
-            unit: (Cycles + $normUnit)
-          "Sequencer \u2192 TA Command Stall":
-            avg: AVG((SQ_VMEM_TA_CMD_FIFO_FULL / $denom))
-            min: MIN((SQ_VMEM_TA_CMD_FIFO_FULL / $denom))
-            max: MAX((SQ_VMEM_TA_CMD_FIFO_FULL / $denom))
-            unit: (Cycles + $normUnit)
-          "Sequencer \u2192 TA Data Stall":
-            avg: AVG((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom))
-            min: MIN((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom))
-            max: MAX((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom))
-            unit: (Cycles + $normUnit)
-        gfx950:
-          Address Processing Unit Busy:
-            avg: AVG(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            min: MIN(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            max: MAX(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            unit: pct
-          Address Stall:
-            avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            unit: pct
-          Data Stall:
-            avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            unit: pct
-          "Data-Processor \u2192 Address Stall":
-            avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            unit: pct
-          "Sequencer \u2192 TA Address Stall":
-            avg: AVG((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom))
-            min: MIN((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom))
-            max: MAX((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom))
-            unit: (Cycles + $normUnit)
-          "Sequencer \u2192 TA Command Stall":
-            avg: AVG((SQ_VMEM_TA_CMD_FIFO_FULL / $denom))
-            min: MIN((SQ_VMEM_TA_CMD_FIFO_FULL / $denom))
-            max: MAX((SQ_VMEM_TA_CMD_FIFO_FULL / $denom))
-            unit: (Cycles + $normUnit)
-          "Sequencer \u2192 TA Data Stall":
-            avg: AVG((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom))
-            min: MIN((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom))
-            max: MAX((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom))
-            unit: (Cycles + $normUnit)
-        gfx908:
-          Address Processing Unit Busy:
-            avg: AVG(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            min: MIN(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            max: MAX(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            unit: pct
-          Address Stall:
-            avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            unit: pct
-          Data Stall:
-            avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            unit: pct
-          "Data-Processor \u2192 Address Stall":
-            avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD
-              * $cu_per_gpu)))
-            unit: pct
-  - metric_table:
-      id: 1502
-      title: Instruction counts
-      header:
-        metric: Metric
-        avg: Avg
-        min: Min
-        max: Max
-        unit: Unit
-      metric:
-        gfx90a:
-          Total Instructions:
-            avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Global/Generic Instructions:
-            avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Global/Generic Read Instructions:
-            avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Global/Generic Write Instructions:
-            avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Global/Generic Atomic Instructions:
-            avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Spill/Stack Instructions:
-            avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Spill/Stack Read Instructions:
-            avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Spill/Stack Write Instructions:
-            avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Spill/Stack Atomic Instructions:
-            avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-            max: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-        gfx941:
-          Total Instructions:
-            avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Global/Generic Instructions:
-            avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Global/Generic Read Instructions:
-            avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Global/Generic Write Instructions:
-            avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Global/Generic Atomic Instructions:
-            avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Spill/Stack Instructions:
-            avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Spill/Stack Read Instructions:
-            avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Spill/Stack Write Instructions:
-            avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Spill/Stack Atomic Instructions:
-            avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-        gfx940:
-          Total Instructions:
-            avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Global/Generic Instructions:
-            avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Global/Generic Read Instructions:
-            avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Global/Generic Write Instructions:
-            avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Global/Generic Atomic Instructions:
-            avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Spill/Stack Instructions:
-            avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Spill/Stack Read Instructions:
-            avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Spill/Stack Write Instructions:
-            avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Spill/Stack Atomic Instructions:
-            avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-        gfx942:
-          Total Instructions:
-            avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Global/Generic Instructions:
-            avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Global/Generic Read Instructions:
-            avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Global/Generic Write Instructions:
-            avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Global/Generic Atomic Instructions:
-            avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Spill/Stack Instructions:
-            avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Spill/Stack Read Instructions:
-            avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Spill/Stack Write Instructions:
-            avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Spill/Stack Atomic Instructions:
-            avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-        gfx950:
-          Total Instructions:
-            avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Global/Generic Instructions:
-            avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Global/Generic Read Instructions:
-            avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Global/Generic Read Instructions for LDS:
-            avg: AVG((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Global/Generic Write Instructions:
-            avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Global/Generic Atomic Instructions:
-            avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Spill/Stack Instructions:
-            avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Spill/Stack Read Instructions:
-            avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Spill/Stack Read Instructions for LDS:
-            avg: AVG((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Spill/Stack Write Instructions:
-            avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Spill/Stack Atomic Instructions:
-            avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-        gfx908:
-          Total Instructions:
-            avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Global/Generic Instructions:
-            avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Global/Generic Read Instructions:
-            avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Global/Generic Write Instructions:
-            avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Global/Generic Atomic Instructions:
-            avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Spill/Stack Instructions:
-            avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Spill/Stack Read Instructions:
-            avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Spill/Stack Write Instructions:
-            avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Spill/Stack Atomic Instructions:
-            avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-            min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-            max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom))
-            unit: (Instructions + $normUnit)
-  - metric_table:
-      id: 1503
-      title: Spill and stack metrics
-      header:
-        metric: Metric
-        avg: Avg
-        min: Min
-        max: Max
-        unit: Unit
-      metric:
-        gfx90a:
-          Spill/Stack Total Cycles:
-            avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
-            min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
-            max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
-            unit: (Cycles + $normUnit)
-          Spill/Stack Coalesced Read:
-            avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
-            min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
-            max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
-            unit: (Cycles + $normUnit)
-          Spill/Stack Coalesced Write:
-            avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
-            min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
-            max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
-            unit: (Cycles + $normUnit)
-        gfx941:
-          Spill/Stack Total Cycles:
-            avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
-            min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
-            max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
-            unit: (Cycles + $normUnit)
-          Spill/Stack Coalesced Read:
-            avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
-            min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
-            max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
-            unit: (Cycles + $normUnit)
-          Spill/Stack Coalesced Write:
-            avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
-            min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
-            max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
-            unit: (Cycles + $normUnit)
-        gfx940:
-          Spill/Stack Total Cycles:
-            avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
-            min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
-            max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
-            unit: (Cycles + $normUnit)
-          Spill/Stack Coalesced Read:
-            avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
-            min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
-            max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
-            unit: (Cycles + $normUnit)
-          Spill/Stack Coalesced Write:
-            avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
-            min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
-            max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
-            unit: (Cycles + $normUnit)
-        gfx942:
-          Spill/Stack Total Cycles:
-            avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
-            min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
-            max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
-            unit: (Cycles + $normUnit)
-          Spill/Stack Coalesced Read:
-            avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
-            min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
-            max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
-            unit: (Cycles + $normUnit)
-          Spill/Stack Coalesced Write:
-            avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
-            min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
-            max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
-            unit: (Cycles + $normUnit)
-        gfx950:
-          Spill/Stack Total Cycles:
-            avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
-            min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
-            max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
-            unit: (Cycles + $normUnit)
-          Spill/Stack Coalesced Read:
-            avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
-            min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
-            max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
-            unit: (Cycles + $normUnit)
-          Spill/Stack Coalesced Write:
-            avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
-            min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
-            max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
-            unit: (Cycles + $normUnit)
-        gfx908:
-          Spill/Stack Total Cycles:
-            avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
-            min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
-            max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom))
-            unit: (Cycles + $normUnit)
-          Spill/Stack Coalesced Read:
-            avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
-            min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
-            max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom))
-            unit: (Cycles + $normUnit)
-          Spill/Stack Coalesced Write:
-            avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
-            min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
-            max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom))
-            unit: (Cycles + $normUnit)
-  - metric_table:
-      id: 1504
-      title: Vector L1 data-return path or Texture Data (TD)
-      header:
-        metric: Metric
-        avg: Avg
-        min: Min
-        max: Max
-        unit: Unit
-      metric:
-        gfx90a:
-          Data-Return Busy:
-            avg: AVG(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            min: MIN(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            max: MAX(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            unit: pct
-          "Cache RAM \u2192 Data-Return Stall":
-            avg: AVG(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            min: MIN(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            max: MAX(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            unit: pct
-          "Workgroup manager \u2192 Data-Return Stall":
-            avg: AVG(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            min: MIN(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            max: MAX(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            unit: pct
-          Coalescable Instructions:
-            avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom))
-            min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom))
-            max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Read Instructions:
-            avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
-              / $denom))
-            min: MIN((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
-              / $denom))
-            max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
-              / $denom))
-            unit: (Instructions + $normUnit)
-          Write Instructions:
-            avg: AVG((TD_STORE_WAVEFRONT_sum / $denom))
-            min: MIN((TD_STORE_WAVEFRONT_sum / $denom))
-            max: MAX((TD_STORE_WAVEFRONT_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Atomic Instructions:
-            avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom))
-            min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom))
-            max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom))
-            unit: (Instructions + $normUnit)
-        gfx941:
-          Data-Return Busy:
-            avg: AVG(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            min: MIN(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            max: MAX(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            unit: pct
-          "Cache RAM \u2192 Data-Return Stall":
-            avg: AVG(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            min: MIN(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            max: MAX(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            unit: pct
-          "Workgroup manager \u2192 Data-Return Stall":
-            avg: AVG(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            min: MIN(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            max: MAX(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            unit: pct
-          Coalescable Instructions:
-            avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom))
-            min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom))
-            max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Read Instructions:
-            avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
-              / $denom))
-            min: MIN((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
-              / $denom))
-            max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
-              / $denom))
-            unit: (Instructions + $normUnit)
-          Write Instructions:
-            avg: AVG((TD_STORE_WAVEFRONT_sum / $denom))
-            min: MIN((TD_STORE_WAVEFRONT_sum / $denom))
-            max: MAX((TD_STORE_WAVEFRONT_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Atomic Instructions:
-            avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom))
-            min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom))
-            max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom))
-            unit: (Instructions + $normUnit)
-        gfx940:
-          Data-Return Busy:
-            avg: AVG(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            min: MIN(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            max: MAX(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            unit: pct
-          "Cache RAM \u2192 Data-Return Stall":
-            avg: AVG(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            min: MIN(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            max: MAX(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            unit: pct
-          "Workgroup manager \u2192 Data-Return Stall":
-            avg: AVG(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            min: MIN(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            max: MAX(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            unit: pct
-          Coalescable Instructions:
-            avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom))
-            min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom))
-            max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Read Instructions:
-            avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
-              / $denom))
-            min: MIN((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
-              / $denom))
-            max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
-              / $denom))
-            unit: (Instructions + $normUnit)
-          Write Instructions:
-            avg: AVG((TD_STORE_WAVEFRONT_sum / $denom))
-            min: MIN((TD_STORE_WAVEFRONT_sum / $denom))
-            max: MAX((TD_STORE_WAVEFRONT_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Atomic Instructions:
-            avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom))
-            min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom))
-            max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom))
-            unit: (Instructions + $normUnit)
-        gfx942:
-          Data-Return Busy:
-            avg: AVG(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            min: MIN(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            max: MAX(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            unit: pct
-          "Cache RAM \u2192 Data-Return Stall":
-            avg: AVG(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            min: MIN(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            max: MAX(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            unit: pct
-          "Workgroup manager \u2192 Data-Return Stall":
-            avg: AVG(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            min: MIN(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            max: MAX(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            unit: pct
-          Coalescable Instructions:
-            avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom))
-            min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom))
-            max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Read Instructions:
-            avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
-              / $denom))
-            min: MIN((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
-              / $denom))
-            max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
-              / $denom))
-            unit: (Instructions + $normUnit)
-          Write Instructions:
-            avg: AVG((TD_STORE_WAVEFRONT_sum / $denom))
-            min: MIN((TD_STORE_WAVEFRONT_sum / $denom))
-            max: MAX((TD_STORE_WAVEFRONT_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Atomic Instructions:
-            avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom))
-            min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom))
-            max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom))
-            unit: (Instructions + $normUnit)
-        gfx950:
-          Data-Return Busy:
-            avg: AVG(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            min: MIN(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            max: MAX(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            unit: pct
-          "Cache RAM \u2192 Data-Return Stall":
-            avg: AVG(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            min: MIN(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            max: MAX(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            unit: pct
-          "Workgroup manager \u2192 Data-Return Stall":
-            avg: AVG(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            min: MIN(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            max: MAX(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            unit: pct
-          Coalescable Instructions:
-            avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom))
-            min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom))
-            max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Read Instructions:
-            avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
-              / $denom))
-            min: MIN((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
-              / $denom))
-            max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
-              / $denom))
-            unit: (Instructions + $normUnit)
-          Write Instructions:
-            avg: AVG((TD_STORE_WAVEFRONT_sum / $denom))
-            min: MIN((TD_STORE_WAVEFRONT_sum / $denom))
-            max: MAX((TD_STORE_WAVEFRONT_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Atomic Instructions:
-            avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom))
-            min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom))
-            max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Write Ack Instructions:
-            avg: AVG((TD_WRITE_ACKT_WAVEFRONT_sum / $denom))
-            min: MIN((TD_WRITE_ACKT_WAVEFRONT_sum / $denom))
-            max: MAX((TD_WRITE_ACKT_WAVEFRONT_sum / $denom))
-            unit: (Instructions + $normUnit)
-        gfx908:
-          Data-Return Busy:
-            avg: AVG(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            min: MIN(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            max: MAX(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            unit: pct
-          "Cache RAM \u2192 Data-Return Stall":
-            avg: AVG(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            min: MIN(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            max: MAX(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
-            unit: pct
-          Coalescable Instructions:
-            avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom))
-            min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom))
-            max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Read Instructions:
-            avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
-              / $denom))
-            min: MIN((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
-              / $denom))
-            max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum)
-              / $denom))
-            unit: (Instructions + $normUnit)
-          Write Instructions:
-            avg: AVG((TD_STORE_WAVEFRONT_sum / $denom))
-            min: MIN((TD_STORE_WAVEFRONT_sum / $denom))
-            max: MAX((TD_STORE_WAVEFRONT_sum / $denom))
-            unit: (Instructions + $normUnit)
-          Atomic Instructions:
-            avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom))
-            min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom))
-            max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom))
-            unit: (Instructions + $normUnit)
-  metrics_description:
-    Address Processing Unit Busy:
-      plain: Percent of the total CU cycles the address processor was busy
-      rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the address processor
-        was busy
-      unit: Percent
-    Address Stall:
-      plain: Percent of the total CU cycles the address processor was stalled from
-        sending address requests further into the vL1D pipeline.
-      rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the address processor
-        was stalled from sending address requests further into the vL1D pipeline
-      unit: Percent
-    Data Stall:
-      plain: Percent of the total CU cycles the address processor was stalled from
-        sending write/atomic data further into the vL1D pipeline.
-      rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the address processor
-        was stalled from sending write/atomic data further into the vL1D pipeline
-      unit: Percent
-    "Data-Processor \u2192 Address Stall":
-      plain: Percent of total CU cycles the address processor was stalled waiting
-        to send command data to the data processor.
-      rst: Percent of :ref:`total CU cycles <total-cu-cycles>` the address processor was
-        stalled waiting to send command data to the :ref:`data processor <desc-td>`
-      unit: Percent
-    Total Instructions:
-      plain: The total number of memory instructions executed by the address processer
-        over all compute units on the accelerator, per normalization unit.
-      rst: The total number of memory instructions executed by the address processer
-        over all compute units on the accelerator, per normalization unit.
-      unit: Instructions per normalization unit
-    Global/Generic Instructions:
-      plain: The total number of global & generic memory instructions executed on
-        all compute units on the accelerator, per normalization unit.
-      rst: The total number of global & generic memory instructions executed on all :doc:`compute
-        units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
-      unit: Instructions per normalization unit
-    Global/Generic Read Instructions:
-      plain: The total number of global & generic memory read instructions executed
-        on all compute units on the accelerator, per normalization unit.
-      rst: The total number of global & generic memory read instructions executed
-        on all :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
-        unit <normalization-units>`.
-      unit: Instructions per normalization unit
-    Global/Generic Write Instructions:
-      plain: The total number of global & generic memory write instructions executed
-        on all compute units on the accelerator, per normalization unit.
-      rst: The total number of global & generic memory write instructions executed on
-        all :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization
-        unit <normalization-units>`.
-      unit: Instructions per normalization unit
-    Global/Generic Atomic Instructions:
-      plain: The total number of global & generic memory atomic (with and without
-        return) instructions executed on all compute units on the accelerator, per
-        normalization unit.
-      rst: The total number of global & generic memory atomic (with and without return)
-        instructions executed on all :doc:`compute units <compute-unit>` on the accelerator,
-        per :ref:`normalization unit <normalization-units>`.
-      unit: Instructions per normalization unit
-    Spill/Stack Instructions:
-      plain: The total number of spill/stack memory instructions executed on all compute
-        units on the accelerator, per normalization unit.
-      rst: The total number of spill/stack memory instructions executed on all :doc:`compute
-        units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
-      unit: Instructions per normalization unit
-    Spill/Stack Read Instructions:
-      plain: The total number of spill/stack memory read instructions executed on
-        all compute units on the accelerator, per normalization unit.
-      rst: The total number of spill/stack memory read instructions executed on all :doc:`compute
-        units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
-      unit: Instructions per normalization unit
-    Spill/Stack Write Instructions:
-      plain: The total number of spill/stack memory write instructions executed on
-        all compute units on the accelerator, per normalization unit.
-      rst: The total number of spill/stack memory write instructions executed on all :doc:`compute
-        units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
-      unit: Instructions per normalization unit
-    Spill/Stack Atomic Instructions:
-      plain: The total number of spill/stack memory atomic (with and without return)
-        instructions executed on all compute units on the accelerator, per normalization
-        unit. Typically unused as these memory operations are typically used to implement
-        thread-local storage.
-      rst: The total number of spill/stack memory atomic (with and without return) instructions
-        executed on all :doc:`compute units <compute-unit>` on the accelerator, per
-        :ref:`normalization unit <normalization-units>`. Typically unused as these
-        memory operations are typically used to implement thread-local storage.
-      unit: Instructions per normalization unit
-    Spill/Stack Total Cycles:
-      plain: The number of cycles the address processing unit spent working on spill/stack
-        instructions, per normalization unit.
-      rst: The number of cycles the address processing unit spent working on spill/stack
-        instructions, per :ref:`normalization unit <normalization-units>`.
-      unit: Cycles per normalization unit
-    Spill/Stack Coalesced Read:
-      plain: The number of cycles the address processing unit spent working on coalesced
-        spill/stack read instructions, per normalization unit.
-      rst: The number of cycles the address processing unit spent working on coalesced
-        spill/stack read instructions, per :ref:`normalization unit <normalization-units>`.
-      unit: Cycles per normalization unit
-    Spill/Stack Coalesced Write:
-      plain: The number of cycles the address processing unit spent working on coalesced
-        spill/stack write instructions, per normalization unit.
-      rst: The number of cycles the address processing unit spent working on coalesced
-        spill/stack write instructions, per :ref:`normalization unit <normalization-units>`.
-      unit: Cycles per normalization unit
-    Data-Return Busy:
-      plain: Percent of the total CU cycles the data-return unit was busy processing
-        or waiting on data to return to the CU.
-      rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the data-return unit
-        was busy processing or waiting on data to return to the :doc:`CU <compute-unit>`.
-      unit: Percent
-    "Cache RAM \u2192 Data-Return Stall":
-      plain: Percent of the total CU cycles the data-return unit was stalled on data
-        to be returned from the vL1D Cache RAM.
-      rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the data-return unit
-        was stalled on data to be returned from the :ref:`vL1D Cache RAM <desc-tc>`.
-      unit: Percent
-    "Workgroup manager \u2192 Data-Return Stall":
-      plain: Percent of the total CU cycles the data-return unit was stalled by the
-        workgroup manager due to initialization of registers as a part of launching
-        new workgroups.
-      rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the data-return unit
-        was stalled by the :ref:`workgroup manager <desc-spi>` due to initialization
-        of registers as a part of launching new workgroups.
-      unit: Percent
-    Coalescable Instructions:
-      plain: The number of instructions submitted to the data-return unit by the address
-        processor that were found to be coalescable, per normalization unit.
-      rst: The number of instructions submitted to the :ref:`data-return unit <desc-td>`
-        by the :ref:`address processor <desc-ta>` that were found to be coalescable,
-        per :ref:`normalization unit <normalization-units>`.
-      unit: Instructions per normalization unit
-    Read Instructions:
-      plain: The number of read instructions submitted to the data-return unit by
-        the address processor summed over all compute units on the accelerator, per
-        normalization unit. This is expected to be the sum of global/generic and spill/stack
-        reads in the address processor.
-      rst: The number of read instructions submitted to the :ref:`data-return unit
-        <desc-td>` by the :ref:`address processor <desc-ta>` summed over all :doc:`compute
-        units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
-        This is expected to be the sum of global/generic and spill/stack reads in
-        the :ref:`address processor <desc-ta>`.
-      unit: Instructions per normalization unit
-    Write Instructions:
-      plain: The number of store instructions submitted to the data-return unit by
-        the address processor summed over all compute units on the accelerator, per
-        normalization unit. This is expected to be the sum of global/generic and spill/stack
-        stores in the address processor.
-      rst: The number of store instructions submitted to the :ref:`data-return unit
-        <desc-td>` by the :ref:`address processor <desc-ta>` summed over all :doc:`compute
-        units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
-        This is expected to be the sum of global/generic and spill/stack stores counted
-        by the :ref:`vL1D cache-front-end <ta-instruction-counts>`.
-      unit: Instructions per normalization unit
-    Atomic Instructions:
-      plain: The number of atomic instructions submitted to the data-return unit by
-        the address processor summed over all compute units on the accelerator, per
-        normalization unit. This is expected to be the sum of global/generic and spill/stack
-        atomics in the address processor.
-      rst: The number of atomic instructions submitted to the :ref:`data-return unit
-        <desc-td>` by the :ref:`address processor <desc-ta>` summed over all :doc:`compute
-        units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`.
-        This is expected to be the sum of global/generic and spill/stack atomics
-        in the :ref:`address processor <desc-ta>`.
-      unit: Instructions per normalization unit
-    Write Ack Instructions:
-      plain: The total number of write acknowledgements submitted by data-return
-        unit to SQ, summed over all compute units on the accelerator, per normalization
-        unit.
-      rst: The total number of write acknowledgements submitted by :ref:`data-return unit <desc-td>`
-        to SQ, summed over all compute units on the accelerator, per normalization unit.
-      unit: Instructions per normalization unit
-- id: 1600
-  title: Vector L1 Data Cache
-  data source:
-  - metric_table:
-      id: 1601
-      title: vL1D Speed-of-Light
-      header:
-        metric: Metric
-        value: Avg
-        unit: Unit
-      metric:
-        gfx90a:
-          Hit rate:
-            value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
-              0) else None))
-            unit: Pct of Peak
-          Bandwidth Utilization:
-            value: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk / 1000) * 64) * $cu_per_gpu))
-            unit: Pct of Peak
-          Utilization:
-            value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
-              != 0) else None))
-            unit: Pct of Peak
-          Coalescing:
-            value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum
-              * 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None))
-            unit: Pct of Peak
-        gfx941:
-          Hit rate:
-            value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
-              0) else None))
-            unit: Pct of Peak
-          Bandwidth Utilization:
-            value: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu))
-            unit: Pct of Peak
-          Utilization:
-            value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
-              != 0) else None))
-            unit: Pct of Peak
-          Coalescing:
-            value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum
-              * 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None))
-            unit: Pct of Peak
-        gfx940:
-          Hit rate:
-            value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
-              0) else None))
-            unit: Pct of Peak
-          Bandwidth Utilization:
-            value: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu))
-            unit: Pct of Peak
-          Utilization:
-            value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
-              != 0) else None))
-            unit: Pct of Peak
-          Coalescing:
-            value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum
-              * 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None))
-            unit: Pct of Peak
-        gfx942:
-          Hit rate:
-            value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
-              0) else None))
-            unit: Pct of Peak
-          Bandwidth Utilization:
-            value: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu))
-            unit: Pct of Peak
-          Utilization:
-            value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
-              != 0) else None))
-            unit: Pct of Peak
-          Coalescing:
-            value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum
-              * 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None))
-            unit: Pct of Peak
-        gfx950:
-          Hit rate:
-            value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
-              0) else None))
-            unit: Pct of Peak
-          Bandwidth Utilization:
-            value: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu))
-            unit: Pct of Peak
-          Utilization:
-            value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
-              != 0) else None))
-            unit: Pct of Peak
-          Coalescing:
-            value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum
-              * 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None))
-            unit: Pct of Peak
-        gfx908:
-          Hit rate:
-            value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
-              0) else None))
-            unit: Pct of Peak
-          Bandwidth Utilization:
-            value: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp
-              - Start_Timestamp)))) / ((($max_sclk / 1000) * 64) * $cu_per_gpu))
-            unit: Pct of Peak
-          Utilization:
-            value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
-              != 0) else None))
-            unit: Pct of Peak
-          Coalescing:
-            value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum
-              * 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None))
-            unit: Pct of Peak
-      comparable: false
-      cli_style: simple_bar
-      tui_style: simple_bar
-  - metric_table:
-      id: 1602
-      title: vL1D cache stall metrics
-      header:
-        metric: Metric
-        expr: Expression
-      metric:
-        gfx90a:
-          Stalled on L2 Data:
-            expr: (((100 * TCP_PENDING_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
-              != 0) else None)
-          Stalled on L2 Req:
-            expr: (((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
-              != 0) else None)
-          Tag RAM Stall (Read):
-            expr: (((100 * TCP_READ_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
-              if (TCP_GATE_EN1_sum != 0) else None)
-          Tag RAM Stall (Write):
-            expr: (((100 * TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
-              if (TCP_GATE_EN1_sum != 0) else None)
-          Tag RAM Stall (Atomic):
-            expr: (((100 * TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
-              if (TCP_GATE_EN1_sum != 0) else None)
-        gfx941:
-          Stalled on L2 Data:
-            expr: (((100 * TCP_PENDING_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
-              != 0) else None)
-          Stalled on L2 Req:
-            expr: (((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
-              != 0) else None)
-          Tag RAM Stall (Read):
-            expr: (((100 * TCP_READ_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
-              if (TCP_GATE_EN1_sum != 0) else None)
-          Tag RAM Stall (Write):
-            expr: (((100 * TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
-              if (TCP_GATE_EN1_sum != 0) else None)
-          Tag RAM Stall (Atomic):
-            expr: (((100 * TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
-              if (TCP_GATE_EN1_sum != 0) else None)
-        gfx940:
-          Stalled on L2 Data:
-            expr: (((100 * TCP_PENDING_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
-              != 0) else None)
-          Stalled on L2 Req:
-            expr: (((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
-              != 0) else None)
-          Tag RAM Stall (Read):
-            expr: (((100 * TCP_READ_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
-              if (TCP_GATE_EN1_sum != 0) else None)
-          Tag RAM Stall (Write):
-            expr: (((100 * TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
-              if (TCP_GATE_EN1_sum != 0) else None)
-          Tag RAM Stall (Atomic):
-            expr: (((100 * TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
-              if (TCP_GATE_EN1_sum != 0) else None)
-        gfx942:
-          Stalled on L2 Data:
-            expr: (((100 * TCP_PENDING_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
-              != 0) else None)
-          Stalled on L2 Req:
-            expr: (((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
-              != 0) else None)
-          Tag RAM Stall (Read):
-            expr: (((100 * TCP_READ_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
-              if (TCP_GATE_EN1_sum != 0) else None)
-          Tag RAM Stall (Write):
-            expr: (((100 * TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
-              if (TCP_GATE_EN1_sum != 0) else None)
-          Tag RAM Stall (Atomic):
-            expr: (((100 * TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
-              if (TCP_GATE_EN1_sum != 0) else None)
-        gfx950:
-          Stalled on L2 Data:
-            expr: (((100 * TCP_PENDING_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
-              != 0) else None)
-          Stalled on L2 Req:
-            expr: (((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
-              != 0) else None)
-          Stalled on Address:
-            expr: (((100 * TCP_TCP_TA_ADDR_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if
-              (TCP_GATE_EN1_sum != 0) else None)
-          Stalled on Data:
-            expr: (((100 * TCP_TCP_TA_DATA_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if
-              (TCP_GATE_EN1_sum != 0) else None)
-          Stalled on Latency FIFO:
-            expr: (((100 * TCP_LFIFO_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
-              != 0) else None)
-          Stalled on Request FIFO:
-            expr: (((100 * TCP_RFIFO_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
-              != 0) else None)
-          Stalled on Read Return:
-            expr: (((100 * TCP_TCR_RDRET_STALL_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
-              != 0) else None)
-          Tag RAM Stall (Read):
-            expr: (((100 * TCP_READ_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
-              if (TCP_GATE_EN1_sum != 0) else None)
-          Tag RAM Stall (Write):
-            expr: (((100 * TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
-              if (TCP_GATE_EN1_sum != 0) else None)
-          Tag RAM Stall (Atomic):
-            expr: (((100 * TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
-              if (TCP_GATE_EN1_sum != 0) else None)
-        gfx908:
-          Stalled on L2 Data:
-            expr: (((100 * TCP_PENDING_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
-              != 0) else None)
-          Stalled on L2 Req:
-            expr: (((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
-              != 0) else None)
-          Tag RAM Stall (Read):
-            expr: (((100 * TCP_READ_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
-              if (TCP_GATE_EN1_sum != 0) else None)
-          Tag RAM Stall (Write):
-            expr: (((100 * TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
-              if (TCP_GATE_EN1_sum != 0) else None)
-          Tag RAM Stall (Atomic):
-            expr: (((100 * TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum)
-              if (TCP_GATE_EN1_sum != 0) else None)
-      cli_style: simple_box
-      tui_style: simple_box
-  - metric_table:
-      id: 1603
-      title: vL1D cache access metrics
-      header:
-        metric: Metric
-        avg: Avg
-        min: Min
-        max: Max
-        unit: Unit
-      metric:
-        gfx90a:
-          Total Req:
-            avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom))
-            min: MIN((TCP_TOTAL_ACCESSES_sum / $denom))
-            max: MAX((TCP_TOTAL_ACCESSES_sum / $denom))
-            unit: (Req + $normUnit)
-          Read Req:
-            avg: AVG((TCP_TOTAL_READ_sum / $denom))
-            min: MIN((TCP_TOTAL_READ_sum / $denom))
-            max: MAX((TCP_TOTAL_READ_sum / $denom))
-            unit: (Req + $normUnit)
-          Write Req:
-            avg: AVG((TCP_TOTAL_WRITE_sum / $denom))
-            min: MIN((TCP_TOTAL_WRITE_sum / $denom))
-            max: MAX((TCP_TOTAL_WRITE_sum / $denom))
-            unit: (Req + $normUnit)
-          Atomic Req:
-            avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
-              / $denom))
-            min: MIN(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
-              / $denom))
-            max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
-              / $denom))
-            unit: (Req + $normUnit)
-          Cache BW:
-            avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp)))
-            min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp)))
-            max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp)))
-            unit: Gbps
-          Cache Hit Rate:
-            avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
-              0) else None))
-            min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
-              0) else None))
-            max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
-              0) else None))
-            unit: pct
-          Cache Accesses:
-            avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
-            min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
-            max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
-            unit: (Req + $normUnit)
-          Cache Hits:
-            avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / $denom))
-            min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / $denom))
-            max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / $denom))
-            unit: (Req + $normUnit)
-          Invalidations:
-            avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
-            min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
-            max: MAX((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
-            unit: (Req + $normUnit)
-          L1-L2 BW:
-            avg: AVG(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)
-              + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp)))
-            min: MIN(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)
-              + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp)))
-            max: MAX(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)
-              + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp)))
-            unit: Gbps
-          L1-L2 Read:
-            avg: AVG((TCP_TCC_READ_REQ_sum / $denom))
-            min: MIN((TCP_TCC_READ_REQ_sum / $denom))
-            max: MAX((TCP_TCC_READ_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          L1-L2 Write:
-            avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom))
-            min: MIN((TCP_TCC_WRITE_REQ_sum / $denom))
-            max: MAX((TCP_TCC_WRITE_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          L1-L2 Atomic:
-            avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
-              / $denom))
-            min: MIN(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
-              / $denom))
-            max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
-              / $denom))
-            unit: (Req + $normUnit)
-          L1 Access Latency:
-            avg: AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum
-              != 0) else None))
-            min: MIN(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum
-              != 0) else None))
-            max: MAX(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum
-              != 0) else None))
-            unit: Cycles
-          L1-L2 Read Latency:
-            avg: AVG(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum))
-              if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else
-              None))
-            min: MIN(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum))
-              if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else
-              None))
-            max: MAX(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum))
-              if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else
-              None))
-            unit: Cycles
-          L1-L2 Write Latency:
-            avg: AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) !=
-              0) else None))
-            min: MIN(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) !=
-              0) else None))
-            max: MAX(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) !=
-              0) else None))
-            unit: Cycles
-        gfx941:
-          Total Req:
-            avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom))
-            min: MIN((TCP_TOTAL_ACCESSES_sum / $denom))
-            max: MAX((TCP_TOTAL_ACCESSES_sum / $denom))
-            unit: (Req + $normUnit)
-          Read Req:
-            avg: AVG((TCP_TOTAL_READ_sum / $denom))
-            min: MIN((TCP_TOTAL_READ_sum / $denom))
-            max: MAX((TCP_TOTAL_READ_sum / $denom))
-            unit: (Req + $normUnit)
-          Write Req:
-            avg: AVG((TCP_TOTAL_WRITE_sum / $denom))
-            min: MIN((TCP_TOTAL_WRITE_sum / $denom))
-            max: MAX((TCP_TOTAL_WRITE_sum / $denom))
-            unit: (Req + $normUnit)
-          Atomic Req:
-            avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
-              / $denom))
-            min: MIN(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
-              / $denom))
-            max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
-              / $denom))
-            unit: (Req + $normUnit)
-          Cache BW:
-            avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
-            min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
-            max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
-            unit: Gbps
-          Cache Hit Rate:
-            avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
-              0) else None))
-            min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
-              0) else None))
-            max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
-              0) else None))
-            unit: pct
-          Cache Accesses:
-            avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
-            min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
-            max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
-            unit: (Req + $normUnit)
-          Cache Hits:
-            avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / $denom))
-            min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / $denom))
-            max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / $denom))
-            unit: (Req + $normUnit)
-          Invalidations:
-            avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
-            min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
-            max: MAX((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
-            unit: (Req + $normUnit)
-          L1-L2 BW:
-            avg: AVG(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum +
-              TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / (End_Timestamp - Start_Timestamp)))
-            min: MIN(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum +
-              TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / (End_Timestamp - Start_Timestamp)))
-            max: MAX(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum +
-              TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / (End_Timestamp - Start_Timestamp)))
-            unit: Gbps
-          L1-L2 Read:
-            avg: AVG((TCP_TCC_READ_REQ_sum / $denom))
-            min: MIN((TCP_TCC_READ_REQ_sum / $denom))
-            max: MAX((TCP_TCC_READ_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          L1-L2 Write:
-            avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom))
-            min: MIN((TCP_TCC_WRITE_REQ_sum / $denom))
-            max: MAX((TCP_TCC_WRITE_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          L1-L2 Atomic:
-            avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
-              / $denom))
-            min: MIN(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
-              / $denom))
-            max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
-              / $denom))
-            unit: (Req + $normUnit)
-        gfx940:
-          Total Req:
-            avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom))
-            min: MIN((TCP_TOTAL_ACCESSES_sum / $denom))
-            max: MAX((TCP_TOTAL_ACCESSES_sum / $denom))
-            unit: (Req + $normUnit)
-          Read Req:
-            avg: AVG((TCP_TOTAL_READ_sum / $denom))
-            min: MIN((TCP_TOTAL_READ_sum / $denom))
-            max: MAX((TCP_TOTAL_READ_sum / $denom))
-            unit: (Req + $normUnit)
-          Write Req:
-            avg: AVG((TCP_TOTAL_WRITE_sum / $denom))
-            min: MIN((TCP_TOTAL_WRITE_sum / $denom))
-            max: MAX((TCP_TOTAL_WRITE_sum / $denom))
-            unit: (Req + $normUnit)
-          Atomic Req:
-            avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
-              / $denom))
-            min: MIN(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
-              / $denom))
-            max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
-              / $denom))
-            unit: (Req + $normUnit)
-          Cache BW:
-            avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
-            min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
-            max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
-            unit: Gbps
-          Cache Hit Rate:
-            avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
-              0) else None))
-            min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
-              0) else None))
-            max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
-              0) else None))
-            unit: pct
-          Cache Accesses:
-            avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
-            min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
-            max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
-            unit: (Req + $normUnit)
-          Cache Hits:
-            avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / $denom))
-            min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / $denom))
-            max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / $denom))
-            unit: (Req + $normUnit)
-          Invalidations:
-            avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
-            min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
-            max: MAX((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
-            unit: (Req + $normUnit)
-          L1-L2 BW:
-            avg: AVG(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum +
-              TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / (End_Timestamp - Start_Timestamp)))
-            min: MIN(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum +
-              TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / (End_Timestamp - Start_Timestamp)))
-            max: MAX(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum +
-              TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / (End_Timestamp - Start_Timestamp)))
-            unit: Gbps
-          L1-L2 Read:
-            avg: AVG((TCP_TCC_READ_REQ_sum / $denom))
-            min: MIN((TCP_TCC_READ_REQ_sum / $denom))
-            max: MAX((TCP_TCC_READ_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          L1-L2 Write:
-            avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom))
-            min: MIN((TCP_TCC_WRITE_REQ_sum / $denom))
-            max: MAX((TCP_TCC_WRITE_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          L1-L2 Atomic:
-            avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
-              / $denom))
-            min: MIN(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
-              / $denom))
-            max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
-              / $denom))
-            unit: (Req + $normUnit)
-        gfx942:
-          Total Req:
-            avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom))
-            min: MIN((TCP_TOTAL_ACCESSES_sum / $denom))
-            max: MAX((TCP_TOTAL_ACCESSES_sum / $denom))
-            unit: (Req + $normUnit)
-          Read Req:
-            avg: AVG((TCP_TOTAL_READ_sum / $denom))
-            min: MIN((TCP_TOTAL_READ_sum / $denom))
-            max: MAX((TCP_TOTAL_READ_sum / $denom))
-            unit: (Req + $normUnit)
-          Write Req:
-            avg: AVG((TCP_TOTAL_WRITE_sum / $denom))
-            min: MIN((TCP_TOTAL_WRITE_sum / $denom))
-            max: MAX((TCP_TOTAL_WRITE_sum / $denom))
-            unit: (Req + $normUnit)
-          Atomic Req:
-            avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
-              / $denom))
-            min: MIN(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
-              / $denom))
-            max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
-              / $denom))
-            unit: (Req + $normUnit)
-          Cache BW:
-            avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
-            min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
-            max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
-            unit: Gbps
-          Cache Hit Rate:
-            avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
-              0) else None))
-            min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
-              0) else None))
-            max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
-              0) else None))
-            unit: pct
-          Cache Accesses:
-            avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
-            min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
-            max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
-            unit: (Req + $normUnit)
-          Cache Hits:
-            avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / $denom))
-            min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / $denom))
-            max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / $denom))
-            unit: (Req + $normUnit)
-          Invalidations:
-            avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
-            min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
-            max: MAX((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
-            unit: (Req + $normUnit)
-          L1-L2 BW:
-            avg: AVG(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum +
-              TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / (End_Timestamp - Start_Timestamp)))
-            min: MIN(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum +
-              TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / (End_Timestamp - Start_Timestamp)))
-            max: MAX(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum +
-              TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / (End_Timestamp - Start_Timestamp)))
-            unit: Gbps
-          L1-L2 Read:
-            avg: AVG((TCP_TCC_READ_REQ_sum / $denom))
-            min: MIN((TCP_TCC_READ_REQ_sum / $denom))
-            max: MAX((TCP_TCC_READ_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          L1-L2 Write:
-            avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom))
-            min: MIN((TCP_TCC_WRITE_REQ_sum / $denom))
-            max: MAX((TCP_TCC_WRITE_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          L1-L2 Atomic:
-            avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
-              / $denom))
-            min: MIN(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
-              / $denom))
-            max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
-              / $denom))
-            unit: (Req + $normUnit)
-        gfx950:
-          Total Req:
-            avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom))
-            min: MIN((TCP_TOTAL_ACCESSES_sum / $denom))
-            max: MAX((TCP_TOTAL_ACCESSES_sum / $denom))
-            unit: (Req + $normUnit)
-          Read Req:
-            avg: AVG((TCP_TOTAL_READ_sum / $denom))
-            min: MIN((TCP_TOTAL_READ_sum / $denom))
-            max: MAX((TCP_TOTAL_READ_sum / $denom))
-            unit: (Req + $normUnit)
-          Write Req:
-            avg: AVG((TCP_TOTAL_WRITE_sum / $denom))
-            min: MIN((TCP_TOTAL_WRITE_sum / $denom))
-            max: MAX((TCP_TOTAL_WRITE_sum / $denom))
-            unit: (Req + $normUnit)
-          Atomic Req:
-            avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
-              / $denom))
-            min: MIN(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
-              / $denom))
-            max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
-              / $denom))
-            unit: (Req + $normUnit)
-          Cache BW:
-            avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
-            min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
-            max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
-            unit: Gbps
-          Cache Hit Rate:
-            avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
-              0) else None))
-            min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
-              0) else None))
-            max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
-              0) else None))
-            unit: pct
-          Cache Accesses:
-            avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
-            min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
-            max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
-            unit: (Req + $normUnit)
-          Cache Hits:
-            avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / $denom))
-            min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / $denom))
-            max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / $denom))
-            unit: (Req + $normUnit)
-          Invalidations:
-            avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
-            min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
-            max: MAX((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
-            unit: (Req + $normUnit)
-          L1-L2 BW:
-            avg: AVG(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum +
-              TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / (End_Timestamp - Start_Timestamp)))
-            min: MIN(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum +
-              TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / (End_Timestamp - Start_Timestamp)))
-            max: MAX(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum +
-              TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / (End_Timestamp - Start_Timestamp)))
-            unit: Gbps
-          Tag RAM 0 Req:
-            avg: AVG((TCP_TAGRAM0_REQ_sum / $denom))
-            min: MIN((TCP_TAGRAM0_REQ_sum / $denom))
-            max: MAX((TCP_TAGRAM0_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          Tag RAM 1 Req:
-            avg: AVG((TCP_TAGRAM1_REQ_sum / $denom))
-            min: MIN((TCP_TAGRAM1_REQ_sum / $denom))
-            max: MAX((TCP_TAGRAM1_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          Tag RAM 2 Req:
-            avg: AVG((TCP_TAGRAM2_REQ_sum / $denom))
-            min: MIN((TCP_TAGRAM2_REQ_sum / $denom))
-            max: MAX((TCP_TAGRAM2_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          Tag RAM 3 Req:
-            avg: AVG((TCP_TAGRAM3_REQ_sum / $denom))
-            min: MIN((TCP_TAGRAM3_REQ_sum / $denom))
-            max: MAX((TCP_TAGRAM3_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          L1-L2 Read:
-            avg: AVG((TCP_TCC_READ_REQ_sum / $denom))
-            min: MIN((TCP_TCC_READ_REQ_sum / $denom))
-            max: MAX((TCP_TCC_READ_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          L1-L2 Write:
-            avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom))
-            min: MIN((TCP_TCC_WRITE_REQ_sum / $denom))
-            max: MAX((TCP_TCC_WRITE_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          L1-L2 Atomic:
-            avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
-              / $denom))
-            min: MIN(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
-              / $denom))
-            max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
-              / $denom))
-            unit: (Req + $normUnit)
-          L1 Access Latency:
-            avg: AVG((TCP_TCP_LATENCY_sum / $denom))
-            min: MIN((TCP_TCP_LATENCY_sum / $denom))
-            max: MAX((TCP_TCP_LATENCY_sum / $denom))
-            unit: (Cycles + $normUnit)
-          L1-L2 Read Latency:
-            avg: AVG((TCP_TCC_READ_REQ_LATENCY_sum / $denom))
-            min: MIN((TCP_TCC_READ_REQ_LATENCY_sum / $denom))
-            max: MAX((TCP_TCC_READ_REQ_LATENCY_sum / $denom))
-            unit: (Cycles + $normUnit)
-          L1-L2 Write Latency:
-            avg: AVG((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom))
-            min: MIN((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom))
-            max: MAX((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom))
-            unit: (Cycles + $normUnit)
-        gfx908:
-          Total Req:
-            avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom))
-            min: MIN((TCP_TOTAL_ACCESSES_sum / $denom))
-            max: MAX((TCP_TOTAL_ACCESSES_sum / $denom))
-            unit: (Req + $normUnit)
-          Read Req:
-            avg: AVG((TCP_TOTAL_READ_sum / $denom))
-            min: MIN((TCP_TOTAL_READ_sum / $denom))
-            max: MAX((TCP_TOTAL_READ_sum / $denom))
-            unit: (Req + $normUnit)
-          Write Req:
-            avg: AVG((TCP_TOTAL_WRITE_sum / $denom))
-            min: MIN((TCP_TOTAL_WRITE_sum / $denom))
-            max: MAX((TCP_TOTAL_WRITE_sum / $denom))
-            unit: (Req + $normUnit)
-          Atomic Req:
-            avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
-              / $denom))
-            min: MIN(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
-              / $denom))
-            max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum)
-              / $denom))
-            unit: (Req + $normUnit)
-          Cache BW:
-            avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp)))
-            min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp)))
-            max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp)))
-            unit: Gbps
-          Cache Hit Rate:
-            avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
-              0) else None))
-            min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
-              0) else None))
-            max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum !=
-              0) else None))
-            unit: pct
-          Cache Accesses:
-            avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
-            min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
-            max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom))
-            unit: (Req + $normUnit)
-          Cache Hits:
-            avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / $denom))
-            min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / $denom))
-            max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum)
-              + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              / $denom))
-            unit: (Req + $normUnit)
-          Invalidations:
-            avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
-            min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
-            max: MAX((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom))
-            unit: (Req + $normUnit)
-          L1-L2 BW:
-            avg: AVG(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)
-              + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp)))
-            min: MIN(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)
-              + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp)))
-            max: MAX(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)
-              + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp)))
-            unit: Gbps
-          L1-L2 Read:
-            avg: AVG((TCP_TCC_READ_REQ_sum / $denom))
-            min: MIN((TCP_TCC_READ_REQ_sum / $denom))
-            max: MAX((TCP_TCC_READ_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          L1-L2 Write:
-            avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom))
-            min: MIN((TCP_TCC_WRITE_REQ_sum / $denom))
-            max: MAX((TCP_TCC_WRITE_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          L1-L2 Atomic:
-            avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
-              / $denom))
-            min: MIN(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
-              / $denom))
-            max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)
-              / $denom))
-            unit: (Req + $normUnit)
-          L1 Access Latency:
-            avg: AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum
-              != 0) else None))
-            min: MIN(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum
-              != 0) else None))
-            max: MAX(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum
-              != 0) else None))
-            unit: Cycles
-          L1-L2 Read Latency:
-            avg: AVG(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum))
-              if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else
-              None))
-            min: MIN(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum))
-              if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else
-              None))
-            max: MAX(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum))
-              if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else
-              None))
-            unit: Cycles
-          L1-L2 Write Latency:
-            avg: AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) !=
-              0) else None))
-            min: MIN(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) !=
-              0) else None))
-            max: MAX(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum))
-              if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) !=
-              0) else None))
-            unit: Cycles
-  - metric_table:
-      id: 1604
-      title: L1D - L2 Transactions
-      header:
-        metric: Metric
-        xfer: Xfer
-        coherency: Coherency
-        avg: Avg
-        min: Min
-        max: Max
-        unit: Unit
-      metric:
-        gfx90a:
-          NC - Read:
-            xfer: Read
-            coherency: NC
-            avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom))
-            min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom))
-            max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          UC - Read:
-            xfer: Read
-            coherency: UC
-            avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom))
-            min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom))
-            max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          CC - Read:
-            xfer: Read
-            coherency: CC
-            avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom))
-            min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom))
-            max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          RW - Read:
-            xfer: Read
-            coherency: RW
-            avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom))
-            min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom))
-            max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          RW - Write:
-            xfer: Write
-            coherency: RW
-            avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom))
-            min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom))
-            max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          NC - Write:
-            xfer: Write
-            coherency: NC
-            avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom))
-            min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom))
-            max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          UC - Write:
-            xfer: Write
-            coherency: UC
-            avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom))
-            min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom))
-            max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          CC - Write:
-            xfer: Write
-            coherency: CC
-            avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom))
-            min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom))
-            max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          NC - Atomic:
-            xfer: Atomic
-            coherency: NC
-            avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
-            min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
-            max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          UC - Atomic:
-            xfer: Atomic
-            coherency: UC
-            avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
-            min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
-            max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          CC - Atomic:
-            xfer: Atomic
-            coherency: CC
-            avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
-            min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
-            max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          RW - Atomic:
-            xfer: Atomic
-            coherency: RW
-            avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
-            min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
-            max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-        gfx941:
-          NC - Read:
-            xfer: Read
-            coherency: NC
-            avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom))
-            min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom))
-            max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          UC - Read:
-            xfer: Read
-            coherency: UC
-            avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom))
-            min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom))
-            max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          CC - Read:
-            xfer: Read
-            coherency: CC
-            avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom))
-            min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom))
-            max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          RW - Read:
-            xfer: Read
-            coherency: RW
-            avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom))
-            min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom))
-            max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          RW - Write:
-            xfer: Write
-            coherency: RW
-            avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom))
-            min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom))
-            max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          NC - Write:
-            xfer: Write
-            coherency: NC
-            avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom))
-            min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom))
-            max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          UC - Write:
-            xfer: Write
-            coherency: UC
-            avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom))
-            min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom))
-            max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          CC - Write:
-            xfer: Write
-            coherency: CC
-            avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom))
-            min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom))
-            max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          NC - Atomic:
-            xfer: Atomic
-            coherency: NC
-            avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
-            min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
-            max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          UC - Atomic:
-            xfer: Atomic
-            coherency: UC
-            avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
-            min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
-            max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          CC - Atomic:
-            xfer: Atomic
-            coherency: CC
-            avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
-            min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
-            max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          RW - Atomic:
-            xfer: Atomic
-            coherency: RW
-            avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
-            min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
-            max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-        gfx940:
-          NC - Read:
-            xfer: Read
-            coherency: NC
-            avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom))
-            min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom))
-            max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          UC - Read:
-            xfer: Read
-            coherency: UC
-            avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom))
-            min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom))
-            max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          CC - Read:
-            xfer: Read
-            coherency: CC
-            avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom))
-            min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom))
-            max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          RW - Read:
-            xfer: Read
-            coherency: RW
-            avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom))
-            min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom))
-            max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          RW - Write:
-            xfer: Write
-            coherency: RW
-            avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom))
-            min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom))
-            max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          NC - Write:
-            xfer: Write
-            coherency: NC
-            avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom))
-            min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom))
-            max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          UC - Write:
-            xfer: Write
-            coherency: UC
-            avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom))
-            min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom))
-            max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          CC - Write:
-            xfer: Write
-            coherency: CC
-            avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom))
-            min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom))
-            max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          NC - Atomic:
-            xfer: Atomic
-            coherency: NC
-            avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
-            min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
-            max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          UC - Atomic:
-            xfer: Atomic
-            coherency: UC
-            avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
-            min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
-            max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          CC - Atomic:
-            xfer: Atomic
-            coherency: CC
-            avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
-            min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
-            max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          RW - Atomic:
-            xfer: Atomic
-            coherency: RW
-            avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
-            min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
-            max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-        gfx942:
-          NC - Read:
-            xfer: Read
-            coherency: NC
-            avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom))
-            min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom))
-            max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          UC - Read:
-            xfer: Read
-            coherency: UC
-            avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom))
-            min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom))
-            max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          CC - Read:
-            xfer: Read
-            coherency: CC
-            avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom))
-            min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom))
-            max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          RW - Read:
-            xfer: Read
-            coherency: RW
-            avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom))
-            min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom))
-            max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          RW - Write:
-            xfer: Write
-            coherency: RW
-            avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom))
-            min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom))
-            max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          NC - Write:
-            xfer: Write
-            coherency: NC
-            avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom))
-            min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom))
-            max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          UC - Write:
-            xfer: Write
-            coherency: UC
-            avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom))
-            min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom))
-            max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          CC - Write:
-            xfer: Write
-            coherency: CC
-            avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom))
-            min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom))
-            max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          NC - Atomic:
-            xfer: Atomic
-            coherency: NC
-            avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
-            min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
-            max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          UC - Atomic:
-            xfer: Atomic
-            coherency: UC
-            avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
-            min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
-            max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          CC - Atomic:
-            xfer: Atomic
-            coherency: CC
-            avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
-            min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
-            max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          RW - Atomic:
-            xfer: Atomic
-            coherency: RW
-            avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
-            min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
-            max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-        gfx950:
-          NC - Read:
-            xfer: Read
-            coherency: NC
-            avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom))
-            min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom))
-            max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          UC - Read:
-            xfer: Read
-            coherency: UC
-            avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom))
-            min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom))
-            max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          CC - Read:
-            xfer: Read
-            coherency: CC
-            avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom))
-            min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom))
-            max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          RW - Read:
-            xfer: Read
-            coherency: RW
-            avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom))
-            min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom))
-            max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          RW - Write:
-            xfer: Write
-            coherency: RW
-            avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom))
-            min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom))
-            max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          NC - Write:
-            xfer: Write
-            coherency: NC
-            avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom))
-            min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom))
-            max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          UC - Write:
-            xfer: Write
-            coherency: UC
-            avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom))
-            min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom))
-            max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          CC - Write:
-            xfer: Write
-            coherency: CC
-            avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom))
-            min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom))
-            max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          NC - Atomic:
-            xfer: Atomic
-            coherency: NC
-            avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
-            min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
-            max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          UC - Atomic:
-            xfer: Atomic
-            coherency: UC
-            avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
-            min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
-            max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          CC - Atomic:
-            xfer: Atomic
-            coherency: CC
-            avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
-            min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
-            max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          RW - Atomic:
-            xfer: Atomic
-            coherency: RW
-            avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
-            min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
-            max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-        gfx908:
-          NC - Read:
-            xfer: Read
-            coherency: NC
-            avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom))
-            min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom))
-            max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          UC - Read:
-            xfer: Read
-            coherency: UC
-            avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom))
-            min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom))
-            max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          CC - Read:
-            xfer: Read
-            coherency: CC
-            avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom))
-            min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom))
-            max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          RW - Read:
-            xfer: Read
-            coherency: RW
-            avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom))
-            min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom))
-            max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          RW - Write:
-            xfer: Write
-            coherency: RW
-            avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom))
-            min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom))
-            max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          NC - Write:
-            xfer: Write
-            coherency: NC
-            avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom))
-            min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom))
-            max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          UC - Write:
-            xfer: Write
-            coherency: UC
-            avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom))
-            min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom))
-            max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          CC - Write:
-            xfer: Write
-            coherency: CC
-            avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom))
-            min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom))
-            max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          NC - Atomic:
-            xfer: Atomic
-            coherency: NC
-            avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
-            min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
-            max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          UC - Atomic:
-            xfer: Atomic
-            coherency: UC
-            avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
-            min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
-            max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          CC - Atomic:
-            xfer: Atomic
-            coherency: CC
-            avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
-            min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
-            max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          RW - Atomic:
-            xfer: Atomic
-            coherency: RW
-            avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
-            min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
-            max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-  - metric_table:
-      id: 1605
-      title: L1 Unified Translation Cache (UTCL1)
-      header:
-        metric: Metric
-        avg: Avg
-        min: Min
-        max: Max
-        units: Unit
-      metric:
-        gfx90a:
-          Req:
-            avg: AVG((TCP_UTCL1_REQUEST_sum / $denom))
-            min: MIN((TCP_UTCL1_REQUEST_sum / $denom))
-            max: MAX((TCP_UTCL1_REQUEST_sum / $denom))
-            units: (Req + $normUnit)
-          Hit Ratio:
-            avg: AVG((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
-              if (TCP_UTCL1_REQUEST_sum != 0) else None))
-            min: MIN((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
-              if (TCP_UTCL1_REQUEST_sum != 0) else None))
-            max: MAX((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
-              if (TCP_UTCL1_REQUEST_sum != 0) else None))
-            units: pct
-          Hits:
-            avg: AVG((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
-            min: MIN((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
-            max: MAX((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
-            units: (Req + $normUnit)
-          Translation Misses:
-            avg: AVG((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
-            min: MIN((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
-            max: MAX((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
-            units: (Req + $normUnit)
-          Permission Misses:
-            avg: AVG((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
-            min: MIN((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
-            max: MAX((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
-            units: (Req + $normUnit)
-        gfx941:
-          Req:
-            avg: AVG((TCP_UTCL1_REQUEST_sum / $denom))
-            min: MIN((TCP_UTCL1_REQUEST_sum / $denom))
-            max: MAX((TCP_UTCL1_REQUEST_sum / $denom))
-            units: (Req + $normUnit)
-          Hit Ratio:
-            avg: AVG((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
-              if (TCP_UTCL1_REQUEST_sum != 0) else None))
-            min: MIN((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
-              if (TCP_UTCL1_REQUEST_sum != 0) else None))
-            max: MAX((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
-              if (TCP_UTCL1_REQUEST_sum != 0) else None))
-            units: pct
-          Hits:
-            avg: AVG((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
-            min: MIN((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
-            max: MAX((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
-            units: (Req + $normUnit)
-          Translation Misses:
-            avg: AVG((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
-            min: MIN((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
-            max: MAX((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
-            units: (Req + $normUnit)
-          Permission Misses:
-            avg: AVG((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
-            min: MIN((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
-            max: MAX((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
-            units: (Req + $normUnit)
-        gfx940:
-          Req:
-            avg: AVG((TCP_UTCL1_REQUEST_sum / $denom))
-            min: MIN((TCP_UTCL1_REQUEST_sum / $denom))
-            max: MAX((TCP_UTCL1_REQUEST_sum / $denom))
-            units: (Req + $normUnit)
-          Hit Ratio:
-            avg: AVG((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
-              if (TCP_UTCL1_REQUEST_sum != 0) else None))
-            min: MIN((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
-              if (TCP_UTCL1_REQUEST_sum != 0) else None))
-            max: MAX((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
-              if (TCP_UTCL1_REQUEST_sum != 0) else None))
-            units: pct
-          Hits:
-            avg: AVG((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
-            min: MIN((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
-            max: MAX((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
-            units: (Req + $normUnit)
-          Translation Misses:
-            avg: AVG((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
-            min: MIN((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
-            max: MAX((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
-            units: (Req + $normUnit)
-          Permission Misses:
-            avg: AVG((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
-            min: MIN((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
-            max: MAX((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
-            units: (Req + $normUnit)
-        gfx942:
-          Req:
-            avg: AVG((TCP_UTCL1_REQUEST_sum / $denom))
-            min: MIN((TCP_UTCL1_REQUEST_sum / $denom))
-            max: MAX((TCP_UTCL1_REQUEST_sum / $denom))
-            units: (Req + $normUnit)
-          Hit Ratio:
-            avg: AVG((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
-              if (TCP_UTCL1_REQUEST_sum != 0) else None))
-            min: MIN((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
-              if (TCP_UTCL1_REQUEST_sum != 0) else None))
-            max: MAX((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
-              if (TCP_UTCL1_REQUEST_sum != 0) else None))
-            units: pct
-          Hits:
-            avg: AVG((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
-            min: MIN((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
-            max: MAX((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
-            units: (Req + $normUnit)
-          Translation Misses:
-            avg: AVG((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
-            min: MIN((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
-            max: MAX((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
-            units: (Req + $normUnit)
-          Permission Misses:
-            avg: AVG((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
-            min: MIN((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
-            max: MAX((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
-            units: (Req + $normUnit)
-        gfx950:
-          Req:
-            avg: AVG((TCP_UTCL1_REQUEST_sum / $denom))
-            min: MIN((TCP_UTCL1_REQUEST_sum / $denom))
-            max: MAX((TCP_UTCL1_REQUEST_sum / $denom))
-            units: (Req + $normUnit)
-          Inflight Req:
-            avg: AVG((TCP_CLIENT_UTCL1_INFLIGHT_sum / $denom))
-            min: MIN((TCP_CLIENT_UTCL1_INFLIGHT_sum / $denom))
-            max: MAX((TCP_CLIENT_UTCL1_INFLIGHT_sum / $denom))
-            units: (Req + $normUnit)
-          Hit Ratio:
-            avg: AVG((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
-              if (TCP_UTCL1_REQUEST_sum != 0) else None))
-            min: MIN((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
-              if (TCP_UTCL1_REQUEST_sum != 0) else None))
-            max: MAX((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
-              if (TCP_UTCL1_REQUEST_sum != 0) else None))
-            units: pct
-          Hits:
-            avg: AVG((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
-            min: MIN((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
-            max: MAX((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
-            units: (Req + $normUnit)
-          Translation Misses:
-            avg: AVG((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
-            min: MIN((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
-            max: MAX((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
-            units: (Req + $normUnit)
-          Misses under Translation Miss:
-            avg: AVG((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom))
-            min: MIN((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom))
-            max: MAX((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom))
-            units: (Req + $normUnit)
-          Permission Misses:
-            avg: AVG((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
-            min: MIN((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
-            max: MAX((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
-            units: (Req + $normUnit)
-        gfx908:
-          Req:
-            avg: AVG((TCP_UTCL1_REQUEST_sum / $denom))
-            min: MIN((TCP_UTCL1_REQUEST_sum / $denom))
-            max: MAX((TCP_UTCL1_REQUEST_sum / $denom))
-            units: (Req + $normUnit)
-          Hit Ratio:
-            avg: AVG((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
-              if (TCP_UTCL1_REQUEST_sum != 0) else None))
-            min: MIN((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
-              if (TCP_UTCL1_REQUEST_sum != 0) else None))
-            max: MAX((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum)
-              if (TCP_UTCL1_REQUEST_sum != 0) else None))
-            units: pct
-          Hits:
-            avg: AVG((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
-            min: MIN((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
-            max: MAX((TCP_UTCL1_TRANSLATION_HIT_sum / $denom))
-            units: (Req + $normUnit)
-          Translation Misses:
-            avg: AVG((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
-            min: MIN((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
-            max: MAX((TCP_UTCL1_TRANSLATION_MISS_sum / $denom))
-            units: (Req + $normUnit)
-          Permission Misses:
-            avg: AVG((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
-            min: MIN((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
-            max: MAX((TCP_UTCL1_PERMISSION_MISS_sum / $denom))
-            units: (Req + $normUnit)
-  - metric_table:
-      id: 1606
-      title: L1D Addr Translation Stalls
-      header:
-        metric: Metric
-        avg: Avg
-        min: Min
-        max: Max
-        units: Unit
-      metric:
-        gfx90a: {}
-        gfx941: {}
-        gfx940: {}
-        gfx942: {}
-        gfx950:
-          Cache Full Stall:
-            avg: AVG((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom))
-            min: MIN((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom))
-            max: MAX((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom))
-            units: (Cycles + $normUnit)
-          Cache Miss Stall:
-            avg: AVG((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom))
-            min: MIN((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom))
-            max: MAX((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom))
-            units: (Cycles + $normUnit)
-          Serialization Stall:
-            avg: AVG((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom))
-            min: MIN((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom))
-            max: MAX((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom))
-            units: (Cycles + $normUnit)
-          Thrashing Stall:
-            avg: AVG((TCP_UTCL1_THRASHING_STALL_sum / $denom))
-            min: MIN((TCP_UTCL1_THRASHING_STALL_sum / $denom))
-            max: MAX((TCP_UTCL1_THRASHING_STALL_sum / $denom))
-            units: (Cycles + $normUnit)
-          Latency FIFO Stall:
-            avg: AVG((TCP_UTCL1_LFIFO_FULL_sum / $denom))
-            min: MIN((TCP_UTCL1_LFIFO_FULL_sum / $denom))
-            max: MAX((TCP_UTCL1_LFIFO_FULL_sum / $denom))
-            units: (Cycles + $normUnit)
-          Resident Page Full Stall:
-            avg: AVG((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom))
-            min: MIN((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom))
-            max: MAX((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom))
-            units: (Cycles + $normUnit)
-          UTCL2 Stall:
-            avg: AVG((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom))
-            min: MIN((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom))
-            max: MAX((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom))
-            units: (Cycles + $normUnit)
-        gfx908: {}
-  metrics_description:
-    Hit rate:
-      plain: The ratio of the number of vL1D cache line requests that hit in vL1D
-        cache over the total number of cache line requests to the vL1D Cache RAM.
-      rst: The ratio of the number of vL1D cache line requests that hit [#vl1d-hit]_ in
-        vL1D cache over the total number of cache line requests to the :ref:`vL1D
-        Cache RAM <desc-tc>`.
-      unit: Percent
-    Bandwidth Utilization:
-      plain: The number of bytes looked up in the vL1D cache as a result of VMEM instructions,
-        as a percent of the peak theoretical bandwidth achievable on the specific
-        accelerator. The number of bytes is calculated as the number of cache lines
-        requested multiplied by the cache line size. This value does not consider
-        partial requests, so for instance, if only a single value is requested in
-        a cache line, the data movement will still be counted as a full cache line.
-      rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM
-        <desc-vmem>` instructions, as a percent of the peak theoretical bandwidth
-        achievable on the specific accelerator. The number of bytes is calculated
-        as the number of cache lines requested multiplied by the cache line size.
-        This value does not consider partial requests, so for instance, if only a
-        single value is requested in a cache line, the data movement will still be
-        counted as a full cache line.
-      unit: Percent
-    Utilization:
-      plain: Indicates how busy the vL1D Cache RAM was during the kernel execution.
-        The number of cycles where the vL1D Cache RAM is actively processing any request
-        divided by the number of cycles where the vL1D is active.
-      rst: Indicates how busy the :ref:`vL1D Cache RAM <desc-tc>` was during the kernel
-        execution. The number of cycles where the vL1D Cache RAM is actively processing
-        any request divided by the number of cycles where the vL1D is active [#vl1d-activity]_.
-      unit: Percent
-    Coalescing:
-      plain: Indicates how well memory instructions were coalesced by the address
-        processing unit, ranging from uncoalesced (25%) to fully coalesced (100%).
-        Calculated as the average number of thread-requests generated per instruction
-        divided by the ideal number of thread-requests per instruction.
-      rst: Indicates how well memory instructions were coalesced by the :ref:`address
-        processing unit <desc-ta>`, ranging from uncoalesced (25%) to fully coalesced
-        (100%). Calculated as the average number of :ref:`thread-requests <thread-requests>`
-        generated per instruction divided by the ideal number of thread-requests
-        per instruction.
-      unit: Percent
-    Stalled on L2 Data:
-      plain: The ratio of the number of cycles where the vL1D is stalled waiting for
-        requested data to return from the L2 cache divided by the number of cycles
-        where the vL1D is active.
-      rst: The ratio of the number of cycles where the vL1D is stalled waiting for requested
-        data to return from the :doc:`L2 cache <l2-cache>` divided by the number
-        of cycles where the vL1D is active [#vl1d-activity]_.
-      unit: Percent
-    Stalled on L2 Req:
-      plain: The ratio of the number of cycles where the vL1D is stalled waiting to
-        issue a request for data to the L2 cache divided by the number of cycles where
-        the vL1D is active.
-      rst: The ratio of the number of cycles where the vL1D is stalled waiting to issue
-        a request for data to the :doc:`L2 cache <l2-cache>` divided by the number
-        of cycles where the vL1D is active [#vl1d-activity]_.
-      unit: Percent
-    Tag RAM Stall (Read):
-      plain: The ratio of the number of cycles where the vL1D is stalled due to Read
-        requests with conflicting tags being looked up concurrently, divided by the
-        number of cycles where the vL1D is active.
-      rst: The ratio of the number of cycles where the vL1D is stalled due to Read
-        requests with conflicting tags being looked up concurrently, divided by the
-        number of cycles where the vL1D is active [#vl1d-activity]_.
-      unit: Percent
-    Tag RAM Stall (Write):
-      plain: The ratio of the number of cycles where the vL1D is stalled due to Write
-        requests with conflicting tags being looked up concurrently, divided by the
-        number of cycles where the vL1D is active.
-      rst: The ratio of the number of cycles where the vL1D is stalled due to Write
-        requests with conflicting tags being looked up concurrently, divided by the
-        number of cycles where the vL1D is active [#vl1d-activity]_.
-      unit: Percent
-    Tag RAM Stall (Atomic):
-      plain: The ratio of the number of cycles where the vL1D is stalled due to Atomic
-        requests with conflicting tags being looked up concurrently, divided by the
-        number of cycles where the vL1D is active.
-      rst: The ratio of the number of cycles where the vL1D is stalled due to Atomic
-        requests with conflicting tags being looked up concurrently, divided by the
-        number of cycles where the vL1D is active [#vl1d-activity]_.
-      unit: Percent
-    Total Req:
-      plain: The total number of incoming requests from the address processing unit
-        after coalescing.
-      rst: The total number of incoming requests from the :ref:`address processing
-        unit <desc-ta>` after coalescing.
-      unit: Requests
-    Read Req:
-      plain: The total number of incoming read requests from the address processing
-        unit after coalescing per normalization unit.
-      rst: The total number of incoming read requests from the :ref:`address processing
-        unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
-      unit: Requests per normalization unit
-    Write Req:
-      plain: The total number of incoming write requests from the address processing
-        unit after coalescing per normalization unit.
-      rst: The total number of incoming write requests from the :ref:`address processing
-        unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
-      unit: Requests per normalization unit
-    Atomic Req:
-      plain: The total number of incoming atomic requests from the address processing
-        unit after coalescing per normalization unit.
-      rst: The total number of incoming atomic requests from the :ref:`address processing
-        unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>`
-      unit: Requests per normalization unit
-    Cache BW:
-      plain: The number of bytes looked up in the vL1D cache as a result of VMEM instructions
-        divided by total duration. The number of bytes is calculated as the number of
-        cache lines requested multiplied by the cache line size. This value does
-        not consider partial requests, so for instance, if only a single value is
-        requested in a cache line, the data movement will still be counted as a full
-        cache line.
-      rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM
-        <desc-vmem>` instructions divided by total duration. The
-        number of bytes is calculated as the number of cache lines requested multiplied
-        by the cache line size. This value does not consider partial requests, so
-        for instance, if only a single value is requested in a cache line, the data movement
-        will still be counted as a full cache line.
-      unit: Gbps
-    Cache Hit Rate:
-      plain: The ratio of the number of vL1D cache line requests that hit in vL1D
-        cache over the total number of cache line requests to the vL1D Cache RAM.
-      rst: The ratio of the number of vL1D cache line requests that hit in vL1D cache
-        over the total number of cache line requests to the :ref:`vL1D Cache RAM
-        <desc-tc>`.
-      unit: Percent
-    Cache Accesses:
-      plain: The total number of cache line lookups in the vL1D.
-      rst: The total number of cache line lookups in the vL1D.
-      unit: Cache lines
-    Cache Hits:
-      plain: The number of cache accesses minus the number of outgoing requests to
-        the L2 cache, that is, the number of cache line requests serviced by the vL1D
-        Cache RAM per normalization unit.
-      rst: The number of cache accesses minus the number of outgoing requests to the :doc:`L2
-        cache <l2-cache>`, that is, the number of cache line requests serviced by
-        the :ref:`vL1D Cache RAM <desc-tc>` per :ref:`normalization unit <normalization-units>`.
-      unit: Cache lines per normalization unit
-    Invalidations:
-      plain: The number of times the vL1D was issued a write-back invalidate command
-        during the kernel's execution per normalization unit. This may be triggered
-        by, for instance, the buffer_wbinvl1 instruction.
-      rst: The number of times the vL1D was issued a write-back invalidate command during
-        the kernel's execution per :ref:`normalization unit <normalization-units>`. This
-        may be triggered by, for instance, the ``buffer_wbinvl1`` instruction.
-      unit: Invalidations per normalization unit
-    L1-L2 BW:
-      plain: The number of bytes transferred across the vL1D-L2 interface as a result
-        of VMEM instructions, divided by total duration. The number of bytes is calculated
-        as the number of cache lines requested multiplied by the cache line size.
-        This value does not consider partial requests, so for instance, if only a
-        single value is requested in a cache line, the data movement will still be
-        counted as a full cache line.
-      rst: The number of bytes transferred across the vL1D-L2 interface as a result of
-        :ref:`VMEM <desc-vmem>` instructions, divided by total duration.
-        The number of bytes is calculated as the number of cache lines requested
-        multiplied by the cache line size. This value does not consider partial requests,
-        so for instance, if only a single value is requested in a cache line, the
-        data movement will still be counted as a full cache line.
-      unit: Gbps
-    L1-L2 Read:
-      plain: The number of read requests for a vL1D cache line that were not satisfied
-        by the vL1D and must be retrieved from the to the L2 Cache per normalization
-        unit.
-      rst: The number of read requests for a vL1D cache line that were not satisfied by
-        the vL1D and must be retrieved from the to the :doc:`L2 Cache <l2-cache>`
-        per :ref:`normalization unit <normalization-units>`.
-      unit: Requests per normalization unit
-    L1-L2 Write:
-      plain: The number of write requests to a vL1D cache line that were sent through
-        the vL1D to the L2 cache, per normalization unit.
-      rst: The number of write requests to a vL1D cache line that were sent through the
-        vL1D to the :doc:`L2 cache <l2-cache>`, per :ref:`normalization unit <normalization-units>`.
-      unit: Requests per normalization unit
-    L1-L2 Atomic:
-      plain: The number of atomic requests that are sent through the vL1D to the L2
-        cache, per normalization unit. This includes requests for atomics with, and
-        without return.
-      rst: The number of atomic requests that are sent through the vL1D to the :doc:`L2
-        cache <l2-cache>`, per :ref:`normalization unit <normalization-units>`. This
-        includes requests for atomics with, and without return.
-      unit: Requests per normalization unit
-    L1 Access Latency:
-      plain: Calculated as the average number of cycles that a vL1D cache line request
-        spent in the vL1D cache pipeline.
-      rst: Calculated as the average number of cycles that a vL1D cache line request
-        spent in the vL1D cache pipeline.
-      unit: Cycles
-    L1-L2 Read Latency:
-      plain: Calculated as the average number of cycles that the vL1D cache took to
-        issue and receive read requests from the L2 Cache. This number also includes
-        requests for atomics with return values.
-      rst: Calculated as the average number of cycles that the vL1D cache took to issue
-        and receive read requests from the :doc:`L2 Cache <l2-cache>`. This number
-        also includes requests for atomics with return values.
-      unit: Cycles
-    L1-L2 Write Latency:
-      plain: Calculated as the average number of cycles that the vL1D cache took to
-        issue and receive acknowledgement of a write request to the L2 Cache. This
-        number also includes requests for atomics without return values.
-      rst: Calculated as the average number of cycles that the vL1D cache took to issue
-        and receive acknowledgement of a write request to the :doc:`L2 Cache <l2-cache>`.
-        This number also includes requests for atomics without return values.
-      unit: Cycles
-    NC - Read:
-      plain: Total read requests with NC mtype from this TCP to all TCCs Sum over
-        TCP instances per normalization unit.
-      rst: Total read requests with NC mtype from this TCP to all TCCs Sum over TCP
-        instances per normalization unit.
-      unit: Requests per normalization unit
-    UC - Read:
-      plain: Total read requests with UC mtype from this TCP to all TCCs Sum over
-        TCP instances per normalization unit.
-      rst: Total read requests with UC mtype from this TCP to all TCCs Sum over TCP
-        instances per normalization unit.
-      unit: Requests per normalization unit
-    CC - Read:
-      plain: Total read requests with CC mtype from this TCP to all TCCs Sum over
-        TCP instances per normalization unit.
-      rst: Total read requests with CC mtype from this TCP to all TCCs Sum over TCP
-        instances per normalization unit.
-      unit: Requests per normalization unit
-    RW - Read:
-      plain: Total read requests with RW mtype from this TCP to all TCCs Sum over
-        TCP instances per normalization unit.
-      rst: Total read requests with RW mtype from this TCP to all TCCs Sum over
-        TCP instances per normalization unit.
-      unit: Requests per normalization unit
-    RW - Write:
-      plain: Total write requests with RW mtype from this TCP to all TCCs Sum over
-        TCP instances per normalization unit.
-      rst: Total write requests with RW mtype from this TCP to all TCCs Sum over TCP
-        instances per normalization unit.
-      unit: Requests per normalization unit
-    NC - Write:
-      plain: Total write requests with NC mtype from this TCP to all TCCs Sum over
-        TCP instances per normalization unit.
-      rst: Total write requests with NC mtype from this TCP to all TCCs Sum over TCP
-        instances per normalization unit.
-      unit: Requests per normalization unit
-    UC - Write:
-      plain: Total write requests with UC mtype from this TCP to all TCCs Sum over
-        TCP instances per normalization unit.
-      rst: Total write requests with UC mtype from this TCP to all TCCs Sum over TCP
-        instances per normalization unit.
-      unit: Requests per normalization unit
-    CC - Write:
-      plain: Total write requests with CC mtype from this TCP to all TCCs Sum over
-        TCP instances per normalization unit.
-      rst: Total write requests with CC mtype from this TCP to all TCCs Sum over TCP
-        instances per normalization unit.
-      unit: Requests per normalization unit
-    NC - Atomic:
-      plain: Total atomic requests with NC mtype from this TCP to all TCCs Sum over
-        TCP instances per normalization unit.
-      rst: Total atomic requests with NC mtype from this TCP to all TCCs Sum over
-        TCP instances per normalization unit.
-      unit: Requests per normalization unit
-    UC - Atomic:
-      plain: Total atomic requests with UC mtype from this TCP to all TCCs Sum over
-        TCP instances per normalization unit.
-      rst: Total atomic requests with UC mtype from this TCP to all TCCs Sum over
-        TCP instances per normalization unit.
-      unit: Requests per normalization unit
-    CC - Atomic:
-      plain: Total atomic requests with CC mtype from this TCP to all TCCs Sum over
-        TCP instances per normalization unit.
-      rst: Total atomic requests with CC mtype from this TCP to all TCCs Sum over
-        TCP instances per normalization unit.
-      unit: Requests per normalization unit
-    RW - Atomic:
-      plain: Total atomic requests with RW mtype from this TCP to all TCCs Sum over
-        TCP instances per normalization unit.
-      rst: Total atomic requests with RW mtype from this TCP to all TCCs Sum over
-        TCP instances per normalization unit.
-      unit: Requests per normalization unit
-    Req:
-      plain: The number of translation requests made to the UTCL1 per normalization
-        unit.
-      rst: The number of translation requests made to the UTCL1 per normalization
-        unit.
-      unit: Requests per normalization unit
-    Hit Ratio:
-      plain: The ratio of the number of translation requests that hit in the UTCL1
-        divided by the total number of translation requests made to the UTCL1.
-      rst: The ratio of the number of translation requests that hit in the UTCL1 divided
-        by the total number of translation requests made to the UTCL1.
-      unit: Percent
-    Hits:
-      plain: The number of translation requests that hit in the UTCL1, and could be
-        reused, per normalization unit.
-      rst: The number of translation requests that hit in the UTCL1, and could be
-        reused, per normalization unit.
-      unit: Requests per normalization unit
-    Translation Misses:
-      plain: The total number of translation requests that missed in the UTCL1 due
-        to translation not being present in the cache, per normalization unit.
-      rst: The total number of translation requests that missed in the UTCL1 due to translation
-        not being present in the cache, per :ref:`normalization unit <normalization-units>`.
-      unit: unit
-    Permission Misses:
-      plain: |-
-        The total number of translation requests that missed in the UTCL1 due
-        to a permission error, per normalization unit. This is unused and expected
-        to be zero in most configurations for modern CDNA\u2122 accelerators.
-      rst: |-
-        The total number of translation requests that missed in the UTCL1 due
-        to a permission error, per :ref:`normalization unit <normalization-units>`.
-        This is unused and expected to be zero in most configurations for modern
-        CDNA\u2122 accelerators.
-      unit: Requests per normalization unit
-- id: 1700
-  title: L2 Cache
-  data source:
-  - metric_table:
-      id: 1701
-      title: L2 Speed-of-Light
-      header:
-        metric: Metric
-        value: Avg
-        unit: Unit
-      metric:
-        gfx90a:
-          Utilization:
-            value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($total_l2_chan) * $GRBM_GUI_ACTIVE_PER_XCD)))
-            unit: pct
-          Peak Bandwidth:
-            value: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))))
-              / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)))
-            unit: pct
-          Hit Rate:
-            value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-              + TCC_MISS_sum) != 0) else 0))
-            unit: pct
-          L2-Fabric Read BW:
-            value: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum)
-              * 64)) / (End_Timestamp - Start_Timestamp)))
-            unit: GB/s
-          L2-Fabric Write and Atomic BW:
-            value: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum)
-              * 32)) / (End_Timestamp - Start_Timestamp)))
-            unit: GB/s
-          HBM Bandwidth:
-            value: $hbmBandwidth
-            unit: GB/s
-        gfx941:
-          Utilization:
-            value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($total_l2_chan) * $GRBM_GUI_ACTIVE_PER_XCD)))
-            unit: pct
-          Peak Bandwidth:
-            value: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))))
-              / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)))
-            unit: pct
-          Hit Rate:
-            value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-              + TCC_MISS_sum) != 0) else 0))
-            unit: pct
-          L2-Fabric Read BW:
-            value: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
-              * 64)) / (End_Timestamp - Start_Timestamp)))
-            unit: GB/s
-          L2-Fabric Write and Atomic BW:
-            value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
-              * 32)) / (End_Timestamp - Start_Timestamp)))
-            unit: GB/s
-          HBM Bandwidth:
-            value: $hbmBandwidth
-            unit: GB/s
-        gfx940:
-          Utilization:
-            value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($total_l2_chan) * $GRBM_GUI_ACTIVE_PER_XCD)))
-            unit: pct
-          Peak Bandwidth:
-            value: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))))
-              / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)))
-            unit: pct
-          Hit Rate:
-            value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-              + TCC_MISS_sum) != 0) else 0))
-            unit: pct
-          L2-Fabric Read BW:
-            value: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
-              * 64)) / (End_Timestamp - Start_Timestamp)))
-            unit: GB/s
-          L2-Fabric Write and Atomic BW:
-            value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
-              * 32)) / (End_Timestamp - Start_Timestamp)))
-            unit: GB/s
-          HBM Bandwidth:
-            value: $hbmBandwidth
-            unit: GB/s
-        gfx942:
-          Utilization:
-            value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($total_l2_chan) * $GRBM_GUI_ACTIVE_PER_XCD)))
-            unit: pct
-          Peak Bandwidth:
-            value: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))))
-              / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)))
-            unit: pct
-          Hit Rate:
-            value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-              + TCC_MISS_sum) != 0) else 0))
-            unit: pct
-          L2-Fabric Read BW:
-            value: AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
-              - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
-              - Start_Timestamp))
-            unit: GB/s
-          L2-Fabric Write and Atomic BW:
-            value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
-              * 32)) / (End_Timestamp - Start_Timestamp)))
-            unit: GB/s
-          HBM Bandwidth:
-            value: $hbmBandwidth
-            unit: GB/s
-        gfx950:
-          Utilization:
-            value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($total_l2_chan) * $GRBM_GUI_ACTIVE_PER_XCD)))
-            unit: pct
-          Peak Bandwidth:
-            value: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))))
-              / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)))
-            unit: pct
-          Hit Rate:
-            value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-              + TCC_MISS_sum) != 0) else 0))
-            unit: pct
-          L2-Fabric Read BW:
-            value: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64)
-              + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp)))
-            unit: GB/s
-          L2-Fabric Write and Atomic BW:
-            value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
-              * 32)) / (End_Timestamp - Start_Timestamp)))
-            unit: GB/s
-          HBM Bandwidth:
-            value: $hbmBandwidth
-            unit: GB/s
-        gfx908:
-          Utilization:
-            value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($total_l2_chan) * $GRBM_GUI_ACTIVE_PER_XCD)))
-            unit: pct
-          Peak Bandwidth:
-            value: ((100 * AVG(((TCC_REQ_sum * 64) / (End_Timestamp - Start_Timestamp))))
-              / ((($max_sclk / 1000) * 64) * TO_INT($total_l2_chan)))
-            unit: pct
-          Hit Rate:
-            value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-              + TCC_MISS_sum) != 0) else 0))
-            unit: pct
-          L2-Fabric Read BW:
-            value: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
-              * 64)) / (End_Timestamp - Start_Timestamp)))
-            unit: GB/s
-          L2-Fabric Write and Atomic BW:
-            value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
-              * 32)) / (End_Timestamp - Start_Timestamp)))
-            unit: GB/s
-          HBM Bandwidth:
-            value: $hbmBandwidth
-            unit: GB/s
-  - metric_table:
-      id: 1702
-      title: L2-Fabric interface metrics
-      header:
-        metric: Metric
-        avg: Avg
-        min: Min
-        max: Max
-        unit: Unit
-      metric:
-        gfx90a:
-          Read BW:
-            avg: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum)
-              * 64)) / (End_Timestamp - Start_Timestamp)))
-            min: MIN((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum)
-              * 64)) / (End_Timestamp - Start_Timestamp)))
-            max: MAX((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum)
-              * 64)) / (End_Timestamp - Start_Timestamp)))
-            unit: Gbps
-          HBM Read Traffic:
-            avg: AVG((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
-              != 0) else None))
-            min: MIN((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
-              != 0) else None))
-            max: MAX((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
-              != 0) else None))
-            unit: pct
-          Remote Read Traffic:
-            avg: AVG((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum)
-              if (TCC_EA_RDREQ_sum != 0) else None))
-            min: MIN((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum)
-              if (TCC_EA_RDREQ_sum != 0) else None))
-            max: MAX((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum)
-              if (TCC_EA_RDREQ_sum != 0) else None))
-            unit: pct
-          Uncached Read Traffic:
-            avg: AVG((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
-              != 0) else None))
-            min: MIN((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
-              != 0) else None))
-            max: MAX((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
-              != 0) else None))
-            unit: pct
-          Write and Atomic BW:
-            avg: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
-              * 32)) / $denom))
-            min: MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
-              * 32)) / $denom))
-            max: MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
-              * 32)) / $denom))
-            unit: (Bytes + $normUnit)
-          HBM Write and Atomic Traffic:
-            avg: AVG((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
-              != 0) else None))
-            min: MIN((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
-              != 0) else None))
-            max: MAX((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
-              != 0) else None))
-            unit: pct
-          Remote Write and Atomic Traffic:
-            avg: AVG((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum)
-              if (TCC_EA_WRREQ_sum != 0) else None))
-            min: MIN((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum)
-              if (TCC_EA_WRREQ_sum != 0) else None))
-            max: MAX((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum)
-              if (TCC_EA_WRREQ_sum != 0) else None))
-            unit: pct
-          Atomic Traffic:
-            avg: AVG((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
-              != 0) else None))
-            min: MIN((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
-              != 0) else None))
-            max: MAX((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
-              != 0) else None))
-            unit: pct
-          Uncached Write and Atomic Traffic:
-            avg: AVG((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
-              != 0) else None))
-            min: MIN((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
-              != 0) else None))
-            max: MAX((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
-              != 0) else None))
-            unit: pct
-          Read Latency:
-            avg: AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
-              != 0) else None))
-            min: MIN(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
-              != 0) else None))
-            max: MAX(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum
-              != 0) else None))
-            unit: Cycles
-          Write and Atomic Latency:
-            avg: AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
-              != 0) else None))
-            min: MIN(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
-              != 0) else None))
-            max: MAX(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum
-              != 0) else None))
-            unit: Cycles
-          Atomic Latency:
-            avg: AVG(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum
-              != 0) else None))
-            min: MIN(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum
-              != 0) else None))
-            max: MAX(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum
-              != 0) else None))
-            unit: Cycles
-        gfx941:
-          Read BW:
-            avg: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
-              * 64)) / (End_Timestamp - Start_Timestamp)))
-            min: MIN((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
-              * 64)) / (End_Timestamp - Start_Timestamp)))
-            max: MAX((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
-              * 64)) / (End_Timestamp - Start_Timestamp)))
-            unit: Gbps
-          HBM Read Traffic:
-            avg: AVG((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else None))
-            min: MIN((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else None))
-            max: MAX((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else None))
-            unit: pct
-          Remote Read Traffic:
-            avg: AVG((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum),0) /
-              TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
-            min: MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum),0) /
-              TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
-            max: MAX((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum),0) /
-              TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
-            unit: pct
-          Uncached Read Traffic:
-            avg: AVG((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else None))
-            min: MIN((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else None))
-            max: MAX((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else None))
-            unit: pct
-          Write and Atomic BW:
-            avg: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
-              * 32)) / (End_Timestamp - Start_Timestamp)))
-            min: MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
-              * 32)) / (End_Timestamp - Start_Timestamp)))
-            max: MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
-              * 32)) / (End_Timestamp - Start_Timestamp)))
-            unit: Gbps
-          HBM Write and Atomic Traffic:
-            avg: AVG((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            min: MIN((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            max: MAX((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            unit: pct
-          Remote Write and Atomic Traffic:
-            avg: AVG((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) /
-              TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
-            min: MIN((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) /
-              TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
-            max: MAX((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) /
-              TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
-            unit: pct
-          Atomic Traffic:
-            avg: AVG((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            min: MIN((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            max: MAX((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            unit: pct
-          Uncached Write and Atomic Traffic:
-            avg: AVG((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            min: MIN((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            max: MAX((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            unit: pct
-          Read Latency:
-            avg: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else None))
-            min: MIN(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else None))
-            max: MAX(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else None))
-            unit: Cycles
-          Write and Atomic Latency:
-            avg: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            min: MIN(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            max: MAX(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            unit: Cycles
-          Atomic Latency:
-            avg: AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
-              != 0) else None))
-            min: MIN(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
-              != 0) else None))
-            max: MAX(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
-              != 0) else None))
-            unit: Cycles
-        gfx940:
-          Read BW:
-            avg: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
-              * 64)) / (End_Timestamp - Start_Timestamp)))
-            min: MIN((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
-              * 64)) / (End_Timestamp - Start_Timestamp)))
-            max: MAX((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
-              * 64)) / (End_Timestamp - Start_Timestamp)))
-            unit: Gbps
-          HBM Read Traffic:
-            avg: AVG((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else None))
-            min: MIN((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else None))
-            max: MAX((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else None))
-            unit: pct
-          Remote Read Traffic:
-            avg: AVG((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0)
-              / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
-            min: MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0)
-              / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
-            max: MAX((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0)
-              / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
-            unit: pct
-          Uncached Read Traffic:
-            avg: AVG((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else None))
-            min: MIN((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else None))
-            max: MAX((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else None))
-            unit: pct
-          Write and Atomic BW:
-            avg: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
-              * 32)) / (End_Timestamp - Start_Timestamp)))
-            min: MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
-              * 32)) / (End_Timestamp - Start_Timestamp)))
-            max: MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
-              * 32)) / (End_Timestamp - Start_Timestamp)))
-            unit: Gbps
-          HBM Write and Atomic Traffic:
-            avg: AVG((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            min: MIN((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            max: MAX((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            unit: pct
-          Remote Write and Atomic Traffic:
-            avg: AVG((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) /
-              TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
-            min: MIN((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) /
-              TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
-            max: MAX((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) /
-              TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
-            unit: pct
-          Atomic Traffic:
-            avg: AVG((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            min: MIN((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            max: MAX((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            unit: pct
-          Uncached Write and Atomic Traffic:
-            avg: AVG((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            min: MIN((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            max: MAX((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            unit: pct
-          Read Latency:
-            avg: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else None))
-            min: MIN(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else None))
-            max: MAX(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else None))
-            unit: Cycles
-          Write and Atomic Latency:
-            avg: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            min: MIN(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            max: MAX(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            unit: Cycles
-          Atomic Latency:
-            avg: AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
-              != 0) else None))
-            min: MIN(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
-              != 0) else None))
-            max: MAX(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
-              != 0) else None))
-            unit: Cycles
-        gfx942:
-          Read BW:
-            avg: AVG(((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
-              - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)))
-            min: MIN(((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
-              - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)))
-            max: MAX(((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
-              - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp)))
-            unit: Gbps
-          HBM Read Traffic:
-            avg: AVG((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else None))
-            min: MIN((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else None))
-            max: MAX((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else None))
-            unit: pct
-          Remote Read Traffic:
-            avg: AVG((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum),0) /
-              TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
-            min: MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum),0) /
-              TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
-            max: MAX((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum),0) /
-              TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
-            unit: pct
-          Uncached Read Traffic:
-            avg: AVG((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else None))
-            min: MIN((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else None))
-            max: MAX((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else None))
-            unit: pct
-          Write and Atomic BW:
-            avg: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
-              * 32)) / (End_Timestamp - Start_Timestamp)))
-            min: MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
-              * 32)) / (End_Timestamp - Start_Timestamp)))
-            max: MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
-              * 32)) / (End_Timestamp - Start_Timestamp)))
-            unit: Gbps
-          HBM Write and Atomic Traffic:
-            avg: AVG((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            min: MIN((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            max: MAX((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            unit: pct
-          Remote Write and Atomic Traffic:
-            avg: AVG((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) /
-              TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
-            min: MIN((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) /
-              TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
-            max: MAX((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) /
-              TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
-            unit: pct
-          Atomic Traffic:
-            avg: AVG((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            min: MIN((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            max: MAX((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            unit: pct
-          Uncached Write and Atomic Traffic:
-            avg: AVG((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            min: MIN((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            max: MAX((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            unit: pct
-          Read Latency:
-            avg: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else None))
-            min: MIN(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else None))
-            max: MAX(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else None))
-            unit: Cycles
-          Write and Atomic Latency:
-            avg: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            min: MIN(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            max: MAX(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            unit: Cycles
-          Atomic Latency:
-            avg: AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
-              != 0) else None))
-            min: MIN(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
-              != 0) else None))
-            max: MAX(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
-              != 0) else None))
-            unit: Cycles
-        gfx950:
-          Read BW:
-            avg: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64)
-              + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp)))
-            min: MIN((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64)
-              + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp)))
-            max: MAX((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64)
-              + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp)))
-            unit: Gbps
-          HBM Read Traffic:
-            avg: AVG((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else None))
-            min: MIN((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else None))
-            max: MAX((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else None))
-            unit: pct
-          Remote Read Traffic:
-            avg: AVG((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0)
-              / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
-            min: MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0)
-              / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
-            max: MAX((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0)
-              / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None))
-            unit: pct
-          Uncached Read Traffic:
-            avg: AVG((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else None))
-            min: MIN((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else None))
-            max: MAX((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else None))
-            unit: pct
-          Write and Atomic BW:
-            avg: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
-              * 32)) / (End_Timestamp - Start_Timestamp)))
-            min: MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
-              * 32)) / (End_Timestamp - Start_Timestamp)))
-            max: MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
-              * 32)) / (End_Timestamp - Start_Timestamp)))
-            unit: Gbps
-          HBM Write and Atomic Traffic:
-            avg: AVG((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            min: MIN((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            max: MAX((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            unit: pct
-          Remote Write and Atomic Traffic:
-            avg: AVG((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) /
-              TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
-            min: MIN((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) /
-              TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
-            max: MAX((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) /
-              TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None))
-            unit: pct
-          Atomic Traffic:
-            avg: AVG((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            min: MIN((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            max: MAX((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            unit: pct
-          Uncached Write and Atomic Traffic:
-            avg: AVG((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            min: MIN((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            max: MAX((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            unit: pct
-          Read Latency:
-            avg: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else None))
-            min: MIN(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else None))
-            max: MAX(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else None))
-            unit: Cycles
-          Write and Atomic Latency:
-            avg: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            min: MIN(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            max: MAX(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            unit: Cycles
-          Atomic Latency:
-            avg: AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
-              != 0) else None))
-            min: MIN(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
-              != 0) else None))
-            max: MAX(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
-              != 0) else None))
-            unit: Cycles
-          Read Stall:
-            avg: AVG((((100 * ((TCC_EA0_RDREQ_IO_CREDIT_STALL_sum + TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum)
-              + TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum
-              != 0) else None))
-            min: MIN((((100 * ((TCC_EA0_RDREQ_IO_CREDIT_STALL_sum + TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum)
-              + TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum
-              != 0) else None))
-            max: MAX((((100 * ((TCC_EA0_RDREQ_IO_CREDIT_STALL_sum + TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum)
-              + TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum
-              != 0) else None))
-            unit: pct
-          Write Stall:
-            avg: AVG(((100 * (TCC_EA0_WRREQ_STALL_sum) / TCC_BUSY_sum) if (TCC_BUSY_sum
-              != 0) else None))
-            min: MIN(((100 * (TCC_EA0_WRREQ_STALL_sum) / TCC_BUSY_sum) if (TCC_BUSY_sum
-              != 0) else None))
-            max: MAX(((100 * (TCC_EA0_WRREQ_STALL_sum) / TCC_BUSY_sum) if (TCC_BUSY_sum
-              != 0) else None))
-            unit: pct
-        gfx908:
-          Read BW:
-            avg: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
-              * 64)) / $denom))
-            min: MIN((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
-              * 64)) / $denom))
-            max: MAX((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum)
-              * 64)) / $denom))
-            unit: (Bytes + $normUnit)
-          HBM Read Traffic:
-            avg: AVG((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else None))
-            min: MIN((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else None))
-            max: MAX((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else None))
-            unit: pct
-          Remote Read Traffic:
-            avg: AVG((100 * ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum) / TCC_EA0_RDREQ_sum)
-              if (TCC_EA0_RDREQ_sum != 0) else None))
-            min: MIN((100 * ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum) / TCC_EA0_RDREQ_sum)
-              if (TCC_EA0_RDREQ_sum != 0) else None))
-            max: MAX((100 * ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum) / TCC_EA0_RDREQ_sum)
-              if (TCC_EA0_RDREQ_sum != 0) else None))
-            unit: pct
-          Uncached Read Traffic:
-            avg: AVG((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else None))
-            min: MIN((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else None))
-            max: MAX((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else None))
-            unit: pct
-          Write and Atomic BW:
-            avg: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
-              * 32)) / $denom))
-            min: MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
-              * 32)) / $denom))
-            max: MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
-              * 32)) / $denom))
-            unit: (Bytes + $normUnit)
-          HBM Write and Atomic Traffic:
-            avg: AVG((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            min: MIN((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            max: MAX((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            unit: pct
-          Remote Write and Atomic Traffic:
-            avg: AVG((100 * ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum) / TCC_EA0_WRREQ_sum)
-              if (TCC_EA0_WRREQ_sum != 0) else None))
-            min: MIN((100 * ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum) / TCC_EA0_WRREQ_sum)
-              if (TCC_EA0_WRREQ_sum != 0) else None))
-            max: MAX((100 * ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum) / TCC_EA0_WRREQ_sum)
-              if (TCC_EA0_WRREQ_sum != 0) else None))
-            unit: pct
-          Atomic Traffic:
-            avg: AVG((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            min: MIN((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            max: MAX((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            unit: pct
-          Uncached Write and Atomic Traffic:
-            avg: AVG((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            min: MIN((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            max: MAX((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            unit: pct
-          Read Latency:
-            avg: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else None))
-            min: MIN(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else None))
-            max: MAX(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum
-              != 0) else None))
-            unit: Cycles
-          Write and Atomic Latency:
-            avg: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            min: MIN(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            max: MAX(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum
-              != 0) else None))
-            unit: Cycles
-          Atomic Latency:
-            avg: AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
-              != 0) else None))
-            min: MIN(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
-              != 0) else None))
-            max: MAX(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum
-              != 0) else None))
-            unit: Cycles
-  - metric_table:
-      id: 1703
-      title: L2 Cache Accesses
-      header:
-        metric: Metric
-        avg: Avg
-        min: Min
-        max: Max
-        unit: Unit
-      metric:
-        gfx90a:
-          Bandwidth:
-            avg: AVG((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
-            min: MIN((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
-            max: MAX((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
-            unit: Gbps
-          Req:
-            avg: AVG((TCC_REQ_sum / $denom))
-            min: MIN((TCC_REQ_sum / $denom))
-            max: MAX((TCC_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          Read Req:
-            avg: AVG((TCC_READ_sum / $denom))
-            min: MIN((TCC_READ_sum / $denom))
-            max: MAX((TCC_READ_sum / $denom))
-            unit: (Req + $normUnit)
-          Write Req:
-            avg: AVG((TCC_WRITE_sum / $denom))
-            min: MIN((TCC_WRITE_sum / $denom))
-            max: MAX((TCC_WRITE_sum / $denom))
-            unit: (Req + $normUnit)
-          Atomic Req:
-            avg: AVG((TCC_ATOMIC_sum / $denom))
-            min: MIN((TCC_ATOMIC_sum / $denom))
-            max: MAX((TCC_ATOMIC_sum / $denom))
-            unit: (Req + $normUnit)
-          Streaming Req:
-            avg: AVG((TCC_STREAMING_REQ_sum / $denom))
-            min: MIN((TCC_STREAMING_REQ_sum / $denom))
-            max: MAX((TCC_STREAMING_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          Probe Req:
-            avg: AVG((TCC_PROBE_sum / $denom))
-            min: MIN((TCC_PROBE_sum / $denom))
-            max: MAX((TCC_PROBE_sum / $denom))
-            unit: (Req + $normUnit)
-          Cache Hit:
-            avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-              + TCC_MISS_sum) != 0) else None))
-            min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-              + TCC_MISS_sum) != 0) else None))
-            max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-              + TCC_MISS_sum) != 0) else None))
-            unit: pct
-          Hits:
-            avg: AVG((TCC_HIT_sum / $denom))
-            min: MIN((TCC_HIT_sum / $denom))
-            max: MAX((TCC_HIT_sum / $denom))
-            unit: (Hits + $normUnit)
-          Misses:
-            avg: AVG((TCC_MISS_sum / $denom))
-            min: MIN((TCC_MISS_sum / $denom))
-            max: MAX((TCC_MISS_sum / $denom))
-            unit: (Misses + $normUnit)
-          Writeback:
-            avg: AVG((TCC_WRITEBACK_sum / $denom))
-            min: MIN((TCC_WRITEBACK_sum / $denom))
-            max: MAX((TCC_WRITEBACK_sum / $denom))
-            unit: (Cachelines + $normUnit)
-          Writeback (Internal):
-            avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom))
-            min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom))
-            max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom))
-            unit: (Cachelines + $normUnit)
-          Writeback (vL1D Req):
-            avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
-            min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
-            max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
-            unit: (Cachelines + $normUnit)
-          Evict (Internal):
-            avg: AVG((TCC_NORMAL_EVICT_sum / $denom))
-            min: MIN((TCC_NORMAL_EVICT_sum / $denom))
-            max: MAX((TCC_NORMAL_EVICT_sum / $denom))
-            unit: (Cachelines + $normUnit)
-          Evict (vL1D Req):
-            avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
-            min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
-            max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
-            unit: (Cachelines + $normUnit)
-          NC Req:
-            avg: AVG((TCC_NC_REQ_sum / $denom))
-            min: MIN((TCC_NC_REQ_sum / $denom))
-            max: MAX((TCC_NC_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          UC Req:
-            avg: AVG((TCC_UC_REQ_sum / $denom))
-            min: MIN((TCC_UC_REQ_sum / $denom))
-            max: MAX((TCC_UC_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          CC Req:
-            avg: AVG((TCC_CC_REQ_sum / $denom))
-            min: MIN((TCC_CC_REQ_sum / $denom))
-            max: MAX((TCC_CC_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          RW Req:
-            avg: AVG((TCC_RW_REQ_sum / $denom))
-            min: MIN((TCC_RW_REQ_sum / $denom))
-            max: MAX((TCC_RW_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-        gfx941:
-          Bandwidth:
-            avg: AVG((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
-            min: MIN((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
-            max: MAX((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
-            unit: Gbps
-          Req:
-            avg: AVG((TCC_REQ_sum / $denom))
-            min: MIN((TCC_REQ_sum / $denom))
-            max: MAX((TCC_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          Read Req:
-            avg: AVG((TCC_READ_sum / $denom))
-            min: MIN((TCC_READ_sum / $denom))
-            max: MAX((TCC_READ_sum / $denom))
-            unit: (Req + $normUnit)
-          Write Req:
-            avg: AVG((TCC_WRITE_sum / $denom))
-            min: MIN((TCC_WRITE_sum / $denom))
-            max: MAX((TCC_WRITE_sum / $denom))
-            unit: (Req + $normUnit)
-          Atomic Req:
-            avg: AVG((TCC_ATOMIC_sum / $denom))
-            min: MIN((TCC_ATOMIC_sum / $denom))
-            max: MAX((TCC_ATOMIC_sum / $denom))
-            unit: (Req + $normUnit)
-          Streaming Req:
-            avg: AVG((TCC_STREAMING_REQ_sum / $denom))
-            min: MIN((TCC_STREAMING_REQ_sum / $denom))
-            max: MAX((TCC_STREAMING_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          Probe Req:
-            avg: AVG((TCC_PROBE_sum / $denom))
-            min: MIN((TCC_PROBE_sum / $denom))
-            max: MAX((TCC_PROBE_sum / $denom))
-            unit: (Req + $normUnit)
-          Cache Hit:
-            avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-              + TCC_MISS_sum) != 0) else None))
-            min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-              + TCC_MISS_sum) != 0) else None))
-            max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-              + TCC_MISS_sum) != 0) else None))
-            unit: pct
-          Hits:
-            avg: AVG((TCC_HIT_sum / $denom))
-            min: MIN((TCC_HIT_sum / $denom))
-            max: MAX((TCC_HIT_sum / $denom))
-            unit: (Hits + $normUnit)
-          Misses:
-            avg: AVG((TCC_MISS_sum / $denom))
-            min: MIN((TCC_MISS_sum / $denom))
-            max: MAX((TCC_MISS_sum / $denom))
-            unit: (Misses + $normUnit)
-          Writeback:
-            avg: AVG((TCC_WRITEBACK_sum / $denom))
-            min: MIN((TCC_WRITEBACK_sum / $denom))
-            max: MAX((TCC_WRITEBACK_sum / $denom))
-            unit: (Cachelines + $normUnit)
-          Writeback (Internal):
-            avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom))
-            min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom))
-            max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom))
-            unit: (Cachelines + $normUnit)
-          Writeback (vL1D Req):
-            avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
-            min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
-            max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
-            unit: (Cachelines + $normUnit)
-          Evict (Internal):
-            avg: AVG((TCC_NORMAL_EVICT_sum / $denom))
-            min: MIN((TCC_NORMAL_EVICT_sum / $denom))
-            max: MAX((TCC_NORMAL_EVICT_sum / $denom))
-            unit: (Cachelines + $normUnit)
-          Evict (vL1D Req):
-            avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
-            min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
-            max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
-            unit: (Cachelines + $normUnit)
-          NC Req:
-            avg: AVG((TCC_NC_REQ_sum / $denom))
-            min: MIN((TCC_NC_REQ_sum / $denom))
-            max: MAX((TCC_NC_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          UC Req:
-            avg: AVG((TCC_UC_REQ_sum / $denom))
-            min: MIN((TCC_UC_REQ_sum / $denom))
-            max: MAX((TCC_UC_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          CC Req:
-            avg: AVG((TCC_CC_REQ_sum / $denom))
-            min: MIN((TCC_CC_REQ_sum / $denom))
-            max: MAX((TCC_CC_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          RW Req:
-            avg: AVG((TCC_RW_REQ_sum / $denom))
-            min: MIN((TCC_RW_REQ_sum / $denom))
-            max: MAX((TCC_RW_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-        gfx940:
-          Bandwidth:
-            avg: AVG((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
-            min: MIN((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
-            max: MAX((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
-            unit: Gbps
-          Req:
-            avg: AVG((TCC_REQ_sum / $denom))
-            min: MIN((TCC_REQ_sum / $denom))
-            max: MAX((TCC_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          Read Req:
-            avg: AVG((TCC_READ_sum / $denom))
-            min: MIN((TCC_READ_sum / $denom))
-            max: MAX((TCC_READ_sum / $denom))
-            unit: (Req + $normUnit)
-          Write Req:
-            avg: AVG((TCC_WRITE_sum / $denom))
-            min: MIN((TCC_WRITE_sum / $denom))
-            max: MAX((TCC_WRITE_sum / $denom))
-            unit: (Req + $normUnit)
-          Atomic Req:
-            avg: AVG((TCC_ATOMIC_sum / $denom))
-            min: MIN((TCC_ATOMIC_sum / $denom))
-            max: MAX((TCC_ATOMIC_sum / $denom))
-            unit: (Req + $normUnit)
-          Streaming Req:
-            avg: AVG((TCC_STREAMING_REQ_sum / $denom))
-            min: MIN((TCC_STREAMING_REQ_sum / $denom))
-            max: MAX((TCC_STREAMING_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          Probe Req:
-            avg: AVG((TCC_PROBE_sum / $denom))
-            min: MIN((TCC_PROBE_sum / $denom))
-            max: MAX((TCC_PROBE_sum / $denom))
-            unit: (Req + $normUnit)
-          Cache Hit:
-            avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-              + TCC_MISS_sum) != 0) else None))
-            min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-              + TCC_MISS_sum) != 0) else None))
-            max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-              + TCC_MISS_sum) != 0) else None))
-            unit: pct
-          Hits:
-            avg: AVG((TCC_HIT_sum / $denom))
-            min: MIN((TCC_HIT_sum / $denom))
-            max: MAX((TCC_HIT_sum / $denom))
-            unit: (Hits + $normUnit)
-          Misses:
-            avg: AVG((TCC_MISS_sum / $denom))
-            min: MIN((TCC_MISS_sum / $denom))
-            max: MAX((TCC_MISS_sum / $denom))
-            unit: (Misses + $normUnit)
-          Writeback:
-            avg: AVG((TCC_WRITEBACK_sum / $denom))
-            min: MIN((TCC_WRITEBACK_sum / $denom))
-            max: MAX((TCC_WRITEBACK_sum / $denom))
-            unit: (Cachelines + $normUnit)
-          Writeback (Internal):
-            avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom))
-            min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom))
-            max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom))
-            unit: (Cachelines + $normUnit)
-          Writeback (vL1D Req):
-            avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
-            min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
-            max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
-            unit: (Cachelines + $normUnit)
-          Evict (Internal):
-            avg: AVG((TCC_NORMAL_EVICT_sum / $denom))
-            min: MIN((TCC_NORMAL_EVICT_sum / $denom))
-            max: MAX((TCC_NORMAL_EVICT_sum / $denom))
-            unit: (Cachelines + $normUnit)
-          Evict (vL1D Req):
-            avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
-            min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
-            max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
-            unit: (Cachelines + $normUnit)
-          NC Req:
-            avg: AVG((TCC_NC_REQ_sum / $denom))
-            min: MIN((TCC_NC_REQ_sum / $denom))
-            max: MAX((TCC_NC_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          UC Req:
-            avg: AVG((TCC_UC_REQ_sum / $denom))
-            min: MIN((TCC_UC_REQ_sum / $denom))
-            max: MAX((TCC_UC_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          CC Req:
-            avg: AVG((TCC_CC_REQ_sum / $denom))
-            min: MIN((TCC_CC_REQ_sum / $denom))
-            max: MAX((TCC_CC_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          RW Req:
-            avg: AVG((TCC_RW_REQ_sum / $denom))
-            min: MIN((TCC_RW_REQ_sum / $denom))
-            max: MAX((TCC_RW_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-        gfx942:
-          Bandwidth:
-            avg: AVG((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
-            min: MIN((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
-            max: MAX((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
-            unit: Gbps
-          Req:
-            avg: AVG((TCC_REQ_sum / $denom))
-            min: MIN((TCC_REQ_sum / $denom))
-            max: MAX((TCC_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          Read Req:
-            avg: AVG((TCC_READ_sum / $denom))
-            min: MIN((TCC_READ_sum / $denom))
-            max: MAX((TCC_READ_sum / $denom))
-            unit: (Req + $normUnit)
-          Write Req:
-            avg: AVG((TCC_WRITE_sum / $denom))
-            min: MIN((TCC_WRITE_sum / $denom))
-            max: MAX((TCC_WRITE_sum / $denom))
-            unit: (Req + $normUnit)
-          Atomic Req:
-            avg: AVG((TCC_ATOMIC_sum / $denom))
-            min: MIN((TCC_ATOMIC_sum / $denom))
-            max: MAX((TCC_ATOMIC_sum / $denom))
-            unit: (Req + $normUnit)
-          Streaming Req:
-            avg: AVG((TCC_STREAMING_REQ_sum / $denom))
-            min: MIN((TCC_STREAMING_REQ_sum / $denom))
-            max: MAX((TCC_STREAMING_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          Probe Req:
-            avg: AVG((TCC_PROBE_sum / $denom))
-            min: MIN((TCC_PROBE_sum / $denom))
-            max: MAX((TCC_PROBE_sum / $denom))
-            unit: (Req + $normUnit)
-          Cache Hit:
-            avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-              + TCC_MISS_sum) != 0) else None))
-            min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-              + TCC_MISS_sum) != 0) else None))
-            max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-              + TCC_MISS_sum) != 0) else None))
-            unit: pct
-          Hits:
-            avg: AVG((TCC_HIT_sum / $denom))
-            min: MIN((TCC_HIT_sum / $denom))
-            max: MAX((TCC_HIT_sum / $denom))
-            unit: (Hits + $normUnit)
-          Misses:
-            avg: AVG((TCC_MISS_sum / $denom))
-            min: MIN((TCC_MISS_sum / $denom))
-            max: MAX((TCC_MISS_sum / $denom))
-            unit: (Misses + $normUnit)
-          Writeback:
-            avg: AVG((TCC_WRITEBACK_sum / $denom))
-            min: MIN((TCC_WRITEBACK_sum / $denom))
-            max: MAX((TCC_WRITEBACK_sum / $denom))
-            unit: (Cachelines + $normUnit)
-          Writeback (Internal):
-            avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom))
-            min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom))
-            max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom))
-            unit: (Cachelines + $normUnit)
-          Writeback (vL1D Req):
-            avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
-            min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
-            max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
-            unit: (Cachelines + $normUnit)
-          Evict (Internal):
-            avg: AVG((TCC_NORMAL_EVICT_sum / $denom))
-            min: MIN((TCC_NORMAL_EVICT_sum / $denom))
-            max: MAX((TCC_NORMAL_EVICT_sum / $denom))
-            unit: (Cachelines + $normUnit)
-          Evict (vL1D Req):
-            avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
-            min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
-            max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
-            unit: (Cachelines + $normUnit)
-          NC Req:
-            avg: AVG((TCC_NC_REQ_sum / $denom))
-            min: MIN((TCC_NC_REQ_sum / $denom))
-            max: MAX((TCC_NC_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          UC Req:
-            avg: AVG((TCC_UC_REQ_sum / $denom))
-            min: MIN((TCC_UC_REQ_sum / $denom))
-            max: MAX((TCC_UC_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          CC Req:
-            avg: AVG((TCC_CC_REQ_sum / $denom))
-            min: MIN((TCC_CC_REQ_sum / $denom))
-            max: MAX((TCC_CC_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          RW Req:
-            avg: AVG((TCC_RW_REQ_sum / $denom))
-            min: MIN((TCC_RW_REQ_sum / $denom))
-            max: MAX((TCC_RW_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-        gfx950:
-          Bandwidth:
-            avg: AVG((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
-            min: MIN((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
-            max: MAX((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))
-            unit: Gbps
-          Read Bandwidth:
-            avg: AVG(TCC_READ_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
-            min: MIN(TCC_READ_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
-            max: MAX(TCC_READ_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
-            unit: Gbps
-          Write Bandwidth:
-            avg: AVG(TCC_WRITE_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
-            min: MIN(TCC_WRITE_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
-            max: MAX(TCC_WRITE_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
-            unit: Gbps
-          Atomic Bandwidth:
-            avg: AVG(TCC_ATOMIC_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
-            min: MIN(TCC_ATOMIC_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
-            max: MAX(TCC_ATOMIC_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp))
-            unit: Gbps
-          Req:
-            avg: AVG((TCC_REQ_sum / $denom))
-            min: MIN((TCC_REQ_sum / $denom))
-            max: MAX((TCC_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          Read Req:
-            avg: AVG((TCC_READ_sum / $denom))
-            min: MIN((TCC_READ_sum / $denom))
-            max: MAX((TCC_READ_sum / $denom))
-            unit: (Req + $normUnit)
-          Write Req:
-            avg: AVG((TCC_WRITE_sum / $denom))
-            min: MIN((TCC_WRITE_sum / $denom))
-            max: MAX((TCC_WRITE_sum / $denom))
-            unit: (Req + $normUnit)
-          Atomic Req:
-            avg: AVG((TCC_ATOMIC_sum / $denom))
-            min: MIN((TCC_ATOMIC_sum / $denom))
-            max: MAX((TCC_ATOMIC_sum / $denom))
-            unit: (Req + $normUnit)
-          Streaming Req:
-            avg: AVG((TCC_STREAMING_REQ_sum / $denom))
-            min: MIN((TCC_STREAMING_REQ_sum / $denom))
-            max: MAX((TCC_STREAMING_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          Bypasss Req:
-            avg: AVG((TCC_BYPASS_REQ_sum / $denom))
-            min: MIN((TCC_BYPASS_REQ_sum / $denom))
-            max: MAX((TCC_BYPASS_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          Probe Req:
-            avg: AVG((TCC_PROBE_sum / $denom))
-            min: MIN((TCC_PROBE_sum / $denom))
-            max: MAX((TCC_PROBE_sum / $denom))
-            unit: (Req + $normUnit)
-          Input Buffer Req:
-            avg: AVG((TCC_IB_REQ_sum / $denom))
-            min: MIN((TCC_IB_REQ_sum / $denom))
-            max: MAX((TCC_IB_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          Cache Hit:
-            avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-              + TCC_MISS_sum) != 0) else None))
-            min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-              + TCC_MISS_sum) != 0) else None))
-            max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-              + TCC_MISS_sum) != 0) else None))
-            unit: pct
-          Hits:
-            avg: AVG((TCC_HIT_sum / $denom))
-            min: MIN((TCC_HIT_sum / $denom))
-            max: MAX((TCC_HIT_sum / $denom))
-            unit: (Hits + $normUnit)
-          Misses:
-            avg: AVG((TCC_MISS_sum / $denom))
-            min: MIN((TCC_MISS_sum / $denom))
-            max: MAX((TCC_MISS_sum / $denom))
-            unit: (Misses + $normUnit)
-          Writeback:
-            avg: AVG((TCC_WRITEBACK_sum / $denom))
-            min: MIN((TCC_WRITEBACK_sum / $denom))
-            max: MAX((TCC_WRITEBACK_sum / $denom))
-            unit: (Cachelines + $normUnit)
-          Writeback (Internal):
-            avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom))
-            min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom))
-            max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom))
-            unit: (Cachelines + $normUnit)
-          Writeback (vL1D Req):
-            avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
-            min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
-            max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
-            unit: (Cachelines + $normUnit)
-          Evict (Internal):
-            avg: AVG((TCC_NORMAL_EVICT_sum / $denom))
-            min: MIN((TCC_NORMAL_EVICT_sum / $denom))
-            max: MAX((TCC_NORMAL_EVICT_sum / $denom))
-            unit: (Cachelines + $normUnit)
-          Evict (vL1D Req):
-            avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
-            min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
-            max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
-            unit: (Cachelines + $normUnit)
-          NC Req:
-            avg: AVG((TCC_NC_REQ_sum / $denom))
-            min: MIN((TCC_NC_REQ_sum / $denom))
-            max: MAX((TCC_NC_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          UC Req:
-            avg: AVG((TCC_UC_REQ_sum / $denom))
-            min: MIN((TCC_UC_REQ_sum / $denom))
-            max: MAX((TCC_UC_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          CC Req:
-            avg: AVG((TCC_CC_REQ_sum / $denom))
-            min: MIN((TCC_CC_REQ_sum / $denom))
-            max: MAX((TCC_CC_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          RW Req:
-            avg: AVG((TCC_RW_REQ_sum / $denom))
-            min: MIN((TCC_RW_REQ_sum / $denom))
-            max: MAX((TCC_RW_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-        gfx908:
-          Bandwidth:
-            avg: AVG((TCC_REQ_sum * 64) / (End_Timestamp - Start_Timestamp))
-            min: MIN((TCC_REQ_sum * 64) / (End_Timestamp - Start_Timestamp))
-            max: MAX((TCC_REQ_sum * 64) / (End_Timestamp - Start_Timestamp))
-            unit: Gbps
-          Req:
-            avg: AVG((TCC_REQ_sum / $denom))
-            min: MIN((TCC_REQ_sum / $denom))
-            max: MAX((TCC_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          Read Req:
-            avg: AVG((TCC_READ_sum / $denom))
-            min: MIN((TCC_READ_sum / $denom))
-            max: MAX((TCC_READ_sum / $denom))
-            unit: (Req + $normUnit)
-          Write Req:
-            avg: AVG((TCC_WRITE_sum / $denom))
-            min: MIN((TCC_WRITE_sum / $denom))
-            max: MAX((TCC_WRITE_sum / $denom))
-            unit: (Req + $normUnit)
-          Atomic Req:
-            avg: AVG((TCC_ATOMIC_sum / $denom))
-            min: MIN((TCC_ATOMIC_sum / $denom))
-            max: MAX((TCC_ATOMIC_sum / $denom))
-            unit: (Req + $normUnit)
-          Streaming Req:
-            avg: AVG((TCC_STREAMING_REQ_sum / $denom))
-            min: MIN((TCC_STREAMING_REQ_sum / $denom))
-            max: MAX((TCC_STREAMING_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          Probe Req:
-            avg: AVG((TCC_PROBE_sum / $denom))
-            min: MIN((TCC_PROBE_sum / $denom))
-            max: MAX((TCC_PROBE_sum / $denom))
-            unit: (Req + $normUnit)
-          Cache Hit:
-            avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-              + TCC_MISS_sum) != 0) else None))
-            min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-              + TCC_MISS_sum) != 0) else None))
-            max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
-              + TCC_MISS_sum) != 0) else None))
-            unit: pct
-          Hits:
-            avg: AVG((TCC_HIT_sum / $denom))
-            min: MIN((TCC_HIT_sum / $denom))
-            max: MAX((TCC_HIT_sum / $denom))
-            unit: (Hits + $normUnit)
-          Misses:
-            avg: AVG((TCC_MISS_sum / $denom))
-            min: MIN((TCC_MISS_sum / $denom))
-            max: MAX((TCC_MISS_sum / $denom))
-            unit: (Misses + $normUnit)
-          Writeback:
-            avg: AVG((TCC_WRITEBACK_sum / $denom))
-            min: MIN((TCC_WRITEBACK_sum / $denom))
-            max: MAX((TCC_WRITEBACK_sum / $denom))
-            unit: (Cachelines + $normUnit)
-          Writeback (Internal):
-            avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom))
-            min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom))
-            max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom))
-            unit: (Cachelines + $normUnit)
-          Writeback (vL1D Req):
-            avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
-            min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
-            max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom))
-            unit: (Cachelines + $normUnit)
-          Evict (Internal):
-            avg: AVG((TCC_NORMAL_EVICT_sum / $denom))
-            min: MIN((TCC_NORMAL_EVICT_sum / $denom))
-            max: MAX((TCC_NORMAL_EVICT_sum / $denom))
-            unit: (Cachelines + $normUnit)
-          Evict (vL1D Req):
-            avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
-            min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
-            max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom))
-            unit: (Cachelines + $normUnit)
-          NC Req:
-            avg: AVG((TCC_NC_REQ_sum / $denom))
-            min: MIN((TCC_NC_REQ_sum / $denom))
-            max: MAX((TCC_NC_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          UC Req:
-            avg: AVG((TCC_UC_REQ_sum / $denom))
-            min: MIN((TCC_UC_REQ_sum / $denom))
-            max: MAX((TCC_UC_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          CC Req:
-            avg: AVG((TCC_CC_REQ_sum / $denom))
-            min: MIN((TCC_CC_REQ_sum / $denom))
-            max: MAX((TCC_CC_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-          RW Req:
-            avg: AVG((TCC_RW_REQ_sum / $denom))
-            min: MIN((TCC_RW_REQ_sum / $denom))
-            max: MAX((TCC_RW_REQ_sum / $denom))
-            unit: (Req + $normUnit)
-  - metric_table:
-      id: 1704
-      title: L2 Cache Stalls
-      header:
-        metric: Metric
-        avg: Avg
-        min: Min
-        max: Max
-        unit: Unit
-      metric:
-        gfx90a: {}
-        gfx941: {}
-        gfx940: {}
-        gfx942: {}
-        gfx950:
-          Stalled on Latency FIFO:
-            avg: AVG(TCC_LATENCY_FIFO_FULL_sum / $denom)
-            min: MIN(TCC_LATENCY_FIFO_FULL_sum / $denom)
-            max: MAX(TCC_LATENCY_FIFO_FULL_sum / $denom)
-            unit: (Cycles + $normUnit)
-          Stalled on Write Data FIFO:
-            avg: AVG(TCC_SRC_FIFO_FULL_sum / $denom)
-            min: MIN(TCC_SRC_FIFO_FULL_sum / $denom)
-            max: MAX(TCC_SRC_FIFO_FULL_sum / $denom)
-            unit: (Cycles + $normUnit)
-          Input Buffer Stalled on L2:
-            avg: AVG(TCC_IB_STALL_sum / $denom)
-            min: MIN(TCC_IB_STALL_sum / $denom)
-            max: MAX(TCC_IB_STALL_sum / $denom)
-            unit: (Cycles + $normUnit)
-        gfx908: {}
-  - metric_table:
-      id: 1705
-      title: L2 - Fabric Interface stalls
-      header:
-        metric: Metric
-        type: Type
-        transaction: Transaction
-        avg: Avg
-        min: Min
-        max: Max
-        unit: Unit
-      style:
-        type: simple_multi_bar
-      metric:
-        gfx90a:
-          Write - Credit Starvation:
-            type: Credit Starvation
-            transaction: Write
-            avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
-              (TCC_BUSY_sum != 0) else None))
-            min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
-              (TCC_BUSY_sum != 0) else None))
-            max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
-              (TCC_BUSY_sum != 0) else None))
-            unit: pct
-        gfx941:
-          Write - Credit Starvation:
-            type: Credit Starvation
-            transaction: Write
-            avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
-              (TCC_BUSY_sum != 0) else None))
-            min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
-              (TCC_BUSY_sum != 0) else None))
-            max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
-              (TCC_BUSY_sum != 0) else None))
-            unit: pct
-        gfx940:
-          Write - Credit Starvation:
-            type: Credit Starvation
-            transaction: Write
-            avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
-              (TCC_BUSY_sum != 0) else None))
-            min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
-              (TCC_BUSY_sum != 0) else None))
-            max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
-              (TCC_BUSY_sum != 0) else None))
-            unit: pct
-        gfx942:
-          Write - Credit Starvation:
-            type: Credit Starvation
-            transaction: Write
-            avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
-              (TCC_BUSY_sum != 0) else None))
-            min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
-              (TCC_BUSY_sum != 0) else None))
-            max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
-              (TCC_BUSY_sum != 0) else None))
-            unit: pct
-        gfx950:
-          Read - PCIe Stall:
-            type: PCIe Stall
-            transaction: Read
-            avg: AVG(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if
-              (TCC_BUSY_sum != 0) else None))
-            min: MIN(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if
-              (TCC_BUSY_sum != 0) else None))
-            max: MAX(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if
-              (TCC_BUSY_sum != 0) else None))
-            unit: pct
-          Read - Infinity Fabric Stall:
-            type: "Infinity Fabric\u2122 Stall"
-            transaction: Read
-            avg: AVG(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum))
-              if (TCC_BUSY_sum != 0) else None))
-            min: MIN(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum))
-              if (TCC_BUSY_sum != 0) else None))
-            max: MAX(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum))
-              if (TCC_BUSY_sum != 0) else None))
-            unit: pct
-          Read - HBM Stall:
-            type: HBM Stall
-            transaction: Read
-            avg: AVG(((100 * (TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum))
-              if (TCC_BUSY_sum != 0) else None))
-            min: MIN(((100 * (TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum))
-              if (TCC_BUSY_sum != 0) else None))
-            max: MAX(((100 * (TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum))
-              if (TCC_BUSY_sum != 0) else None))
-            unit: pct
-          Write - PCIe Stall:
-            type: PCIe Stall
-            transaction: Write
-            avg: AVG(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if
-              (TCC_BUSY_sum != 0) else None))
-            min: MIN(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if
-              (TCC_BUSY_sum != 0) else None))
-            max: MAX(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if
-              (TCC_BUSY_sum != 0) else None))
-            unit: pct
-          Write - Infinity Fabric Stall:
-            type: "Infinity Fabric\u2122 Stall"
-            transaction: Write
-            avg: AVG(((100 * (TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum))
-              if (TCC_BUSY_sum != 0) else None))
-            min: MIN(((100 * (TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum))
-              if (TCC_BUSY_sum != 0) else None))
-            max: MAX(((100 * (TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum))
-              if (TCC_BUSY_sum != 0) else None))
-            unit: pct
-          Write - HBM Stall:
-            type: HBM Stall
-            transaction: Write
-            avg: AVG(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum))
-              if (TCC_BUSY_sum != 0) else None))
-            min: MIN(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum))
-              if (TCC_BUSY_sum != 0) else None))
-            max: MAX(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum))
-              if (TCC_BUSY_sum != 0) else None))
-            unit: pct
-          Write - Credit Starvation:
-            type: Credit Starvation
-            transaction: Write
-            avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
-              (TCC_BUSY_sum != 0) else None))
-            min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
-              (TCC_BUSY_sum != 0) else None))
-            max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
-              (TCC_BUSY_sum != 0) else None))
-            unit: pct
-        gfx908:
-          Write - Credit Starvation:
-            type: Credit Starvation
-            transaction: Write
-            avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
-              (TCC_BUSY_sum != 0) else None))
-            min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
-              (TCC_BUSY_sum != 0) else None))
-            max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if
-              (TCC_BUSY_sum != 0) else None))
-            unit: pct
-  - metric_table:
-      id: 1706
-      title: L2 - Fabric interface detailed metrics
-      header:
-        metric: Metric
-        avg: Avg
-        min: Min
-        max: Max
-        unit: Unit
-      metric:
-        gfx90a:
-          Read (32B):
-            avg: AVG((TCC_EA_RDREQ_32B_sum / $denom))
-            min: MIN((TCC_EA_RDREQ_32B_sum / $denom))
-            max: MAX((TCC_EA_RDREQ_32B_sum / $denom))
-            unit: (Req + $normUnit)
-          Read (64B):
-            avg: AVG(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom))
-            min: MIN(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom))
-            max: MAX(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom))
-            unit: (Req + $normUnit)
-          Read (Uncached):
-            avg: AVG((TCC_EA_RD_UNCACHED_32B_sum / $denom))
-            min: MIN((TCC_EA_RD_UNCACHED_32B_sum / $denom))
-            max: MAX((TCC_EA_RD_UNCACHED_32B_sum / $denom))
-            unit: (Req + $normUnit)
-          HBM Read:
-            avg: AVG((TCC_EA_RDREQ_DRAM_sum / $denom))
-            min: MIN((TCC_EA_RDREQ_DRAM_sum / $denom))
-            max: MAX((TCC_EA_RDREQ_DRAM_sum / $denom))
-            unit: (Req + $normUnit)
-          Remote Read:
-            avg: AVG((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom))
-            min: MIN((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom))
-            max: MAX((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom))
-            unit: (Req + $normUnit)
-          Write and Atomic (32B):
-            avg: AVG(MAX(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom), 0))
-            min: MIN(MAX(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom), 0))
-            max: MAX(MAX(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom), 0))
-            unit: (Req + $normUnit)
-          Write and Atomic (Uncached):
-            avg: AVG((TCC_EA_WR_UNCACHED_32B_sum / $denom))
-            min: MIN((TCC_EA_WR_UNCACHED_32B_sum / $denom))
-            max: MAX((TCC_EA_WR_UNCACHED_32B_sum / $denom))
-            unit: (Req + $normUnit)
-          Write and Atomic (64B):
-            avg: AVG((TCC_EA_WRREQ_64B_sum / $denom))
-            min: MIN((TCC_EA_WRREQ_64B_sum / $denom))
-            max: MAX((TCC_EA_WRREQ_64B_sum / $denom))
-            unit: (Req + $normUnit)
-          HBM Write and Atomic:
-            avg: AVG((TCC_EA_WRREQ_DRAM_sum / $denom))
-            min: MIN((TCC_EA_WRREQ_DRAM_sum / $denom))
-            max: MAX((TCC_EA_WRREQ_DRAM_sum / $denom))
-            unit: (Req + $normUnit)
-          Remote Write and Atomic:
-            avg: AVG((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom))
-            min: MIN((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom))
-            max: MAX((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom))
-            unit: (Req + $normUnit)
-          Atomic:
-            avg: AVG((TCC_EA_ATOMIC_sum / $denom))
-            min: MIN((TCC_EA_ATOMIC_sum / $denom))
-            max: MAX((TCC_EA_ATOMIC_sum / $denom))
-            unit: (Req + $normUnit)
-        gfx941:
-          Read (32B):
-            avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom))
-            min: MIN((TCC_EA0_RDREQ_32B_sum / $denom))
-            max: MAX((TCC_EA0_RDREQ_32B_sum / $denom))
-            unit: (Req + $normUnit)
-          Read (64B):
-            avg: AVG(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
-            min: MIN(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
-            max: MAX(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
-            unit: (Req + $normUnit)
-          Read (Uncached):
-            avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
-            min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
-            max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
-            unit: (Req + $normUnit)
-          HBM Read:
-            avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom))
-            min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom))
-            max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom))
-            unit: (Req + $normUnit)
-          Remote Read:
-            avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
-            min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
-            max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
-            unit: (Req + $normUnit)
-          Write and Atomic (32B):
-            avg: AVG(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
-            min: MIN(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
-            max: MAX(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
-            unit: (Req + $normUnit)
-          Write and Atomic (Uncached):
-            avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
-            min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
-            max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
-            unit: (Req + $normUnit)
-          Write and Atomic (64B):
-            avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom))
-            min: MIN((TCC_EA0_WRREQ_64B_sum / $denom))
-            max: MAX((TCC_EA0_WRREQ_64B_sum / $denom))
-            unit: (Req + $normUnit)
-          HBM Write and Atomic:
-            avg: AVG((TCC_EA0_WRREQ_DRAM_sum / $denom))
-            min: MIN((TCC_EA0_WRREQ_DRAM_sum / $denom))
-            max: MAX((TCC_EA0_WRREQ_DRAM_sum / $denom))
-            unit: (Req + $normUnit)
-          Remote Write and Atomic:
-            avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
-            min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
-            max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
-            unit: (Req + $normUnit)
-          Atomic:
-            avg: AVG((TCC_EA0_ATOMIC_sum / $denom))
-            min: MIN((TCC_EA0_ATOMIC_sum / $denom))
-            max: MAX((TCC_EA0_ATOMIC_sum / $denom))
-            unit: (Req + $normUnit)
-        gfx940:
-          Read (32B):
-            avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom))
-            min: MIN((TCC_EA0_RDREQ_32B_sum / $denom))
-            max: MAX((TCC_EA0_RDREQ_32B_sum / $denom))
-            unit: (Req + $normUnit)
-          Read (64B):
-            avg: AVG(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
-            min: MIN(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
-            max: MAX(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
-            unit: (Req + $normUnit)
-          Read (Uncached):
-            avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
-            min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
-            max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
-            unit: (Req + $normUnit)
-          HBM Read:
-            avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom))
-            min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom))
-            max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom))
-            unit: (Req + $normUnit)
-          Remote Read:
-            avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
-            min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
-            max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
-            unit: (Req + $normUnit)
-          Write and Atomic (32B):
-            avg: AVG(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
-            min: MIN(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
-            max: MAX(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
-            unit: (Req + $normUnit)
-          Write and Atomic (Uncached):
-            avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
-            min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
-            max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
-            unit: (Req + $normUnit)
-          Write and Atomic (64B):
-            avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom))
-            min: MIN((TCC_EA0_WRREQ_64B_sum / $denom))
-            max: MAX((TCC_EA0_WRREQ_64B_sum / $denom))
-            unit: (Req + $normUnit)
-          HBM Write and Atomic:
-            avg: AVG((TCC_EA0_WRREQ_DRAM_sum / $denom))
-            min: MIN((TCC_EA0_WRREQ_DRAM_sum / $denom))
-            max: MAX((TCC_EA0_WRREQ_DRAM_sum / $denom))
-            unit: (Req + $normUnit)
-          Remote Write and Atomic:
-            avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
-            min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
-            max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
-            unit: (Req + $normUnit)
-          Atomic:
-            avg: AVG((TCC_EA0_ATOMIC_sum / $denom))
-            min: MIN((TCC_EA0_ATOMIC_sum / $denom))
-            max: MAX((TCC_EA0_ATOMIC_sum / $denom))
-            unit: (Req + $normUnit)
-        gfx942:
-          Read (32B):
-            avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom))
-            min: MIN((TCC_EA0_RDREQ_32B_sum / $denom))
-            max: MAX((TCC_EA0_RDREQ_32B_sum / $denom))
-            unit: (Req + $normUnit)
-          Read (64B):
-            avg: AVG(MAX(((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) /
-              $denom), 0))
-            min: MIN(MAX(((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) /
-              $denom), 0))
-            max: MAX(MAX(((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) /
-              $denom), 0))
-            unit: (Req + $normUnit)
-          Read (128B):
-            avg: AVG(((TCC_BUBBLE_sum) / $denom))
-            min: MIN(((TCC_BUBBLE_sum) / $denom))
-            max: MAX(((TCC_BUBBLE_sum) / $denom))
-            unit: (Req + $normUnit)
-          Read (Uncached):
-            avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
-            min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
-            max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
-            unit: (Req + $normUnit)
-          HBM Read:
-            avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom))
-            min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom))
-            max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom))
-            unit: (Req + $normUnit)
-          Remote Read:
-            avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
-            min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
-            max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
-            unit: (Req + $normUnit)
-          Write and Atomic (32B):
-            avg: AVG(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
-            min: MIN(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
-            max: MAX(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
-            unit: (Req + $normUnit)
-          Write and Atomic (Uncached):
-            avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
-            min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
-            max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
-            unit: (Req + $normUnit)
-          Write and Atomic (64B):
-            avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom))
-            min: MIN((TCC_EA0_WRREQ_64B_sum / $denom))
-            max: MAX((TCC_EA0_WRREQ_64B_sum / $denom))
-            unit: (Req + $normUnit)
-          HBM Write and Atomic:
-            avg: AVG((TCC_EA0_WRREQ_DRAM_sum / $denom))
-            min: MIN((TCC_EA0_WRREQ_DRAM_sum / $denom))
-            max: MAX((TCC_EA0_WRREQ_DRAM_sum / $denom))
-            unit: (Req + $normUnit)
-          Remote Write and Atomic:
-            avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
-            min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
-            max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
-            unit: (Req + $normUnit)
-          Atomic:
-            avg: AVG((TCC_EA0_ATOMIC_sum / $denom))
-            min: MIN((TCC_EA0_ATOMIC_sum / $denom))
-            max: MAX((TCC_EA0_ATOMIC_sum / $denom))
-            unit: (Req + $normUnit)
-        gfx950:
-          Read (32B):
-            avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom))
-            min: MIN((TCC_EA0_RDREQ_32B_sum / $denom))
-            max: MAX((TCC_EA0_RDREQ_32B_sum / $denom))
-            unit: (Req + $normUnit)
-          Read (64B):
-            avg: AVG((TCC_EA0_RDREQ_64B_sum / $denom))
-            min: MIN((TCC_EA0_RDREQ_64B_sum / $denom))
-            max: MAX((TCC_EA0_RDREQ_64B_sum / $denom))
-            unit: (Req + $normUnit)
-          Read (128B):
-            avg: AVG((TCC_EA0_RDREQ_128B_sum / $denom))
-            min: MIN((TCC_EA0_RDREQ_128B_sum / $denom))
-            max: MAX((TCC_EA0_RDREQ_128B_sum / $denom))
-            unit: (Req + $normUnit)
-          Read (Uncached):
-            avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
-            min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
-            max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
-            unit: (Req + $normUnit)
-          HBM Read:
-            avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom))
-            min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom))
-            max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom))
-            unit: (Req + $normUnit)
-          Remote Read:
-            avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
-            min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
-            max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
-            unit: (Req + $normUnit)
-          Read Bandwidth - PCIe:
-            avg: AVG(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
-            min: MIN(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
-            max: MAX(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
-            unit: Gbps
-          "Read Bandwidth - Infinity Fabric\u2122":
-            avg: AVG(TCC_EA0_RDREQ_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
-            min: MIN(TCC_EA0_RDREQ_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
-            max: MAX(TCC_EA0_RDREQ_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
-            unit: Gbps
-          Read Bandwidth - HBM:
-            avg: AVG(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
-            min: MIN(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
-            max: MAX(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
-            unit: Gbps
-          Write and Atomic (32B):
-            avg: AVG(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
-            min: MIN(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
-            max: MAX(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
-            unit: (Req + $normUnit)
-          Write and Atomic (Uncached):
-            avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
-            min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
-            max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
-            unit: (Req + $normUnit)
-          Write and Atomic (64B):
-            avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom))
-            min: MIN((TCC_EA0_WRREQ_64B_sum / $denom))
-            max: MAX((TCC_EA0_WRREQ_64B_sum / $denom))
-            unit: (Req + $normUnit)
-          HBM Write and Atomic:
-            avg: AVG((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom))
-            min: MIN((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom))
-            max: MAX((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom))
-            unit: (Req + $normUnit)
-          Remote Write and Atomic:
-            avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
-            min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
-            max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
-            unit: (Req + $normUnit)
-          Write Bandwidth - PCIe:
-            avg: AVG(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
-            min: MIN(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
-            max: MAX(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
-            unit: Gbps
-          "Write Bandwidth - Infinity Fabric\u2122":
-            avg: AVG(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
-            min: MIN(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
-            max: MAX(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
-            unit: Gbps
-          Write Bandwidth - HBM:
-            avg: AVG(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
-            min: MIN(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
-            max: MAX(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
-            unit: Gbps
-          Atomic:
-            avg: AVG((TCC_EA0_ATOMIC_sum / $denom))
-            min: MIN((TCC_EA0_ATOMIC_sum / $denom))
-            max: MAX((TCC_EA0_ATOMIC_sum / $denom))
-            unit: (Req + $normUnit)
-          Atomic - HBM:
-            avg: AVG((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom))
-            min: MIN((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom))
-            max: MAX((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom))
-            unit: (Req + $normUnit)
-          Atomic Bandwidth - PCIe:
-            avg: AVG(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
-            min: MIN(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
-            max: MAX(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
-            unit: Gbps
-          "Atomic Bandwidth - Infinity Fabric\u2122":
-            avg: AVG(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
-            min: MIN(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
-            max: MAX(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
-            unit: Gbps
-          Atomic Bandwidth - HBM:
-            avg: AVG(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
-            min: MIN(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
-            max: MAX(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp))
-            unit: Gbps
-        gfx908:
-          Read (32B):
-            avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom))
-            min: MIN((TCC_EA0_RDREQ_32B_sum / $denom))
-            max: MAX((TCC_EA0_RDREQ_32B_sum / $denom))
-            unit: (Req + $normUnit)
-          Read (64B):
-            avg: AVG(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
-            min: MIN(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
-            max: MAX(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom))
-            unit: (Req + $normUnit)
-          Read (Uncached):
-            avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
-            min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
-            max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom))
-            unit: (Req + $normUnit)
-          HBM Read:
-            avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom))
-            min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom))
-            max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom))
-            unit: (Req + $normUnit)
-          Remote Read:
-            avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
-            min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
-            max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom))
-            unit: (Req + $normUnit)
-          Write and Atomic (32B):
-            avg: AVG(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
-            min: MIN(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
-            max: MAX(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0))
-            unit: (Req + $normUnit)
-          Write and Atomic (Uncached):
-            avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
-            min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
-            max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom))
-            unit: (Req + $normUnit)
-          Write and Atomic (64B):
-            avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom))
-            min: MIN((TCC_EA0_WRREQ_64B_sum / $denom))
-            max: MAX((TCC_EA0_WRREQ_64B_sum / $denom))
-            unit: (Req + $normUnit)
-          HBM Write and Atomic:
-            avg: AVG((TCC_EA0_WRREQ_DRAM_sum / $denom))
-            min: MIN((TCC_EA0_WRREQ_DRAM_sum / $denom))
-            max: MAX((TCC_EA0_WRREQ_DRAM_sum / $denom))
-            unit: (Req + $normUnit)
-          Remote Write and Atomic:
-            avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
-            min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
-            max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom))
-            unit: (Req + $normUnit)
-          Atomic:
-            avg: AVG((TCC_EA0_ATOMIC_sum / $denom))
-            min: MIN((TCC_EA0_ATOMIC_sum / $denom))
-            max: MAX((TCC_EA0_ATOMIC_sum / $denom))
-            unit: (Req + $normUnit)
-  metrics_description:
-    Utilization:
-      plain: The ratio of the number of cycles an L2 channel was active, summed over
-        all L2 channels on the accelerator over the total L2 cycles.
-      rst: The ratio of the :ref:`number of cycles an L2 channel was active, summed
-        over all L2 channels on the accelerator <total-active-l2-cycles>` over the
-        :ref:`total L2 cycles <total-l2-cycles>`.
-      unit: Percent
-    Peak Bandwidth:
-      plain: The number of bytes looked up in the L2 cache, as a percent of the peak
-        theoretical bandwidth achievable on the specific accelerator. The number of
-        bytes is calculated as the number of cache lines requested multiplied by the
-        cache line size. This value does not consider partial requests, so e.g., if
-        only a single value is requested in a cache line, the data movement will still
-        be counted as a full cache line.
-      rst: The number of bytes looked up in the L2 cache, as a percent of the peak theoretical
-        bandwidth achievable on the specific accelerator. The number of bytes is
-        calculated as the number of cache lines requested multiplied by the cache
-        line size. This value does not consider partial requests, so e.g., if only
-        a single value is requested in a cache line, the data movement will still
-        be counted as a full cache line.
-      unit: Percent
-    Hit Rate:
-      plain: The ratio of the number of L2 cache line requests that hit in the L2
-        cache over the total number of incoming cache line requests to the L2 cache.
-      rst: The ratio of the number of L2 cache line requests that hit in the L2 cache
-        over the total number of incoming cache line requests to the L2 cache.
-      unit: Percent
-    L2-Fabric Read BW:
-      plain: The number of bytes read by the L2 over the Infinity Fabric interface
-        per unit time.
-      rst: The number of bytes read by the L2 over the :ref:`Infinity Fabric interface
-        <l2-fabric>` per unit time.
-      unit: GB/s
-    L2-Fabric Write and Atomic BW:
-      plain: The number of bytes sent by the L2 over the Infinity Fabric interface
-        by write and atomic operations per unit time.
-      rst: The number of bytes sent by the L2 over the :ref:`Infinity Fabric interface
-        <l2-fabric>` by write and atomic operations per unit time.
-      unit: GB/s
-    HBM Bandwidth:
-      plain: Maximum theoretical bandwidth of the accelerator's local high-bandwidth
-        memory (HBM) per unit time. This value is calculated as the number of HBM
-        channels multiplied by the HBM channel width multiplied by the HBM clock frequency.
-      rst: Maximum theoretical bandwidth of the accelerator's local high-bandwidth
-        memory (HBM) per unit time. This value is calculated as the number of HBM
-        channels multiplied by the HBM channel width multiplied by the HBM clock frequency.
-      unit: GB/s
-    Read BW:
-      plain: The total number of bytes read by the L2 cache from Infinity Fabric divided by total duration.
-      rst: The total number of bytes read by the L2 cache from Infinity Fabric divided by total duration.
-      unit: Gbps
-    HBM Read Traffic:
-      plain: The percent of read requests generated by the L2 cache that are routed
-        to the accelerator's local high-bandwidth memory (HBM). This breakdown does
-        not consider the size of the request (meaning that 32B and 64B requests are
-        both counted as a single request), so this metric only approximates the percent
-        of the L2-Fabric Read bandwidth directed to the local HBM.
-      rst: The percent of read requests generated by the L2 cache that are routed
-        to the accelerator's local high-bandwidth memory (HBM). This breakdown does not
-        consider the *size* of the request (meaning that 32B and 64B requests are
-        both counted as a single request), so this metric only *approximates* the
-        percent of the L2-Fabric Read bandwidth directed to the local HBM.
-      unit: Percent
-    Remote Read Traffic:
-      plain: The percent of read requests generated by the L2 cache that are routed
-        to any memory location other than the accelerator's local high-bandwidth memory
-        (HBM) - for example, the CPU's DRAM or a remote accelerator's HBM. This breakdown
-        does not consider the size of the request (meaning that 32B and 64B requests
-        are both counted as a single request), so this metric only approximates the
-        percent of the L2-Fabric Read bandwidth directed to a remote location.
-      rst: The percent of read requests generated by the L2 cache that are routed
-        to any memory location other than the accelerator's local high-bandwidth memory
-        (HBM) -- for example, the CPU's DRAM or a remote accelerator's HBM. This
-        breakdown does not consider the *size* of the request (meaning that 32B and
-        64B requests are both counted as a single request), so this metric only *approximates*
-        the percent of the L2-Fabric Read bandwidth directed to a remote location.
-      unit: Percent
-    Uncached Read Traffic:
-      plain: The percent of read requests generated by the L2 cache that are reading
-        from an uncached memory allocation. Note, as described in the request flow
-        section, a single 64B read request is typically counted as two uncached read
-        requests. So, it is possible for the Uncached Read Traffic to reach up to
-        200% of the total number of read requests. This breakdown does not consider
-        the size of the request (i.e., 32B and 64B requests are both counted as a
-        single request), so this metric only approximates the percent of the L2-Fabric
-        read bandwidth directed to an uncached memory location.
-      rst: The percent of read requests generated by the L2 cache that are reading from
-        an :ref:`uncached memory allocation <memory-type>`. Note, as described in
-        the :ref:`request flow <l2-request-flow>` section, a single 64B read request
-        is typically counted as two uncached read requests. So, it is possible for
-        the Uncached Read Traffic to reach up to 200% of the total number of read
-        requests. This breakdown does not consider the *size* of the request (i.e.,
-        32B and 64B requests are both counted as a single request), so this metric
-        only *approximates* the percent of the L2-Fabric read bandwidth directed
-        to an uncached memory location.
-      unit: Percent
-    Write and Atomic BW:
-      plain: The total number of bytes written by the L2 over Infinity Fabric by write
-        and atomic operations divided by total duration. Note that on current CDNA accelerators,
-        such as the MI2XX, requests are only considered atomic by Infinity Fabric
-        if they are targeted at non-write-cacheable memory, for example, fine-grained
-        memory allocations or uncached memory allocations on the MI2XX.
-      rst: The total number of bytes written by the L2 over Infinity Fabric by write and
-        atomic operations divided by total duration. Note
-        that on current CDNA accelerators, such as the :ref:`MI2XX <mixxx-note>`,
-        requests are only considered *atomic* by Infinity Fabric if they are targeted
-        at non-write-cacheable memory, for example, :ref:`fine-grained memory <memory-type>`
-        allocations or :ref:`uncached memory <memory-type>` allocations on the MI2XX.
-      unit: Gbps
-    HBM Write and Atomic Traffic:
-      plain: The percent of write and atomic requests generated by the L2 cache that
-        are routed to the accelerator's local high-bandwidth memory (HBM). This breakdown
-        does not consider the size of the request (meaning that 32B and 64B requests
-        are both counted as a single request), so this metric only approximates the
-        percent of the L2-Fabric Write and Atomic bandwidth directed to the local
-        HBM. Note that on current CDNA accelerators, such as the MI2XX, requests are
-        only considered atomic by Infinity Fabric if they are targeted at fine-grained
-        memory allocations or uncached memory allocations.
-      rst: The percent of write and atomic requests generated by the L2 cache that are
-        routed to the accelerator's local high-bandwidth memory (HBM). This breakdown
-        does not consider the *size* of the request (meaning that 32B and 64B requests
-        are both counted as a single request), so this metric only *approximates*
-        the percent of the L2-Fabric Write and Atomic bandwidth directed to the local
-        HBM. Note that on current CDNA accelerators, such as the :ref:`MI2XX <mixxx-note>`,
-        requests are only considered *atomic* by Infinity Fabric if they are targeted
-        at :ref:`fine-grained memory <memory-type>` allocations or :ref:`uncached
-        memory <memory-type>` allocations.
-      unit: Percent
-    Remote Write and Atomic Traffic:
-      plain: The percent of read requests generated by the L2 cache that are routed
-        to any memory location other than the accelerator's local high-bandwidth memory
-        (HBM) - for example, the CPU's DRAM or a remote accelerator's HBM. This breakdown
-        does not consider the size of the request (meaning that 32B and 64B requests
-        are both counted as a single request), so this metric only approximates the
-        percent of the L2-Fabric Read bandwidth directed to a remote location. Note
-        that on current CDNA accelerators, such as the MI2XX, requests are only considered
-        atomic by Infinity Fabric if they are targeted at fine-grained memory allocations
-        or uncached memory allocations.
-      rst: The percent of read requests generated by the L2 cache that are routed
-        to any memory location other than the accelerator's local high-bandwidth memory
-        (HBM) -- for example, the CPU's DRAM or a remote accelerator's HBM. This
-        breakdown does not consider the *size* of the request (meaning that 32B and
-        64B requests are both counted as a single request), so this metric only *approximates*
-        the percent of the L2-Fabric Read bandwidth directed to a remote location.
-        Note that on current CDNA accelerators, such as the :ref:`MI2XX <mixxx-note>`,
-        requests are only considered *atomic* by Infinity Fabric if they are targeted
-        at :ref:`fine-grained memory <memory-type>` allocations or :ref:`uncached
-        memory <memory-type>` allocations.
-      unit: Percent
-    Atomic Traffic:
-      plain: The percent of write requests generated by the L2 cache that are atomic
-        requests to any memory location. This breakdown does not consider the size
-        of the request (meaning that 32B and 64B requests are both counted as a single
-        request), so this metric only approximates the percent of the L2-Fabric Read
-        bandwidth directed to a remote location. Note that on current CDNA accelerators,
-        such as the MI2XX, requests are only considered atomic by Infinity Fabric
-        if they are targeted at fine-grained memory allocations or uncached memory
-        allocations.
-      rst: The percent of write requests generated by the L2 cache that are atomic requests
-        to *any* memory location. This breakdown does not consider the *size* of
-        the request (meaning that 32B and 64B requests are both counted as a single
-        request), so this metric only *approximates* the percent of the L2-Fabric
-        Read bandwidth directed to a remote location. Note that on current CDNA accelerators,
-        such as the :ref:`MI2XX <mixxx-note>`, requests are only considered *atomic*
-        by Infinity Fabric if they are targeted at :ref:`fine-grained memory <memory-type>`
-        allocations or :ref:`uncached memory <memory-type>` allocations.
-      unit: Percent
-    Uncached Write and Atomic Traffic:
-      plain: The percent of write and atomic requests generated by the L2 cache that
-        are targeting uncached memory allocations. This breakdown does not consider
-        the size of the request (meaning that 32B and 64B requests are both counted
-        as a single request), so this metric only approximates the percent of the
-        L2-Fabric read bandwidth directed to uncached memory allocations.
-      rst: The percent of write and atomic requests generated by the L2 cache that are
-        targeting :ref:`uncached memory allocations <memory-type>`. This breakdown
-        does not consider the *size* of the request (meaning that 32B and 64B requests
-        are both counted as a single request), so this metric only *approximates*
-        the percent of the L2-Fabric read bandwidth directed to uncached memory allocations.
-      unit: Percent
-    Read Latency:
-      plain: The time-averaged number of cycles read requests spent in Infinity Fabric
-        before data was returned to the L2.
-      rst: The time-averaged number of cycles read requests spent in Infinity Fabric before
-        data was returned to the L2.
-      unit: Cycles
-    Write and Atomic Latency:
-      plain: The time-averaged number of cycles write requests spent in Infinity Fabric
-        before a completion acknowledgement was returned to the L2.
-      rst: The time-averaged number of cycles write requests spent in Infinity Fabric
-        before a completion acknowledgement was returned to the L2.
-      unit: Cycles
-    Atomic Latency:
-      plain: The time-averaged number of cycles atomic requests spent in Infinity
-        Fabric before a completion acknowledgement (atomic without return value) or
-        data (atomic with return value) was returned to the L2.
-      rst: The time-averaged number of cycles atomic requests spent in Infinity Fabric
-        before a completion acknowledgement (atomic without return value) or data
-        (atomic with return value) was returned to the L2.
-      unit: Cycles
-    Bandwidth:
-      plain: The number of bytes looked up in the L2 cache, divided by total duration.
-        The number of bytes is calculated as the number of cache lines requested multiplied
-        by the cache line size. This value does not consider partial requests, so
-        for example, if only a single value is requested in a cache line, the data
-        movement will still be counted as a full cache line.
-      rst: The number of bytes looked up in the L2 cache, divided by total duration.
-        The number of bytes is calculated as the number of cache lines requested
-        multiplied by the cache line size. This value does
-        not consider partial requests, so for example, if only a single value is
-        requested in a cache line, the data movement will still be counted as a full
-        cache line.
-      unit: Gbps
-    Read Bandwidth:
-      plain: Total number of bytes looked up in the L2 cache for read requests,
-        divided by total duration.
-      rst: Total number of bytes looked up in the L2 cache for read requests,
-        divided by total duration.
-      unit: Gbps
-    Write Bandwidth:
-      plain: Total number of bytes looked up in the L2 cache for write requests,
-        divided by total duration.
-      rst: Total number of bytes looked up in the L2 cache for write requests,
-        divided by total duration.
-      unit: Gbps
-    Atomic Bandwidth:
-      plain: Total number of bytes looked up in the L2 cache for atomic requests,
-        divided by total duration.
-      rst: Total number of bytes looked up in the L2 cache for atomic requests,
-        divided by total duration.
-      unit: Gbps
-    Req:
-      plain: The total number of incoming requests to the L2 from all clients for
-        all request types, per normalization unit.
-      rst: The total number of incoming requests to the L2 from all clients for all request
-        types, per :ref:`normalization unit <normalization-units>`.
-      unit: Requests per normalization unit
-    Read Req:
-      plain: The total number of read requests to the L2 from all clients.
-      rst: The total number of read requests to the L2 from all clients.
-      unit: Requests per normalization unit
-    Write Req:
-      plain: The total number of write requests to the L2 from all clients.
-      rst: The total number of write requests to the L2 from all clients.
-      unit: Requests per normalization unit
-    Atomic Req:
-      plain: The total number of atomic requests (with and without return) to the
-        L2 from all clients.
-      rst: The total number of atomic requests (with and without return) to the L2
-        from all clients.
-      unit: Requests per normalization unit
-    Streaming Req:
-      plain: The total number of incoming requests to the L2 that are marked as streaming.
-        The exact meaning of this may differ depending on the targeted accelerator,
-        however on an MI2XX this corresponds to non-temporal load or stores. The L2
-        cache attempts to evict streaming requests before normal requests when the
-        L2 is at capacity.
-      rst: The total number of incoming requests to the L2 that are marked as *streaming*.
-        The exact meaning of this may differ depending on the targeted accelerator,
-        however on an :ref:`MI2XX <mixxx-note>` this corresponds to `non-temporal
-        load or stores <https://clang.llvm.org/docs/LanguageExtensions.html#non-temporal-load-store-builtins>`_. The
-        L2 cache attempts to evict *streaming* requests before normal requests when
-        the L2 is at capacity.
-      unit: Requests per normalization unit
-    Probe Req:
-      plain: The number of coherence probe requests made to the L2 cache from outside
-        the accelerator. On an MI2XX, probe requests may be generated by, for example,
-        writes to fine-grained device memory or by writes to coarse-grained device
-        memory.
-      rst: The number of coherence probe requests made to the L2 cache from outside the
-        accelerator. On an :ref:`MI2XX <mixxx-note>`, probe requests may be generated
-        by, for example, writes to :ref:`fine-grained device <memory-type>` memory
-        or by writes to :ref:`coarse-grained <memory-type>` device memory.
-      unit: Requests per normalization unit
-    Cache Hit:
-      plain: The ratio of the number of L2 cache line requests that hit in the L2
-        cache over the total number of incoming cache line requests to the L2 cache.
-      rst: The ratio of the number of L2 cache line requests that hit in the L2 cache
-        over the total number of incoming cache line requests to the L2 cache.
-      unit: Percent
-    Hits:
-      plain: The total number of requests to the L2 from all clients that hit in the
-        cache. As noted in the Speed-of-Light section, this includes hit-on-miss requests.
-      rst: The total number of requests to the L2 from all clients that hit in the cache.
-        As noted in the :ref:`Speed-of-Light <l2-sol>` section, this includes hit-on-miss
-        requests.
-      unit: Requests per normalization unit
-    Misses:
-      plain: The total number of requests to the L2 from all clients that miss in
-        the cache. As noted in the Speed-of-Light section, these do not include hit-on-miss
-        requests.
-      rst: The total number of requests to the L2 from all clients that miss in the cache.
-        As noted in the :ref:`Speed-of-Light <l2-sol>` section, these do not include
-        hit-on-miss requests.
-      unit: Requests per normalization unit
-    Writeback:
-      plain: The total number of L2 cache lines written back to memory for any reason.
-        Write-backs may occur due to user code (such as HIP kernel calls to _threadfence_system
-        or atomic built-ins) by the command processor's memory acquire/release fences,
-        or for other internal hardware reasons.
-      rst: The total number of L2 cache lines written back to memory for any reason. Write-backs
-        may occur due to user code (such as HIP kernel calls to ``__threadfence_system``
-        or atomic built-ins) by the :doc:`command processor <command-processor>`'s
-        memory acquire/release fences, or for other internal hardware reasons.
-      unit: Cache lines per normalization unit
-    Writeback (Internal):
-      plain: The total number of L2 cache lines written back to memory for internal
-        hardware reasons, per normalization unit.
-      rst: The total number of L2 cache lines written back to memory for internal hardware
-        reasons, per :ref:`normalization unit <normalization-units>`.
-      unit: Cache lines per normalization unit
-    Writeback (vL1D Req):
-      plain: The total number of L2 cache lines written back to memory due to requests
-        initiated by the vL1D cache, per normalization unit.
-      rst: The total number of L2 cache lines written back to memory due to requests initiated
-        by the :doc:`vL1D cache <vector-l1-cache>`, per :ref:`normalization unit
-        <normalization-units>`.
-      unit: Cache lines per normalization unit
-    Evict (Internal):
-      plain: The total number of L2 cache lines evicted from the cache due to capacity
-        limits, per normalization unit.
-      rst: The total number of L2 cache lines evicted from the cache due to capacity limits,
-        per :ref:`normalization unit <normalization-units>`.
-      unit: Cache lines per normalization unit
-    Evict (vL1D Req):
-      plain: The total number of L2 cache lines evicted from the cache due to invalidation
-        requests initiated by the vL1D cache, per normalization unit.
-      rst: The total number of L2 cache lines evicted from the cache due to invalidation
-        requests initiated by the :doc:`vL1D cache <vector-l1-cache>`, per :ref:`normalization
-        unit <normalization-units>`.
-      unit: Cache lines per normalization unit
-    NC Req:
-      plain: The total number of requests to the L2 to Not-hardware-Coherent (NC)
-        memory allocations, per normalization unit.
-      rst: The total number of requests to the L2 to Not-hardware-Coherent (NC) memory
-        allocations, per :ref:`normalization unit <normalization-units>`. See the
-        :ref:`memory-type` for more information.
-      unit: Requests per normalization unit
-    UC Req:
-      plain: The total number of requests to the L2 that go to Uncached (UC) memory
-        allocations.
-      rst: The total number of requests to the L2 that go to Uncached (UC) memory allocations.
-        See the :ref:`memory-type` for more information.
-      unit: Requests per normalization unit
-    CC Req:
-      plain: The total number of requests to the L2 that go to Coherently Cacheable
-        (CC) memory allocations.
-      rst: The total number of requests to the L2 that go to Coherently Cacheable
-        (CC) memory allocations. See the :ref:`memory-type` for more information.
-      unit: Requests per normalization unit
-    RW Req:
-      plain: The total number of requests to the L2 that go to Read-Write coherent
-        memory (RW) allocations.
-      rst: The total number of requests to the L2 that go to Read-Write coherent memory (RW)
-        allocations. See the :ref:`memory-type` for more information.
-      unit: Requests per normalization unit
-    Write - Credit Starvation:
-      plain: The number of cycles the L2-Fabric interface was stalled on write or
-        atomic requests to any memory location because too many write/atomic requests
-        were currently in flight, as a percent of the total active L2 cycles.
-      rst: The number of cycles the L2-Fabric interface was stalled on write or atomic
-        requests to any memory location because too many write/atomic requests were
-        currently in flight, as a percent of the :ref:`total active L2 cycles <total-active-l2-cycles>`.
-      unit: Percent
-    Read (32B):
-      plain: The total number of L2 requests to Infinity Fabric to read 32B of data
-        from any memory location, per normalization unit.
-      rst: The total number of L2 requests to Infinity Fabric to read 32B of data from
-        any memory location, per :ref:`normalization unit <normalization-units>`.
-        See :ref:`l2-request-flow` for more detail. Typically unused on CDNA accelerators.
-      unit: Requests per normalization unit
-    Read (64B):
-      plain: The total number of L2 requests to Infinity Fabric to read 64B of data
-        from any memory location, per normalization unit.
-      rst: The total number of L2 requests to Infinity Fabric to read 64B of data from
-        any memory location, per :ref:`normalization unit <normalization-units>`.
-        See :ref:`l2-request-flow` for more detail.
-      unit: Requests per normalization unit
-    Read (Uncached):
-      plain: The total number of L2 requests to Infinity Fabric to read uncached data
-        from any memory location, per normalization unit. 64B requests for uncached
-        data are counted as two 32B uncached data requests.
-      rst: The total number of L2 requests to Infinity Fabric to read :ref:`uncached
-        data <memory-type>` from any memory location, per :ref:`normalization unit
-        <normalization-units>`. 64B requests for uncached data are counted as two
-        32B uncached data requests. See :ref:`l2-request-flow` for more detail.
-      unit: Requests per normalization unit
-    HBM Read:
-      plain: The total number of L2 requests to Infinity Fabric to read 32B or 64B
-        of data from the accelerator's local HBM, per normalization unit.
-      rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of data
-        from the accelerator's local HBM, per :ref:`normalization unit <normalization-units>`.
-        See :ref:`l2-request-flow` for more detail.
-      unit: Requests per normalization unit
-    Remote Read:
-      plain: The total number of L2 requests to Infinity Fabric to read 32B or 64B
-        of data from any source other than the accelerator's local HBM, per normalization
-        unit.
-      rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of data
-        from any source other than the accelerator's local HBM, per :ref:`normalization
-        unit <normalization-units>`. See :ref:`l2-request-flow` for more detail.
-      unit: Requests per normalization unit
-    Read Bandwidth - PCIe:
-      plain: Total number of bytes due to L2 read requests due to PCIe traffic, divided by total duration.
-      rst: Total number of bytes due to L2 read requests due to PCIe traffic, divided by total duration.
-      unit: Gbps
-    "Read Bandwidth - Infinity Fabric\u2122":
-      plain: Total number of bytes due to L2 read requests due to Infinity Fabric traffic, divided by total duration.
-      rst: Total number of bytes due to L2 read requests due to Infinity Fabric traffic, divided by total duration.
-      unit: Gbps
-    Read Bandwidth - HBM:
-      plain: Total number of bytes due to L2 read requests due to HBM traffic, divided by total duration.
-      rst: Total number of bytes due to L2 read requests due to HBM traffic, divided by total duration.
-      unit: Gbps
-    Write and Atomic (32B):
-      plain: The total number of L2 requests to Infinity Fabric to write or atomically
-        update 32B of data to any memory location, per normalization unit.
-      rst: The total number of L2 requests to Infinity Fabric to write or atomically update
-        32B of data to any memory location, per :ref:`normalization unit <normalization-units>`.
-        See :ref:`l2-request-flow` for more detail.
-      unit: Requests per normalization unit
-    Write and Atomic (Uncached):
-      plain: The total number of L2 requests to Infinity Fabric to write or atomically
-        update 32B or 64B of uncached data, per normalization unit.
-      rst: The total number of L2 requests to Infinity Fabric to write or atomically update
-        32B or 64B of :ref:`uncached data <memory-type>`, per :ref:`normalization
-        unit <normalization-units>`. See :ref:`l2-request-flow` for more detail.
-      unit: Requests per normalization unit
-    Write and Atomic (64B):
-      plain: The total number of L2 requests to Infinity Fabric to write or atomically
-        update 64B of data in any memory location, per normalization unit.
-      rst: The total number of L2 requests to Infinity Fabric to write or atomically update
-        64B of data in any memory location, per :ref:`normalization unit <normalization-units>`.
-        See :ref:`l2-request-flow` for more detail.
-      unit: Requests per normalization unit
-    HBM Write and Atomic:
-      plain: The total number of L2 requests to Infinity Fabric to write or atomically
-        update 32B or 64B of data in the accelerator's local HBM, per normalization
-        unit.
-      rst: The total number of L2 requests to Infinity Fabric to write or atomically update
-        32B or 64B of data in the accelerator's local HBM, per :ref:`normalization
-        unit <normalization-units>`. See :ref:`l2-request-flow` for more detail. plain
-      unit: Requests per normalization unit
-    Remote Write and Atomic:
-      plain: The total number of L2 requests to Infinity Fabric to write or atomically
-        update 32B or 64B of data in any memory location other than the accelerator's
-        local HBM, per normalization unit.
-      rst: The total number of L2 requests to Infinity Fabric to write or atomically update
-        32B or 64B of data in any memory location other than the accelerator's local
-        HBM, per :ref:`normalization unit <normalization-units>`. See :ref:`l2-request-flow`
-        for more detail.
-      unit: Requests per normalization unit
-    Write Bandwidth - PCIe:
-      plain: Total number of bytes due to L2 write requests due to PCIe traffic, divided by total duration.
-      rst: Total number of bytes due to L2 write requests due to PCIe traffic, divided by total duration.
-      unit: Gbps
-    "Write Bandwidth - Infinity Fabric\u2122":
-      plain: Total number of bytes due to L2 write requests due to Infinity Fabric traffic, divided by total duration.
-      rst: Total number of bytes due to L2 write requests due to Infinity Fabric traffic, divided by total duration.
-      unit: Gbps
-    Write Bandwidth - HBM:
-      plain: Total number of bytes due to L2 write requests due to HBM traffic, divided by total duration.
-      rst: Total number of bytes due to L2 write requests due to HBM traffic, divided by total duration.
-      unit: Gbps
-    Atomic Bandwidth - PCIe:
-      plain: Total number of bytes due to L2 atomic requests due to PCIe traffic, divided by total duration.
-      rst: Total number of bytes due to L2 atomic requests due to PCIe traffic, divided by total duration.
-      unit: Gbps
-    "Atomic Bandwidth - Infinity Fabric\u2122":
-      plain: Total number of bytes due to L2 atomic requests due to Infinity Fabric traffic, divided by total duration.
-      rst: Total number of bytes due to L2 atomic requests due to Infinity Fabric traffic, divided by total duration.
-      unit: Gbps
-    Atomic Bandwidth - HBM:
-      plain: Total number of bytes due to L2 atomic requests due to HBM traffic, divided by total duration.
-      rst: Total number of bytes due to L2 atomic requests due to HBM traffic, divided by total duration.
-      unit: Gbps
-    Atomic:
-      plain: The total number of L2 requests to Infinity Fabric to atomically update
-        32B or 64B of data in any memory location, per normalization unit. See Request
-        flow for more detail. Note that on current CDNA accelerators, such as the
-        MI2XX, requests are only considered atomic by Infinity Fabric if they are
-        targeted at non-write-cacheable memory, such as fine-grained memory allocations
-        or uncached memory allocations on the MI2XX.
-      rst: The total number of L2 requests to Infinity Fabric to atomically update 32B
-        or 64B of data in any memory location, per :ref:`normalization unit <normalization-units>`.
-        See :ref:`l2-request-flow` for more detail. Note that on current CDNA accelerators,
-        such as the :ref:`MI2XX <mixxx-note>`, requests are only considered *atomic*
-        by Infinity Fabric if they are targeted at non-write-cacheable memory, such
-        as :ref:`fine-grained memory <memory-type>` allocations or :ref:`uncached
-        memory <memory-type>` allocations on the MI2XX.
-      unit: Requests per normalization unit
-    Read Stall:
-      plain: |-
-        The ratio of the total number of cycles the L2-Fabric interface was
-        stalled on a read request to any destination (local HBM, remote PCIe\xAE
-        connected accelerator or CPU, or remote Infinity Fabric connected accelerator
-        or CPU) over the total active L2 cycles.
-      rst: |-
-        The ratio of the total number of cycles the L2-Fabric interface was stalled
-        on a read request to any destination (local HBM, remote PCIe\xAE connected
-        accelerator or CPU, or remote Infinity Fabric connected accelerator [#inf]_
-        or CPU) over the :ref:`total active L2 cycles <total-active-l2-cycles>`.
-      unit: Percent
-    Write Stall:
-      plain: The ratio of the total number of cycles the L2-Fabric interface was stalled
-        on a write or atomic request to any destination (local HBM, remote accelerator
-        or CPU, PCIe connected accelerator or CPU, or remote Infinity Fabric connected
-        accelerator or CPU) over the total active L2 cycles.
-      rst: The ratio of the total number of cycles the L2-Fabric interface was stalled
-        on a write or atomic request to any destination (local HBM, remote accelerator
-        or CPU, PCIe connected accelerator or CPU, or remote Infinity Fabric connected
-        accelerator [#inf]_ or CPU) over the :ref:`total active L2 cycles <total-active-l2-cycles>`.
-      unit: Percent
-    Read - PCIe Stall:
-      plain: The number of cycles the L2-Fabric interface was stalled on read requests
-        to remote PCIe connected accelerators or CPUs as a percent of the total active
-        L2 cycles.
-      rst: The number of cycles the L2-Fabric interface was stalled on read requests
-        to remote PCIe connected accelerators [#inf]_ or CPUs as a percent of the
-        :ref:`total active L2 cycles <total-active-l2-cycles>`.
-      unit: Percent
-    Read - Infinity Fabric Stall:
-      plain: The number of cycles the L2-Fabric interface was stalled on read requests
-        to remote Infinity Fabric connected accelerators or CPUs as a percent of the
-        total active L2 cycles.
-      rst: The number of cycles the L2-Fabric interface was stalled on read requests
-        to remote Infinity Fabric connected accelerators [#inf]_ or CPUs as a percent
-        of the :ref:`total active L2 cycles <total-active-l2-cycles>`.
-      unit: Percent
-    Read - HBM Stall:
-      plain: The number of cycles the L2-Fabric interface was stalled on read requests
-        to the accelerator's local HBM as a percent of the total active L2 cycles.
-      rst: The number of cycles the L2-Fabric interface was stalled on read requests
-        to the accelerator's local HBM as a percent of the :ref:`total active L2 cycles
-        <total-active-l2-cycles>`.
-      unit: Percent
-    Write - PCIe Stall:
-      plain: The number of cycles the L2-Fabric interface was stalled on write or
-        atomic requests to remote PCIe connected accelerators or CPUs as a percent
-        of the total active L2 cycles.
-      rst: The number of cycles the L2-Fabric interface was stalled on write or atomic
-        requests to remote PCIe connected accelerators [#inf]_ or CPUs as a percent
-        of the :ref:`total active L2 cycles <total-active-l2-cycles>`.
-      unit: Percent
-    Write - Infinity Fabric Stall:
-      plain: The number of cycles the L2-Fabric interface was stalled on write or
-        atomic requests to remote Infinity Fabric connected accelerators or CPUs as
-        a percent of the total active L2 cycles.
-      rst: The number of cycles the L2-Fabric interface was stalled on write or atomic
-        requests to remote Infinity Fabric connected accelerators [#inf]_ or CPUs
-        as a percent of the :ref:`total active L2 cycles <total-active-l2-cycles>`.
-      unit: Percent
-    Write - HBM Stall:
-      plain: The number of cycles the L2-Fabric interface was stalled on write or
-        atomic requests to accelerator's local HBM as a percent of the total active
-        L2 cycles.
-      rst: The number of cycles the L2-Fabric interface was stalled on write or atomic
-        requests to accelerator's local HBM as a percent of the total active L2 cycles.
-      unit: Percent
-- id: 1800
-  title: L2 Cache (per Channel)
-  data source:
-  - metric_table:
-      id: 1801
-      title: Aggregate Stats (All channels)
-      header:
-        metric: Metric
-        avg: Avg
-        std dev: Std Dev
-        min: Min
-        max: Max
-        unit: Unit
-      metric:
-        gfx90a:
-          L2 Cache Hit Rate:
-            avg: AVG(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100 *
-              TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4]))
-              + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) +
-              (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100
-              * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100
-              * TCC_HIT[14])) + (100 * TCC_HIT[15])) + (100 * TCC_HIT[16])) + (100
-              * TCC_HIT[17])) + (100 * TCC_HIT[18])) + (100 * TCC_HIT[19])) + (100
-              * TCC_HIT[20])) + (100 * TCC_HIT[21])) + (100 * TCC_HIT[22])) + (100
-              * TCC_HIT[23])) + (100 * TCC_HIT[24])) + (100 * TCC_HIT[25])) + (100
-              * TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 * TCC_HIT[28])) + (100
-              * TCC_HIT[29])) + (100 * TCC_HIT[30])) + (100 * TCC_HIT[31])) / ((((((((((((((((((((((((((((((((TCC_MISS[0]
-              + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2]))
-              + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5]
-              + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7]))
-              + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10]
-              + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12]))
-              + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15]
-              + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17]))
-              + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20]
-              + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22]))
-              + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25]
-              + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27]))
-              + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30]
-              + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0]
-              + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2]))
-              + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5]
-              + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7]))
-              + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10]
-              + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12]))
-              + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15]
-              + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17]))
-              + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20]
-              + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22]))
-              + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25]
-              + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27]))
-              + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[29] + TCC_HIT[29])) + (TCC_MISS[30]
-              + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None))
-            std dev: STD(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100
-              * TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 *
-              TCC_HIT[4])) + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7]))
-              + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) +
-              (100 * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) +
-              (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) + (100 * TCC_HIT[16])) +
-              (100 * TCC_HIT[17])) + (100 * TCC_HIT[18])) + (100 * TCC_HIT[19])) +
-              (100 * TCC_HIT[20])) + (100 * TCC_HIT[21])) + (100 * TCC_HIT[22])) +
-              (100 * TCC_HIT[23])) + (100 * TCC_HIT[24])) + (100 * TCC_HIT[25])) +
-              (100 * TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 * TCC_HIT[28])) +
-              (100 * TCC_HIT[29])) + (100 * TCC_HIT[30])) + (100 * TCC_HIT[31])) /
-              ((((((((((((((((((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1]
-              + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3]))
-              + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6]
-              + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8]))
-              + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11]
-              + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13]))
-              + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) + (TCC_MISS[16]
-              + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) + (TCC_MISS[18] + TCC_HIT[18]))
-              + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] + TCC_HIT[20])) + (TCC_MISS[21]
-              + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) + (TCC_MISS[23] + TCC_HIT[23]))
-              + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] + TCC_HIT[25])) + (TCC_MISS[26]
-              + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) + (TCC_MISS[28] + TCC_HIT[28]))
-              + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] + TCC_HIT[30])) + (TCC_MISS[31]
-              + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
-              + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
-              + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
-              + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
-              + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
-              + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
-              + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))
-              + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) + (TCC_MISS[18]
-              + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] + TCC_HIT[20]))
-              + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) + (TCC_MISS[23]
-              + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] + TCC_HIT[25]))
-              + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) + (TCC_MISS[28]
-              + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] + TCC_HIT[30]))
-              + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None))
-            min: MIN(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100 *
-              TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4]))
-              + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) +
-              (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100
-              * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100
-              * TCC_HIT[14])) + (100 * TCC_HIT[15])) + (100 * TCC_HIT[16])) + (100
-              * TCC_HIT[17])) + (100 * TCC_HIT[18])) + (100 * TCC_HIT[19])) + (100
-              * TCC_HIT[20])) + (100 * TCC_HIT[21])) + (100 * TCC_HIT[22])) + (100
-              * TCC_HIT[23])) + (100 * TCC_HIT[24])) + (100 * TCC_HIT[25])) + (100
-              * TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 * TCC_HIT[28])) + (100
-              * TCC_HIT[29])) + (100 * TCC_HIT[30])) + (100 * TCC_HIT[31])) / ((((((((((((((((((((((((((((((((TCC_MISS[0]
-              + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2]))
-              + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5]
-              + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7]))
-              + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10]
-              + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12]))
-              + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15]
-              + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17]))
-              + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20]
-              + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22]))
-              + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25]
-              + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27]))
-              + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30]
-              + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0]
-              + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2]))
-              + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5]
-              + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7]))
-              + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10]
-              + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12]))
-              + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15]
-              + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17]))
-              + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20]
-              + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22]))
-              + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25]
-              + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27]))
-              + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30]
-              + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None))
-            max: MAX(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100 *
-              TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4]))
-              + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) +
-              (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100
-              * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100
-              * TCC_HIT[14])) + (100 * TCC_HIT[15])) + (100 * TCC_HIT[16])) + (100
-              * TCC_HIT[17])) + (100 * TCC_HIT[18])) + (100 * TCC_HIT[19])) + (100
-              * TCC_HIT[20])) + (100 * TCC_HIT[21])) + (100 * TCC_HIT[22])) + (100
-              * TCC_HIT[23])) + (100 * TCC_HIT[24])) + (100 * TCC_HIT[25])) + (100
-              * TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 * TCC_HIT[28])) + (100
-              * TCC_HIT[29])) + (100 * TCC_HIT[30])) + (100 * TCC_HIT[31])) / ((((((((((((((((((((((((((((((((TCC_MISS[0]
-              + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2]))
-              + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5]
-              + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7]))
-              + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10]
-              + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12]))
-              + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15]
-              + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17]))
-              + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20]
-              + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22]))
-              + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25]
-              + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27]))
-              + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30]
-              + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0]
-              + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2]))
-              + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5]
-              + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7]))
-              + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10]
-              + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12]))
-              + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15]
-              + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17]))
-              + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20]
-              + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22]))
-              + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25]
-              + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27]))
-              + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30]
-              + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None))
-            unit: pct
-        gfx941:
-          L2 Cache Hit Rate:
-            avg: AVG(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100
-              * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 *
-              TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8]))
-              + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11]))
-              + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14]))
-              + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
-              + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
-              + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
-              + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
-              + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
-              + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
-              + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])))
-              if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1]))
-              + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4]
-              + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6]))
-              + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9]
-              + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11]))
-              + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14]
-              + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
-            std dev: STD(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) +
-              (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100
-              * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 *
-              TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11]))
-              + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14]))
-              + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
-              + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
-              + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
-              + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
-              + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
-              + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
-              + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])))
-              if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1]))
-              + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4]
-              + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6]))
-              + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9]
-              + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11]))
-              + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14]
-              + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
-            min: MIN(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100
-              * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 *
-              TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8]))
-              + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11]))
-              + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14]))
-              + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
-              + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
-              + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
-              + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
-              + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
-              + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
-              + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])))
-              if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1]))
-              + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4]
-              + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6]))
-              + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9]
-              + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11]))
-              + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14]
-              + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
-            max: MAX(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100
-              * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 *
-              TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8]))
-              + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11]))
-              + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14]))
-              + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
-              + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
-              + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
-              + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
-              + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
-              + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
-              + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])))
-              if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1]))
-              + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4]
-              + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6]))
-              + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9]
-              + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11]))
-              + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14]
-              + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
-            unit: pct
-        gfx940:
-          L2 Cache Hit Rate:
-            avg: AVG(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100
-              * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 *
-              TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8]))
-              + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11]))
-              + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14]))
-              + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
-              + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
-              + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
-              + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
-              + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
-              + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
-              + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])))
-              if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1]))
-              + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4]
-              + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6]))
-              + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9]
-              + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11]))
-              + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14]
-              + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
-            std dev: STD(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) +
-              (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100
-              * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 *
-              TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11]))
-              + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14]))
-              + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
-              + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
-              + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
-              + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
-              + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
-              + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
-              + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])))
-              if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1]))
-              + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4]
-              + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6]))
-              + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9]
-              + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11]))
-              + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14]
-              + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
-            min: MIN(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100
-              * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 *
-              TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8]))
-              + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11]))
-              + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14]))
-              + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
-              + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
-              + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
-              + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
-              + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
-              + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
-              + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])))
-              if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1]))
-              + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4]
-              + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6]))
-              + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9]
-              + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11]))
-              + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14]
-              + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
-            max: MAX(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100
-              * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 *
-              TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8]))
-              + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11]))
-              + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14]))
-              + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
-              + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
-              + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
-              + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
-              + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
-              + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
-              + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])))
-              if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1]))
-              + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4]
-              + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6]))
-              + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9]
-              + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11]))
-              + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14]
-              + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
-            unit: pct
-        gfx942:
-          L2 Cache Hit Rate:
-            avg: AVG(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100
-              * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 *
-              TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8]))
-              + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11]))
-              + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14]))
-              + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
-              + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
-              + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
-              + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
-              + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
-              + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
-              + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])))
-              if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1]))
-              + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4]
-              + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6]))
-              + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9]
-              + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11]))
-              + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14]
-              + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
-            std dev: STD(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) +
-              (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100
-              * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 *
-              TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11]))
-              + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14]))
-              + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
-              + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
-              + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
-              + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
-              + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
-              + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
-              + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])))
-              if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1]))
-              + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4]
-              + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6]))
-              + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9]
-              + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11]))
-              + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14]
-              + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
-            min: MIN(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100
-              * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 *
-              TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8]))
-              + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11]))
-              + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14]))
-              + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
-              + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
-              + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
-              + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
-              + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
-              + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
-              + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])))
-              if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1]))
-              + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4]
-              + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6]))
-              + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9]
-              + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11]))
-              + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14]
-              + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
-            max: MAX(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100
-              * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 *
-              TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8]))
-              + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11]))
-              + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14]))
-              + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
-              + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
-              + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
-              + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
-              + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
-              + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
-              + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])))
-              if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1]))
-              + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4]
-              + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6]))
-              + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9]
-              + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11]))
-              + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14]
-              + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
-            unit: pct
-        gfx950:
-          L2 Cache Hit Rate:
-            avg: AVG(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100
-              * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 *
-              TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8]))
-              + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11]))
-              + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14]))
-              + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
-              + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
-              + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
-              + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
-              + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
-              + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
-              + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])))
-              if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1]))
-              + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4]
-              + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6]))
-              + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9]
-              + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11]))
-              + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14]
-              + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
-            std dev: STD(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) +
-              (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100
-              * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 *
-              TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11]))
-              + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14]))
-              + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
-              + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
-              + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
-              + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
-              + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
-              + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
-              + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])))
-              if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1]))
-              + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4]
-              + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6]))
-              + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9]
-              + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11]))
-              + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14]
-              + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
-            min: MIN(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100
-              * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 *
-              TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8]))
-              + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11]))
-              + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14]))
-              + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
-              + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
-              + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
-              + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
-              + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
-              + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
-              + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])))
-              if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1]))
-              + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4]
-              + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6]))
-              + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9]
-              + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11]))
-              + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14]
-              + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
-            max: MAX(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100
-              * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 *
-              TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8]))
-              + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11]))
-              + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14]))
-              + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
-              + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
-              + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
-              + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
-              + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
-              + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
-              + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])))
-              if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1]))
-              + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4]
-              + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6]))
-              + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9]
-              + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11]))
-              + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14]
-              + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None)
-            unit: pct
-        gfx908:
-          L2 Cache Hit Rate:
-            avg: AVG(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100 *
-              TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4]))
-              + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) +
-              (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100
-              * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100
-              * TCC_HIT[14])) + (100 * TCC_HIT[15])) + (100 * TCC_HIT[16])) + (100
-              * TCC_HIT[17])) + (100 * TCC_HIT[18])) + (100 * TCC_HIT[19])) + (100
-              * TCC_HIT[20])) + (100 * TCC_HIT[21])) + (100 * TCC_HIT[22])) + (100
-              * TCC_HIT[23])) + (100 * TCC_HIT[24])) + (100 * TCC_HIT[25])) + (100
-              * TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 * TCC_HIT[28])) + (100
-              * TCC_HIT[29])) + (100 * TCC_HIT[30])) + (100 * TCC_HIT[31])) / ((((((((((((((((((((((((((((((((TCC_MISS[0]
-              + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2]))
-              + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5]
-              + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7]))
-              + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10]
-              + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12]))
-              + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15]
-              + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17]))
-              + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20]
-              + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22]))
-              + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25]
-              + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27]))
-              + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30]
-              + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0]
-              + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2]))
-              + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5]
-              + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7]))
-              + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10]
-              + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12]))
-              + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15]
-              + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17]))
-              + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20]
-              + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22]))
-              + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25]
-              + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27]))
-              + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[29] + TCC_HIT[29])) + (TCC_MISS[30]
-              + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None))
-            std dev: STD(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100
-              * TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 *
-              TCC_HIT[4])) + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7]))
-              + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) +
-              (100 * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) +
-              (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) + (100 * TCC_HIT[16])) +
-              (100 * TCC_HIT[17])) + (100 * TCC_HIT[18])) + (100 * TCC_HIT[19])) +
-              (100 * TCC_HIT[20])) + (100 * TCC_HIT[21])) + (100 * TCC_HIT[22])) +
-              (100 * TCC_HIT[23])) + (100 * TCC_HIT[24])) + (100 * TCC_HIT[25])) +
-              (100 * TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 * TCC_HIT[28])) +
-              (100 * TCC_HIT[29])) + (100 * TCC_HIT[30])) + (100 * TCC_HIT[31])) /
-              ((((((((((((((((((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1]
-              + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3]))
-              + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6]
-              + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8]))
-              + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11]
-              + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13]))
-              + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) + (TCC_MISS[16]
-              + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) + (TCC_MISS[18] + TCC_HIT[18]))
-              + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] + TCC_HIT[20])) + (TCC_MISS[21]
-              + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) + (TCC_MISS[23] + TCC_HIT[23]))
-              + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] + TCC_HIT[25])) + (TCC_MISS[26]
-              + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) + (TCC_MISS[28] + TCC_HIT[28]))
-              + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] + TCC_HIT[30])) + (TCC_MISS[31]
-              + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0] + TCC_HIT[0])
-              + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3]
-              + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5]))
-              + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8]
-              + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10]))
-              + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13]
-              + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))
-              + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) + (TCC_MISS[18]
-              + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] + TCC_HIT[20]))
-              + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) + (TCC_MISS[23]
-              + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] + TCC_HIT[25]))
-              + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) + (TCC_MISS[28]
-              + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] + TCC_HIT[30]))
-              + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None))
-            min: MIN(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100 *
-              TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4]))
-              + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) +
-              (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100
-              * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100
-              * TCC_HIT[14])) + (100 * TCC_HIT[15])) + (100 * TCC_HIT[16])) + (100
-              * TCC_HIT[17])) + (100 * TCC_HIT[18])) + (100 * TCC_HIT[19])) + (100
-              * TCC_HIT[20])) + (100 * TCC_HIT[21])) + (100 * TCC_HIT[22])) + (100
-              * TCC_HIT[23])) + (100 * TCC_HIT[24])) + (100 * TCC_HIT[25])) + (100
-              * TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 * TCC_HIT[28])) + (100
-              * TCC_HIT[29])) + (100 * TCC_HIT[30])) + (100 * TCC_HIT[31])) / ((((((((((((((((((((((((((((((((TCC_MISS[0]
-              + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2]))
-              + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5]
-              + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7]))
-              + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10]
-              + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12]))
-              + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15]
-              + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17]))
-              + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20]
-              + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22]))
-              + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25]
-              + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27]))
-              + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30]
-              + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0]
-              + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2]))
-              + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5]
-              + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7]))
-              + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10]
-              + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12]))
-              + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15]
-              + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17]))
-              + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20]
-              + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22]))
-              + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25]
-              + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27]))
-              + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30]
-              + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None))
-            max: MAX(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100 *
-              TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4]))
-              + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) +
-              (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100
-              * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100
-              * TCC_HIT[14])) + (100 * TCC_HIT[15])) + (100 * TCC_HIT[16])) + (100
-              * TCC_HIT[17])) + (100 * TCC_HIT[18])) + (100 * TCC_HIT[19])) + (100
-              * TCC_HIT[20])) + (100 * TCC_HIT[21])) + (100 * TCC_HIT[22])) + (100
-              * TCC_HIT[23])) + (100 * TCC_HIT[24])) + (100 * TCC_HIT[25])) + (100
-              * TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 * TCC_HIT[28])) + (100
-              * TCC_HIT[29])) + (100 * TCC_HIT[30])) + (100 * TCC_HIT[31])) / ((((((((((((((((((((((((((((((((TCC_MISS[0]
-              + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2]))
-              + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5]
-              + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7]))
-              + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10]
-              + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12]))
-              + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15]
-              + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17]))
-              + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20]
-              + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22]))
-              + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25]
-              + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27]))
-              + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30]
-              + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0]
-              + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2]))
-              + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5]
-              + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7]))
-              + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10]
-              + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12]))
-              + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15]
-              + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17]))
-              + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20]
-              + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22]))
-              + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25]
-              + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27]))
-              + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30]
-              + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None))
-            unit: pct
-  - metric_table:
-      id: 1802
-      title: L2 Cache Hit Rate (pct)
-      header:
-        metric: Channel
-        expr: Expression
-      metric:
-        gfx90a:
-          ::_1:
-            expr: (((100 * TCC_HIT[::_1]) / (TCC_HIT[::_1] + TCC_MISS[::_1])) if ((TCC_HIT[::_1]
-              + TCC_MISS[::_1]) != 0) else None)
-          placeholder_range:
-            ::_1: $total_l2_chan
-        gfx941:
-          ::_1:
-            expr: (((100 * TCC_HIT[::_1]) / (TCC_HIT[::_1] + TCC_MISS[::_1])) if ((TCC_HIT[::_1]
-              + TCC_MISS[::_1]) != 0) else None)
-          placeholder_range:
-            ::_1: $total_l2_chan
-        gfx940:
-          ::_1:
-            expr: (((100 * TCC_HIT[::_1]) / (TCC_HIT[::_1] + TCC_MISS[::_1])) if ((TCC_HIT[::_1]
-              + TCC_MISS[::_1]) != 0) else None)
-          placeholder_range:
-            ::_1: $total_l2_chan
-        gfx942:
-          ::_1:
-            expr: (((100 * TCC_HIT[::_1]) / (TCC_HIT[::_1] + TCC_MISS[::_1])) if ((TCC_HIT[::_1]
-              + TCC_MISS[::_1]) != 0) else None)
-          placeholder_range:
-            ::_1: $total_l2_chan
-        gfx950:
-          ::_1:
-            expr: (((100 * TCC_HIT[::_1]) / (TCC_HIT[::_1] + TCC_MISS[::_1])) if ((TCC_HIT[::_1]
-              + TCC_MISS[::_1]) != 0) else None)
-          placeholder_range:
-            ::_1: $total_l2_chan
-        gfx908:
-          ::_1:
-            expr: (((100 * TCC_HIT[::_1]) / (TCC_HIT[::_1] + TCC_MISS[::_1])) if ((TCC_HIT[::_1]
-              + TCC_MISS[::_1]) != 0) else None)
-          placeholder_range:
-            ::_1: $total_l2_chan
-      cli_style: simple_box
-      tui_style: simple_box
-  - metric_table:
-      id: 1803
-      title: L2 Requests (per normUnit)
-      header:
-        metric: Channel
-        expr: Expression
-      metric:
-        gfx90a:
-          ::_1:
-            expr: (TO_INT(TCC_REQ[::_1]) / $denom)
-          placeholder_range:
-            ::_1: $total_l2_chan
-        gfx941:
-          ::_1:
-            expr: (TO_INT(TCC_REQ[::_1]) / $denom)
-          placeholder_range:
-            ::_1: $total_l2_chan
-        gfx940:
-          ::_1:
-            expr: (TO_INT(TCC_REQ[::_1]) / $denom)
-          placeholder_range:
-            ::_1: $total_l2_chan
-        gfx942:
-          ::_1:
-            expr: (TO_INT(TCC_REQ[::_1]) / $denom)
-          placeholder_range:
-            ::_1: $total_l2_chan
-        gfx950:
-          ::_1:
-            expr: (TO_INT(TCC_REQ[::_1]) / $denom)
-          placeholder_range:
-            ::_1: $total_l2_chan
-        gfx908:
-          ::_1:
-            expr: (TO_INT(TCC_REQ[::_1]) / $denom)
-          placeholder_range:
-            ::_1: $total_l2_chan
-      cli_style: simple_box
-      tui_style: simple_box
-  - metric_table:
-      id: 1804
-      title: L2 Requests (per normUnit)
-      header:
-        metric: Channel
-        read req: L2 Read
-        write req: L2 Write
-        atomic req: L2 Atomic
-      metric:
-        gfx90a:
-          ::_1:
-            read req: AVG((TO_INT(TCC_READ[::_1]) / $denom))
-            write req: AVG((TO_INT(TCC_WRITE[::_1]) / $denom))
-            atomic req: AVG((TO_INT(TCC_ATOMIC[::_1]) / $denom))
-          placeholder_range:
-            ::_1: $total_l2_chan
-        gfx941:
-          ::_1:
-            read req: AVG((TO_INT(TCC_READ[::_1]) / $denom))
-            write req: AVG((TO_INT(TCC_WRITE[::_1]) / $denom))
-            atomic req: AVG((TO_INT(TCC_ATOMIC[::_1]) / $denom))
-          placeholder_range:
-            ::_1: $total_l2_chan
-        gfx940:
-          ::_1:
-            read req: AVG((TO_INT(TCC_READ[::_1]) / $denom))
-            write req: AVG((TO_INT(TCC_WRITE[::_1]) / $denom))
-            atomic req: AVG((TO_INT(TCC_ATOMIC[::_1]) / $denom))
-          placeholder_range:
-            ::_1: $total_l2_chan
-        gfx942:
-          ::_1:
-            read req: AVG((TO_INT(TCC_READ[::_1]) / $denom))
-            write req: AVG((TO_INT(TCC_WRITE[::_1]) / $denom))
-            atomic req: AVG((TO_INT(TCC_ATOMIC[::_1]) / $denom))
-          placeholder_range:
-            ::_1: $total_l2_chan
-        gfx950:
-          ::_1:
-            read req: AVG((TO_INT(TCC_READ[::_1]) / $denom))
-            write req: AVG((TO_INT(TCC_WRITE[::_1]) / $denom))
-            atomic req: AVG((TO_INT(TCC_ATOMIC[::_1]) / $denom))
-          placeholder_range:
-            ::_1: $total_l2_chan
-        gfx908:
-          ::_1:
-            read req: AVG((TO_INT(TCC_READ[::_1]) / $denom))
-            write req: AVG((TO_INT(TCC_WRITE[::_1]) / $denom))
-            atomic req: AVG((TO_INT(TCC_ATOMIC[::_1]) / $denom))
-          placeholder_range:
-            ::_1: $total_l2_chan
-      cli_style: simple_multiple_bar
-      tui_style: simple_multiple_bar
-  - metric_table:
-      id: 1805
-      title: L2-Fabric Requests (per normUnit)
-      header:
-        metric: Channel
-        read req: L2-Fabric Read
-        write req: L2-Fabric Write and Atomic
-        atomic req: L2-Fabric Atomic
-      metric:
-        gfx90a:
-          ::_1:
-            read req: AVG((TO_INT(TCC_EA_RDREQ[::_1]) / $denom))
-            write req: AVG((TO_INT(TCC_EA_WRREQ[::_1]) / $denom))
-            atomic req: AVG((TO_INT(TCC_EA_ATOMIC[::_1]) / $denom))
-          placeholder_range:
-            ::_1: $total_l2_chan
-        gfx941:
-          ::_1:
-            read req: AVG((TO_INT(TCC_EA0_RDREQ[::_1]) / $denom))
-            write req: AVG((TO_INT(TCC_EA0_WRREQ[::_1]) / $denom))
-            atomic req: AVG((TO_INT(TCC_EA0_ATOMIC[::_1]) / $denom))
-          placeholder_range:
-            ::_1: $total_l2_chan
-        gfx940:
-          ::_1:
-            read req: AVG((TO_INT(TCC_EA0_RDREQ[::_1]) / $denom))
-            write req: AVG((TO_INT(TCC_EA0_WRREQ[::_1]) / $denom))
-            atomic req: AVG((TO_INT(TCC_EA0_ATOMIC[::_1]) / $denom))
-          placeholder_range:
-            ::_1: $total_l2_chan
-        gfx942:
-          ::_1:
-            read req: AVG((TO_INT(TCC_EA0_RDREQ[::_1]) / $denom))
-            write req: AVG((TO_INT(TCC_EA0_WRREQ[::_1]) / $denom))
-            atomic req: AVG((TO_INT(TCC_EA0_ATOMIC[::_1]) / $denom))
-          placeholder_range:
-            ::_1: $total_l2_chan
-        gfx950:
-          ::_1:
-            read req: AVG((TO_INT(TCC_EA0_RDREQ[::_1]) / $denom))
-            write req: AVG((TO_INT(TCC_EA0_WRREQ[::_1]) / $denom))
-            atomic req: AVG((TO_INT(TCC_EA0_ATOMIC[::_1]) / $denom))
-          placeholder_range:
-            ::_1: $total_l2_chan
-        gfx908:
-          ::_1:
-            read req: AVG((TO_INT(TCC_EA0_RDREQ[::_1]) / $denom))
-            write req: AVG((TO_INT(TCC_EA0_WRREQ[::_1]) / $denom))
-            atomic req: AVG((TO_INT(TCC_EA0_ATOMIC[::_1]) / $denom))
-          placeholder_range:
-            ::_1: $total_l2_chan
-      cli_style: simple_multiple_bar
-      tui_style: simple_multiple_bar
-  - metric_table:
-      id: 1806
-      title: L2-Fabric Read Latency (Cycles)
-      header:
-        metric: Channel
-        expr: Expression
-      metric:
-        gfx90a:
-          ::_1:
-            expr: ((TCC_EA_RDREQ_LEVEL[::_1] / TCC_EA_RDREQ[::_1]) if (TCC_EA_RDREQ[::_1]
-              != 0) else None)
-          placeholder_range:
-            ::_1: $total_l2_chan
-        gfx941:
-          ::_1:
-            expr: ((TCC_EA0_RDREQ_LEVEL[::_1] / TCC_EA0_RDREQ[::_1]) if (TCC_EA0_RDREQ[::_1]
-              != 0) else None)
-          placeholder_range:
-            ::_1: $total_l2_chan
-        gfx940:
-          ::_1:
-            expr: ((TCC_EA0_RDREQ_LEVEL[::_1] / TCC_EA0_RDREQ[::_1]) if (TCC_EA0_RDREQ[::_1]
-              != 0) else None)
-          placeholder_range:
-            ::_1: $total_l2_chan
-        gfx942:
-          ::_1:
-            expr: ((TCC_EA0_RDREQ_LEVEL[::_1] / TCC_EA0_RDREQ[::_1]) if (TCC_EA0_RDREQ[::_1]
-              != 0) else None)
-          placeholder_range:
-            ::_1: $total_l2_chan
-        gfx950:
-          ::_1:
-            expr: ((TCC_EA0_RDREQ_LEVEL[::_1] / TCC_EA0_RDREQ[::_1]) if (TCC_EA0_RDREQ[::_1]
-              != 0) else None)
-          placeholder_range:
-            ::_1: $total_l2_chan
-        gfx908:
-          ::_1:
-            expr: ((TCC_EA0_RDREQ_LEVEL[::_1] / TCC_EA0_RDREQ[::_1]) if (TCC_EA0_RDREQ[::_1]
-              != 0) else None)
-          placeholder_range:
-            ::_1: $total_l2_chan
-      cli_style: simple_box
-      tui_style: simple_box
-  - metric_table:
-      id: 1807
-      title: L2-Fabric Write and Atomic Latency (Cycles)
-      header:
-        metric: Channel
-        expr: Expression
-      metric:
-        gfx90a:
-          ::_1:
-            expr: ((TCC_EA_WRREQ_LEVEL[::_1] / TCC_EA_WRREQ[::_1]) if (TCC_EA_WRREQ[::_1]
-              != 0) else None)
-          placeholder_range:
-            ::_1: $total_l2_chan
-        gfx941:
-          ::_1:
-            expr: ((TCC_EA0_WRREQ_LEVEL[::_1] / TCC_EA0_WRREQ[::_1]) if (TCC_EA0_WRREQ[::_1]
-              != 0) else None)
-          placeholder_range:
-            ::_1: $total_l2_chan
-        gfx940:
-          ::_1:
-            expr: ((TCC_EA0_WRREQ_LEVEL[::_1] / TCC_EA0_WRREQ[::_1]) if (TCC_EA0_WRREQ[::_1]
-              != 0) else None)
-          placeholder_range:
-            ::_1: $total_l2_chan
-        gfx942:
-          ::_1:
-            expr: ((TCC_EA0_WRREQ_LEVEL[::_1] / TCC_EA0_WRREQ[::_1]) if (TCC_EA0_WRREQ[::_1]
-              != 0) else None)
-          placeholder_range:
-            ::_1: $total_l2_chan
-        gfx950:
-          ::_1:
-            expr: ((TCC_EA0_WRREQ_LEVEL[::_1] / TCC_EA0_WRREQ[::_1]) if (TCC_EA0_WRREQ[::_1]
-              != 0) else None)
-          placeholder_range:
-            ::_1: $total_l2_chan
-        gfx908:
-          ::_1:
-            expr: ((TCC_EA0_WRREQ_LEVEL[::_1] / TCC_EA0_WRREQ[::_1]) if (TCC_EA0_WRREQ[::_1]
-              != 0) else None)
-          placeholder_range:
-            ::_1: $total_l2_chan
-      cli_style: simple_box
-      tui_style: simple_box
-  - metric_table:
-      id: 1808
-      title: L2-Fabric Atomic Latency (Cycles)
-      header:
-        metric: Channel
-        expr: Expression
-      metric:
-        gfx90a:
-          ::_1:
-            expr: ((TCC_EA_ATOMIC_LEVEL[::_1] / TCC_EA_ATOMIC[::_1]) if (TCC_EA_ATOMIC[::_1]
-              != 0) else 0)
-          placeholder_range:
-            ::_1: $total_l2_chan
-        gfx941:
-          ::_1:
-            expr: ((TCC_EA0_ATOMIC_LEVEL[::_1] / TCC_EA0_ATOMIC[::_1]) if (TCC_EA0_ATOMIC[::_1]
-              != 0) else 0)
-          placeholder_range:
-            ::_1: $total_l2_chan
-        gfx940:
-          ::_1:
-            expr: ((TCC_EA0_ATOMIC_LEVEL[::_1] / TCC_EA0_ATOMIC[::_1]) if (TCC_EA0_ATOMIC[::_1]
-              != 0) else 0)
-          placeholder_range:
-            ::_1: $total_l2_chan
-        gfx942:
-          ::_1:
-            expr: ((TCC_EA0_ATOMIC_LEVEL[::_1] / TCC_EA0_ATOMIC[::_1]) if (TCC_EA0_ATOMIC[::_1]
-              != 0) else 0)
-          placeholder_range:
-            ::_1: $total_l2_chan
-        gfx950:
-          ::_1:
-            expr: ((TCC_EA0_ATOMIC_LEVEL[::_1] / TCC_EA0_ATOMIC[::_1]) if (TCC_EA0_ATOMIC[::_1]
-              != 0) else 0)
-          placeholder_range:
-            ::_1: $total_l2_chan
-        gfx908:
-          ::_1:
-            expr: ((TCC_EA0_ATOMIC_LEVEL[::_1] / TCC_EA0_ATOMIC[::_1]) if (TCC_EA0_ATOMIC[::_1]
-              != 0) else 0)
-          placeholder_range:
-            ::_1: $total_l2_chan
-      cli_style: simple_box
-      tui_style: simple_box
-  - metric_table:
-      id: 1809
-      title: L2-Fabric Read Stall (Cycles per normUnit)
-      header:
-        metric: Channel
-        ea read stall - pcie: L2-Fabric Read Stall (PCIe)
-        ea read stall - if: "L2-Fabric Read Stall (Infinity Fabric\u2122)"
-        ea read stall - hbm: L2-Fabric Read Stall (HBM)
-      metric:
-        gfx90a:
-          ::_1:
-            ea read stall - pcie: None
-            ea read stall - if: None
-            ea read stall - hbm: None
-          placeholder_range:
-            ::_1: $total_l2_chan
-        gfx941:
-          ::_1:
-            ea read stall - pcie: None
-            ea read stall - if: None
-            ea read stall - hbm: None
-          placeholder_range:
-            ::_1: $total_l2_chan
-        gfx940:
-          ::_1:
-            ea read stall - pcie: None
-            ea read stall - if: None
-            ea read stall - hbm: None
-          placeholder_range:
-            ::_1: $total_l2_chan
-        gfx942:
-          ::_1:
-            ea read stall - pcie: None
-            ea read stall - if: None
-            ea read stall - hbm: None
-          placeholder_range:
-            ::_1: $total_l2_chan
-        gfx950:
-          ::_1:
-            ea read stall - pcie: AVG((TO_INT(TCC_EA0_RDREQ_IO_CREDIT_STALL[::_1])
-              / $denom))
-            ea read stall - if: AVG((TO_INT(TCC_EA0_RDREQ_GMI_CREDIT_STALL[::_1])
-              / $denom))
-            ea read stall - hbm: AVG((TO_INT(TCC_EA0_RDREQ_DRAM_CREDIT_STALL[::_1])
-              / $denom))
-          placeholder_range:
-            ::_1: $total_l2_chan
-        gfx908:
-          ::_1:
-            ea read stall - pcie: None
-            ea read stall - if: None
-            ea read stall - hbm: None
-          placeholder_range:
-            ::_1: $total_l2_chan
-      cli_style: simple_multiple_bar
-      tui_style: simple_multiple_bar
-  - metric_table:
-      id: 1810
-      title: L2-Fabric Write and Atomic Stall (Cycles per normUnit)
-      header:
-        metric: Channel
-        ea write stall - pcie: L2-Fabric Write Stall (PCIe)
-        ea write stall - if: "L2-Fabric Write Stall (Infinity Fabric\u2122)"
-        ea write stall - hbm: L2-Fabric Write Stall (HBM)
-        ea write stall - starve: L2-Fabric Write Starve
-      metric:
-        gfx90a:
-          ::_1:
-            ea write stall - pcie: None
-            ea write stall - if: None
-            ea write stall - hbm: None
-            ea write stall - starve: AVG((TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[::_1])
-              / $denom))
-          placeholder_range:
-            ::_1: $total_l2_chan
-        gfx941:
-          ::_1:
-            ea write stall - pcie: None
-            ea write stall - if: None
-            ea write stall - hbm: None
-            ea write stall - starve: AVG((TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[::_1])
-              / $denom))
-          placeholder_range:
-            ::_1: $total_l2_chan
-        gfx940:
-          ::_1:
-            ea write stall - pcie: None
-            ea write stall - if: None
-            ea write stall - hbm: None
-            ea write stall - starve: AVG((TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[::_1])
-              / $denom))
-          placeholder_range:
-            ::_1: $total_l2_chan
-        gfx942:
-          ::_1:
-            ea write stall - pcie: None
-            ea write stall - if: None
-            ea write stall - hbm: None
-            ea write stall - starve: AVG((TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[::_1])
-              / $denom))
-          placeholder_range:
-            ::_1: $total_l2_chan
-        gfx950:
-          ::_1:
-            ea write stall - pcie: AVG((TO_INT(TCC_EA0_WRREQ_IO_CREDIT_STALL[::_1])
-              / $denom))
-            ea write stall - if: AVG((TO_INT(TCC_EA0_WRREQ_GMI_CREDIT_STALL[::_1])
-              / $denom))
-            ea write stall - hbm: AVG((TO_INT(TCC_EA0_WRREQ_DRAM_CREDIT_STALL[::_1])
-              / $denom))
-            ea write stall - starve: AVG((TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[::_1])
-              / $denom))
-          placeholder_range:
-            ::_1: $total_l2_chan
-        gfx908:
-          ::_1:
-            ea write stall - pcie: None
-            ea write stall - if: None
-            ea write stall - hbm: None
-            ea write stall - starve: AVG((TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[::_1])
-              / $denom))
-          placeholder_range:
-            ::_1: $total_l2_chan
-      cli_style: simple_multiple_bar
-      tui_style: simple_multiple_bar
-  - metric_table:
-      id: 1812
-      title: L2-Fabric (128B read requests per normUnit)
-      header:
-        metric: Channel
-        expr: Expression
-      metric:
-        gfx90a:
-          ::_1:
-            expr: (TO_INT(TCC_BUBBLE[::_1]) / $denom)
-          placeholder_range:
-            ::_1: $total_l2_chan
-        gfx941:
-          ::_1:
-            expr: (TO_INT(TCC_BUBBLE[::_1]) / $denom)
-          placeholder_range:
-            ::_1: $total_l2_chan
-        gfx940:
-          ::_1:
-            expr: (TO_INT(TCC_BUBBLE[::_1]) / $denom)
-          placeholder_range:
-            ::_1: $total_l2_chan
-        gfx942:
-          ::_1:
-            expr: (TO_INT(TCC_BUBBLE[::_1]) / $denom)
-          placeholder_range:
-            ::_1: $total_l2_chan
-        gfx950:
-          ::_1:
-            expr: (TO_INT(TCC_BUBBLE[::_1]) / $denom)
-          placeholder_range:
-            ::_1: $total_l2_chan
-        gfx908:
-          ::_1:
-            expr: (TO_INT(TCC_BUBBLE[::_1]) / $denom)
-          placeholder_range:
-            ::_1: $total_l2_chan
-      cli_style: simple_box
-      tui_style: simple_box
-  metrics_description:
-    L2 Cache Hit Rate:
-      plain: The percent of total number of requests to the L2 from all clients that
-        hit in the cache. As noted in the Speed-of-Light section, this includes hit-on-miss
-        requests.
-      rst: The total number of requests to the L2 from all clients that hit in the cache.
-        As noted in the :ref:`Speed-of-Light <l2-sol>` section, this includes hit-on-miss
-        requests.
-      unit: Percent
-- id: 2100
-  title: PC Sampling
-  data source:
-  - pc_sampling_table:
-      id: 2101
-      title: PC Sampling
-      source: ps_file
-      comparable: false
diff --git a/projects/rocprofiler-compute/tools/unified_sets.yaml b/projects/rocprofiler-compute/tools/unified_sets.yaml
deleted file mode 100644
index f94aa22435..0000000000
--- a/projects/rocprofiler-compute/tools/unified_sets.yaml
+++ /dev/null
@@ -1,176 +0,0 @@
----
-# Pre-defined sets containing a collection of relevant metrics that can be collected in a single pass.
-# To profile customized set(s), append to this yaml file.
-
-sets:
-- title: Compute Throughput Utilization
-  set_option: compute_thruput_util
-  description: Placeholder
-  metric:
-    gfx908:
-      - 11.2.2
-      - 11.2.3
-    gfx90a:
-      - 11.2.3
-      - 11.2.4
-      - 11.2.5
-      - 11.2.6
-    gfx940:
-      - 11.2.2
-      - 11.2.3
-      - 11.2.4
-      - 11.2.5
-    gfx941:
-      - 11.2.2
-      - 11.2.3
-      - 11.2.4
-      - 11.2.5
-    gfx942:
-      - 11.2.2
-      - 11.2.3
-      - 11.2.4
-      - 11.2.5
-    gfx950:
-      - 11.2.2
-      - 11.2.3
-      - 11.2.5
-      - 11.2.6
-
-- title: Compute Throughput FLOPS
-  set_option: compute_thruput_flops
-  description: Placeholder
-  metric:
-    gfx908:
-      - 2.1.2
-      - 2.1.3
-      - 2.1.4
-      - 2.1.5
-      - 2.1.6
-    gfx90a:
-      - 2.1.2
-      - 2.1.3
-      - 2.1.4
-      - 2.1.5
-      - 2.1.6
-    gfx940:
-      - 2.1.2
-      - 2.1.3
-      - 2.1.4
-      - 2.1.5
-      - 2.1.6
-      - 2.1.7
-    gfx941:
-      - 2.1.2
-      - 2.1.3
-      - 2.1.4
-      - 2.1.5
-      - 2.1.6
-      - 2.1.7
-    gfx942:
-      - 2.1.2
-      - 2.1.3
-      - 2.1.4
-      - 2.1.5
-      - 2.1.6
-      - 2.1.7
-    gfx950:
-      - 2.1.2
-      - 2.1.3
-      - 2.1.4
-      - 2.1.5
-      - 2.1.6
-      - 2.1.8
-
-- title: Memory Throughput
-  set_option: mem_thruput
-  description: Placeholder
-  metric:
-    gfx908:
-      - 2.1.16
-      - 2.1.17
-      - 16.1.2
-      - 17.1.0
-    gfx90a:
-      - 2.1.16
-      - 2.1.17
-      - 16.1.2
-      - 17.1.0
-    gfx940:
-      - 2.1.17
-      - 2.1.18
-      - 16.1.2
-      - 17.1.0
-    gfx941:
-      - 2.1.17
-      - 2.1.18
-      - 16.1.2
-      - 17.1.0
-    gfx942:
-      - 2.1.17
-      - 2.1.18
-      - 16.1.2
-      - 17.1.0
-    gfx950:
-      - 2.1.18
-      - 2.1.19
-      - 16.1.2
-      - 17.1.0
-
-- title: Launch Stats
-  set_option: launch_stats
-  description: Placeholder
-  metric:
-    gfx908:
-      - 7.1.0
-      - 7.1.1
-      - 7.1.2
-      - 7.1.5
-      - 7.1.6
-      - 7.1.7
-      - 7.1.8
-      - 7.1.9
-    gfx90a:
-      - 7.1.0
-      - 7.1.1
-      - 7.1.2
-      - 7.1.5
-      - 7.1.6
-      - 7.1.7
-      - 7.1.8
-      - 7.1.9
-    gfx940:
-      - 7.1.0
-      - 7.1.1
-      - 7.1.2
-      - 7.1.5
-      - 7.1.6
-      - 7.1.7
-      - 7.1.8
-      - 7.1.9
-    gfx941:
-      - 7.1.0
-      - 7.1.1
-      - 7.1.2
-      - 7.1.5
-      - 7.1.6
-      - 7.1.7
-      - 7.1.8
-      - 7.1.9
-    gfx942:
-      - 7.1.0
-      - 7.1.1
-      - 7.1.2
-      - 7.1.5
-      - 7.1.6
-      - 7.1.7
-      - 7.1.8
-      - 7.1.9
-    gfx950:
-      - 7.1.0
-      - 7.1.1
-      - 7.1.2
-      - 7.1.5
-      - 7.1.6
-      - 7.1.7
-      - 7.1.8
-      - 7.1.9