From 71b9ea6ba06fc40c4098d8d23658e275ae5fe688 Mon Sep 17 00:00:00 2001 From: xuchen-amd Date: Wed, 14 Jan 2026 13:20:27 -0500 Subject: [PATCH] [rocprofiler-compute] improve config management system (#2359) --- .../.pre-commit-config.yaml | 2 +- .../profile_configs/sets/gfx908_sets.yaml | 1 - .../profile_configs/sets/gfx90a_sets.yaml | 1 - .../profile_configs/sets/gfx940_sets.yaml | 1 - .../profile_configs/sets/gfx941_sets.yaml | 1 - .../profile_configs/sets/gfx942_sets.yaml | 1 - .../profile_configs/sets/gfx950_sets.yaml | 1 - .../utils}/.config_hashes.json | 2 +- .../utils}/hash_checker.py | 27 +- .../tests/test_autogen_config.py | 98 +- .../tools/autogen_hash.yaml | 2 - .../tools/config_management/README.md | 726 +- .../config_management/apply_config_deltas.py | 69 +- .../config_management/config_workflow.yaml | 2 +- .../generate_config_deltas.py | 497 +- .../tools/config_management/hash_manager.py | 2 +- .../master_config_workflow_script.py | 1171 +- .../metric_description_manager.py | 22 +- .../parse_config_template.py | 239 +- .../tools/config_management/utils.py | 52 - .../tools/config_management/utils_ruamel.py | 92 + .../verify_against_config_template.py | 440 +- .../rocprofiler-compute/tools/split_config.py | 307 - .../tools/unified_config.yaml | 17736 ---------------- .../tools/unified_sets.yaml | 176 - 25 files changed, 1407 insertions(+), 20261 deletions(-) rename projects/rocprofiler-compute/{tools/config_management => src/utils}/.config_hashes.json (99%) rename projects/rocprofiler-compute/{tools/config_management => src/utils}/hash_checker.py (88%) delete mode 100644 projects/rocprofiler-compute/tools/autogen_hash.yaml delete mode 100644 projects/rocprofiler-compute/tools/config_management/utils.py create mode 100644 projects/rocprofiler-compute/tools/config_management/utils_ruamel.py delete mode 100644 projects/rocprofiler-compute/tools/split_config.py delete mode 100644 projects/rocprofiler-compute/tools/unified_config.yaml delete mode 100644 projects/rocprofiler-compute/tools/unified_sets.yaml diff --git a/projects/rocprofiler-compute/.pre-commit-config.yaml b/projects/rocprofiler-compute/.pre-commit-config.yaml index 10c643321f..2b36616c96 100644 --- a/projects/rocprofiler-compute/.pre-commit-config.yaml +++ b/projects/rocprofiler-compute/.pre-commit-config.yaml @@ -23,7 +23,7 @@ repos: hooks: - id: hash-check name: Hash consistency check - entry: bash -lc 'cd projects/rocprofiler-compute && python3 tools/config_management/hash_checker.py' + entry: bash -lc 'cd projects/rocprofiler-compute && python3 src/utils/hash_checker.py' language: system pass_filenames: false stages: [pre-commit] diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/sets/gfx908_sets.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/sets/gfx908_sets.yaml index 939d12f04d..88ac2bd087 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/sets/gfx908_sets.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/sets/gfx908_sets.yaml @@ -1,4 +1,3 @@ -# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_sets.yaml. Generated by utils/split_config.py sets: - title: Compute Throughput Utilization set_option: compute_thruput_util diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/sets/gfx90a_sets.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/sets/gfx90a_sets.yaml index 3a970342f2..c67cbd6718 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/sets/gfx90a_sets.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/sets/gfx90a_sets.yaml @@ -1,4 +1,3 @@ -# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_sets.yaml. Generated by utils/split_config.py sets: - title: Compute Throughput Utilization set_option: compute_thruput_util diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/sets/gfx940_sets.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/sets/gfx940_sets.yaml index b549f0fede..ab78f316c1 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/sets/gfx940_sets.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/sets/gfx940_sets.yaml @@ -1,4 +1,3 @@ -# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_sets.yaml. Generated by utils/split_config.py sets: - title: Compute Throughput Utilization set_option: compute_thruput_util diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/sets/gfx941_sets.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/sets/gfx941_sets.yaml index b549f0fede..ab78f316c1 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/sets/gfx941_sets.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/sets/gfx941_sets.yaml @@ -1,4 +1,3 @@ -# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_sets.yaml. Generated by utils/split_config.py sets: - title: Compute Throughput Utilization set_option: compute_thruput_util diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/sets/gfx942_sets.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/sets/gfx942_sets.yaml index b549f0fede..ab78f316c1 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/sets/gfx942_sets.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/sets/gfx942_sets.yaml @@ -1,4 +1,3 @@ -# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_sets.yaml. Generated by utils/split_config.py sets: - title: Compute Throughput Utilization set_option: compute_thruput_util diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/sets/gfx950_sets.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/sets/gfx950_sets.yaml index f93a0af246..177a8f9a7a 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/sets/gfx950_sets.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/profile_configs/sets/gfx950_sets.yaml @@ -1,4 +1,3 @@ -# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_sets.yaml. Generated by utils/split_config.py sets: - title: Compute Throughput Utilization set_option: compute_thruput_util diff --git a/projects/rocprofiler-compute/tools/config_management/.config_hashes.json b/projects/rocprofiler-compute/src/utils/.config_hashes.json similarity index 99% rename from projects/rocprofiler-compute/tools/config_management/.config_hashes.json rename to projects/rocprofiler-compute/src/utils/.config_hashes.json index 2ad6ac76f2..0b8335ffaf 100644 --- a/projects/rocprofiler-compute/tools/config_management/.config_hashes.json +++ b/projects/rocprofiler-compute/src/utils/.config_hashes.json @@ -139,4 +139,4 @@ } } } -} \ No newline at end of file +} diff --git a/projects/rocprofiler-compute/tools/config_management/hash_checker.py b/projects/rocprofiler-compute/src/utils/hash_checker.py similarity index 88% rename from projects/rocprofiler-compute/tools/config_management/hash_checker.py rename to projects/rocprofiler-compute/src/utils/hash_checker.py index 14c5d17254..78e631f888 100644 --- a/projects/rocprofiler-compute/tools/config_management/hash_checker.py +++ b/projects/rocprofiler-compute/src/utils/hash_checker.py @@ -43,27 +43,16 @@ from pathlib import Path import yaml -try: - from . import hash_manager # type: ignore -except Exception: - import importlib.util +PROJECT_ROOT = Path(__file__).resolve().parents[2] # rocprofiler-compute/ +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) - _HERE = Path(__file__).resolve().parent - _SPEC = importlib.util.spec_from_file_location( - "hash_manager", str(_HERE / "hash_manager.py") - ) - hash_manager = importlib.util.module_from_spec(_SPEC) # type: ignore[assignment] - assert _SPEC and _SPEC.loader is not None - _SPEC.loader.exec_module(hash_manager) # type: ignore[attr-defined] -# --------------------------------------------------------------------------- +from tools.config_management import hash_manager # noqa: E402 -# Subproject root: .../projects/rocprofiler-compute -SUBROOT = Path(__file__).resolve().parents[2] - -CONFIGS_ROOT: Path = SUBROOT / "src" / "rocprof_compute_soc" / "analysis_configs" -HASH_FILE: Path = SUBROOT / "tools" / "config_management" / ".config_hashes.json" +CONFIGS_ROOT: Path = PROJECT_ROOT / "src" / "rocprof_compute_soc" / "analysis_configs" +HASH_FILE: Path = PROJECT_ROOT / "src" / "utils" / ".config_hashes.json" TEMPLATE_FILE: Path = ( - SUBROOT / "tools" / "config_management" / "analysis_config_template.yaml" + PROJECT_ROOT / "tools" / "config_management" / "gfx9_config_template.yaml" ) @@ -73,7 +62,7 @@ TEMPLATE_FILE: Path = ( def _latest_arch(template_file: Path) -> str: if not template_file.is_file(): return "" - with open(template_file, "r", encoding="utf-8") as f: + with open(template_file, encoding="utf-8") as f: data = yaml.safe_load(f) or {} return str(data.get("latest_arch") or "") diff --git a/projects/rocprofiler-compute/tests/test_autogen_config.py b/projects/rocprofiler-compute/tests/test_autogen_config.py index b30d83e1f1..81db7498c4 100644 --- a/projects/rocprofiler-compute/tests/test_autogen_config.py +++ b/projects/rocprofiler-compute/tests/test_autogen_config.py @@ -24,25 +24,91 @@ ############################################################################## import hashlib +import json from pathlib import Path import pytest -import yaml + +PROJECT_ROOT = Path(__file__).resolve().parents[1] +HASH_DB = PROJECT_ROOT / "src/utils/.config_hashes.json" +ANALYSIS_CONFIGS = PROJECT_ROOT / "src/rocprof_compute_soc/analysis_configs" -@pytest.mark.skip( - reason=( - "TODO: Skip this test until we use " - "tools/config_management/.config.hashes.json for testing" +def md5(path: Path) -> str: + return hashlib.md5(path.read_bytes()).hexdigest() + + +def test_config_hashes_match_files() -> None: + assert HASH_DB.exists(), f"Missing hash DB: {HASH_DB}" + assert ANALYSIS_CONFIGS.exists(), ( + f"Missing analysis configs dir: {ANALYSIS_CONFIGS}" ) -) -def test_modification_time(): - # Ensure hash map consistency - hash_path = Path("tools/autogen_hash.yaml") - with open(hash_path) as f: - hash_map = yaml.safe_load(f) - for file, hash in hash_map.items(): - file_hash = hashlib.sha256(Path(file).read_bytes()).hexdigest() - assert file_hash == hash, ( - f"Hash mismatch for {file}: expected {hash}, got {file_hash}" - ) + + with HASH_DB.open() as f: + data = json.load(f) + + assert "archs" in data, "Hash DB missing 'archs' key" + assert isinstance(data["archs"], dict) + + failures = [] + + for arch, arch_data in data["archs"].items(): + arch_dir = ANALYSIS_CONFIGS / arch + if not arch_dir.exists(): + failures.append(f"Arch directory missing: {arch_dir}") + continue + + # ------------------------- + # Panel YAMLs + # ------------------------- + files = arch_data.get("files", {}) + if not isinstance(files, dict): + failures.append(f"'files' for {arch} is not a dict") + continue + + for rel_path, expected_hash in files.items(): + panel_path = arch_dir / rel_path + if not panel_path.exists(): + failures.append(f"Missing panel file: {panel_path}") + continue + + actual_hash = md5(panel_path) + if actual_hash != expected_hash: + failures.append( + f"[{arch}] Panel hash mismatch: {panel_path}\n" + f" expected: {expected_hash}\n" + f" actual: {actual_hash}" + ) + + # ------------------------- + # Delta YAML (if any) + # ------------------------- + delta_hash = arch_data.get("delta_hash") + + if delta_hash is not None: + delta_dir = arch_dir / "config_delta" + if not delta_dir.exists(): + failures.append(f"[{arch}] Missing config_delta directory") + continue + + # Exactly one *_diff.yaml should exist + delta_files = list(delta_dir.glob("*_diff.yaml")) + if len(delta_files) != 1: + failures.append( + f"[{arch}] Expected exactly one delta file, found " + f"{len(delta_files)} in {delta_dir}" + ) + continue + + delta_path = delta_files[0] + actual_delta_hash = md5(delta_path) + + if actual_delta_hash != delta_hash: + failures.append( + f"[{arch}] Delta hash mismatch: {delta_path}\n" + f" expected: {delta_hash}\n" + f" actual: {actual_delta_hash}" + ) + + if failures: + pytest.fail("Hash consistency failures:\n\n" + "\n".join(failures)) diff --git a/projects/rocprofiler-compute/tools/autogen_hash.yaml b/projects/rocprofiler-compute/tools/autogen_hash.yaml deleted file mode 100644 index e25b0bb4f9..0000000000 --- a/projects/rocprofiler-compute/tools/autogen_hash.yaml +++ /dev/null @@ -1,2 +0,0 @@ -# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from tools/unified_config.yaml. Generated by tools/split_config.py -{} diff --git a/projects/rocprofiler-compute/tools/config_management/README.md b/projects/rocprofiler-compute/tools/config_management/README.md index d19ecf5e95..f5cbca7247 100644 --- a/projects/rocprofiler-compute/tools/config_management/README.md +++ b/projects/rocprofiler-compute/tools/config_management/README.md @@ -1,500 +1,276 @@ -# Architecture Configuration Workflow +# ROCProfiler-Compute Configuration Management -This document explains the master workflow system for managing architecture-specific metric configurations. +This directory contains the authoritative configuration-management system for ROCProfiler-Compute analysis configurations. -## Overview +It is designed to guarantee: -The workflow system manages changes to architecture configurations located in `src/rocprof_compute_soc/analysis_configs/gfx/`. It handles: +- **Structural correctness** across GPU architectures +- **Deterministic deltas** relative to a single latest architecture +- **Byte-level immutability** enforced via hashes +- **Safe promotion** of a new latest architecture with rollback +- **CI enforcement** of all invariants -- **Metric changes** (additions, deletions, modifications) -- **Metric description changes** (plain text + RST documentation) -- **New architecture additions** -- **Template updates** -- **Config delta generation** for version control - -## Files Overview - -### Core Scripts - -1. **`master_config_workflow_script.py`** - Main orchestrator script -2. **`hash_manager.py`** - Tracks file changes via MD5 hashes -3. **`metric_description_manager.py`** - Syncs metric descriptions across files -4. **`config_workflow.yaml`** - Configuration file -5. **`parse_config_template.py`** - Parses base config template from latest arch -6. **`generate_config_deltas.py`** - Generates config deltas between two archs -7. **`apply_config_deltas.py`** - Applies config deltas to genearte new arch configs -8. **`verify_against_config_template.py`** - Validates configs against template - -## Quick Start - -### Initial Setup (not needed following first commit) - -1. Create the hash database: -```bash -python hash_manager.py --compute-all src/rocprof_compute_soc/analysis_configs -``` - -2. Ensure `analysis_config_template.yaml` has metadata: -```yaml -latest_arch: gfx950 -panels: - - file: top_stats.yaml - panel_id: 0 - ... -``` - -### Making Changes - -Simply run the master workflow after making any changes: +All workflows are orchestrated by a single sequential driver script: ```bash -python master_config_workflow_script.py +tools/config_management/master_config_workflow_script.py +``` + +## Repository Layout + +```bash +rocprofiler-compute/ +├── src/rocprof_compute_soc/ +│ └── analysis_configs/ +│ ├── gfx908/ +│ │ ├── 0000_top_stats.yaml +│ │ └── config_delta/ +│ │ └── _diff.yaml +│ ├── gfx90a/ +│ ├── gfx940/ +│ ├── gfx950/ # latest_arch +│ └── gfx9_config_template.yaml # single source of truth +│ +├── src/util/ +│ ├── hash_checker.py +│ ├── .config_hashes.json +│ +└── tools/config_management/ + ├── master_config_workflow_script.py + ├── parse_config_template.py + ├── verify_against_config_template.py + ├── generate_config_deltas.py + ├── apply_config_deltas.py + ├── hash_manager.py + ├── TESTING.md + └── README.md +``` + +## Core Concepts +### Latest Architecture + +- Exactly one architecture is considered *latest* +- Defined in: +```bash +src/rocprof_compute_soc/analysis_configs/gfx9_config_template.yaml +``` + +### Panel YAMLs + +- Live under: +```bash +analysis_configs//*.yaml +``` +- Must conform strictly to the template schema +- Are edited in-place using ruamel.yaml round-trip mode + +### Delta YAMLs + +- Represent differences from latest → older architecture +- Live under: +```bash +analysis_configs//config_delta/ +``` +- Exactly one delta file per arch +- Always named: +```bash +_diff.yaml +``` + +### Hash Database + +- Stored at: +```bash +src/utils/.config_hashes.json +``` +- Records: + - md5 hashes of panel YAMLs per arch + - md5 hash of the delta YAML (or null for latest) +- Machine-generated only +- Enforced in CI and pytest + +## Architecture Diagram (End-to-End Flow) +```pqsql + ┌──────────────────────────┐ + │ analysis_configs/ │ + │ gfx9_config_template │ + └───────────┬──────────────┘ + │ + ▼ + ┌───────────────────────────────┐ + │ verify_against_config_template│ + │ (structural validation) │ + └───────────┬───────────────────┘ + │ + ┌───────────────────┴───────────────────┐ + │ │ + ▼ ▼ +┌────────────────────┐ ┌──────────────────────┐ +│ edit-existing mode │ │ promotion mode │ +│ (local dev only) │ │ (authoritative path) │ +└─────────┬──────────┘ └──────────┬───────────┘ + │ │ + ▼ ▼ +┌────────────────────┐ ┌─────────────────────────────┐ +│ generate / apply │ │ parse_config_template.py │ +│ deltas manually │ │ (update latest_arch) │ +└────────────────────┘ └──────────┬──────────────────┘ + │ + ▼ + ┌──────────────────────────────────┐ + │ generate_config_deltas.py │ + │ latest → all older arches │ + │ (_diff.yaml only) │ + └──────────┬───────────────────────┘ + │ + ▼ + ┌──────────────────────────────────┐ + │ verify_against_config_template │ + │ (post-promotion validation) │ + └──────────┬───────────────────────┘ + │ + ▼ + ┌──────────────────────────────────┐ + │ hash_manager.py --compute-all │ + │ (new steady state) │ + └──────────┬───────────────────────┘ + │ + ▼ + ┌──────────────────────────────────┐ + │ hash_checker.py │ + │ (semantic consistency) │ + └──────────────────────────────────┘ +``` + +## Contributor Quick Start + +> [!NOTE] +> **Required Python Dependency** +> This configuration management system requires the `ruamel.yaml` Python package. +> It is used to safely modify YAML files while preserving comments, ordering, +> and formatting. The workflow scripts will not function correctly without it. +> +> Install it via: +> ```bash +> pip install ruamel.yaml +> ``` + +### 1. Validate the current state + +Before making **any** config changes: +```bash +python tools/config_management/master_config_workflow_script.py --validate-only +``` + +This must pass. + +### 2. Editing an existing architecture (most common) + +Edit panel YAMLs **directly** under: +```bash +src/rocprof_compute_soc/analysis_configs// +``` + +Rules: + +- Preserve structure +- Preserve ordering +- Use multiline `>-` formatting for metric descriptions +- Do **not** regenerate entire files + +After editing: +```bash +python tools/config_management/master_config_workflow_script.py --validate-only +``` + +### 3. Generating or applying deltas (advanced / optional) + +For local experimentation only: +```bash +python tools/config_management/master_config_workflow_script.py --edit-existing +``` + +This mode: + +- never updates the template +- never updates hashes +- always re-validates after application + +### 4. Promoting a new latest architecture (rare, gated) + +Promotion changes **global invariants** and must use the master script: +```bash +python tools/config_management/master_config_workflow_script.py --promote ``` The script will: -- Detect what changed -- Prompt you for confirmation -- Apply changes -- Validate results -- Update all necessary files -### Dry Run Mode +1. Update `latest_arch` in the template +2. Regenerate deltas for all older arches +3. Remove stale delta files +4. Re-validate everything +5. Rebuild the hash database +6. Verify semantic consistency -To see what would happen without making changes: +If anything fails: +- all changes are rolled back +- no partial state remains + +### 5. Hash checks (fast local / CI) ```bash -python master_config_workflow_script.py --dry-run +python tools/config_management/master_config_workflow_script.py --hash-only ``` -## Usage Scenarios - -### Scenario A: Add Metrics to Latest Arch (gfx950) - -**Method 1: Direct Edit** - -1. Edit `src/rocprof_compute_soc/analysis_configs/gfx950/0700_wavefront.yaml` -2. Add your metric to the appropriate table -3. Add description to `metrics_description` section -4. Run: `python master_config_workflow_script.py` -5. Answer prompts - -**Method 2: Using Delta** - -1. Create `src/rocprof_compute_soc/analysis_configs/gfx950/config_delta/gfx955_diff.yaml`: -```yaml -Addition: - - Panel Config: - id: 700 - title: Wavefront - metric_tables: - - metric_table: - id: 701 - title: Wavefront Launch Stats - metrics: - - New Metric: - avg: AVG(something) - unit: Units - metric_descriptions: - New Metric: - plain: Description text - rst: >- # Optional - Description with :ref:`RST markup ` - -Deletion: - [] - -Modification: - [] -``` - -2. Run: `python master_config_workflow_script.py` - -**What Happens:** -- Changes applied to gfx950 -- Template updated -- Deltas regenerated for all previous archs (gfx940, gfx941, etc.) -- Metric descriptions synced to: - - `tools/per_arch_metric_definitions/gfx950_metrics_description.yaml` - - `docs/data/metrics_description.yaml` -- All archs validated -- Hashes updated - -### Scenario B: Modify Metrics in Older Arch (gfx940) - -**Method 1: Direct Edit** - -1. Edit `src/rocprof_compute_soc/analysis_configs/gfx940/0700_wavefront.yaml` -2. Make your changes -3. Run: `python master_config_workflow_script.py` - -**Method 2: Using Delta** - -1. Create `src/rocprof_compute_soc/analysis_configs/gfx940/config_delta/gfx950_diff.yaml` -2. Run: `python master_config_workflow_script.py` - -**What Happens:** -- Changes applied to gfx940 only -- Validated against template (must still match structure) -- Metric descriptions synced to `tools/per_arch_metric_definitions/gfx940_metrics_description.yaml` -- Hashes updated for gfx940 only - -### Scenario C: Add New Architecture (gfx955) - -**Method 1: Create Directory with YAMLs** - -1. Create `src/rocprof_compute_soc/analysis_configs/gfx955/` -2. Copy/create YAML files -3. Run: `python master_config_workflow_script.py` -4. Confirm this is the new latest arch - -**Method 2: Using Delta from Latest** - -1. Create delta showing differences from gfx950 -2. Place in `src/rocprof_compute_soc/analysis_configs/gfx955/config_delta/gfx955_diff.yaml` -3. Run: `python master_config_workflow_script.py` -4. Confirm this is the new latest arch - -**What Happens:** -- gfx955 becomes new latest arch -- Template updated with gfx955 as source -- Deltas generated: gfx955 → gfx950, gfx955 → gfx940, etc. -- All archs validated -- Metric descriptions synced -- Hashes updated - -### Scenario D: Update Metric Descriptions Only - -1. Edit description in config YAML: -```yaml -metrics_description: - Grid Size: "Updated description text" -``` - -2. Run: `python master_config_workflow_script.py` - -**What Happens:** -- Same workflow as metric changes -- Plain text stored in config YAMLs -- RST version generated and stored in docs/tools files - -## Delta YAML Structure - -### Complete Example - -```yaml -Addition: - - Panel Config: - id: 1100 - title: Compute Units - Compute Pipeline - metric_tables: - - metric_table: - id: 1103 - title: Arithmetic Operations - metrics: - - F8 OPs: - avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) - min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) - max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) - unit: (OPs + $normUnit) - metric_descriptions: - F8 OPs: - plain: Number of 8-bit floating point operations - rst: |- - Number of 8-bit floating point operations per :ref:`normalization unit `" - -Deletion: - - Panel Config: - id: 1100 - title: Compute Units - Compute Pipeline - metric_tables: - - metric_table: - id: 1103 - title: Arithmetic Operations - metrics: - - Old Metric: - avg: AVG(something) - metric_descriptions: - Old Metric: - plain: "Old description" - -Modification: - - Panel Config: - id: 1100 - title: Compute Units - Compute Pipeline - metric_tables: - - metric_table: - id: 1103 - title: Arithmetic Operations - metrics: - - Existing Metric: - avg: AVG(new_formula) # Changed field only - metric_descriptions: - Existing Metric: - plain: Updated description - rst: >- - Updated description with **RST**" -``` - -### Rules for Deltas - -1. **Must have all three sections**: Addition, Deletion, Modification (can be empty lists) -2. **Metric descriptions**: - - `plain` field is required - - `rst` field is optional (defaults to copy of plain) -3. **Delta filename**: Must be `_diff.yaml` -4. **Location**: `src/rocprof_compute_soc/analysis_configs/gfx/config_delta/` - -## Standalone Tool Usage - -### Hash Manager - +or: ```bash -# Compute hashes for all archs -python hash_manager.py --compute-all src/rocprof_compute_soc/analysis_configs - -# Detect changes -python hash_manager.py --detect-changes src/rocprof_compute_soc/analysis_configs - -# Update hashes for specific arch -python hash_manager.py --update gfx950 src/rocprof_compute_soc/analysis_configs +python tools/config_management/master_config_workflow_script.py --ci ``` -### Metric Description Manager +This runs semantic hash validation only. +## Automated Testing +### Pytest Hash Integrity Test + +Located at: ```bash -# Sync descriptions for specific arch -python metric_description_manager.py --sync-arch gfx950 src/rocprof_compute_soc/analysis_configs --latest-arch gfx950 - -# Sync all archs -python metric_description_manager.py --sync-all src/rocprof_compute_soc/analysis_configs --latest-arch gfx950 - -# Validate descriptions -python metric_description_manager.py --validate gfx950 src/rocprof_compute_soc/analysis_configs +tests/test_autogen_config.py ``` -### Parse Config Template +This test: +- parses `.config_hashes.json` +- verifies **byte-for-byte** integrity of: + - panel YAMLs + - delta YAMLs +- fails on: + - missing files + - changed content + - stale hash DB +Semantic correctness is enforced separately by `hash_checker.py`. + +## Contributor Rules (Strict) + +- Do **not** edit `.config_hashes.json` manually +- Do **not** create multiple delta files per arch +- Do **not** rename delta files arbitrarily +- Do **not** regenerate full YAMLs unnecessarily +- Use in-place edits (ruamel round-trip) +- Use the master script for promotions +- Expect CI to reject inconsistent states + +## Summary + +This system guarantees: + +- A **single source of truth** for latest architecture +- Deterministic, reviewable deltas +- Stable diffs for Git review +- Hash-backed immutability +- Safe, transactional promotions +- CI-enforced correctness + +All correctness flows through: ```bash -# Generate template with metadata -python parse_config_template.py src/rocprof_compute_soc/analysis_configs/gfx950 \ - tools/config_management/analysis_config_template.yaml \ - --latest-arch gfx950 -``` - -### Generate Delta - -```bash -# Generate delta from current arch to previous arch -python generate_config_deltas.py \ - src/rocprof_compute_soc/analysis_configs/gfx950 \ - src/rocprof_compute_soc/analysis_configs/gfx940 -``` - -### Apply Delta - -```bash -# Apply delta to base arch -python apply_config_deltas.py \ - src/rocprof_compute_soc/analysis_configs/gfx940 \ - src/rocprof_compute_soc/analysis_configs/gfx940/config_delta/gfx950_diff.yaml \ - output_dir -``` - -### Verify Against Template - -```bash -# Validate all archs -python verify_against_config_template.py \ - src/rocprof_compute_soc/analysis_configs \ - tools/config_management/analysis_config_template.yaml -``` - -## File Structure - -``` -. -├── src/rocprof_compute_soc/analysis_configs/ -│ ├── gfx940/ -│ │ ├── 0700_wavefront.yaml # Config with plain descriptions -│ │ └── config_delta/ -│ │ └── gfx950_diff.yaml # Delta to apply changes -│ ├── gfx941/ -│ └── gfx950/ # Latest arch -│ ├── 0700_wavefront.yaml -│ └── config_delta/ -│ └── gfx950_diff.yaml # Optional delta for modifications -│ -├── tools/ -│ ├── config_management/ -│ │ ├── .config_hashes.json # Hash database (auto-generated) -│ │ ├── analysis_config_template.yaml # Template with metadata -│ │ ├── hash_manager.py -│ │ ├── metric_description_manager.py -│ │ ├── parse_config_template.py -│ │ ├── generate_config_deltas.py -│ │ ├── apply_config_deltas.py -│ │ ├── verify_against_config_template.py -│ │ ├── master_config_workflow_script.py -│ │ └── config_workflow.yaml -│ │ -│ └── per_arch_metric_definitions/ -│ ├── gfx940_metrics_description.yaml # RST only -│ ├── gfx941_metrics_description.yaml -│ └── gfx950_metrics_description.yaml -│ -├── docs/data/ -│ └── metrics_description.yaml # RST only, latest arch only -│ -└── .backups/ # Auto-generated backups - └── 20250115_143022/ # Timestamped backup -``` - -## Configuration - -Edit `config_workflow.yaml` to customize paths and behavior: - -```yaml -paths: - template: tools/config_management/analysis_config_template.yaml - configs_root: src/rocprof_compute_soc/analysis_configs - backups: .backups - hashes: tools/config_management/.config_hashes.json - per_arch_metrics: tools/per_arch_metric_definitions - docs_metrics: docs/data/metrics_description.yaml - -validation: - strict_mode: true # Fail on warnings - verify_after_changes: true # Validate after operations - -behavior: - require_confirmation: true # Prompt before changes -``` - -## Error Handling - -### Validation Failures - -If validation fails: -1. All changes are automatically reverted -2. Backup is restored -3. Detailed error report is printed -4. Fix the issue and run again - -### Hash Mismatches - -If hashes are out of sync: -```bash -# Recompute all hashes -python hash_manager.py --compute-all src/rocprof_compute_soc/analysis_configs -``` - -### Description Validation Errors - -Common issues: -- **Missing descriptions**: Warning only (won't fail) -- **Invalid RST syntax**: Error (will fail and revert) -- **Missing plain text**: Error (plain is required) - -## Best Practices - -1. **Always use master_config_workflow_script.py** - Don't run individual scripts manually unless debugging -2. **Test with --dry-run first** - See what will happen before committing -3. **Use deltas for complex changes** - Easier to review and version control -4. **Keep descriptions updated** - Plain text in configs, RST in docs -5. **One change at a time** - If multiple archs need updates, do them sequentially -6. **Check validation output** - Review warnings even if they don't fail - -## Troubleshooting - -### "No changes detected" - -- Check that files were actually modified -- Ensure you're in the correct directory -- Verify hash database exists: `tools/config_management/.config_hashes.json` - -### "Validation failed" - -- Review the error output carefully -- Check that new metrics match template structure -- Ensure panel IDs are correct -- Verify data source ordering - -### "Failed to sync metric descriptions" - -- Check RST syntax in descriptions -- Ensure all metrics have descriptions -- Verify section_panel_map includes your table ID - -### Changes not detected after manual edit - -```bash -# Force recompute hashes -python hash_manager.py --compute-all src/rocprof_compute_soc/analysis_configs - -# Then run workflow -python master_config_workflow_script.py -``` - -## Development Notes - -### Adding New Architecture Support - -When adding a completely new architecture line: - -1. Ensure table IDs are in `metric_description_manager.py`'s `SECTION_PANEL_MAP` -2. Follow existing naming conventions (gfxXXX) -3. Create complete YAML set (don't start with partial configs) - -### Modifying the Workflow - -If you need to modify the workflow behavior: - -1. Edit `config_workflow.yaml` for path/behavior changes -2. Edit `master_config_workflow_script.py` for workflow logic changes -3. Test with `--dry-run` extensively -4. Update this README - - -# Pre-commit: Hash Consistency Check - -We ship a lightweight pre-commit hook that catches inconsistent hash updates across config YAMLs and deltas. - -## What it enforces (per arch) - -* Latest panels changed → latest delta must change (if there are older archs). -* Latest delta changed → latest panels must change or a new arch must be added. -* Older arch panels changed → that arch’s delta must change. -* Older arch delta changed → either latest panels or that arch’s panels must have changed. - -## Setup - -Install and enable pre-commit: - -```bash -pip install pre-commit -pre-commit install -``` - -Our .pre-commit-config.yaml includes a local hook that runs the checker. - -```yaml -- repo: local - hooks: - - id: hash-check - name: Hash consistency check - entry: bash -lc 'cd projects/rocprofiler-compute && python3 tools/config_management/hash_checker.py' - language: system - pass_filenames: false - stages: [pre-commit] -``` - -## Run manually - -```bash -# from super-repo root -pre-commit run --all-files - -# or directly in the subproject -cd projects/rocprofiler-compute -python3 tools/config_management/hash_checker.py +master_config_workflow_script.py ``` diff --git a/projects/rocprofiler-compute/tools/config_management/apply_config_deltas.py b/projects/rocprofiler-compute/tools/config_management/apply_config_deltas.py index f96fba1650..d9df6672a5 100644 --- a/projects/rocprofiler-compute/tools/config_management/apply_config_deltas.py +++ b/projects/rocprofiler-compute/tools/config_management/apply_config_deltas.py @@ -36,24 +36,14 @@ import sys from pathlib import Path from typing import Any, Optional, Union -try: - from . import utils as cm_utils -except Exception: - repo_root = Path(__file__).resolve().parents[1] - if str(repo_root) not in sys.path: - sys.path.insert(0, str(repo_root)) - try: - import config_management.utils as cm_utils # type: ignore - except Exception: - import utils as cm_utils # type: ignore +PROJECT_ROOT = Path(__file__).resolve().parents[1] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) -AUTOGEN_TEXT = ( - "# AUTOGENERATED FILE. Only edit for testing purposes, not for development. " - "Generated by tools/config_management/apply_config_deltas.py\n" -) +from config_management import utils_ruamel as cm_utils # noqa: E402 -def find_table_in_config(config: dict, table_id: Any) -> Optional[dict]: +def find_table(config: dict, table_id: Any) -> Optional[dict]: """Find and return the table with given id, or None.""" for item in config.get("Panel Config", {}).get("data source", []): table = item.get("metric_table") @@ -72,8 +62,8 @@ def add_table(config: dict, metric_table: dict) -> None: def add_metrics(config: dict, table_id: Any, metrics: list[dict]) -> None: """Add metrics to existing table.""" - table = find_table_in_config(config, table_id) - if not table: + table = find_table(config, table_id) + if table is None: print(f"WARNING: Table {table_id} not found for metric addition") return @@ -90,7 +80,7 @@ def delete_table(config: dict, table_id: Any) -> None: for idx, item in enumerate(list(data_source)): table = item.get("metric_table") if isinstance(table, dict) and table.get("id") == table_id: - data_source.pop(idx) + del data_source[idx] print(f"Deleted table: {table_id}") return print(f"WARNING: Table {table_id} not found for deletion") @@ -98,8 +88,8 @@ def delete_table(config: dict, table_id: Any) -> None: def delete_metrics(config: dict, table_id: Any, metrics: list[dict]) -> None: """Remove specific metrics from table.""" - table = find_table_in_config(config, table_id) - if not table or "metric" not in table: + table = find_table(config, table_id) + if table is None or "metric" not in table: print(f"WARNING: Table {table_id} not found or has no metrics") return @@ -112,8 +102,8 @@ def delete_metrics(config: dict, table_id: Any, metrics: list[dict]) -> None: def modify_metrics(config: dict, table_id: Any, metrics: list[dict]) -> None: """Modify specific fields in existing metrics.""" - table = find_table_in_config(config, table_id) - if not table or "metric" not in table: + table = find_table(config, table_id) + if table is None or "metric" not in table: print(f"WARNING: Table {table_id} not found or has no metrics") return @@ -129,19 +119,17 @@ def modify_metrics(config: dict, table_id: Any, metrics: list[dict]) -> None: def add_descriptions(config: dict, descriptions: dict) -> None: """Add metric descriptions to config.""" - pc = config.setdefault("Panel Config", {}) - pc.setdefault("metrics_description", {}) - md = pc["metrics_description"] + md = config["Panel Config"].setdefault("metrics_description", {}) for metric_name, desc_data in descriptions.items(): - value = desc_data if isinstance(desc_data, dict) else desc_data - md[metric_name] = value + md[metric_name] = dict(desc_data) if isinstance(desc_data, dict) else desc_data + print(f"Added description: {metric_name}") def delete_descriptions(config: dict, descriptions: dict) -> None: """Remove metric descriptions from config.""" - md = config.get("Panel Config", {}).get("metrics_description", {}) + md = config["Panel Config"].setdefault("metrics_description", {}) for metric_name in descriptions.keys(): if metric_name in md: del md[metric_name] @@ -150,21 +138,25 @@ def delete_descriptions(config: dict, descriptions: dict) -> None: def modify_descriptions(config: dict, descriptions: dict) -> None: """Modify metric descriptions in config.""" - pc = config.setdefault("Panel Config", {}) - pc.setdefault("metrics_description", {}) - md = pc["metrics_description"] + md = config["Panel Config"].setdefault("metrics_description", {}) for metric_name, desc_data in descriptions.items(): - value = desc_data if isinstance(desc_data, dict) else desc_data - md[metric_name] = value + if isinstance(desc_data, dict): + new_dict = {} + for k, v in desc_data.items(): + new_dict[k] = v + md[metric_name] = new_dict + else: + md[metric_name] = desc_data + print(f"Added description: {metric_name}") def apply_changes(config: dict, changes: list[dict], category: str) -> None: """Apply delta changes to configuration.""" for change in changes: - for mt_wrapper in change.get("metric_tables", []): - mt = mt_wrapper.get("metric_table", mt_wrapper) + mt = change.get("metric_table") + if mt: table_id = mt.get("id") if category == "Addition": @@ -199,7 +191,7 @@ def apply_delta( output_dir: Union[str, Path], ) -> None: """Apply delta YAML to all files in base directory.""" - delta = cm_utils.load_yaml(delta_file) + delta = cm_utils.load_yaml(delta_file, round_trip=True) output_path = Path(output_dir) output_path.mkdir(parents=True, exist_ok=True) @@ -214,7 +206,7 @@ def apply_delta( base_path = Path(base_dir) for yaml_file in base_path.glob("*.yaml"): - config = cm_utils.load_yaml(yaml_file) + config = cm_utils.load_yaml(yaml_file, round_trip=True) panel_id = config.get("Panel Config", {}).get("id") if panel_id in changes_by_panel: @@ -226,7 +218,8 @@ def apply_delta( config, changes_by_panel[panel_id][category], category ) - cm_utils.save_yaml(config, output_path / yaml_file.name, AUTOGEN_TEXT) + cm_utils.strip_existing_header(config) + cm_utils.save_yaml(config, output_path / yaml_file.name) print(f"Saved: {yaml_file.name}") else: shutil.copy(yaml_file, output_path / yaml_file.name) diff --git a/projects/rocprofiler-compute/tools/config_management/config_workflow.yaml b/projects/rocprofiler-compute/tools/config_management/config_workflow.yaml index b6649e540d..760a4ef370 100644 --- a/projects/rocprofiler-compute/tools/config_management/config_workflow.yaml +++ b/projects/rocprofiler-compute/tools/config_management/config_workflow.yaml @@ -11,7 +11,7 @@ paths: backups: .backups # Hash database file - hashes: tools/config_management/.config_hashes.json + hashes: src/utils/.config_hashes.json # Per-arch metric definitions output per_arch_metrics: tools/per_arch_metric_definitions diff --git a/projects/rocprofiler-compute/tools/config_management/generate_config_deltas.py b/projects/rocprofiler-compute/tools/config_management/generate_config_deltas.py index 7fcaccc044..112aa9bbd5 100644 --- a/projects/rocprofiler-compute/tools/config_management/generate_config_deltas.py +++ b/projects/rocprofiler-compute/tools/config_management/generate_config_deltas.py @@ -34,326 +34,265 @@ from __future__ import annotations import sys from pathlib import Path +from typing import Any, Optional -try: - from . import utils as cm_utils -except Exception: - repo_root = Path(__file__).resolve().parents[1] - if str(repo_root) not in sys.path: - sys.path.insert(0, str(repo_root)) - try: - import config_management.utils as cm_utils # type: ignore - except Exception: - import utils as cm_utils # type: ignore +PROJECT_ROOT = Path(__file__).resolve().parents[1] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) -AUTOGEN_TEXT = ( - "# AUTOGENERATED FILE. Only edit for testing purposes, not for development. " - "Generated by tools/config_management/generate_config_deltas.py\n" -) +from config_management import utils_ruamel as cm_utils # noqa: E402 +from ruamel.yaml.comments import CommentedMap # noqa: E402 -def get_metric_tables(data: dict) -> list[dict]: - """Extract all metric tables from data source.""" - tables: list[dict] = [] - for item in data.get("Panel Config", {}).get("data source", []): - mt = item.get("metric_table") - if isinstance(mt, dict): - tables.append(mt) - return tables +def load_yaml_roundtrip(path: Path) -> Any: + return cm_utils.load_yaml(path, round_trip=True) -def get_metric_descriptions(data: dict) -> dict: - """Extract metric descriptions from panel config.""" - return data.get("Panel Config", {}).get("metrics_description", {}) or {} +def diff_metric_fields(base_fields, new_fields) -> Optional[CommentedMap]: + out = CommentedMap() + + for key in new_fields: + if key not in base_fields or base_fields[key] != new_fields[key]: + # Preserve the original value with comments + out[key] = new_fields[key] + + return out if out else None -def compare_metrics( - prev_metrics: dict, curr_metrics: dict +def descriptions_equal(base_desc, new_desc) -> bool: + """Check if two descriptions are equal by comparing their string representation.""" + return str(base_desc) == str(new_desc) + + +def diff_metric_table( + base_table, new_table ) -> tuple[list[dict], list[dict], list[dict]]: - """Compare metrics and return (additions, deletions, modifications).""" - prev_keys = set(prev_metrics.keys()) - curr_keys = set(curr_metrics.keys()) - - additions = [{name: curr_metrics[name]} for name in sorted(curr_keys - prev_keys)] - deletions = [{name: prev_metrics[name]} for name in sorted(prev_keys - curr_keys)] - - modifications: list[dict] = [] - for name in sorted(prev_keys & curr_keys): - if prev_metrics[name] != curr_metrics[name]: - all_fields = set(prev_metrics[name].keys()) | set(curr_metrics[name].keys()) - modified_fields = { - field: curr_metrics[name].get(field) - for field in all_fields - if prev_metrics[name].get(field) != curr_metrics[name].get(field) - } - if modified_fields: - modifications.append({name: modified_fields}) - - return additions, deletions, modifications - - -def compare_descriptions( - prev_descriptions: dict, curr_descriptions: dict -) -> tuple[dict, dict, dict]: """ - Compare metric descriptions and return (additions, deletions, modifications). - Values are dicts with 'plain' and 'rst'. + Returns (additions, modifications, deletions) tuple. """ - prev_keys = set(prev_descriptions.keys()) - curr_keys = set(curr_descriptions.keys()) + addition_metrics: list[dict] = [] + modification_metrics: list[dict] = [] + deletion_metrics: list[dict] = [] - additions: dict = {} - deletions: dict = {} - modifications: dict = {} + base_metrics = base_table.get("metric", {}) + new_metrics = new_table.get("metric", {}) - for name in sorted(curr_keys - prev_keys): - desc = curr_descriptions[name] - additions[name] = ( - desc if isinstance(desc, dict) else {"plain": desc, "rst": desc} - ) + # Metrics deleted + for metric in base_metrics: + if metric not in new_metrics: + deletion_metrics.append({metric: None}) - for name in sorted(prev_keys - curr_keys): - desc = prev_descriptions[name] - deletions[name] = ( - desc if isinstance(desc, dict) else {"plain": desc, "rst": desc} - ) + # Metrics added or modified + for metric in new_metrics: + if metric not in base_metrics: + # Entire metric is new - preserve original with comments + addition_metrics.append({metric: new_metrics[metric]}) + else: + # Field-level diff + changes = diff_metric_fields(base_metrics[metric], new_metrics[metric]) + if changes: + modification_metrics.append({metric: changes}) - for name in sorted(prev_keys & curr_keys): - prev_desc = prev_descriptions[name] - curr_desc = curr_descriptions[name] - - prev_plain = ( - prev_desc if isinstance(prev_desc, str) else prev_desc.get("plain", "") - ) - curr_plain = ( - curr_desc if isinstance(curr_desc, str) else curr_desc.get("plain", "") - ) - - prev_rst = ( - prev_desc - if isinstance(prev_desc, str) - else prev_desc.get("rst", prev_plain) - ) - curr_rst = ( - curr_desc - if isinstance(curr_desc, str) - else curr_desc.get("rst", curr_plain) - ) - - if prev_plain != curr_plain or prev_rst != curr_rst: - modifications[name] = {"plain": curr_plain, "rst": curr_rst} - - return additions, deletions, modifications + return addition_metrics, modification_metrics, deletion_metrics -def compare_tables( - prev_tables: list[dict], curr_tables: list[dict] -) -> tuple[list[dict], list[dict], list[dict]]: - """Compare tables and return (additions, deletions, modifications).""" - prev_dict = {t["id"]: t for t in prev_tables} - curr_dict = {t["id"]: t for t in curr_tables} +def diff_descriptions( + base_md, new_md +) -> tuple[Optional[CommentedMap], Optional[CommentedMap], Optional[CommentedMap]]: + """ + Returns (additions, modifications, deletions) tuple. + """ + additions = CommentedMap() + modifications = CommentedMap() + deletions = CommentedMap() - prev_ids = set(prev_dict.keys()) - curr_ids = set(curr_dict.keys()) + # Deletions + for key in base_md: + if key not in new_md: + deletions[key] = None - additions: list[dict] = [] - deletions: list[dict] = [] - modifications: list[dict] = [] + # Additions and modifications + for key in new_md: + if key not in base_md: + # New description - preserve original node + additions[key] = new_md[key] + else: + # Check if modified + if not descriptions_equal(base_md[key], new_md[key]): + # Preserve original node to maintain style + modifications[key] = new_md[key] - additions.extend(curr_dict[tid] for tid in sorted(curr_ids - prev_ids)) - deletions.extend(prev_dict[tid] for tid in sorted(prev_ids - curr_ids)) + return ( + additions if additions else None, + modifications if modifications else None, + deletions if deletions else None, + ) - for tid in sorted(prev_ids & curr_ids): - prev_metrics = prev_dict[tid].get("metric", {}) or {} - curr_metrics = curr_dict[tid].get("metric", {}) or {} - metric_adds, metric_dels, metric_mods = compare_metrics( - prev_metrics, curr_metrics - ) +def extract_metric_tables(data_sources) -> list[Any]: + out = [] + for ds in data_sources: + if "metric_table" in ds: + mt = ds["metric_table"] + table_id = mt.get("id") + if table_id is not None: + out.append((table_id, mt)) + return out - if metric_adds: - additions.append({ - "id": tid, - "title": curr_dict[tid].get("title"), - "metrics": metric_adds, - }) - if metric_dels: - deletions.append({ - "id": tid, - "title": prev_dict[tid].get("title"), - "metrics": metric_dels, - }) - if metric_mods: - modifications.append({ - "id": tid, - "title": curr_dict[tid].get("title"), - "metrics": metric_mods, + +def diff_panel(base_config, new_config) -> Optional[dict[str, list[Any]]]: + """ + Produce delta for a single panel. + Returns dicts under keys: + 'Addition', 'Deletion', 'Modification' + or None if no diffs. + """ + out = {"Addition": [], "Deletion": [], "Modification": []} + panel_id = base_config["Panel Config"]["id"] + + # Table-level diffs + base_tables = extract_metric_tables( + base_config["Panel Config"].get("data source", []) + ) + new_tables = extract_metric_tables( + new_config["Panel Config"].get("data source", []) + ) + + # Indexing by table ID to preserve order + base_by_id = {tid: table for (tid, table) in base_tables} + new_by_id = {tid: table for (tid, table) in new_tables} + + # Table deletions + for tid in base_by_id: + if tid not in new_by_id: + out["Deletion"].append({ + "Panel Config": {"id": panel_id}, + "metric_tables": [{"metric_table": {"id": tid}}], }) - return additions, deletions, modifications - - -def format_metric_fields(metric_data: dict) -> list[str]: - """Format metric fields as YAML lines.""" - lines: list[str] = [] - for field_name, field_value in metric_data.items(): - if isinstance(field_value, str) and ( - "\n" in field_value or len(field_value) > 80 - ): - lines.append(f" {field_name}: |") - lines.extend( - f" {line}" for line in field_value.split("\n") + # Table additions + modifications + for tid in new_by_id: + if tid not in base_by_id: + # Entire table is added - preserve original + out["Addition"].append({ + "Panel Config": {"id": panel_id}, + "metric_tables": [{"metric_table": new_by_id[tid]}], + }) + else: + # Check metric-level diffs + additions, modifications, deletions = diff_metric_table( + base_by_id[tid], new_by_id[tid] ) - else: - lines.append(f" {field_name}: {field_value}") - return lines + + if deletions: + out["Deletion"].append({ + "Panel Config": {"id": panel_id}, + "metric_table": {"id": tid, "metrics": deletions}, + }) + + if additions: + out["Addition"].append({ + "Panel Config": {"id": panel_id}, + "metric_table": {"id": tid, "metrics": additions}, + }) + + if modifications: + out["Modification"].append({ + "Panel Config": {"id": panel_id}, + "metric_table": {"id": tid, "metrics": modifications}, + }) + + # Description diffs + base_md = base_config["Panel Config"].get("metrics_description", {}) + new_md = new_config["Panel Config"].get("metrics_description", {}) + desc_additions, desc_modifications, desc_deletions = diff_descriptions( + base_md, new_md + ) + + if desc_deletions: + out["Deletion"].append({ + "Panel Config": {"id": panel_id}, + "metric_descriptions": desc_deletions, + }) + + if desc_additions: + out["Addition"].append({ + "Panel Config": {"id": panel_id}, + "metric_descriptions": desc_additions, + }) + + if desc_modifications: + out["Modification"].append({ + "Panel Config": {"id": panel_id}, + "metric_descriptions": desc_modifications, + }) + + # Clean empties + if not out["Addition"]: + del out["Addition"] + if not out["Deletion"]: + del out["Deletion"] + if not out["Modification"]: + del out["Modification"] + + return out if out else None -def format_description_fields(desc_data: dict) -> list[str]: - """Format description fields as YAML lines.""" - lines: list[str] = [] - for field_name, field_value in desc_data.items(): - if isinstance(field_value, str) and ( - "\n" in field_value or len(field_value) > 80 - ): - lines.append(f" {field_name}: |") - lines.extend(f" {line}" for line in field_value.split("\n")) - else: - lines.append(f" {field_name}: {field_value}") - return lines +def generate_arch_delta(base_dir: Path, new_dir: Path) -> CommentedMap: + """ + Compare all YAML files panel-by-panel. + """ + out = CommentedMap() + out["Addition"] = [] + out["Deletion"] = [] + out["Modification"] = [] - -def format_output(combined_diff: dict) -> str: - """Format the diff dictionary into a YAML string.""" - lines: list[str] = [] - for category in ("Addition", "Deletion", "Modification"): - lines.append(f"{category}:") - if not combined_diff.get(category): - lines.append(" []") - lines.append("") + base_files = sorted(base_dir.glob("*.yaml")) + for base_file in base_files: + new_file = new_dir / base_file.name + if not new_file.exists(): continue - for panel_item in combined_diff[category]: - pc = panel_item["panel_config"] - lines.extend([ - " - Panel Config:", - f" id: {pc['id']}", - f" title: {pc['title']}", - ]) + base_config = load_yaml_roundtrip(base_file) + new_config = load_yaml_roundtrip(new_file) - if panel_item.get("metric_tables"): - lines.append(" metric_tables:") - for mt in panel_item["metric_tables"]: - lines.extend([ - " - metric_table:", - f" id: {mt['id']}", - f" title: {mt['title']}", - " metrics:", - ]) - metrics_to_format = mt.get("metrics") or [ - {name: data} for name, data in (mt.get("metric") or {}).items() - ] - for metric in metrics_to_format: - for metric_name, metric_data in metric.items(): - lines.append(f" - {metric_name}:") - lines.extend(format_metric_fields(metric_data)) + diff = diff_panel(base_config, new_config) + if not diff: + continue - if panel_item.get("metric_descriptions"): - lines.append(" metric_descriptions:") - for metric_name, desc_data in panel_item["metric_descriptions"].items(): - lines.append(f" {metric_name}:") - lines.extend(format_description_fields(desc_data)) + if "Addition" in diff: + out["Addition"].extend(diff["Addition"]) + if "Deletion" in diff: + out["Deletion"].extend(diff["Deletion"]) + if "Modification" in diff: + out["Modification"].extend(diff["Modification"]) - lines.append("") - return "\n".join(lines) + # Strip empty categories + if not out["Addition"]: + del out["Addition"] + if not out["Deletion"]: + del out["Deletion"] + if not out["Modification"]: + del out["Modification"] + + return out def main() -> None: - if len(sys.argv) != 3: - print("Usage: python generate_config_deltas.py ") - sys.exit(1) - - curr_arch_dir = Path(sys.argv[1]) - prev_arch_dir = Path(sys.argv[2]) - - if not curr_arch_dir.is_dir() or not prev_arch_dir.is_dir(): - print("Error: Both arguments must be directories") - sys.exit(1) - - curr_files = {f.name for f in curr_arch_dir.glob("*.yaml")} - prev_files = {f.name for f in prev_arch_dir.glob("*.yaml")} - common_files = curr_files & prev_files - - if not common_files: - print("Error: No common YAML files found") - sys.exit(1) - - print(f"Comparing {len(common_files)} files...") - - combined_diff = {"Addition": [], "Deletion": [], "Modification": []} - - for filename in sorted(common_files): - curr_data = cm_utils.load_yaml(curr_arch_dir / filename) - prev_data = cm_utils.load_yaml(prev_arch_dir / filename) - - curr_pc = curr_data.get("Panel Config", {}) or {} - prev_pc = prev_data.get("Panel Config", {}) or {} - - curr_tables = get_metric_tables(curr_data) - prev_tables = get_metric_tables(prev_data) - - curr_descriptions = get_metric_descriptions(curr_data) - prev_descriptions = get_metric_descriptions(prev_data) - - table_adds, table_dels, table_mods = compare_tables(prev_tables, curr_tables) - desc_adds, desc_dels, desc_mods = compare_descriptions( - prev_descriptions, curr_descriptions + if len(sys.argv) != 4: + print( + "Usage: python generate_config_deltas.py " # noqa: E501 ) + sys.exit(1) - if table_adds or desc_adds: - entry = { - "panel_config": {"id": curr_pc.get("id"), "title": curr_pc.get("title")} - } - if table_adds: - entry["metric_tables"] = table_adds - if desc_adds: - entry["metric_descriptions"] = desc_adds - combined_diff["Addition"].append(entry) + base_dir = Path(sys.argv[1]) + new_dir = Path(sys.argv[2]) + out_file = Path(sys.argv[3]) - if table_dels or desc_dels: - entry = { - "panel_config": {"id": prev_pc.get("id"), "title": prev_pc.get("title")} - } - if table_dels: - entry["metric_tables"] = table_dels - if desc_dels: - entry["metric_descriptions"] = desc_dels - combined_diff["Deletion"].append(entry) + delta = generate_arch_delta(base_dir, new_dir) - if table_mods or desc_mods: - entry = { - "panel_config": {"id": curr_pc.get("id"), "title": curr_pc.get("title")} - } - if table_mods: - entry["metric_tables"] = table_mods - if desc_mods: - entry["metric_descriptions"] = desc_mods - combined_diff["Modification"].append(entry) - - output = AUTOGEN_TEXT + format_output(combined_diff) - - print("\n" + "=" * 80) - print("COMBINED DIFF OUTPUT:") - print("=" * 80) - print(output) - - output_dir = prev_arch_dir / "config_delta" - output_dir.mkdir(exist_ok=True) - output_file = output_dir / f"{curr_arch_dir.name}_diff.yaml" - with open(output_file, "w") as f: - f.write(output) - - print(f"\nDiff written to: {output_file}") + cm_utils.save_yaml(delta, out_file) + print(f"Delta generated at: {out_file}") if __name__ == "__main__": diff --git a/projects/rocprofiler-compute/tools/config_management/hash_manager.py b/projects/rocprofiler-compute/tools/config_management/hash_manager.py index 5c93534986..4823b81e08 100644 --- a/projects/rocprofiler-compute/tools/config_management/hash_manager.py +++ b/projects/rocprofiler-compute/tools/config_management/hash_manager.py @@ -43,7 +43,7 @@ import sys from pathlib import Path from typing import Optional -DEFAULT_HASH_DB = "tools/config_management/.config_hashes.json" +DEFAULT_HASH_DB = "src/utils/.config_hashes.json" def compute_file_hash(filepath: Path) -> str: diff --git a/projects/rocprofiler-compute/tools/config_management/master_config_workflow_script.py b/projects/rocprofiler-compute/tools/config_management/master_config_workflow_script.py index 50d61b7b48..b02ab84dc4 100644 --- a/projects/rocprofiler-compute/tools/config_management/master_config_workflow_script.py +++ b/projects/rocprofiler-compute/tools/config_management/master_config_workflow_script.py @@ -24,991 +24,262 @@ ############################################################################## -""" -Master workflow script for managing architecture configurations. -- Detects changes -- Handles direct edits and delta files -- Supports promoting a NEW arch from: - (A) direct edits to latest, or - (B) a delta YAML targeting latest -- Validates, syncs metric descriptions, and updates hashes - -""" - -from __future__ import annotations - import argparse import shutil import subprocess import sys -from datetime import datetime +import time from pathlib import Path -from typing import Optional -try: - from . import hash_manager, metric_description_manager -except Exception: - repo_root = Path(__file__).resolve().parents[1] # repo root - if str(repo_root) not in sys.path: - sys.path.insert(0, str(repo_root)) - import config_management.hash_manager as hash_manager # type: ignore - import config_management.metric_description_manager as metric_description_manager # type: ignore +SCRIPT_DIR = Path(__file__).resolve().parent +# .../rocprofiler-compute/tools/config_management -import yaml +REPO_ROOT = SCRIPT_DIR.parents[1] +# .../rocprofiler-compute -# ============================================================================= -# CONFIG -# ============================================================================= +TOOLS_DIR = SCRIPT_DIR -CONFIG_FILE = "config_workflow.yaml" +SOC_ROOT = REPO_ROOT / "src" / "rocprof_compute_soc" +ANALYSIS_CONFIGS = SOC_ROOT / "analysis_configs" -DEFAULT_CONFIG: dict = { - "paths": { - "template": "tools/config_management/gfx9_config_template.yaml", - "configs_root": "src/rocprof_compute_soc/analysis_configs", - "backups": ".backups", - "hashes": "tools/config_management/.config_hashes.json", - "per_arch_metrics": "tools/per_arch_metric_definitions", - "docs_metrics": "docs/data/metrics_description.yaml", - }, - "validation": {"strict_mode": True, "verify_after_changes": True}, - "behavior": {"require_confirmation": True}, -} +TEMPLATE_FILE = ANALYSIS_CONFIGS / "gfx9_config_template.yaml" +HASH_JSON = REPO_ROOT / "src" / "utils" / ".config_hashes.json" +BACKUP_DIR = SCRIPT_DIR / "backups" + +PYTHON = sys.executable + +VERIFY_SCRIPT = TOOLS_DIR / "verify_against_config_template.py" +PARSE_TEMPLATE_SCRIPT = TOOLS_DIR / "parse_config_template.py" +GENERATE_DELTAS_SCRIPT = TOOLS_DIR / "generate_config_deltas.py" +APPLY_DELTAS_SCRIPT = TOOLS_DIR / "apply_config_deltas.py" +HASH_CHECKER_SCRIPT = REPO_ROOT / "src" / "utils" / "hash_checker.py" +HASH_MANAGER_SCRIPT = TOOLS_DIR / "hash_manager.py" -# ============================================================================= -# UTILITIES -# ============================================================================= +def run(cmd): + print("\n$", " ".join(str(c) for c in cmd)) + return subprocess.run(cmd, cwd=str(REPO_ROOT)).returncode -def load_config() -> dict: - """Load config from CONFIG_FILE with a shallow merge onto DEFAULT_CONFIG.""" - p = Path(CONFIG_FILE) - if not p.exists(): - return DEFAULT_CONFIG - with open(p) as f: - user = yaml.safe_load(f) or {} - merged = DEFAULT_CONFIG.copy() - for k, v in user.items(): - if isinstance(v, dict) and isinstance(merged.get(k), dict): - merged[k] = {**merged[k], **v} +def fatal(msg): + print(f"\nFATAL: {msg}") + sys.exit(1) + + +def confirm(prompt): + ans = input(f"{prompt} [y/N]: ").strip().lower() + return ans in ("y", "yes") + + +def backup(paths): + BACKUP_DIR.mkdir(exist_ok=True) + backup_path = BACKUP_DIR / f"backup_{int(time.time())}" + backup_path.mkdir() + + for p in paths: + if not p.exists(): + continue + dest = backup_path / p.name + if p.is_dir(): + shutil.copytree(p, dest) else: - merged[k] = v - return merged + shutil.copy2(p, dest) - -def create_backup(source_paths: list[str], backup_dir: str) -> Path: - """Create a timestamped backup of the provided paths.""" - ts = datetime.now().strftime("%Y%m%d_%H%M%S_%f") # add microseconds - base = Path(backup_dir) - base.mkdir(parents=True, exist_ok=True) - backup_path = base / ts - - # Fallback suffix if somehow collides - i = 1 - while backup_path.exists(): - backup_path = base / f"{ts}_{i}" - i += 1 - - print(f"Creating backup: {backup_path}") - for s in source_paths: - sp = Path(s) - dst = backup_path / sp.name - if sp.is_dir(): - shutil.copytree(sp, dst) - elif sp.is_file(): - dst.parent.mkdir(parents=True, exist_ok=True) - shutil.copy2(sp, dst) + print(f"\nBackup created at {backup_path}") return backup_path -def restore_backup(backup_path: Path, target_paths: list[str]) -> None: - """Restore files/dirs from a given backup path.""" - print(f"Restoring from backup: {backup_path}") - for t in target_paths: - tp = Path(t) - bp = backup_path / tp.name - if not bp.exists(): +def restore(backup_path, paths): + print("\nRestoring from backup...") + for p in paths: + src = backup_path / p.name + if not src.exists(): continue - if tp.is_dir(): - shutil.rmtree(tp, ignore_errors=True) - elif tp.exists(): - tp.unlink() - if bp.is_dir(): - shutil.copytree(bp, tp) + if p.exists(): + if p.is_dir(): + shutil.rmtree(p) + else: + p.unlink() + if src.is_dir(): + shutil.copytree(src, p) else: - shutil.copy2(bp, tp) - print("Backup restored") + shutil.copy2(src, p) + print("Restore complete.") -def cleanup_old_backups(backup_dir: str) -> None: - """Keep latest backup, remove older ones.""" - b = Path(backup_dir) - if not b.exists(): - return - dirs = sorted([d for d in b.iterdir() if d.is_dir()]) - for old in dirs[:-1]: - shutil.rmtree(old, ignore_errors=True) - print(f"Removed old backup: {old.name}") - - -def prompt_yes_no(question: str, default: Optional[bool] = None) -> bool: - """Ask a yes/no question in the terminal.""" - if default is None: - prompt = f"{question} (y/n): " - elif default: - prompt = f"{question} [Y/n]: " - else: - prompt = f"{question} [y/N]: " - while True: - ans = input(prompt).strip().lower() - if not ans and default is not None: - return default - if ans in ("y", "yes"): - return True - if ans in ("n", "no"): - return False - print("Please answer 'y' or 'n'.") - - -def run_script( - script_name: str, args: list[str], capture_output: bool = True -) -> subprocess.CompletedProcess: - """Run a Python helper script and return CompletedProcess.""" - return subprocess.run( - [sys.executable, script_name] + args, capture_output=capture_output, text=True - ) - - -def get_all_archs(configs_dir: str) -> list[str]: - """Return sorted list of gfx* directories.""" - root = Path(configs_dir) - return sorted([ - d.name for d in root.iterdir() if d.is_dir() and d.name.startswith("gfx") - ]) - - -def get_latest_arch(template_file: str) -> Optional[str]: - """Read 'latest_arch' from template YAML.""" - p = Path(template_file) - if not p.is_file(): - return None - with open(p) as f: - data = yaml.safe_load(f) or {} - return data.get("latest_arch") - - -def validate_delta_structure(delta_file: str) -> tuple[bool, str]: - """Ensure delta YAML contains Addition/Deletion/Modification keys.""" - with open(delta_file) as f: - data = yaml.safe_load(f) or {} - required = {"Addition", "Deletion", "Modification"} - if not isinstance(data, dict) or not required.issubset(data.keys()): - return False, "Delta must have Addition, Deletion, Modification keys" - return True, "" - - -# ============================================================================= -# VALIDATION / SYNC -# ============================================================================= - - -def validate_all_archs(config: dict) -> tuple[bool, str]: - """Validate all archs against the template.""" - print("Validating all architectures against template...") - res = run_script( - "tools/config_management/verify_against_config_template.py", - [config["paths"]["configs_root"], config["paths"]["template"]], - capture_output=True, - ) - if res.stdout: - print(res.stdout) - if res.returncode != 0: - if res.stderr: - print(res.stderr) - return False, "Validation failed" - return True, "Validation passed" - - -def validate_arch_against_template(arch_name: str, config: dict) -> tuple[bool, str]: - """Validate one arch (best-effort: rely on script output mentioning arch).""" - print(f"Validating {arch_name} against template...") - res = run_script( - "tools/config_management/verify_against_config_template.py", - [config["paths"]["configs_root"], config["paths"]["template"]], - capture_output=True, - ) - if res.returncode != 0 and arch_name in (res.stdout or ""): - print(res.stdout) - return False, f"Validation failed for {arch_name}" - return True, f"Validation passed for {arch_name}" - - -# ============================================================================= -# CHANGE DETECTION -# ============================================================================= - - -def detect_changes(config: dict) -> dict: - print("Detecting changes...") - return hash_manager.detect_changes( - config["paths"]["configs_root"], config["paths"]["hashes"] - ) - - -def display_change_summary(changes: dict) -> bool: - print("\n" + "=" * 80) - print("CHANGE SUMMARY") - print("=" * 80) - - has_changes = any([ - changes.get("new_archs"), - changes.get("modified_archs"), - changes.get("delta_files"), - changes.get("deleted_archs"), - ]) - - if changes.get("new_archs"): - print("\nNew Architecture Directories:") - for a in changes["new_archs"]: - print(f" • {a}") - - if changes.get("modified_archs"): - print("\nModified Architectures:") - for a, files in changes["modified_archs"].items(): - print(f" • {a}:") - for f in files[:5]: - print(f" - {f}") - extra = len(files) - 5 - if extra > 0: - print(f" ... and {extra} more files") - - if changes.get("delta_files"): - print("\nDelta Files Detected:") - for a, d in changes["delta_files"].items(): - print(f" • {a}: {Path(d).name}") - - if changes.get("deleted_archs"): - print("\nDeleted Architectures:") - for a in changes["deleted_archs"]: - print(f" • {a}") - - if not has_changes: - print("\nNo changes detected") - - print("=" * 80 + "\n") - return has_changes - - -# ============================================================================= -# CORE WORKFLOW OPS -# ============================================================================= - - -def promote_to_latest( - new_arch: str, config: dict, reuse_backup: Optional[Path] = None -) -> bool: - """ - Original 'promote' that assumes new_arch dir already exists & populated. - (Kept for backward compatibility.) - """ - print(f"\nPROMOTING {new_arch} TO LATEST ARCHITECTURE...") - backup_paths = [config["paths"]["configs_root"], config["paths"]["template"]] - backup_path = reuse_backup or create_backup( - backup_paths, config["paths"]["backups"] - ) - - try: - root = Path(config["paths"]["configs_root"]) - new_dir = root / new_arch - if not new_dir.is_dir(): - raise Exception(f"New arch directory not found: {new_dir}") - - all_archs = get_all_archs(config["paths"]["configs_root"]) - prev_archs = [a for a in all_archs if a != new_arch] - - print(f"\n1. Updating template with new latest arch: {new_arch}") - res = run_script( - "tools/config_management/parse_config_template.py", - [str(new_dir), config["paths"]["template"], "--latest-arch", new_arch], - capture_output=True, - ) - if res.returncode != 0: - raise Exception(f"Failed to update template: {res.stderr}") - - print(f"\n2. Generating deltas for {len(prev_archs)} previous architectures") - for p in prev_archs: - prev_dir = root / p - gen = run_script( - "tools/config_management/generate_config_deltas.py", - [str(new_dir), str(prev_dir)], - capture_output=True, - ) - if gen.returncode != 0: - raise Exception(f"Failed to generate delta for {p}: {gen.stderr}") - - print("\n\tUpdating hashes for previous architectures (delta files)") - for p in prev_archs: - hash_manager.update_hashes( - p, config["paths"]["configs_root"], config["paths"]["hashes"] - ) - - print("\n3. Validating all architectures") - ok, msg = validate_all_archs(config) - if not ok: - raise Exception(msg) - - print("\n4. Syncing metric descriptions") - ok = metric_description_manager.sync_arch( - new_arch, - config["paths"]["configs_root"], - config["paths"]["per_arch_metrics"], - config["paths"]["docs_metrics"], - is_latest=True, - ) - if not ok: - raise Exception("Failed to sync metric descriptions") - - print("\n5. Updating hashes") - hash_manager.update_hashes( - new_arch, config["paths"]["configs_root"], config["paths"]["hashes"] - ) - - print(f"\nSuccessfully promoted {new_arch} to latest architecture!") - return True - - except Exception as e: - print(f"\nERROR: {e}\nRestoring from backup...") - restore_backup(backup_path, backup_paths) - return False - - -def update_latest_arch_from_delta( - delta_file: str, arch_name: str, config: dict -) -> bool: - """Apply a delta in-place to the latest arch (legacy flow).""" - print(f"\nUPDATING LATEST ARCH {arch_name} FROM DELTA...") - backup_paths = [config["paths"]["configs_root"], config["paths"]["template"]] - backup_path = create_backup(backup_paths, config["paths"]["backups"]) - - try: - root = Path(config["paths"]["configs_root"]) - arch_dir = root / arch_name - tmp = root / f"{arch_name}_tmp" - - print(f"\n1. Applying delta to {arch_name}") - res = run_script( - "tools/config_management/apply_config_deltas.py", - [str(arch_dir), delta_file, str(tmp)], - capture_output=True, - ) - if res.returncode != 0: - raise Exception(f"Failed to apply delta: {res.stderr}") - - shutil.rmtree(arch_dir) - shutil.move(str(tmp), str(arch_dir)) - - print("\n2. Updating template") - res = run_script( - "tools/config_management/parse_config_template.py", - [str(arch_dir), config["paths"]["template"], "--latest-arch", arch_name], - capture_output=True, - ) - if res.returncode != 0: - raise Exception(f"Failed to update template: {res.stderr}") - - print("\n3. Regenerating deltas for previous architectures") - all_archs = get_all_archs(config["paths"]["configs_root"]) - for prev in [a for a in all_archs if a != arch_name]: - prev_dir = root / prev - gen = run_script( - "tools/config_management/generate_config_deltas.py", - [str(arch_dir), str(prev_dir)], - capture_output=True, - ) - if gen.returncode != 0: - raise Exception(f"Failed to generate delta for {prev}") - - for prev in [a for a in all_archs if a != arch_name]: - hash_manager.update_hashes( - prev, config["paths"]["configs_root"], config["paths"]["hashes"] - ) - - print("\n4. Validating all architectures") - ok, msg = validate_all_archs(config) - if not ok: - raise Exception(msg) - - print("\n5. Syncing metric descriptions") - ok = metric_description_manager.sync_arch( - arch_name, - config["paths"]["configs_root"], - config["paths"]["per_arch_metrics"], - config["paths"]["docs_metrics"], - is_latest=True, - ) - if not ok: - raise Exception("Failed to sync metric descriptions") - - print("\n6. Updating hashes") - hash_manager.update_hashes( - arch_name, config["paths"]["configs_root"], config["paths"]["hashes"] - ) - - print(f"\nSuccessfully updated latest arch {arch_name}!") - return True - - except Exception as e: - print(f"\nERROR: {e}\nRestoring from backup...") - restore_backup(backup_path, backup_paths) - return False - - -def update_older_arch_from_delta(delta_file: str, arch_name: str, config: dict) -> bool: - """Apply a delta in-place to an older arch (legacy flow).""" - print(f"\nUPDATING OLDER ARCH {arch_name} FROM DELTA...") - root = Path(config["paths"]["configs_root"]) - arch_dir = root / arch_name - backup_path = create_backup([str(arch_dir)], config["paths"]["backups"]) - - try: - tmp = root / f"{arch_name}_tmp" - - print(f"\n1. Applying delta to {arch_name}") - res = run_script( - "tools/config_management/apply_config_deltas.py", - [str(arch_dir), delta_file, str(tmp)], - capture_output=True, - ) - if res.returncode != 0: - raise Exception(f"Failed to apply delta: {res.stderr}") - - shutil.rmtree(arch_dir) - shutil.move(str(tmp), str(arch_dir)) - - print("\n2. Validating against template") - ok, msg = validate_arch_against_template(arch_name, config) - if not ok: - raise Exception(msg) - - print("\n3. Syncing metric descriptions") - ok = metric_description_manager.sync_arch( - arch_name, - config["paths"]["configs_root"], - config["paths"]["per_arch_metrics"], - config["paths"]["docs_metrics"], - is_latest=False, - ) - if not ok: - raise Exception("Failed to sync metric descriptions") - - print("\n4. Updating hashes") - hash_manager.update_hashes( - arch_name, config["paths"]["configs_root"], config["paths"]["hashes"] - ) - - print(f"\nSuccessfully updated older arch {arch_name}!") - return True - - except Exception as e: - print(f"\nERROR: {e}\nRestoring from backup...") - restore_backup(backup_path, [str(arch_dir)]) - return False - - -def update_latest_arch_from_edits(arch_name: str, config: dict) -> bool: - """Re-derive template/deltas from direct edits to latest (legacy in-place).""" - print(f"\nUPDATING LATEST ARCH {arch_name} FROM DIRECT EDITS...") - backup_paths = [config["paths"]["configs_root"], config["paths"]["template"]] - backup_path = create_backup(backup_paths, config["paths"]["backups"]) - - try: - root = Path(config["paths"]["configs_root"]) - arch_dir = root / arch_name - - print("\n1. Updating template") - res = run_script( - "tools/config_management/parse_config_template.py", - [str(arch_dir), config["paths"]["template"], "--latest-arch", arch_name], - capture_output=True, - ) - if res.returncode != 0: - raise Exception(f"Failed to update template: {res.stderr}") - - print("\n2. Regenerating deltas for previous architectures") - for prev in [ - a for a in get_all_archs(config["paths"]["configs_root"]) if a != arch_name - ]: - prev_dir = root / prev - gen = run_script( - "tools/config_management/generate_config_deltas.py", - [str(arch_dir), str(prev_dir)], - capture_output=True, - ) - if gen.returncode != 0: - raise Exception(f"Failed to generate delta for {prev}") - - for prev in [ - a for a in get_all_archs(config["paths"]["configs_root"]) if a != arch_name - ]: - hash_manager.update_hashes( - prev, config["paths"]["configs_root"], config["paths"]["hashes"] - ) - - print("\n3. Validating all architectures") - ok, msg = validate_all_archs(config) - if not ok: - raise Exception(msg) - - print("\n4. Syncing metric descriptions") - ok = metric_description_manager.sync_arch( - arch_name, - config["paths"]["configs_root"], - config["paths"]["per_arch_metrics"], - config["paths"]["docs_metrics"], - is_latest=True, - ) - if not ok: - raise Exception("Failed to sync metric descriptions") - - print("\n5. Updating hashes") - hash_manager.update_hashes( - arch_name, config["paths"]["configs_root"], config["paths"]["hashes"] - ) - - print(f"\nSuccessfully updated latest arch {arch_name}!") - return True - - except Exception as e: - print(f"\nERROR: {e}\nRestoring from backup...") - restore_backup(backup_path, backup_paths) - return False - - -def update_older_arch_from_edits(arch_name: str, config: dict) -> bool: - """Re-validate/sync/hash older arch after direct edits (legacy in-place).""" - print(f"\nUPDATING OLDER ARCH {arch_name} FROM DIRECT EDITS...") - root = Path(config["paths"]["configs_root"]) - arch_dir = root / arch_name - backup_path = create_backup([str(arch_dir)], config["paths"]["backups"]) - - try: - print("\n1. Validating against template") - ok, msg = validate_arch_against_template(arch_name, config) - if not ok: - raise Exception(msg) - - print("\n2. Syncing metric descriptions") - ok = metric_description_manager.sync_arch( - arch_name, - config["paths"]["configs_root"], - config["paths"]["per_arch_metrics"], - config["paths"]["docs_metrics"], - is_latest=False, - ) - if not ok: - raise Exception("Failed to sync metric descriptions") - - print("\n3. Updating hashes") - hash_manager.update_hashes( - arch_name, config["paths"]["configs_root"], config["paths"]["hashes"] - ) - - print(f"\nSuccessfully updated older arch {arch_name}!") - return True - - except Exception as e: - print(f"\nERROR: {e}\nRestoring from backup...") - restore_backup(backup_path, [str(arch_dir)]) - return False - - -# ============================================================================= -# NEW: PROMOTE NEW ARCH FROM (A) EDITS or (B) DELTA -# ============================================================================= - - -def _git_restore_pristine(path: Path) -> None: - """ - Best-effort restore of a directory to HEAD using Git. - No-op if not in a Git repo. Raises on checkout failure when in a repo. - """ - chk = subprocess.run( - ["git", "rev-parse", "--is-inside-work-tree"], capture_output=True, text=True - ) - if chk.returncode != 0 or chk.stdout.strip() != "true": - return - res = subprocess.run( - ["git", "checkout", "--", str(path)], capture_output=True, text=True - ) - if res.returncode != 0: - raise Exception(f"Failed to restore pristine state from Git for {path}") - - -def promote_new_arch_from_latest_edits( - latest_arch: str, new_arch: str, config: dict -) -> bool: - """ - Flow (A): Direct edits were made to the current latest arch. - 1) Snapshot edited latest to temp - 2) Restore pristine latest (via Git) - 3) Copy pristine latest → new arch - 4) Generate delta (edited_tmp vs pristine_latest) → write under latest/config_delta/ - 5) Apply delta to new arch - 6) Update template latest=new_arch, regen deltas, validate, sync, hash - """ - print(f"\nPROMOTING {new_arch} FROM EDITS IN {latest_arch}...") - root = Path(config["paths"]["configs_root"]) - latest_dir = root / latest_arch - new_dir = root / new_arch - edited_tmp = root / f"_{latest_arch}_edited_tmp" - new_tmp = root / f"_{new_arch}_tmp" - - backup_paths = [config["paths"]["configs_root"], config["paths"]["template"]] - backup_path = create_backup(backup_paths, config["paths"]["backups"]) - - try: - # 1) Snapshot edited latest - if edited_tmp.exists(): - shutil.rmtree(edited_tmp) - shutil.copytree(latest_dir, edited_tmp) - - # 2) Restore pristine latest - _git_restore_pristine(latest_dir) - - # 3) Copy pristine latest → new arch - if new_dir.exists(): - raise Exception(f"Target new arch directory already exists: {new_dir}") - shutil.copytree(latest_dir, new_dir) - - # 4) Generate delta: edited (curr) vs pristine latest (prev) - print("\nGenerating delta (edited latest → pristine latest)") - gen = run_script( - "tools/config_management/generate_config_deltas.py", - [str(edited_tmp), str(latest_dir)], - capture_output=True, - ) - if gen.returncode != 0: - raise Exception(f"Failed to generate delta: {gen.stderr}") - - delta_dir = latest_dir / "config_delta" - # Prefer the file named for edited_tmp; otherwise take the latest *_diff.yaml - candidates = sorted(delta_dir.glob(f"{edited_tmp.name}_diff.yaml")) or sorted( - delta_dir.glob("*_diff.yaml") - ) - if not candidates: - raise Exception("Delta file not found after generation.") - delta_file = candidates[-1] - - # 5) Apply delta onto new arch - if new_tmp.exists(): - shutil.rmtree(new_tmp) - print(f"\nApplying delta to {new_arch}: {delta_file.name}") - app = run_script( - "tools/config_management/apply_config_deltas.py", - [str(new_dir), str(delta_file), str(new_tmp)], - capture_output=True, - ) - if app.returncode != 0: - raise Exception(f"Failed to apply delta: {app.stderr}") - shutil.rmtree(new_dir) - shutil.move(str(new_tmp), str(new_dir)) - - # 6) Promote to latest, regen deltas, validate, sync, hash - return promote_to_latest(new_arch, config, reuse_backup=backup_path) - - except Exception as e: - print(f"\nERROR: {e}\nRestoring from backup...") - restore_backup(backup_path, backup_paths) - return False - finally: - if edited_tmp.exists(): - shutil.rmtree(edited_tmp, ignore_errors=True) - if new_tmp.exists(): - shutil.rmtree(new_tmp, ignore_errors=True) - - -def promote_new_arch_from_delta( - latest_arch: str, new_arch: str, delta_file: str, config: dict -) -> bool: - """ - Flow (B): Developer added a delta YAML targeting the latest arch. - 1) Copy pristine latest → new arch - 2) Apply the provided delta to new arch - 3) Promote to latest, regen deltas, validate, sync, hash - """ - print(f"\nPROMOTING {new_arch} FROM DELTA ON {latest_arch}...") - root = Path(config["paths"]["configs_root"]) - latest_dir = root / latest_arch - new_dir = root / new_arch - new_tmp = root / f"_{new_arch}_tmp" - - backup_paths = [config["paths"]["configs_root"], config["paths"]["template"]] - backup_path = create_backup(backup_paths, config["paths"]["backups"]) - - try: - if not Path(delta_file).is_file(): - raise Exception(f"Delta file does not exist: {delta_file}") - if not latest_dir.is_dir(): - raise Exception(f"Latest arch not found: {latest_dir}") - if new_dir.exists(): - raise Exception(f"Target new arch directory already exists: {new_dir}") - - # Start from pristine latest - _git_restore_pristine(latest_dir) - - # 1) Copy pristine latest → new arch - shutil.copytree(latest_dir, new_dir) - - # 2) Apply delta onto the new arch - if new_tmp.exists(): - shutil.rmtree(new_tmp) - print(f"\nApplying delta to {new_arch}: {Path(delta_file).name}") - app = run_script( - "tools/config_management/apply_config_deltas.py", - [str(new_dir), str(delta_file), str(new_tmp)], - capture_output=True, - ) - if app.returncode != 0: - raise Exception(f"Failed to apply delta: {app.stderr}") - shutil.rmtree(new_dir) - shutil.move(str(new_tmp), str(new_dir)) - - # 3) Promote to latest, regen deltas, validate, sync, hash - return promote_to_latest(new_arch, config, reuse_backup=backup_path) - - except Exception as e: - print(f"\nERROR: {e}\nRestoring from backup...") - restore_backup(backup_path, backup_paths) - return False - finally: - if new_tmp.exists(): - shutil.rmtree(new_tmp, ignore_errors=True) - - -# ============================================================================= -# USER-FACING SCENARIO HANDLERS -# ============================================================================= - - -def handle_new_arch(arch_name: str, config: dict, dry_run: bool = False) -> bool: - print(f"\n{'=' * 80}\nNEW ARCHITECTURE DETECTED: {arch_name}\n{'=' * 80}") - if not prompt_yes_no(f"Is {arch_name} the new latest architecture?"): - print( - "ERROR: New arch detected but not marked as latest.\n " - "Only the latest arch should be added as a new directory." - ) - return False - if dry_run: - print(f"[DRY RUN] Would promote {arch_name} to latest") - return True - return promote_to_latest(arch_name, config) - - -def handle_delta_file( - delta_file: str, arch_name: str, config: dict, dry_run: bool = False -) -> bool: - print( - f"\n{'=' * 80}\nDELTA FILE DETECTED: {Path(delta_file).name}\n " - f"Target architecture: {arch_name}\n{'=' * 80}" - ) - - valid, err = validate_delta_structure(delta_file) - if not valid: - print(f"ERROR: Invalid delta structure - {err}") - return False - - latest = ( - get_latest_arch(config["paths"]["template"]) - or (get_all_archs(config["paths"]["configs_root"]) or [None])[-1] - ) - - if arch_name == latest: - print(f"\nDelta targets the current latest arch: {latest}") - print("Choose how to apply this delta:") - print(" 1. Update the existing latest arch in-place") - print( - " 2. Create a NEW architecture from latest and apply " - "the delta there (promote to latest)" - ) - - while True: - choice = input("Enter choice (1 or 2): ").strip() - if choice == "1": - if dry_run: - print(f"[DRY RUN] Would update latest arch {latest} from delta") - return True - return update_latest_arch_from_delta(delta_file, latest, config) - if choice == "2": - new_arch_name = input( - "Enter new architecture name (e.g., gfx955): " - ).strip() - if not new_arch_name: - print("New architecture name cannot be empty.") - continue - if not prompt_yes_no( - f"Promote {new_arch_name} to new latest architecture?" - ): - print("Operation cancelled.") - return False - if dry_run: - print( - "[DRY RUN] Would create " - f"{new_arch_name} from {latest} and apply delta" - ) - return True - return promote_new_arch_from_delta( - latest, new_arch_name, delta_file, config - ) - print("Invalid choice. Please enter 1 or 2.") - else: - if not prompt_yes_no(f"Apply delta to older arch ({arch_name}) in-place?"): - return False - if dry_run: - print(f"[DRY RUN] Would update older arch {arch_name} from delta") - return True - return update_older_arch_from_delta(delta_file, arch_name, config) - - -def handle_direct_edits( - arch_name: str, modified_files: list[str], config: dict, dry_run: bool = False -) -> bool: - print(f"\n{'=' * 80}\nDIRECT EDITS DETECTED: {arch_name}\n{'=' * 80}") - print("Modified files:") - for f in modified_files: - print(f" • {f}") - - latest = ( - get_latest_arch(config["paths"]["template"]) - or (get_all_archs(config["paths"]["configs_root"]) or [None])[-1] - ) - - if arch_name == latest: - print(f"\nThis is the current latest architecture ({latest}).") - print("Are you:") - print(" 1. Updating the existing latest arch") - print(" 2. Creating a new architecture (this will become the new latest)") - - while True: - choice = input("Enter choice (1 or 2): ").strip() - if choice == "1": - if dry_run: - print( - f"[DRY RUN] Would update latest arch {latest} from direct edits" - ) - return True - return update_latest_arch_from_edits(arch_name, config) - if choice == "2": - new_arch_name = ( - input( - "Enter new architecture name " - f"(currently detected as {arch_name}): " - ).strip() - or arch_name - ) - if not prompt_yes_no( - f"Promote {new_arch_name} to new latest architecture?" - ): - print("Operation cancelled.") - return False - if dry_run: - print( - "[DRY RUN] Would promote " - f"{new_arch_name} from edits in {arch_name}" - ) - return True - return promote_new_arch_from_latest_edits( - arch_name, new_arch_name, config - ) - print("Invalid choice. Please enter 1 or 2.") - else: - if not prompt_yes_no( - f"These are edits to older arch ({arch_name}). Continue (in-place)?" - ): - return False - if dry_run: - print(f"[DRY RUN] Would update older arch {arch_name} from direct edits") - return True - return update_older_arch_from_edits(arch_name, config) - - -# ============================================================================= -# MAIN -# ============================================================================= - - -def main() -> int: - parser = argparse.ArgumentParser( - description="Master workflow for managing architecture configurations" - ) - parser.add_argument( - "--dry-run", - action="store_true", - help="Show what would be done without making changes", - ) +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--ci", action="store_true") + parser.add_argument("--hash-only", action="store_true") + parser.add_argument("--validate-only", action="store_true") + parser.add_argument("--edit-existing", action="store_true") + parser.add_argument("--promote", metavar="ARCH") args = parser.parse_args() - print("=" * 80) - print("ARCHITECTURE CONFIG WORKFLOW") - print("=" * 80) + # -------------------------------------------------------- + # CI / HASH-ONLY PATH (non-mutating) + # -------------------------------------------------------- + if args.ci or args.hash_only: + if not HASH_CHECKER_SCRIPT.exists(): + fatal("hash_checker.py not found") + sys.exit(run([PYTHON, HASH_CHECKER_SCRIPT])) - config = load_config() + # -------------------------------------------------------- + # HARD PREFLIGHT (STRUCTURAL VALIDATION) for all non-hash paths + # -------------------------------------------------------- + if not VERIFY_SCRIPT.exists(): + fatal("verify_against_config_template.py not found") - if args.dry_run: - print("\nDRY RUN MODE - No changes will be made\n") + rc = run([PYTHON, VERIFY_SCRIPT, ANALYSIS_CONFIGS, TEMPLATE_FILE]) + if rc != 0: + fatal("Template / architecture verification failed") - changes = detect_changes(config) - has_changes = display_change_summary(changes) - if not has_changes: - return 0 + # -------------------------------------------------------- + # VALIDATE-ONLY + # -------------------------------------------------------- + if args.validate_only: + print("\nValidation successful.") + sys.exit(0) - latest_arch = ( - get_latest_arch(config["paths"]["template"]) - or (get_all_archs(config["paths"]["configs_root"]) or [None])[-1] - ) - latest_has_edits = latest_arch in (changes.get("modified_archs") or {}) + # -------------------------------------------------------- + # EDIT EXISTING ARCHITECTURE (helpers only; no template/hash updates) + # -------------------------------------------------------- + if args.edit_existing: + print("\nEdit existing architecture mode.") - # New arch directories that appeared on disk - for new_arch in changes.get("new_archs", []): - if not handle_new_arch(new_arch, config, args.dry_run): - return 1 + choice = input( + "\nChoose:\n 1) Generate delta\n 2) Apply delta\n 3) Exit\nSelect: " + ).strip() - # If latest was directly edited, prioritize resolving that path - # (user will choose in-place vs new arch) - if latest_has_edits: - if not handle_direct_edits( - latest_arch, changes["modified_archs"][latest_arch], config, args.dry_run + if choice == "1": + base = input("Base arch dir (absolute or relative to repo root): ").strip() + new = input("New arch dir: ").strip() + out = input("Output delta yaml: ").strip() + sys.exit(run([PYTHON, GENERATE_DELTAS_SCRIPT, base, new, out])) + + if choice == "2": + base = input("Base arch dir: ").strip() + delta = input("Delta yaml: ").strip() + out = input("Output dir: ").strip() + + rc = run([PYTHON, APPLY_DELTAS_SCRIPT, base, delta, out]) + if rc != 0: + sys.exit(rc) + + # Re-verify after apply + rc = run([PYTHON, VERIFY_SCRIPT, ANALYSIS_CONFIGS, TEMPLATE_FILE]) + sys.exit(rc) + + sys.exit(0) + + # -------------------------------------------------------- + # PROMOTE NEW LATEST ARCHITECTURE (mutating, with rollback) + # -------------------------------------------------------- + if args.promote: + new_latest = args.promote + new_arch_dir = ANALYSIS_CONFIGS / new_latest + + if not new_arch_dir.is_dir(): + fatal(f"Architecture directory not found: {new_arch_dir}") + + if not confirm( + f"Promote {new_latest} to latest? " + "This will update template, regenerate deltas, and update hashes." ): - return 1 - print("\nNote: Delta files for older archs will be regenerated automatically.") - print("Skipping delta file processing for older architectures.\n") - else: - # Process delta files - for arch, delta_file in changes.get("delta_files", {}).items(): - if not handle_delta_file(delta_file, arch, config, args.dry_run): - return 1 + sys.exit(0) - # Remaining direct edits (excluding latest if already processed) - for arch, files in (changes.get("modified_archs") or {}).items(): - if arch == latest_arch and latest_has_edits: - continue - if arch in (changes.get("delta_files") or {}): - continue - if not handle_direct_edits(arch, files, config, args.dry_run): - return 1 + # Back up the things we mutate + backup_path = backup([ANALYSIS_CONFIGS, TEMPLATE_FILE, HASH_JSON]) - if not args.dry_run: - cleanup_old_backups(config["paths"]["backups"]) - print("\n" + "=" * 80) - print("ALL OPERATIONS COMPLETED SUCCESSFULLY!") - print("=" * 80) - else: - print("\n" + "=" * 80) - print("DRY RUN COMPLETE") - print("=" * 80) + try: + # 1) Update template + if not PARSE_TEMPLATE_SCRIPT.exists(): + raise RuntimeError("parse_config_template.py not found") - return 0 + rc = run([ + PYTHON, + PARSE_TEMPLATE_SCRIPT, + new_arch_dir, + TEMPLATE_FILE, + "--latest-arch", + new_latest, + ]) + if rc != 0: + raise RuntimeError("Failed to update template") + + # 2) Regenerate deltas for all other archs + for arch_dir in sorted(ANALYSIS_CONFIGS.iterdir()): + if not arch_dir.is_dir(): + continue + if arch_dir.name == new_latest: + continue + + delta_dir = arch_dir / "config_delta" + delta_dir.mkdir(exist_ok=True) + out_delta = delta_dir / f"{new_latest}_diff.yaml" + + rc = run([ + PYTHON, + GENERATE_DELTAS_SCRIPT, + new_arch_dir, + arch_dir, + out_delta, + ]) + if rc != 0: + raise RuntimeError(f"Delta generation failed for {arch_dir.name}") + + for f in delta_dir.glob("*_diff.yaml"): + if f.name != f"{new_latest}_diff.yaml": + f.unlink() + + # 3) Re-verify everything against updated template + rc = run([PYTHON, VERIFY_SCRIPT, ANALYSIS_CONFIGS, TEMPLATE_FILE]) + if rc != 0: + raise RuntimeError("Post-promotion verification failed") + + # 4) Now update the hash DB to the new steady state. + # Promotion touched many delta files, so compute-all is the safest. + if not HASH_MANAGER_SCRIPT.exists(): + raise RuntimeError("hash_manager.py not found") + + rc = run([ + PYTHON, + HASH_MANAGER_SCRIPT, + "--compute-all", + ANALYSIS_CONFIGS, + HASH_JSON, + ]) + if rc != 0: + raise RuntimeError("Hash DB update failed (--compute-all)") + + # 5) run hash_checker + rc = run([PYTHON, HASH_CHECKER_SCRIPT]) + if rc != 0: + raise RuntimeError( + "Hash consistency check failed (after hash DB update)" + ) + + print(f"\nSUCCESS: {new_latest} promoted to latest.") + sys.exit(0) + + except Exception as e: + print(f"\nERROR: {e}") + restore(backup_path, [ANALYSIS_CONFIGS, TEMPLATE_FILE, HASH_JSON]) + sys.exit(1) + + # -------------------------------------------------------- + # NO INTENT PROVIDED + # -------------------------------------------------------- + print( + "\nNo workflow selected.\n" + "Use one of:\n" + " --validate-only\n" + " --edit-existing\n" + " --promote gfxXYZ\n" + " --hash-only / --ci\n" + ) + sys.exit(0) if __name__ == "__main__": - sys.exit(main()) + main() diff --git a/projects/rocprofiler-compute/tools/config_management/metric_description_manager.py b/projects/rocprofiler-compute/tools/config_management/metric_description_manager.py index 3fa75d5122..1dd7c30190 100644 --- a/projects/rocprofiler-compute/tools/config_management/metric_description_manager.py +++ b/projects/rocprofiler-compute/tools/config_management/metric_description_manager.py @@ -43,21 +43,11 @@ from typing import Union import yaml -try: - from . import utils as cm_utils -except Exception: - repo_root = Path(__file__).resolve().parents[1] - if str(repo_root) not in sys.path: - sys.path.insert(0, str(repo_root)) - try: - import config_management.utils as cm_utils # type: ignore - except Exception: - import utils as cm_utils # type: ignore +PROJECT_ROOT = Path(__file__).resolve().parents[1] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) -AUTOGEN_TEXT = ( - "# AUTOGENERATED FILE. Only edit for testing purposes, not for development. " - "Generated by tools/config_management/metric_description_manager.py\n" -) +from config_management import utils_ruamel as cm_utils # noqa: E402 # Section to panel ID mapping for organizing descriptions SECTION_PANEL_MAP: dict[str, int] = { @@ -274,7 +264,7 @@ def update_per_arch_metrics_file( entry["unit"] = desc_data["unit"] rst_descriptions[section][metric_name] = entry - cm_utils.save_yaml(rst_descriptions, output_path, AUTOGEN_TEXT) + cm_utils.save_yaml(rst_descriptions, output_path) print(f"Updated: {output_path}") @@ -303,7 +293,7 @@ def update_docs_metrics_file( docs_path.parent.mkdir(parents=True, exist_ok=True) - cm_utils.save_yaml(existing, docs_path, AUTOGEN_TEXT) + cm_utils.save_yaml(existing, docs_path) return True diff --git a/projects/rocprofiler-compute/tools/config_management/parse_config_template.py b/projects/rocprofiler-compute/tools/config_management/parse_config_template.py index 15ca94699a..894a8fb968 100644 --- a/projects/rocprofiler-compute/tools/config_management/parse_config_template.py +++ b/projects/rocprofiler-compute/tools/config_management/parse_config_template.py @@ -1,8 +1,49 @@ #!/usr/bin/env python3 +############################################################################## +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +############################################################################## + """ -Parse panel configuration based on YAML files for an architecture. -Usage: - python parse_config_template.py [output_file.yaml] [--latest-arch ARCH] +parse_config_template.py + +Parse panel configuration based on YAML files for an architecture and, optionally, +generate a lightweight template describing panel IDs, titles, aliases, and +data-source ordering. + +Usage +----- +Generate a template from an architecture directory: + + python tools/config_management/parse_config_template.py \ + analysis_configs/gfx950 \ + analysis_configs/config_template.yaml \ + --latest-arch gfx950 + +Inspect an architecture (no template written): + + python tools/config_management/parse_config_template.py \ + analysis_configs/gfx950 """ from __future__ import annotations @@ -12,45 +53,93 @@ import sys from pathlib import Path from typing import Any, Optional -try: - from . import utils as cm_utils -except Exception: - repo_root = Path(__file__).resolve().parents[1] - if str(repo_root) not in sys.path: - sys.path.insert(0, str(repo_root)) - try: - import config_management.utils as cm_utils # type: ignore - except Exception: - import utils as cm_utils # type: ignore +PROJECT_ROOT = Path(__file__).resolve().parents[1] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) -AUTOGEN_TEXT = ( - "# AUTOGENERATED FILE. Only edit for testing purposes, not for development. " - "Generated by tools/config_management/parse_config_template.py\n" -) +from config_management import utils_ruamel as cm_utils # noqa: E402 -def parse_panel_config(yaml_file: Path) -> Optional[dict]: - """Parse a single YAML file and extract panel and data source info.""" +def normalize_panel_id(panel_id: Optional[int]) -> Optional[int]: + """Normalize panel ID by dividing by 100 if needed.""" + if panel_id is None: + return None + return panel_id // 100 if panel_id >= 100 else panel_id + + +def normalize_table_id(table_id: Optional[int]) -> Optional[int]: + """Normalize table ID using modulo 100.""" + if table_id is None: + return None + return table_id % 100 + + +def parse_panel_config(yaml_file: Path) -> Optional[dict[str, Any]]: + """ + Parse a single panel YAML file and extract template-relevant info. + + Returns a dict with: + - file: panel filename (without leading numeric prefix) + - panel_id: normalized panel id (id // 100 when >= 100) + - panel_title: Panel Config.title + - panel_alias: Panel Config.alias (optional) + - data_sources: ordered list of + {type: , id: , title: } + or None if the file does not contain a valid Panel Config or fails basic checks. + """ data = cm_utils.load_yaml(yaml_file) panel_config = data.get("Panel Config") if not isinstance(panel_config, dict): + print(f"WARNING: {yaml_file} has no valid 'Panel Config' mapping, skipping.") + return None + + # Enforce presence of core panel-level keys + missing_keys: list[str] = [] + for key in ("id", "title", "data source", "metrics_description"): + if key not in panel_config: + missing_keys.append(key) + + if missing_keys: + missing_str = ", ".join(missing_keys) + print( + f"ERROR: {yaml_file} is missing required Panel Config keys: {missing_str}" + ) return None filename = ( yaml_file.name.split("_", 1)[1] if "_" in yaml_file.name else yaml_file.name ) - panel_id = panel_config.get("id") - if panel_id and panel_id >= 100: - panel_id = panel_id // 100 + raw_panel_id = panel_config.get("id") + if not isinstance(raw_panel_id, int): + print( + f"ERROR: {yaml_file} has non-integer or missing Panel Config.id " + f"({raw_panel_id!r})" + ) + return None - data_sources = [] - for ds in panel_config.get("data source", []): + panel_id = normalize_panel_id(raw_panel_id) + + # Extract and normalize data sources + data_sources: list[dict[str, Any]] = [] + ds_list = panel_config.get("data source", []) + if not isinstance(ds_list, list): + print( + f"ERROR: {yaml_file} has non-list 'data source' field " + f"({type(ds_list).__name__})" + ) + return None + + for ds in ds_list: + if not isinstance(ds, dict): + print(f"WARNING: {yaml_file} has non-dict data source entry: {ds!r}") + continue for key, value in ds.items(): if isinstance(value, dict) and "id" in value and "title" in value: + norm_id = normalize_table_id(value["id"]) data_sources.append({ "type": key, - "id": value["id"] % 100, + "id": norm_id, "title": value["title"], }) @@ -63,15 +152,65 @@ def parse_panel_config(yaml_file: Path) -> Optional[dict]: } +def build_template_from_directory( + directory: Path, + existing_panels_by_id: Optional[dict[int, dict]], +) -> list[dict]: + panels: list[dict] = [] + errors = 0 + + for yaml_file in sorted(directory.glob("*.yaml")): + info = parse_panel_config(yaml_file) + if info is None: + errors += 1 + continue + + panel_id = info.get("panel_id") + + if ( + existing_panels_by_id + and panel_id is not None + and panel_id in existing_panels_by_id + ): + old_panel = existing_panels_by_id[panel_id] + + # Preserve panel_alias unless explicitly set by panel YAML + if info.get("panel_alias") is None and "panel_alias" in old_panel: + info["panel_alias"] = old_panel["panel_alias"] + + panels.append(info) + + # Deterministic ordering for stable templates + panels.sort(key=lambda p: (p["panel_id"], p["file"])) + + if errors: + print( + f"\nEncountered {errors} panel file(s) with structural errors " + "while building template." + ) + + return panels + + def main() -> None: parser = argparse.ArgumentParser( - description="Parse panel configuration from YAML files" + description=( + "Parse panel YAML files for an architecture and optionally generate " + "a config_template-style YAML describing panel IDs and data sources." + ) + ) + parser.add_argument("directory", help="Directory containing panel YAML files") + parser.add_argument( + "output", + nargs="?", + help="Output YAML file (optional). If omitted, only a summary is printed.", ) - parser.add_argument("directory", help="Directory containing YAML files") - parser.add_argument("output", nargs="?", help="Output YAML file (optional)") parser.add_argument( "--latest-arch", - help="Specify this architecture as latest (adds metadata to output)", + help=( + "Specify this architecture as latest (adds 'latest_arch' metadata " + "to the generated template). Only used when an output file is given." + ), ) args = parser.parse_args() @@ -80,33 +219,45 @@ def main() -> None: print(f"Error: '{args.directory}' is not a valid directory") sys.exit(1) - results = [] - for yaml_file in sorted(directory.glob("*.yaml")): - parsed = parse_panel_config(yaml_file) - if parsed: - results.append(parsed) + existing_template = None + if args.output and Path(args.output).exists(): + existing_template = cm_utils.load_yaml(Path(args.output)) - if not results: - print("No valid panel configurations found.") + existing_panels_by_id = {} + if existing_template: + for p in existing_template.get("panels", []): + pid = p.get("panel_id") + if pid is not None: + existing_panels_by_id[pid] = p + + panels = build_template_from_directory( + directory, + existing_panels_by_id=existing_panels_by_id if args.output else None, + ) + + if not panels: + print("No valid panel YAML files found; nothing to do.") sys.exit(1) - for panel in results: - print(f"\n{'=' * 80}") - print(f"File: {panel['file']}") + # Always show a human-readable summary. + print(f"Found {len(panels)} panel(s) in {directory}:") + for panel in panels: + print(f"\nFile: {panel['file']}") print(f"Panel ID: {panel['panel_id']}") print(f"Panel Title: {panel['panel_title']}") - if panel.get("panel_alias"): + if panel["panel_alias"]: print(f"Panel Alias: {panel['panel_alias']}") print(f"\nData Sources ({len(panel['data_sources'])}):") for ds in panel["data_sources"]: print(f" - {ds['type']}: {ds['id']} - {ds['title']}") + # Optionally write a template YAML. if args.output: - output_data: Any = results + output_data: Any = {"panels": panels} if args.latest_arch: - output_data = {"latest_arch": args.latest_arch, "panels": results} - cm_utils.save_yaml(output_data, args.output, AUTOGEN_TEXT) - print(f"\nResults saved to: {args.output}") + output_data = {"latest_arch": args.latest_arch, "panels": panels} + cm_utils.save_yaml(output_data, Path(args.output)) + print(f"\nTemplate saved to: {args.output}") if __name__ == "__main__": diff --git a/projects/rocprofiler-compute/tools/config_management/utils.py b/projects/rocprofiler-compute/tools/config_management/utils.py deleted file mode 100644 index d79d65a257..0000000000 --- a/projects/rocprofiler-compute/tools/config_management/utils.py +++ /dev/null @@ -1,52 +0,0 @@ -############################################################################## -# MIT License -# -# Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -# THE SOFTWARE. - -############################################################################## - -from pathlib import Path -from typing import Optional, Union - -import yaml - - -def str_representer(dumper, data): - if "\n" in data: - return dumper.represent_scalar("tag:yaml.org,2002:str", data, style=">") - return dumper.represent_scalar("tag:yaml.org,2002:str", data) - - -yaml.add_representer(str, str_representer) - - -def load_yaml(filepath: Union[str, Path]) -> dict: - with open(filepath) as f: - return yaml.safe_load(f) or {} - - -def save_yaml( - data: dict, filepath: Union[str, Path], header: Optional[str] = None -) -> None: - with open(filepath, "w") as f: - if header: - f.write(header) - yaml.dump(data, f, sort_keys=False) diff --git a/projects/rocprofiler-compute/tools/config_management/utils_ruamel.py b/projects/rocprofiler-compute/tools/config_management/utils_ruamel.py new file mode 100644 index 0000000000..512ad03e4d --- /dev/null +++ b/projects/rocprofiler-compute/tools/config_management/utils_ruamel.py @@ -0,0 +1,92 @@ +############################################################################## +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +############################################################################## + +from pathlib import Path +from typing import Any, Union + +from ruamel.yaml import YAML +from ruamel.yaml.comments import CommentedMap + +# --- Round-trip YAML (for writing) --- +RT_YAML = YAML(typ="rt") +RT_YAML.preserve_quotes = True +RT_YAML.width = 4096 # prevent unwanted line wrapping +RT_YAML.indent(mapping=2, sequence=2, offset=0) +RT_YAML.explicit_start = False +RT_YAML.explicit_end = False + +# --- Read-only YAML (safe loader) --- +RO_YAML = YAML(typ="safe") +RO_YAML.width = 4096 + + +def load_yaml( + filepath: Union[str, Path], + *, + round_trip: bool = False, +) -> Any: + path = Path(filepath) + if not path.exists(): + raise FileNotFoundError(f"YAML file not found: {path}") + + yaml = RT_YAML if round_trip else RO_YAML + + with open(path, "r", encoding="utf-8") as f: + return yaml.load(f) or CommentedMap() + + +def save_yaml(data: Any, filepath: Union[str, Path]) -> None: + path = Path(filepath) + path.parent.mkdir(parents=True, exist_ok=True) + + with open(path, "w", encoding="utf-8") as f: + RT_YAML.dump(data, f) + + +def strip_existing_header(yaml_data) -> None: + ca = getattr(yaml_data, "ca", None) + if not ca or not hasattr(ca, "comment") or ca.comment is None: + return + + original = ca.comment + + cleaned = [] + + for block in original: + if block is None: + cleaned.append(None) + continue + + new_block = [token for token in block if "AUTOGENERATED" not in token.value] + + if not new_block: + cleaned.append(None) + else: + cleaned.append(new_block) + + if len(cleaned) < 2: + cleaned.extend([None] * (2 - len(cleaned))) + + ca.comment = cleaned diff --git a/projects/rocprofiler-compute/tools/config_management/verify_against_config_template.py b/projects/rocprofiler-compute/tools/config_management/verify_against_config_template.py index 32b9044edb..0930764c86 100644 --- a/projects/rocprofiler-compute/tools/config_management/verify_against_config_template.py +++ b/projects/rocprofiler-compute/tools/config_management/verify_against_config_template.py @@ -23,155 +23,318 @@ # THE SOFTWARE. ############################################################################## + """ -Validate panel YAML files against base template ordering. -Checks that panel configs match expected structure, IDs, titles, and data source order. +verify_against_config_template.py + +Validate per-architecture panel YAMLs against a shared config template. +- Validate structure + ordering only. +- Treat any deviation as an error. +- Collect all errors and report at end. + +Template format (generated by parse_config_template.py): + latest_arch: gfx### (optional) + panels: + - file: <filename without numeric prefix> + panel_id: <normalized panel id> + panel_title: <title> + panel_alias: <optional> + data_sources: + - type: metric_table|raw_csv_table|... + id: <normalized table id> + title: <title> Usage: - python verify_against_config_template.py <analysis_configs_dir> <template_yaml> + python verify_against_config_template.py <analysis_configs_dir> <template_yaml> """ from __future__ import annotations +import argparse import sys +from dataclasses import dataclass from pathlib import Path -from typing import Optional +from typing import Any, Optional -import yaml +PROJECT_ROOT = Path(__file__).resolve().parents[1] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + +from config_management import utils_ruamel as cm_utils # noqa: E402 + +REQUIRED_PANEL_KEYS = ("id", "title", "data source", "metrics_description") +OPTIONAL_PANEL_KEYS = ("alias",) +DEFAULT_ALLOWED_PANEL_KEYS = set(REQUIRED_PANEL_KEYS) | set(OPTIONAL_PANEL_KEYS) def normalize_panel_id(panel_id: int) -> int: - """Normalize panel ID by dividing by 100.""" - return panel_id // 100 if panel_id and panel_id >= 100 else panel_id + return panel_id // 100 if panel_id >= 100 else panel_id -def normalize_table_id(table_id: int) -> Optional[int]: - """Normalize table ID using modulo 100.""" - return table_id % 100 if table_id else None +def normalize_table_id(table_id: int) -> int: + return table_id % 100 -def load_template(template_file: Path) -> dict[int, dict]: - """Load template and create lookup by normalized panel ID.""" - with open(template_file) as f: - data = yaml.safe_load(f) or {} - - panels = data.get("panels", []) - lookup: dict[int, dict] = {} - for panel in panels: - pid = normalize_panel_id(panel["panel_id"]) - lookup[pid] = { - "panel_title": panel["panel_title"], - "panel_alias": panel.get("panel_alias"), - "data_sources": [ - {"type": ds["type"], "id": ds["id"], "title": ds["title"]} - for ds in panel.get("data_sources", []) - ], - } - return lookup +@dataclass(frozen=True) +class TemplateDataSource: + type: str + id: int + title: str -def extract_panel_info(yaml_file: Path) -> Optional[dict]: - """Extract panel config info from YAML file.""" - with open(yaml_file) as f: - data = yaml.safe_load(f) or {} - if "Panel Config" not in data: - return None - - panel_config = data["Panel Config"] - data_sources = [] - for ds in panel_config.get("data source", []): - for key, value in ds.items(): - if isinstance(value, dict) and "id" in value and "title" in value: - data_sources.append({ - "type": key, - "id": normalize_table_id(value["id"]), - "title": value["title"], - }) - - return { - "panel_id": normalize_panel_id(panel_config.get("id")), - "panel_title": panel_config.get("title"), - "data_sources": data_sources, - } +@dataclass(frozen=True) +class TemplatePanel: + file: str + panel_id: int + panel_title: str + panel_alias: Any + data_sources: tuple[TemplateDataSource, ...] -def validate_panel( - yaml_file: Path, panel_info: dict, template: dict[int, dict], stats: dict -) -> None: - """Validate a single panel YAML against template.""" - panel_id = panel_info["panel_id"] - file_path = f"{yaml_file.parent.name}/{yaml_file.name}" +def _as_str(v: Any) -> str: + return "" if v is None else str(v) - if panel_id not in template: - print(f"WARNING [{file_path}]: Panel ID {panel_id} not found in template") - stats["warnings"] += 1 - return - expected = template[panel_id] +def load_template( + template_file: Path, +) -> tuple[list[TemplatePanel], dict[int, TemplatePanel]]: + data = cm_utils.load_yaml(template_file) or {} + panels_raw = data.get("panels", []) + if not isinstance(panels_raw, list): + raise ValueError("Template YAML must contain a top-level 'panels' list") + + panels: list[TemplatePanel] = [] + by_id: dict[int, TemplatePanel] = {} + + for idx, p in enumerate(panels_raw): + if not isinstance(p, dict): + raise ValueError(f"Template panels[{idx}] must be a mapping") + if "panel_id" not in p or "panel_title" not in p: + raise ValueError( + f"Template panels[{idx}] missing 'panel_id' or 'panel_title'" + ) + + pid_raw = p.get("panel_id") + if not isinstance(pid_raw, int): + raise ValueError( + f"Template panels[{idx}].panel_id must be int, got {pid_raw!r}" + ) + pid = normalize_panel_id(pid_raw) + + ds_list = p.get("data_sources", []) or [] + if not isinstance(ds_list, list): + raise ValueError(f"Template panels[{idx}].data_sources must be list") + + ds_out: list[TemplateDataSource] = [] + for j, ds in enumerate(ds_list): + if not isinstance(ds, dict): + raise ValueError( + f"Template panels[{idx}].data_sources[{j}] must be mapping" + ) + for k in ("type", "id", "title"): + if k not in ds: + raise ValueError( + f"Template panels[{idx}].data_sources[{j}] missing '{k}'" + ) + + ds_id = ds["id"] + if not isinstance(ds_id, int): + raise ValueError( + f"Template panels[{idx}].data_sources[{j}].id must be int, " + f"got {ds_id!r}" + ) + + ds_out.append( + TemplateDataSource( + type=_as_str(ds["type"]), + id=normalize_table_id(ds_id), + title=_as_str(ds["title"]), + ) + ) + + panel = TemplatePanel( + file=_as_str(p.get("file", "")), + panel_id=pid, + panel_title=_as_str(p.get("panel_title")), + panel_alias=p.get("panel_alias"), + data_sources=tuple(ds_out), + ) + + if pid in by_id: + raise ValueError(f"Duplicate panel_id {pid} in template") + + panels.append(panel) + by_id[pid] = panel + + return panels, by_id + + +def extract_panel_info( + yaml_file: Path, +) -> tuple[Optional[int], dict[str, Any], list[dict[str, Any]]]: + """Return (panel_id, panel_config, extracted_data_sources).""" + data = cm_utils.load_yaml(yaml_file) or {} + panel_config = data.get("Panel Config") + if not isinstance(panel_config, dict): + return None, {}, [] + + pid_raw = panel_config.get("id") + pid = normalize_panel_id(pid_raw) if isinstance(pid_raw, int) else None + + ds_extracted: list[dict[str, Any]] = [] + ds_list = panel_config.get("data source", []) + if isinstance(ds_list, list): + for item in ds_list: + if not isinstance(item, dict): + continue + for ds_type, value in item.items(): + if ( + isinstance(value, dict) + and isinstance(value.get("id"), int) + and "title" in value + ): + ds_extracted.append({ + "type": str(ds_type), + "id": normalize_table_id(value["id"]), + "title": _as_str(value.get("title")), + }) + + return pid, panel_config, ds_extracted + + +def validate_arch( + arch_dir: Path, + template_panels: list[TemplatePanel], + template_by_id: dict[int, TemplatePanel], + allowed_panel_keys: set[str], +) -> list[str]: + """Validate one architecture directory. Returns list of errors.""" errors: list[str] = [] - warnings: list[str] = [] - if panel_info["panel_title"] != expected["panel_title"]: - errors.append( - f"Panel title mismatch: expected '{expected['panel_title']}', " - f"got '{panel_info['panel_title']}'" - ) + panel_files = sorted(arch_dir.glob("*.yaml")) + actual_by_id: dict[int, Path] = {} + actual_order: list[int] = [] - if len(panel_info["data_sources"]) != len(expected["data_sources"]): - errors.append( - f"Data source count mismatch: expected {len(expected['data_sources'])}, " - f"got {len(panel_info['data_sources'])}" - ) + for f in panel_files: + pid, panel_config, ds_actual = extract_panel_info(f) + rel = f"{arch_dir.name}/{f.name}" - for i, actual_ds in enumerate(panel_info["data_sources"]): - matching_idx = next( - ( - j - for j, exp_ds in enumerate(expected["data_sources"]) - if actual_ds["id"] == exp_ds["id"] - and actual_ds["title"] == exp_ds["title"] - and actual_ds["type"] == exp_ds["type"] - ), - None, - ) - if matching_idx is None: + if pid is None: + errors.append(f"ERROR [{rel}]: Missing or non-integer Panel Config.id") + continue + + # required keys + missing = [k for k in REQUIRED_PANEL_KEYS if k not in panel_config] + if missing: errors.append( - f"Data source {i + 1}: No matching entry in template for " - f"{actual_ds['type']} id={actual_ds['id']} title='{actual_ds['title']}'" - ) - elif matching_idx != i: - warnings.append( - f"Data source {i + 1}: Order mismatch - appears at position {i + 1} " - f"but expected at position {matching_idx + 1}" + f"ERROR [{rel}]: Missing required Panel Config keys: " + f"{', '.join(missing)}" ) - if errors: - print(f"ERROR [{file_path}]:") - for error in errors: - print(f" - {error}") - stats["errors"] += len(errors) - stats["failed_files"] += 1 - elif warnings: - print(f"WARNING [{file_path}]:") - for warning in warnings: - print(f" - {warning}") - stats["warnings"] += len(warnings) - stats["passed_files"] += 1 - else: - print(f"PASS [{file_path}]") - stats["passed_files"] += 1 + # prohibited keys (unknown keys) + for k in panel_config.keys(): + if k not in allowed_panel_keys: + errors.append( + f"ERROR [{rel}]: Prohibited/unknown Panel Config key '{k}' " + f"(allowed: {sorted(allowed_panel_keys)})" + ) + + # panel must exist in template + if pid not in template_by_id: + errors.append(f"ERROR [{rel}]: Panel ID {pid} not found in template") + else: + expected = template_by_id[pid] + actual_title = _as_str(panel_config.get("title")) + if actual_title != expected.panel_title: + errors.append( + f"ERROR [{rel}]: Panel title mismatch for id {pid}: " + f"expected '{expected.panel_title}', got '{actual_title}'" + ) + + # data sources must match count + order strictly + if len(ds_actual) != len(expected.data_sources): + errors.append( + f"ERROR [{rel}]: Data source count mismatch for panel " + f"{pid}: expected {len(expected.data_sources)}, " + f"got {len(ds_actual)}" + ) + + for i, exp_ds in enumerate(expected.data_sources): + if i >= len(ds_actual): + break + act = ds_actual[i] + if ( + act["type"] != exp_ds.type + or act["id"] != exp_ds.id + or act["title"] != exp_ds.title + ): + errors.append( + f"ERROR [{rel}]: Data source #{i + 1} mismatch " + f"for panel {pid}: expected {exp_ds.type} id={exp_ds.id} " + f"title='{exp_ds.title}', got {act['type']} " + f"id={act['id']} title='{act['title']}'" + ) + + # duplicates + if pid in actual_by_id: + errors.append( + f"ERROR [{rel}]: Duplicate panel id {pid} " + f"(also in {arch_dir.name}/{actual_by_id[pid].name})" + ) + else: + actual_by_id[pid] = f + actual_order.append(pid) + + # missing / extra panels + expected_ids = [p.panel_id for p in template_panels] + actual_ids = set(actual_by_id.keys()) + expected_set = set(expected_ids) + + for pid in expected_ids: + if pid not in actual_ids: + errors.append( + f"ERROR [{arch_dir.name}]: Missing panel id {pid} required by template" + ) + + for pid in sorted(actual_ids - expected_set): + errors.append( + f"ERROR [{arch_dir.name}/{actual_by_id[pid].name}]: " + f"Extra panel id {pid} not present in template" + ) + + # panel ordering (based on file sorting) + expected_order = [pid for pid in expected_ids if pid in actual_ids] + if actual_order and expected_order and actual_order != expected_order: + for i, (a, e) in enumerate(zip(actual_order, expected_order)): + if a != e: + errors.append( + f"ERROR [{arch_dir.name}]: Panel file order mismatch at position " + f"{i + 1}: expected panel id {e}, got {a} " + "(files must follow template order)" + ) + break + + return errors def main() -> None: - if len(sys.argv) != 3: - print( - "Usage: python verify_against_config_template.py " - "<analysis_configs_dir> <template_yaml>" - ) - sys.exit(1) + parser = argparse.ArgumentParser( + description="Validate per-arch panel YAMLs against a shared config template." + ) + parser.add_argument( + "analysis_configs_dir", help="Directory containing architecture subdirs" + ) + parser.add_argument("template_yaml", help="Template YAML (config_template.yaml)") + parser.add_argument( + "--allow-panel-key", + action="append", + default=[], + help="Allow an additional key under 'Panel Config' (repeatable)", + ) + args = parser.parse_args() - configs_dir = Path(sys.argv[1]) - template_file = Path(sys.argv[2]) + configs_dir = Path(args.analysis_configs_dir) + template_file = Path(args.template_yaml) if not configs_dir.is_dir(): print(f"Error: {configs_dir} is not a directory") @@ -180,45 +343,40 @@ def main() -> None: print(f"Error: {template_file} is not a file") sys.exit(1) + template_panels, template_by_id = load_template(template_file) + allowed_panel_keys = set(DEFAULT_ALLOWED_PANEL_KEYS) | set(args.allow_panel_key) print(f"Loading template from {template_file}") - template = load_template(template_file) - print(f"Template loaded: {len(template)} panels\n") + print(f"Template loaded: {len(template_panels)} panels\n") - stats = { - "total_files": 0, - "passed_files": 0, - "failed_files": 0, - "errors": 0, - "warnings": 0, - } + all_errors: list[str] = [] + total_arches = 0 for arch_dir in sorted(configs_dir.iterdir()): if not arch_dir.is_dir(): continue + total_arches += 1 print(f"{'=' * 80}\nValidating architecture: {arch_dir.name}\n{'=' * 80}") - for yaml_file in sorted(arch_dir.glob("*.yaml")): - stats["total_files"] += 1 - panel_info = extract_panel_info(yaml_file) - if panel_info: - validate_panel(yaml_file, panel_info, template, stats) - else: - print(f"ERROR [{arch_dir.name}/{yaml_file.name}]: Invalid panel config") - stats["errors"] += 1 - stats["failed_files"] += 1 + arch_errors = validate_arch( + arch_dir=arch_dir, + template_panels=template_panels, + template_by_id=template_by_id, + allowed_panel_keys=allowed_panel_keys, + ) + if arch_errors: + for e in arch_errors: + print(e) + all_errors.extend(arch_errors) + else: + print(f"PASS [{arch_dir.name}]: All panel YAMLs match template") print() print(f"{'=' * 80}\nVALIDATION SUMMARY\n{'=' * 80}") - print(f"Total files checked: {stats['total_files']}") - print(f"Passed: {stats['passed_files']}") - print(f"Failed: {stats['failed_files']}") - print(f"Total errors: {stats['errors']}") - print(f"Total warnings: {stats['warnings']}") + print(f"Architectures checked: {total_arches}") + print(f"Total errors: {len(all_errors)}") - if stats["failed_files"] > 0: + if all_errors: print("\nValidation FAILED") sys.exit(1) - elif stats["warnings"] > 0: - print("\nValidation PASSED with warnings") else: print("\nValidation PASSED") diff --git a/projects/rocprofiler-compute/tools/split_config.py b/projects/rocprofiler-compute/tools/split_config.py deleted file mode 100644 index d1f0a55ca3..0000000000 --- a/projects/rocprofiler-compute/tools/split_config.py +++ /dev/null @@ -1,307 +0,0 @@ -############################################################################## -# MIT License -# -# Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -# THE SOFTWARE. - -############################################################################## - -# NOTES -# -# Read tools/unified_config.yaml and split it into per gfx architecture per panel -# config files. WARNING: This script will overwrite existing files under per gfx -# architecture folders under src/rocprof_compute_soc/analysis_configs. -# -# Read tools/unified_config.yaml and split it into metric tables per documentation -# section. -# WARNING: This script will overwrite existing docs/data/metrics_description.yaml. - -import copy -import hashlib -import re -from pathlib import Path - -import yaml - -# Get root directory of the project -ROOT_DIR = Path(__file__).parent.parent -SOURCE_DIR = ROOT_DIR / "tools" -TARGET_DIR = ROOT_DIR / "src" / "rocprof_compute_soc" / "analysis_configs" -SETS_TARGET_DIR = ROOT_DIR / "src" / "rocprof_compute_soc" / "profile_configs" / "sets" -DOC_TARGET_DIR = ROOT_DIR / "docs" / "data" -AUTOGEN_TEXT = ( - "# AUTOGENERATED FILE. Only edit for testing purposes, not for development. " - "Generated from tools/unified_config.yaml. Generated by tools/split_config.py\n" -) -HASH_FILE = ROOT_DIR / "tools" / "autogen_hash.yaml" -HASH_FILE_MAP = {} -GFX_VERSIONS = ["gfx908", "gfx90a", "gfx940", "gfx941", "gfx942", "gfx950"] -METRIC_ID_TO_NAME_MAP = {gfx_version: {} for gfx_version in GFX_VERSIONS} - - -def str_representer(dumper, data): - if "\n" in data: - return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") - return dumper.represent_scalar("tag:yaml.org,2002:str", data) - - -yaml.add_representer(str, str_representer) - - -def get_autogen_text(config_file="tools/unified_config.yaml"): - return ( - f"# AUTOGENERATED FILE. Only edit for testing purposes, " - f"not for development. Generated from {config_file}. " - f"Generated by tools/split_config.py\n" - ) - - -def update_analysis_config(): - global METRIC_ID_TO_NAME_MAP - - # Read the unified config file - with open(SOURCE_DIR / "unified_config.yaml") as file: - unified_config = yaml.safe_load(file) - - # Create per panel config file - for panel_config in unified_config["panels"]: - new_panel_config = {"Panel Config": {}} - new_panel_config["Panel Config"]["id"] = panel_config["id"] - new_panel_config["Panel Config"]["title"] = panel_config["title"] - - panel_id_int = panel_config["id"] - # Convert int into str with 4 digits - panel_id = str(panel_config["id"]).zfill(4) - # Replace parentehsis, hyphen, slash and space with underscore - # Remove duplicate underscore - # Convert to lower case - panel_title = re.sub(r"[()\-/ ]+", "_", panel_config["title"]) - panel_title = "_".join(filter(None, panel_title.split("_"))) - panel_title = panel_title.lower() - - for gfx_version in GFX_VERSIONS: - # Create per gfx architecture folder - gfx_dir = TARGET_DIR / gfx_version - # Create directory if it doesn't exist - if not gfx_dir.exists(): - gfx_dir.mkdir() - print(f"Created directory: {gfx_dir}") - - # Collect metrics for this gfx_version - gfx_metrics = [] - - # Select metrics from current gfx arch - new_panel_config["Panel Config"]["data source"] = [] - for data_source_index, data_source_config in enumerate( - panel_config["data source"] - ): - data_source_config = copy.deepcopy(data_source_config) - if "metric_table" in data_source_config: - data_source_config["metric_table"]["metric"] = data_source_config[ - "metric_table" - ]["metric"][gfx_version] - - # Collect metric names for this gfx version (preserve order) - for metric_name in data_source_config["metric_table"][ - "metric" - ].keys(): - if metric_name not in gfx_metrics: - gfx_metrics.append(metric_name) - - build_metric_id_mapping( - panel_id_int, - data_source_index, - data_source_config["metric_table"]["metric"], - gfx_version, - ) - new_panel_config["Panel Config"]["data source"].append( - data_source_config - ) - - # Only include metric descriptions for metrics that exist in this gfx - new_panel_config["Panel Config"]["metrics_description"] = { - key: value["plain"].strip() - for key, value in panel_config.get("metrics_description", {}).items() - if key in gfx_metrics - } - - # Write panel config to file - filename = TARGET_DIR / gfx_version / f"{panel_id}_{panel_title}.yaml" - with open(filename, "w") as file: - file.write(get_autogen_text()) - yaml.dump(new_panel_config, file, sort_keys=False) - print(f"File write: {filename}") - # Calculate hash of filename - HASH_FILE_MAP[str(filename.relative_to(ROOT_DIR))] = hashlib.sha256( - filename.read_bytes() - ).hexdigest() - - -def build_metric_id_mapping(panel_id, data_source_index, metrics, gfx_version): - # Build metric id to metric name mapping - global METRIC_ID_TO_NAME_MAP - for metric_index, metric_name in enumerate(metrics.keys()): - metric_id = f"{panel_id // 100}.{data_source_index + 1}.{metric_index}" - METRIC_ID_TO_NAME_MAP[gfx_version][str(metric_id)] = metric_name - - -def update_sets_config(): - # Create directory if it doesn't exist - if not SETS_TARGET_DIR.exists(): - SETS_TARGET_DIR.mkdir() - print(f"Created directory: {SETS_TARGET_DIR}") - - # Read the unified config file - with open(SOURCE_DIR / "unified_sets.yaml") as file: - unified_sets = yaml.safe_load(file) - - # Create per gfx version file - for gfx_version in GFX_VERSIONS: - new_sets = {"sets": []} - - for sets in unified_sets["sets"]: - # Create new set object for each set - current_set = { - "title": sets["title"], - "set_option": sets["set_option"], - "description": sets["description"], - "metric": [], - } - - for metric_id in sets["metric"][gfx_version]: - current_set["metric"].append({ - metric_id: METRIC_ID_TO_NAME_MAP[gfx_version][str(metric_id)] - }) - - new_sets["sets"].append(current_set) - - # Write gfx version sets to file - filename = SETS_TARGET_DIR / f"{gfx_version}_sets.yaml" - with open(filename, "w") as file: - file.write(get_autogen_text("tools/unified_sets.yaml")) - yaml.dump(new_sets, file, sort_keys=False) - print(f"File write: {filename}") - # Calculate hash of filename - HASH_FILE_MAP[str(filename.relative_to(ROOT_DIR))] = hashlib.sha256( - filename.read_bytes() - ).hexdigest() - - -def update_documentation(): - # Documentation sections - section_panel_map = { - "Wavefront launch stats": 701, - "Wavefront runtime stats": 702, - "Overall instruction mix": 1001, - "VALU arithmetic instruction mix": 1002, - "MFMA instruction mix": 1004, - "Compute Speed-of-Light": 1101, - "Pipeline statistics": 1102, - "Arithmetic operations": 1103, - "LDS Speed-of-Light": 1201, - "LDS Statistics": 1202, - "vL1D Speed-of-Light": 1601, - "Busy / stall metrics": 1501, - "Instruction counts": 1502, - "Spill / stack metrics": 1503, - "L1 Unified Translation Cache (UTCL1)": 1605, - "vL1D cache stall metrics": 1602, - "vL1D cache access metrics": 1603, - "Vector L1 data-return path or Texture Data (TD)": 1504, - "L2 Speed-of-Light": 1701, - "L2 cache accesses": 1703, - "L2-Fabric interface metrics": 1702, - "L2 - Fabric interface detailed metrics": 1706, - "L2 - Fabric Interface stalls": 1705, - "Scalar L1D Speed-of-Light": 1401, - "Scalar L1D cache accesses": 1402, - "Scalar L1D Cache - L2 Interface": 1403, - "L1I Speed-of-Light": 1301, - "L1I cache accesses": 1302, - "L1I <-> L2 interface": 1303, - "Workgroup manager utilizations": 601, - "Workgroup Manager - Resource Allocation": 602, - "Command processor fetcher (CPF)": 501, - "Command processor packet processor (CPC)": 502, - "System Speed-of-Light": 201, - } - - # Read the unified config file - with open(SOURCE_DIR / "unified_config.yaml") as file: - unified_config = yaml.safe_load(file) - - panel_metric_map = {} - for panel_config in unified_config["panels"]: - for data_source in panel_config["data source"]: - if "metric_table" in data_source: - metrics_info = {} - # Metric names from data source - metric_names = { - metric - for _, gfx_data in data_source["metric_table"]["metric"].items() - for metric in gfx_data - } - # Select metrics with descriptions available - metric_names = metric_names.intersection( - panel_config["metrics_description"].keys() - ) - # Add metrics info - for metric_name in sorted(list(metric_names)): - metrics_info[metric_name] = { - "rst": panel_config["metrics_description"][metric_name][ - "rst" - ].strip(), - "unit": panel_config["metrics_description"][metric_name][ - "unit" - ], - } - panel_metric_map[data_source["metric_table"]["id"]] = metrics_info - - # Merge panel_metric_map with section_panel_map - section_metric_map = {} - for section, panel_id in section_panel_map.items(): - if panel_id in panel_metric_map: - section_metric_map[section] = panel_metric_map[panel_id] - - # Write documentation metrics description file - filename = DOC_TARGET_DIR / "metrics_description.yaml" - with open(filename, "w") as file: - file.write(get_autogen_text()) - yaml.dump(section_metric_map, file, sort_keys=False) - print(f"File write: {filename}") - # Calculate hash of filename - HASH_FILE_MAP[str(filename.relative_to(ROOT_DIR))] = hashlib.sha256( - filename.read_bytes() - ).hexdigest() - - -def update_hash(): - # Write hash file - with open(HASH_FILE, "w") as file: - file.write(get_autogen_text()) - yaml.dump(HASH_FILE_MAP, file, sort_keys=False) - print(f"File write: {HASH_FILE}") - - -if __name__ == "__main__": - update_analysis_config() - update_sets_config() - update_documentation() - update_hash() diff --git a/projects/rocprofiler-compute/tools/unified_config.yaml b/projects/rocprofiler-compute/tools/unified_config.yaml deleted file mode 100644 index d157b14ac1..0000000000 --- a/projects/rocprofiler-compute/tools/unified_config.yaml +++ /dev/null @@ -1,17736 +0,0 @@ -# NOTE: Please run tools/split_config.py after making changes to this file to auto-generate configs -panels: -- id: 0 - title: Top Stats - data source: - - raw_csv_table: - id: 1 - title: Top Kernels - source: pmc_kernel_top.csv - - raw_csv_table: - id: 2 - title: Dispatch List - source: pmc_dispatch_info.csv -- id: 100 - title: System Info - data source: - - raw_csv_table: - id: 101 - title: System Info - source: sysinfo.csv - columnwise: true -- id: 200 - title: System Speed-of-Light - data source: - - metric_table: - id: 201 - title: System Speed-of-Light - header: - metric: Metric - value: Avg - unit: Unit - peak: Peak - pop: Pct of Peak - metric: - gfx90a: - VALU FLOPs: - value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) - + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) - + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) - pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) - + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) - + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) - / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) - VALU IOPs: - value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp - - Start_Timestamp))) - unit: GIOP/s - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) - pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / - (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) - * 64) * 2) / 1000)) - MFMA FLOPs (BF16): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 1024) / 1000)) - MFMA FLOPs (F16): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 1024) / 1000)) - MFMA FLOPs (F32): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) - MFMA FLOPs (F64): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) - MFMA IOPs (Int8): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GIOP/s - peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 1024) / 1000)) - Active CUs: - value: $numActiveCUs - unit: CUs - peak: $cu_per_gpu - pop: ((100 * $numActiveCUs) / $cu_per_gpu) - SALU Utilization: - value: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - peak: 100 - pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - VALU Utilization: - value: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu))) - unit: pct - peak: 100 - pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - MFMA Utilization: - value: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu) * 4))) - unit: pct - peak: 100 - pop: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu) * 4))) - VMEM Utilization: - value: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) - / $cu_per_gpu)) - unit: pct - peak: 100 - pop: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) - / $cu_per_gpu)) - Branch Utilization: - value: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / - $cu_per_gpu)) - unit: pct - peak: 100 - pop: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - VALU Active Threads: - value: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU - != 0) else None)) - unit: Threads - peak: 64 - pop: (AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU - != 0) else None)) * 1.5625) - IPC: - value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) - unit: Instr/cycle - peak: 5 - pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5) - Wavefront Occupancy: - value: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) - unit: Wavefronts - peak: ($max_waves_per_cu * $cu_per_gpu) - pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / ($max_waves_per_cu - * $cu_per_gpu)))) - coll_level: SQ_LEVEL_WAVES - Theoretical LDS Bandwidth: - value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: (($max_sclk * $cu_per_gpu) * 0.128) - pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * - 0.00128))) - LDS Bank Conflicts/Access: - value: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - unit: Conflicts/access - peak: 32 - pop: ((100 * AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))) / - 32) - vL1D Cache Hit Rate: - value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != - 0) else None)) - unit: pct - peak: 100 - pop: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != - 0) else None)) - vL1D Cache BW: - value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: ((($max_sclk / 1000) * 64) * $cu_per_gpu) - pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk / 1000) * 64) * $cu_per_gpu)) - L2 Cache Hit Rate: - value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - unit: pct - peak: 100 - pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - L2 Cache BW: - value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: ((($max_sclk / 1000) * 64) * TO_INT($total_l2_chan)) - pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk / 1000) * 64) * TO_INT($total_l2_chan))) - L2-Fabric Read BW: - value: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) - * 64)) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: $hbmBandwidth - pop: ((100 * AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - - TCC_EA_RDREQ_32B_sum) * 64)) / (End_Timestamp - Start_Timestamp)))) - / $hbmBandwidth) - L2-Fabric Write BW: - value: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: $hbmBandwidth - pop: ((100 * AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - - TCC_EA_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp)))) - / $hbmBandwidth) - L2-Fabric Read Latency: - value: AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum - != 0) else None)) - unit: Cycles - peak: None - pop: None - L2-Fabric Write Latency: - value: AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum - != 0) else None)) - unit: Cycles - peak: None - pop: None - sL1D Cache Hit Rate: - value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) - if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) - unit: pct - peak: 100 - pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) - if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) - sL1D Cache BW: - value: AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) - unit: GB/s - peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) - pop: ((100 * AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) - * 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu)) - L1I Hit Rate: - value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) - unit: pct - peak: 100 - pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) - L1I BW: - value: AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) - unit: GB/s - peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) - pop: ((100 * AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) - * 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu)) - L1I Fetch Latency: - value: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) - unit: Cycles - peak: None - pop: None - coll_level: SQ_IFETCH_LEVEL - gfx941: - VALU FLOPs: - value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) - + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) - + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) - pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) - + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) - + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) - / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) - VALU IOPs: - value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp - - Start_Timestamp))) - unit: GIOP/s - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) - pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / - (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) - * 64) * 2) / 1000)) - MFMA FLOPs (F8): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - MFMA FLOPs (BF16): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) - MFMA FLOPs (F16): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) - MFMA FLOPs (F32): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) - MFMA FLOPs (F64): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) - MFMA IOPs (Int8): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GIOP/s - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - Active CUs: - value: $numActiveCUs - unit: CUs - peak: $cu_per_gpu - pop: ((100 * $numActiveCUs) / $cu_per_gpu) - SALU Utilization: - value: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - peak: 100 - pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - VALU Utilization: - value: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu))) - unit: pct - peak: 100 - pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - MFMA Utilization: - value: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu) * 4))) - unit: pct - peak: 100 - pop: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu) * 4))) - VMEM Utilization: - value: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) - / $cu_per_gpu)) - unit: pct - peak: 100 - pop: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) - / $cu_per_gpu)) - Branch Utilization: - value: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / - $cu_per_gpu)) - unit: pct - peak: 100 - pop: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - VALU Active Threads: - value: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU - != 0) else None)) - unit: Threads - peak: $wave_size - pop: (100 * AVG((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU / $wave_size) - if (SQ_ACTIVE_INST_VALU != 0) else None)) - IPC: - value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) - unit: Instr/cycle - peak: 5 - pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5) - Wavefront Occupancy: - value: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) - unit: Wavefronts - peak: ($max_waves_per_cu * $cu_per_gpu) - pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / ($max_waves_per_cu - * $cu_per_gpu)))) - coll_level: SQ_LEVEL_WAVES - Theoretical LDS Bandwidth: - value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: (($max_sclk * $cu_per_gpu) * 0.128) - pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * - 0.00128))) - LDS Bank Conflicts/Access: - value: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - unit: Conflicts/access - peak: 32 - pop: ((100 * AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))) / - 32) - vL1D Cache Hit Rate: - value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != - 0) else None)) - unit: pct - peak: 100 - pop: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != - 0) else None)) - vL1D Cache BW: - value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: ((($max_sclk / 1000) * 128) * $cu_per_gpu) - pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu)) - L2 Cache Hit Rate: - value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - unit: pct - peak: 100 - pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - L2 Cache BW: - value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)) - pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))) - L2-Fabric Read BW: - value: AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - - Start_Timestamp)) - unit: GB/s - peak: $hbmBandwidth - pop: ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - - Start_Timestamp)))) / $hbmBandwidth) - L2-Fabric Write BW: - value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: $hbmBandwidth - pop: ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp)))) - / $hbmBandwidth) - L2-Fabric Read Latency: - value: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - unit: Cycles - peak: None - pop: None - L2-Fabric Write Latency: - value: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - unit: Cycles - peak: None - pop: None - sL1D Cache Hit Rate: - value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) - if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) - unit: pct - peak: 100 - pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) - if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) - sL1D Cache BW: - value: AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) - unit: GB/s - peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) - pop: ((100 * AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) - * 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu)) - L1I Hit Rate: - value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) - unit: pct - peak: 100 - pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) - L1I BW: - value: AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) - unit: GB/s - peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) - pop: ((100 * AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) - * 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu)) - L1I Fetch Latency: - value: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) - unit: Cycles - peak: None - pop: None - coll_level: SQ_IFETCH_LEVEL - gfx940: - VALU FLOPs: - value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) - + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) - + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) - pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) - + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) - + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) - / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) - VALU IOPs: - value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp - - Start_Timestamp))) - unit: GIOP/s - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) - pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / - (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) - * 64) * 2) / 1000)) - MFMA FLOPs (F8): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - MFMA FLOPs (BF16): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) - MFMA FLOPs (F16): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) - MFMA FLOPs (F32): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) - MFMA FLOPs (F64): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) - MFMA IOPs (Int8): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GIOP/s - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - Active CUs: - value: $numActiveCUs - unit: CUs - peak: $cu_per_gpu - pop: ((100 * $numActiveCUs) / $cu_per_gpu) - SALU Utilization: - value: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - peak: 100 - pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - VALU Utilization: - value: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu))) - unit: pct - peak: 100 - pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - MFMA Utilization: - value: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu) * 4))) - unit: pct - peak: 100 - pop: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu) * 4))) - VMEM Utilization: - value: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) - / $cu_per_gpu)) - unit: pct - peak: 100 - pop: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) - / $cu_per_gpu)) - Branch Utilization: - value: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / - $cu_per_gpu)) - unit: pct - peak: 100 - pop: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - VALU Active Threads: - value: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU - != 0) else None)) - unit: Threads - peak: $wave_size - pop: (100 * AVG((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU / $wave_size) - if (SQ_ACTIVE_INST_VALU != 0) else None)) - IPC: - value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) - unit: Instr/cycle - peak: 5 - pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5) - Wavefront Occupancy: - value: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) - unit: Wavefronts - peak: ($max_waves_per_cu * $cu_per_gpu) - pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / ($max_waves_per_cu - * $cu_per_gpu)))) - coll_level: SQ_LEVEL_WAVES - Theoretical LDS Bandwidth: - value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: (($max_sclk * $cu_per_gpu) * 0.128) - pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * - 0.00128))) - LDS Bank Conflicts/Access: - value: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - unit: Conflicts/access - peak: 32 - pop: ((100 * AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))) / - 32) - vL1D Cache Hit Rate: - value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != - 0) else None)) - unit: pct - peak: 100 - pop: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != - 0) else None)) - vL1D Cache BW: - value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: ((($max_sclk / 1000) * 128) * $cu_per_gpu) - pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu)) - L2 Cache Hit Rate: - value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - unit: pct - peak: 100 - pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - L2 Cache BW: - value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)) - pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))) - L2-Fabric Read BW: - value: AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - - Start_Timestamp)) - unit: GB/s - peak: $hbmBandwidth - pop: ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - - Start_Timestamp)))) / $hbmBandwidth) - L2-Fabric Write BW: - value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: $hbmBandwidth - pop: ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp)))) - / $hbmBandwidth) - L2-Fabric Read Latency: - value: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - unit: Cycles - peak: None - pop: None - L2-Fabric Write Latency: - value: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - unit: Cycles - peak: None - pop: None - sL1D Cache Hit Rate: - value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) - if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) - unit: pct - peak: 100 - pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) - if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) - sL1D Cache BW: - value: AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) - unit: GB/s - peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) - pop: ((100 * AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) - * 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu)) - L1I Hit Rate: - value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) - unit: pct - peak: 100 - pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) - L1I BW: - value: AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) - unit: GB/s - peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) - pop: ((100 * AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) - * 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu)) - L1I Fetch Latency: - value: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) - unit: Cycles - peak: None - pop: None - coll_level: SQ_IFETCH_LEVEL - gfx942: - VALU FLOPs: - value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) - + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) - + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) - pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) - + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) - + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) - / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) - VALU IOPs: - value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp - - Start_Timestamp))) - unit: GIOP/s - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) - pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / - (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) - * 64) * 2) / 1000)) - MFMA FLOPs (F8): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - MFMA FLOPs (BF16): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) - MFMA FLOPs (F16): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) - MFMA FLOPs (F32): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) - MFMA FLOPs (F64): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) - MFMA IOPs (Int8): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GIOP/s - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - Active CUs: - value: $numActiveCUs - unit: CUs - peak: $cu_per_gpu - pop: ((100 * $numActiveCUs) / $cu_per_gpu) - SALU Utilization: - value: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - peak: 100 - pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - VALU Utilization: - value: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu))) - unit: pct - peak: 100 - pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - MFMA Utilization: - value: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu) * 4))) - unit: pct - peak: 100 - pop: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu) * 4))) - VMEM Utilization: - value: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) - / $cu_per_gpu)) - unit: pct - peak: 100 - pop: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) - / $cu_per_gpu)) - Branch Utilization: - value: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / - $cu_per_gpu)) - unit: pct - peak: 100 - pop: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - VALU Active Threads: - value: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU - != 0) else None)) - unit: Threads - peak: $wave_size - pop: (100 * AVG((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU / $wave_size) - if (SQ_ACTIVE_INST_VALU != 0) else None)) - IPC: - value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) - unit: Instr/cycle - peak: 5 - pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5) - Wavefront Occupancy: - value: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) - unit: Wavefronts - peak: ($max_waves_per_cu * $cu_per_gpu) - pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / ($max_waves_per_cu - * $cu_per_gpu)))) - coll_level: SQ_LEVEL_WAVES - Theoretical LDS Bandwidth: - value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: (($max_sclk * $cu_per_gpu) * 0.128) - pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * - 0.00128))) - LDS Bank Conflicts/Access: - value: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - unit: Conflicts/access - peak: 32 - pop: ((100 * AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))) / - 32) - vL1D Cache Hit Rate: - value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != - 0) else None)) - unit: pct - peak: 100 - pop: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != - 0) else None)) - vL1D Cache BW: - value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: ((($max_sclk / 1000) * 128) * $cu_per_gpu) - pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu)) - L2 Cache Hit Rate: - value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - unit: pct - peak: 100 - pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - L2 Cache BW: - value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)) - pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))) - L2-Fabric Read BW: - value: AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - - Start_Timestamp)) - unit: GB/s - peak: $hbmBandwidth - pop: ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - - Start_Timestamp)))) / $hbmBandwidth) - L2-Fabric Write BW: - value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: $hbmBandwidth - pop: ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp)))) - / $hbmBandwidth) - L2-Fabric Read Latency: - value: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - unit: Cycles - peak: None - pop: None - L2-Fabric Write Latency: - value: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - unit: Cycles - peak: None - pop: None - sL1D Cache Hit Rate: - value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) - if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) - unit: pct - peak: 100 - pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) - if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) - sL1D Cache BW: - value: AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) - unit: GB/s - peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) - pop: ((100 * AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) - * 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu)) - L1I Hit Rate: - value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) - unit: pct - peak: 100 - pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) - L1I BW: - value: AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) - unit: GB/s - peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) - pop: ((100 * AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) - * 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu)) - L1I Fetch Latency: - value: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) - unit: Cycles - peak: None - pop: None - coll_level: SQ_IFETCH_LEVEL - gfx950: - VALU FLOPs: - value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) - + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) - + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) - pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) - + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) - + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) - / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) - VALU IOPs: - value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp - - Start_Timestamp))) - unit: GIOP/s - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) - pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / - (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) - * 64) * 2) / 1000)) - MFMA FLOPs (F8): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) - MFMA FLOPs (BF16): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - MFMA FLOPs (F16): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - MFMA FLOPs (F32): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) - MFMA FLOPs (F64): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 128) / 1000)) - MFMA FLOPs (F6F4): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 16834) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 16834) / 1000)) - MFMA IOPs (Int8): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GIOP/s - peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) - Active CUs: - value: $numActiveCUs - unit: CUs - peak: $cu_per_gpu - pop: ((100 * $numActiveCUs) / $cu_per_gpu) - SALU Utilization: - value: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - peak: 100 - pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - VALU Utilization: - value: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu))) - unit: pct - peak: 100 - pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - MFMA Utilization: - value: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu) * 4))) - unit: pct - peak: 100 - pop: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu) * 4))) - VMEM Utilization: - value: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) - / $cu_per_gpu)) - unit: pct - peak: 100 - pop: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) - / $cu_per_gpu)) - Branch Utilization: - value: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / - $cu_per_gpu)) - unit: pct - peak: 100 - pop: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - VALU Active Threads: - value: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU - != 0) else None)) - unit: Threads - peak: $wave_size - pop: (100 * AVG((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU / $wave_size) - if (SQ_ACTIVE_INST_VALU != 0) else None)) - IPC: - value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) - unit: Instr/cycle - peak: 5 - pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5) - Wavefront Occupancy: - value: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) - unit: Wavefronts - peak: ($max_waves_per_cu * $cu_per_gpu) - pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / ($max_waves_per_cu - * $cu_per_gpu)))) - coll_level: SQ_LEVEL_WAVES - Theoretical LDS Bandwidth: - value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: (($max_sclk * $cu_per_gpu) * 0.128) - pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * - 0.00128))) - LDS Bank Conflicts/Access: - value: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - unit: Conflicts/access - peak: 32 - pop: ((100 * AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))) / - 32) - vL1D Cache Hit Rate: - value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != - 0) else None)) - unit: pct - peak: 100 - pop: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != - 0) else None)) - vL1D Cache BW: - value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: ((($max_sclk / 1000) * 128) * $cu_per_gpu) - pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu)) - L2 Cache Hit Rate: - value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - unit: pct - peak: 100 - pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - L2 Cache BW: - value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)) - pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))) - L2-Fabric Read BW: - value: AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - - Start_Timestamp)) - unit: GB/s - peak: $hbmBandwidth - pop: ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - - Start_Timestamp)))) / $hbmBandwidth) - L2-Fabric Write BW: - value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: $hbmBandwidth - pop: ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp)))) - / $hbmBandwidth) - L2-Fabric Read Latency: - value: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - unit: Cycles - peak: None - pop: None - L2-Fabric Write Latency: - value: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - unit: Cycles - peak: None - pop: None - sL1D Cache Hit Rate: - value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) - if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) - unit: pct - peak: 100 - pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) - if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) - sL1D Cache BW: - value: AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) - unit: GB/s - peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) - pop: ((100 * AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) - * 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu)) - L1I Hit Rate: - value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) - unit: pct - peak: 100 - pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) - L1I BW: - value: AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) - unit: GB/s - peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) - pop: ((100 * AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) - * 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu)) - L1I Fetch Latency: - value: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) - unit: Cycles - peak: None - pop: None - coll_level: SQ_IFETCH_LEVEL - gfx908: - VALU FLOPs: - value: None - unit: GFLOP/s - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) - pop: None - VALU IOPs: - value: None - unit: GIOP/s - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) - pop: None - MFMA FLOPs (BF16): - value: None - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 512) / 1000) - pop: None - MFMA FLOPs (F16): - value: None - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000) - pop: None - MFMA FLOPs (F32): - value: None - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) - pop: None - MFMA FLOPs (F64): - value: None - unit: GFLOP/s - peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) - pop: None - MFMA IOPs (Int8): - value: None - unit: GIOP/s - peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000) - pop: None - Active CUs: - value: $numActiveCUs - unit: CUs - peak: $cu_per_gpu - pop: ((100 * $numActiveCUs) / $cu_per_gpu) - SALU Utilization: - value: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - peak: 100 - pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - VALU Utilization: - value: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu))) - unit: pct - peak: 100 - pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - MFMA Utilization: - value: None - unit: pct - peak: 100 - pop: None - VMEM Utilization: - value: None - unit: pct - peak: 100 - pop: None - Branch Utilization: - value: None - unit: pct - peak: 100 - pop: None - VALU Active Threads: - value: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU - != 0) else None)) - unit: Threads - peak: $wave_size - pop: (100 * AVG((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU / $wave_size) - if (SQ_ACTIVE_INST_VALU != 0) else None)) - IPC: - value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) - unit: Instr/cycle - peak: 5 - pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5) - Wavefront Occupancy: - value: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) - unit: Wavefronts - peak: ($max_waves_per_cu * $cu_per_gpu) - pop: (100 * AVG(((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / ($max_waves_per_cu - * $cu_per_gpu)))) - coll_level: SQ_LEVEL_WAVES - Theoretical LDS Bandwidth: - value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: (($max_sclk * $cu_per_gpu) * 0.128) - pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * - 0.00128))) - LDS Bank Conflicts/Access: - value: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - unit: Conflicts/access - peak: 32 - pop: ((100 * AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None))) / - 32) - vL1D Cache Hit Rate: - value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != - 0) else None)) - unit: pct - peak: 100 - pop: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != - 0) else None)) - vL1D Cache BW: - value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: ((($max_sclk / 1000) * 64) * $cu_per_gpu) - pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk / 1000) * 64) * $cu_per_gpu)) - L2 Cache Hit Rate: - value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - unit: pct - peak: 100 - pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - L2 Cache BW: - value: AVG(((TCC_REQ_sum * 64) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: ((($max_sclk / 1000) * 64) * TO_INT($total_l2_chan)) - pop: ((100 * AVG(((TCC_REQ_sum * 64) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk / 1000) * 64) * TO_INT($total_l2_chan))) - L2-Fabric Read BW: - value: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) - * 64)) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: $hbmBandwidth - pop: ((100 * AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - - TCC_EA0_RDREQ_32B_sum) * 64)) / (End_Timestamp - Start_Timestamp)))) - / $hbmBandwidth) - L2-Fabric Write BW: - value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - peak: $hbmBandwidth - pop: ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - - TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp)))) - / $hbmBandwidth) - L2-Fabric Read Latency: - value: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - unit: Cycles - peak: None - pop: None - L2-Fabric Write Latency: - value: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - unit: Cycles - peak: None - pop: None - sL1D Cache Hit Rate: - value: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) - if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) - unit: pct - peak: 100 - pop: AVG((((100 * SQC_DCACHE_HITS) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES)) - if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) != 0) else None)) - sL1D Cache BW: - value: AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) - unit: GB/s - peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) - pop: ((100 * AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) - * 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu)) - L1I Hit Rate: - value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) - unit: pct - peak: 100 - pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES))) - L1I BW: - value: AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64)) - unit: GB/s - peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu) - pop: ((100 * AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) - * 64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu)) - L1I Fetch Latency: - value: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) - unit: Cycles - peak: None - pop: None - coll_level: SQ_IFETCH_LEVEL - metrics_description: - VALU FLOPs: - plain: |- - The total floating-point operations executed per second on the VALU. - This is also presented as a percent of the peak theoretical FLOPs achievable - on the specific accelerator. Note: this does not include any floating-point - operations from MFMA instructions. - rst: |- - The total floating-point operations executed per second on the :ref:`VALU - <desc-valu>`. This is also presented as a percent of the peak theoretical - FLOPs achievable on the specific accelerator. Note: this does not include - any floating-point operations from :ref:`MFMA <desc-mfma>` instructions. - unit: GFLOPs - VALU IOPs: - plain: |- - The total integer operations executed per second on the VALU. This is - also presented as a percent of the peak theoretical IOPs achievable on the - specific accelerator. Note: this does not include any integer operations from - MFMA instructions. - rst: |- - The total integer operations executed per second on the :ref:`VALU <desc-valu>`. - This is also presented as a percent of the peak theoretical IOPs achievable - on the specific accelerator. Note: this does not include any integer operations - from :ref:`MFMA <desc-mfma>` instructions. - unit: GOIPs - MFMA FLOPs (F8): - plain: The total number of 8-bit brain floating point MFMA operations executed - per second. This does not include any 16-bit brain floating point operations - from VALU instructions. This is also presented as a percent of the peak theoretical - F8 MFMA operations achievable on the specific accelerator. It is supported - on AMD Instinct MI300 series and later only. - rst: |- - The total number of 8-bit brain floating point :ref:`MFMA <desc-mfma>` - operations executed per second. Note: this does not include any 16-bit brain - floating point operations from :ref:`VALU <desc-valu>` instructions. This - is also presented as a percent of the peak theoretical F8 MFMA operations - achievable on the specific accelerator. It is supported on AMD Instinct MI300 - series and later only. - unit: GFLOPs - MFMA FLOPs (BF16): - plain: |- - The total number of 16-bit brain floating point MFMA operations executed - per second. Note: this does not include any 16-bit brain floating point operations - from VALU instructions. This is also presented as a percent of the peak theoretical - BF16 MFMA operations achievable on the specific accelerator. - rst: |- - The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>` - operations executed per second. Note: this does not include any 16-bit brain - floating point operations from :ref:`VALU <desc-valu>` instructions. This - is also presented as a percent of the peak theoretical BF16 MFMA operations - achievable on the specific accelerator. - unit: GFLOPs - MFMA FLOPs (F16): - plain: |- - The total number of 16-bit floating point MFMA operations executed per - second. Note: this does not include any 16-bit floating point operations from - VALU instructions. This is also presented as a percent of the peak theoretical - F16 MFMA operations achievable on the specific accelerator. - rst: |- - The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations - executed per second. Note: this does not include any 16-bit floating point - operations from :ref:`VALU <desc-valu>` instructions. This is also presented - as a percent of the peak theoretical F16 MFMA operations achievable on the - specific accelerator. - unit: GFLOPs - MFMA FLOPs (F32): - plain: |- - The total number of 32-bit floating point MFMA operations executed per - second. Note: this does not include any 32-bit floating point operations from - VALU instructions. This is also presented as a percent of the peak theoretical - F32 MFMA operations achievable on the specific accelerator. - rst: |- - The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations - executed per second. Note: this does not include any 32-bit floating point - operations from :ref:`VALU <desc-valu>` instructions. This is also presented - as a percent of the peak theoretical F32 MFMA operations achievable on the - specific accelerator. - unit: GFLOPs - MFMA FLOPs (F64): - plain: |- - The total number of 64-bit floating point MFMA operations executed per - second. Note: this does not include any 64-bit floating point operations from - VALU instructions. This is also presented as a percent of the peak theoretical - F64 MFMA operations achievable on the specific accelerator. - rst: |- - The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations - executed per second. Note: this does not include any 64-bit floating point - operations from :ref:`VALU <desc-valu>` instructions. This is also presented - as a percent of the peak theoretical F64 MFMA operations achievable on the - specific accelerator. - unit: GFLOPs - MFMA IOPs (Int8): - plain: |- - The total number of 8-bit integer MFMA operations executed per second. - Note: this does not include any 8-bit integer operations from VALU instructions. - This is also presented as a percent of the peak theoretical INT8 MFMA operations - achievable on the specific accelerator. - rst: |- - The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed - per second. Note: this does not include any 8-bit integer operations from - :ref:`VALU <desc-valu>` instructions. This is also presented as a percent - of the peak theoretical INT8 MFMA operations achievable on the specific accelerator. - unit: GIOPs - Active CUs: - plain: Total number of active compute units (CUs) on the accelerator during - the kernel execution. - unit: Number - rst: Total number of active compute units (CUs) on the accelerator during the - kernel execution. - SALU Utilization: - plain: Indicates what percent of the kernel's duration the SALU was busy executing - instructions. Computed as the ratio of the total number of cycles spent by - the scheduler issuing SALU or SMEM instructions over the total CU cycles. - rst: Indicates what percent of the kernel's duration the :ref:`SALU <desc-salu>` - was busy executing instructions. Computed as the ratio of the total number - of cycles spent by the :ref:`scheduler <desc-scheduler>` issuing SALU / :ref:`SMEM - <desc-smem>` instructions over the :ref:`total CU cycles <total-cu-cycles>`. - unit: Percent - VALU Utilization: - plain: Indicates what percent of the kernel's duration the VALU was busy executing - instructions. Does not include VMEM operations. Computed as the ratio of the - total number of cycles spent by the scheduler issuing VALU instructions over - the total CU cycles. - rst: Indicates what percent of the kernel's duration the :ref:`VALU <desc-valu>` - was busy executing instructions. Does not include :ref:`VMEM <desc-vmem>` - operations. Computed as the ratio of the total number of cycles spent by the - :ref:`scheduler <desc-scheduler>` issuing VALU instructions over the :ref:`total - CU cycles <total-cu-cycles>`. - unit: Percent - MFMA Utilization: - plain: Indicates what percent of the kernel's duration the MFMA unit was busy - executing instructions. Computed as the ratio of the total number of cycles - the MFMA was busy over the total CU cycles. - rst: Indicates what percent of the kernel's duration the :ref:`MFMA <desc-mfma>` - unit was busy executing instructions. Computed as the ratio of the total number - of cycles spent by the :ref:`MFMA <desc-salu>` was busy over the :ref:`total - CU cycles <total-cu-cycles>`. - unit: Percent - VMEM Utilization: - plain: Indicates what percent of the kernel's duration the VMEM unit was busy - executing instructions, including both global/generic and spill/scratch operations - (see the VMEM instruction count metrics) for more detail). Does not include - VALU operations. Computed as the ratio of the total number of cycles spent - by the scheduler issuing VMEM instructions over the total CU cycles. - rst: Indicates what percent of the kernel's duration the :ref:`VMEM <desc-vmem>` - unit was busy executing instructions, including both global/generic and spill/scratch - operations (see the :ref:`VMEM instruction count metrics <ta-instruction-counts>` - for more detail). Does not include :ref:`VALU <desc-valu>` operations. Computed - as the ratio of the total number of cycles spent by the :ref:`scheduler <desc-scheduler>` - issuing VMEM instructions over the :ref:`total CU cycles <total-cu-cycles>`. - unit: Percent - Branch Utilization: - plain: Indicates what percent of the kernel's duration the branch unit was busy - executing instructions. Computed as the ratio of the total number of cycles - spent by the scheduler issuing branch instructions over the total CU cycles - rst: Indicates what percent of the kernel's duration the :ref:`branch <desc-branch>` - unit was busy executing instructions. Computed as the ratio of the total number - of cycles spent by the :ref:`scheduler <desc-scheduler>` issuing branch instructions - over the :ref:`total CU cycles <total-cu-cycles>`. - unit: Percent - VALU Active Threads: - plain: Indicates the average level of divergence within a wavefront over the - lifetime of the kernel. The number of work-items that were active in a wavefront - during execution of each VALU instruction, time-averaged over all VALU instructions - run on all wavefronts in the kernel. - rst: Indicates the average level of :ref:`divergence <desc-divergence>` within - a wavefront over the lifetime of the kernel. The number of work-items that - were active in a wavefront during execution of each :ref:`VALU <desc-valu>` - instruction, time-averaged over all VALU instructions run on all wavefronts - in the kernel. - unit: Work-items - IPC: - plain: The ratio of the total number of instructions executed on the CU over - the total active CU cycles. This is also presented as a percent of the peak - theoretical bandwidth achievable on the specific accelerator. - rst: The ratio of the total number of instructions executed on the :doc:`CU - <compute-unit>` over the :ref:`total active CU cycles <total-active-cu-cycles>`. - unit: Instructions per-cycle - Wavefront Occupancy: - plain: |- - The time-averaged number of wavefronts resident on the accelerator over - the lifetime of the kernel. Note: this metric may be inaccurate for short-running - kernels (less than 1ms). This is also presented as a percent of the peak theoretical - occupancy achievable on the specific accelerator. - rst: |- - The time-averaged number of wavefronts resident on the accelerator over - the lifetime of the kernel. Note: this metric may be inaccurate for short-running - kernels (less than 1ms). This is also presented as a percent of the peak theoretical - occupancy achievable on the specific accelerator. - unit: Wavefronts - Theoretical LDS Bandwidth: - plain: Indicates the maximum amount of bytes that could have been loaded from, - stored to, or atomically updated in the LDS per unit time (see LDS Bandwidth - example for more detail). This is also presented as a percent of the peak - theoretical F64 MFMA operations achievable on the specific accelerator. - rst: Indicates the maximum amount of bytes that could have been loaded from, - stored to, or atomically updated in the LDS per unit time (see :ref:`LDS Bandwidth - <lds-bandwidth>` example for more detail). This is also presented as a percent - of the peak theoretical F64 MFMA operations achievable on the specific accelerator. - unit: GB/s - LDS Bank Conflicts/Access: - plain: The ratio of the number of cycles spent in the LDS scheduler due to bank - conflicts (as determined by the conflict resolution hardware) to the base - number of cycles that would be spent in the LDS scheduler in a completely - uncontended case. This is also presented in normalized form (i.e., the Bank - Conflict Rate). - rst: The ratio of the number of cycles spent in the :doc:`LDS scheduler <local-data-share>` - due to bank conflicts (as determined by the conflict resolution hardware) - to the base number of cycles that would be spent in the LDS scheduler in - a completely uncontended case. This is also presented in normalized form - (i.e., the Bank Conflict Rate). - unit: Conflicts/Access - vL1D Cache Hit Rate: - plain: The ratio of the number of vL1D cache line requests that hit in vL1D - cache over the total number of cache line requests to the vL1D cache RAM. - rst: The ratio of the number of vL1D cache line requests that hit in vL1D cache - over the total number of cache line requests to the :ref:`vL1D cache RAM - <desc-tc>`. - unit: Percent - vL1D Cache BW: - plain: The number of bytes looked up in the vL1D cache as a result of VMEM instructions - per unit time. The number of bytes is calculated as the number of cache lines - requested multiplied by the cache line size. This value does not consider - partial requests, so e.g., if only a single value is requested in a cache - line, the data movement will still be counted as a full cache line. This is - also presented as a percent of the peak theoretical bandwidth achievable on - the specific accelerator. - rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM - <desc-vmem>` instructions per unit time. The number of bytes is calculated - as the number of cache lines requested multiplied by the cache line size. - This value does not consider partial requests, so e.g., if only a single - value is requested in a cache line, the data movement will still be counted - as a full cache line. This is also presented as a percent of the peak theoretical - bandwidth achievable on the specific accelerator. - unit: GB/s - L2 Cache Hit Rate: - plain: The ratio of the number of L2 cache line requests that hit in the L2 - cache over the total number of incoming cache line requests to the L2 cache. - rst: The ratio of the number of L2 cache line requests that hit in the L2 cache - over the total number of incoming cache line requests to the L2 cache. - unit: Percent - L2 Cache BW: - plain: The number of bytes looked up in the L2 cache per unit time. The number - of bytes is calculated as the number of cache lines requested multiplied by - the cache line size. This value does not consider partial requests, so e.g., - if only a single value is requested in a cache line, the data movement will - still be counted as a full cache line. This is also presented as a percent - of the peak theoretical bandwidth achievable on the specific accelerator. - rst: The number of bytes looked up in the L2 cache per unit time. The number of - bytes is calculated as the number of cache lines requested multiplied by - the cache line size. This value does not consider partial requests, so e.g., - if only a single value is requested in a cache line, the data movement will - still be counted as a full cache line. This is also presented as a percent - of the peak theoretical bandwidth achievable on the specific accelerator. - unit: GB/s - L2-Fabric Read BW: - plain: |- - The number of bytes read by the L2 over the Infinity Fabric\u2122 interface - per unit time. This is also presented as a percent of the peak theoretical - bandwidth achievable on the specific accelerator. - rst: |- - The number of bytes read by the L2 over the :ref:`Infinity Fabric\u2122 - interface <l2-fabric>` per unit time. This is also presented as a percent - of the peak theoretical bandwidth achievable on the specific accelerator. - unit: GB/s - L2-Fabric Write BW: - plain: The number of bytes sent by the L2 over the Infinity Fabric interface - by write and atomic operations per unit time. This is also presented as a - percent of the peak theoretical bandwidth achievable on the specific accelerator. - rst: The number of bytes sent by the L2 over the :ref:`Infinity Fabric interface - <l2-fabric>` by write and atomic operations per unit time. This is also presented - as a percent of the peak theoretical bandwidth achievable on the specific - accelerator. - unit: GB/s - L2-Fabric Read Latency: - plain: The time-averaged number of cycles read requests spent in Infinity Fabric - before data was returned to the L2. - rst: The time-averaged number of cycles read requests spent in Infinity Fabric before - data was returned to the L2. - unit: Cycles - L2-Fabric Write Latency: - plain: The time-averaged number of cycles write requests spent in Infinity Fabric - before a completion acknowledgement was returned to the L2. - rst: The time-averaged number of cycles write requests spent in Infinity Fabric - before a completion acknowledgement was returned to the L2. - unit: Cycles - sL1D Cache Hit Rate: - plain: The percent of sL1D requests that hit on a previously loaded line the - cache. Calculated as the ratio of the number of sL1D requests that hit over - the number of all sL1D requests. - rst: The percent of sL1D requests that hit on a previously loaded line the cache. - Calculated as the ratio of the number of sL1D requests that hit over the - number of all sL1D requests. - unit: Percent - sL1D Cache BW: - plain: The number of bytes looked up in the sL1D cache per unit time. This is - also presented as a percent of the peak theoretical bandwidth achievable on - the specific accelerator. - rst: The number of bytes looked up in the sL1D cache per unit time. This is also - presented as a percent of the peak theoretical bandwidth achievable on the - specific accelerator. - unit: GB/s - L1I Hit Rate: - plain: The number of bytes looked up in the L1I cache per unit time. This is - also presented as a percent of the peak theoretical bandwidth achievable on - the specific accelerator. - rst: The percent of L1I requests that hit on a previously loaded line the cache. - Calculated as the ratio of the number of L1I requests that hit over the number - of all L1I requests. - unit: GB/s - L1I BW: - plain: The percent of L1I requests that hit on a previously loaded line the - cache. Calculated as the ratio of the number of L1I requests that hit over - the number of all L1I requests. - rst: The number of bytes looked up in the L1I cache per unit time. This is also - presented as a percent of the peak theoretical bandwidth achievable on the - specific accelerator. - unit: Percent - L1I Fetch Latency: - plain: The average number of cycles spent to fetch instructions to a CU. - rst: The average number of cycles spent to fetch instructions to a :doc:`CU - <compute-unit>`. - unit: Cycles -- id: 300 - title: Memory Chart - data source: - - metric_table: - id: 301 - title: Memory Chart - header: - metric: Metric - value: Value - metric: - gfx90a: - Wavefront Occupancy: - value: ROUND(AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / $numActiveCUs), - 0) - coll_level: SQ_LEVEL_WAVES - Wave Life: - value: ROUND(AVG(((4 * (SQ_WAVE_CYCLES / SQ_WAVES)) if (SQ_WAVES != 0) - else 0)), 0) - SALU: - value: ROUND(AVG((SQ_INSTS_SALU / $denom)), 0) - SMEM: - value: ROUND(AVG((SQ_INSTS_SMEM / $denom)), 0) - VALU: - value: ROUND(AVG((SQ_INSTS_VALU / $denom)), 0) - MFMA: - value: ROUND(AVG((SQ_INSTS_MFMA / $denom)), 0) - VMEM: - value: ROUND(AVG((SQ_INSTS_VMEM / $denom)), 0) - LDS: - value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) - GWS: - value: ROUND(AVG((SQ_INSTS_GDS / $denom)), 0) - BR: - value: ROUND(AVG((SQ_INSTS_BRANCH / $denom)), 0) - Active CUs: - value: $numActiveCUs - Num CUs: - value: $cu_per_gpu - VGPR: - value: ROUND(AVG(Arch_VGPR), 0) - SGPR: - value: ROUND(AVG(SGPR), 0) - LDS Allocation: - value: ROUND(AVG(LDS_Per_Workgroup), 0) - Scratch Allocation: - value: ROUND(AVG(Scratch_Per_Workitem), 0) - Wavefronts: - value: ROUND(AVG(SPI_CSN_WAVE), 0) - Workgroups: - value: ROUND(AVG(SPI_CSN_NUM_THREADGROUPS), 0) - LDS Req: - value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) - LDS Util: - value: ROUND(AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))), 0) - LDS Latency: - value: ROUND(AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS - != 0) else None)),0) - coll_level: SQ_INST_LEVEL_LDS - VL1 Rd: - value: ROUND(AVG((TCP_TOTAL_READ_sum / $denom)), 0) - VL1 Wr: - value: ROUND(AVG((TCP_TOTAL_WRITE_sum / $denom)), 0) - VL1 Atomic: - value: ROUND(AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) - / $denom)), 0) - VL1 Hit: - value: ROUND(AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != - 0) else None )), 0) - VL1 Lat: - value: ROUND(AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if - (TCP_TA_TCP_STATE_READ_sum != 0) else None)), 0) - VL1 Coalesce: - value: ROUND(AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum - * 4)) if (TCP_TOTAL_ACCESSES_sum != None) else 0)), 0) - VL1 Stall: - value: ROUND(AVG((((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) - if (TCP_GATE_EN1_sum != 0) else None)), 0) - VL1_L2 Rd: - value: ROUND(AVG((TCP_TCC_READ_REQ_sum / $denom)), 0) - VL1_L2 Wr: - value: ROUND(AVG((TCP_TCC_WRITE_REQ_sum / $denom)), 0) - VL1_L2 Atomic: - value: ROUND(AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - / $denom)), 0) - sL1D Rd: - value: ROUND(AVG((SQC_DCACHE_REQ / $denom)), 0) - sL1D Hit: - value: ROUND((AVG(((SQC_DCACHE_HITS / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ - != 0) else None)) * 100), 0) - sL1D Lat: - value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ - != 0) else None)) * 100), 0) - coll_level: SQC_DCACHE_INFLIGHT_LEVEL - sL1D_L2 Rd: - value: ROUND(AVG((SQC_TC_DATA_READ_REQ / $denom)), 0) - sL1D_L2 Wr: - value: ROUND(AVG((SQC_TC_DATA_WRITE_REQ / $denom)), 0) - sL1D_L2 Atomic: - value: ROUND(AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)), 0) - IL1 Fetch: - value: ROUND(AVG((SQC_ICACHE_REQ / $denom)), 0) - IL1 Hit: - value: ROUND((AVG((SQC_ICACHE_HITS / SQC_ICACHE_REQ)) * 100), 0) - IL1 Lat: - value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_ICACHE_REQ) if (SQC_ICACHE_REQ - != 0) else None)) * 100), 0) - coll_level: SQC_ICACHE_INFLIGHT_LEVEL - IL1_L2 Rd: - value: ROUND(AVG((SQC_TC_INST_REQ / $denom)), 0) - L2 Rd: - value: ROUND(AVG((TCC_READ_sum / $denom)), 0) - L2 Wr: - value: ROUND(AVG((TCC_WRITE_sum / $denom)), 0) - L2 Atomic: - value: ROUND(AVG((TCC_ATOMIC_sum / $denom)), 0) - L2 Hit: - value: ROUND(AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) - if ((TCC_HIT_sum + TCC_MISS_sum) != 0) else 0)), 0) - L2 Rd Lat: - value: ROUND(AVG(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) - != 0) else None)), 0) - L2 Wr Lat: - value: ROUND(AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum - + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) if ((TCP_TCC_WRITE_REQ_sum + - TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != 0) else None)), 0) - Fabric_L2 Rd: - value: ROUND(AVG((TCC_EA_RDREQ_sum / $denom)), 0) - Fabric_L2 Wr: - value: ROUND(AVG((TCC_EA_WRREQ_sum / $denom)), 0) - Fabric_L2 Atomic: - value: ROUND(AVG((TCC_EA_ATOMIC_sum / $denom)), 0) - Fabric Rd Lat: - value: ROUND(AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum - != 0) else 0)), 0) - Fabric Wr Lat: - value: ROUND(AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum - != 0) else 0)), 0) - Fabric Atomic Lat: - value: ROUND(AVG(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum - != 0) else 0)), 0) - HBM Rd: - value: ROUND(AVG((TCC_EA_RDREQ_DRAM_sum / $denom)), 0) - HBM Wr: - value: ROUND(AVG((TCC_EA_WRREQ_DRAM_sum / $denom)), 0) - gfx941: - Wavefront Occupancy: - value: ROUND(AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / $numActiveCUs), - 0) - coll_level: SQ_LEVEL_WAVES - Wave Life: - value: ROUND(AVG(((4 * (SQ_WAVE_CYCLES / SQ_WAVES)) if (SQ_WAVES != 0) - else 0)), 0) - SALU: - value: ROUND(AVG((SQ_INSTS_SALU / $denom)), 0) - SMEM: - value: ROUND(AVG((SQ_INSTS_SMEM / $denom)), 0) - VALU: - value: ROUND(AVG((SQ_INSTS_VALU / $denom)), 0) - MFMA: - value: ROUND(AVG((SQ_INSTS_MFMA / $denom)), 0) - VMEM: - value: ROUND(AVG((SQ_INSTS_VMEM / $denom)), 0) - LDS: - value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) - GWS: - value: ROUND(AVG((SQ_INSTS_GDS / $denom)), 0) - BR: - value: ROUND(AVG((SQ_INSTS_BRANCH / $denom)), 0) - Active CUs: - value: $numActiveCUs - Num CUs: - value: $cu_per_gpu - VGPR: - value: ROUND(AVG(Arch_VGPR), 0) - SGPR: - value: ROUND(AVG(SGPR), 0) - LDS Allocation: - value: ROUND(AVG(LDS_Per_Workgroup), 0) - Scratch Allocation: - value: ROUND(AVG(Scratch_Per_Workitem), 0) - Wavefronts: - value: ROUND(AVG(SPI_CSN_WAVE), 0) - Workgroups: - value: ROUND(AVG(SPI_CSN_NUM_THREADGROUPS), 0) - LDS Req: - value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) - LDS Util: - value: ROUND(AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))), 0) - LDS Latency: - value: ROUND(AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS - != 0) else None)),0) - coll_level: SQ_INST_LEVEL_LDS - VL1 Rd: - value: ROUND(AVG((TCP_TOTAL_READ_sum / $denom)), 0) - VL1 Wr: - value: ROUND(AVG((TCP_TOTAL_WRITE_sum / $denom)), 0) - VL1 Atomic: - value: ROUND(AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) - / $denom)), 0) - VL1 Hit: - value: ROUND(AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != - 0) else None )), 0) - VL1 Lat: - value: ROUND(AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if - (TCP_TA_TCP_STATE_READ_sum != 0) else None)), 0) - VL1 Coalesce: - value: ROUND(AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum - * 4)) if (TCP_TOTAL_ACCESSES_sum != None) else 0)), 0) - VL1 Stall: - value: ROUND(AVG((((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) - if (TCP_GATE_EN1_sum != 0) else None)), 0) - VL1_L2 Rd: - value: ROUND(AVG((TCP_TCC_READ_REQ_sum / $denom)), 0) - VL1_L2 Wr: - value: ROUND(AVG((TCP_TCC_WRITE_REQ_sum / $denom)), 0) - VL1_L2 Atomic: - value: ROUND(AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - / $denom)), 0) - sL1D Rd: - value: ROUND(AVG((SQC_DCACHE_REQ / $denom)), 0) - sL1D Hit: - value: ROUND((AVG(((SQC_DCACHE_HITS / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ - != 0) else None)) * 100), 0) - sL1D Lat: - value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ - != 0) else None)) * 100), 0) - coll_level: SQC_DCACHE_INFLIGHT_LEVEL - sL1D_L2 Rd: - value: ROUND(AVG((SQC_TC_DATA_READ_REQ / $denom)), 0) - sL1D_L2 Wr: - value: ROUND(AVG((SQC_TC_DATA_WRITE_REQ / $denom)), 0) - sL1D_L2 Atomic: - value: ROUND(AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)), 0) - IL1 Fetch: - value: ROUND(AVG((SQC_ICACHE_REQ / $denom)), 0) - IL1 Hit: - value: ROUND((AVG((SQC_ICACHE_HITS / SQC_ICACHE_REQ)) * 100), 0) - IL1 Lat: - value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_ICACHE_REQ) if (SQC_ICACHE_REQ - != 0) else None)) * 100), 0) - coll_level: SQC_ICACHE_INFLIGHT_LEVEL - IL1_L2 Rd: - value: ROUND(AVG((SQC_TC_INST_REQ / $denom)), 0) - L2 Rd: - value: ROUND(AVG((TCC_READ_sum / $denom)), 0) - L2 Wr: - value: ROUND(AVG((TCC_WRITE_sum / $denom)), 0) - L2 Atomic: - value: ROUND(AVG((TCC_ATOMIC_sum / $denom)), 0) - L2 Hit: - value: ROUND(AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) - if ((TCC_HIT_sum + TCC_MISS_sum) != 0) else 0)), 0) - Fabric_L2 Rd: - value: ROUND(AVG((TCC_EA0_RDREQ_sum / $denom)), 0) - Fabric_L2 Wr: - value: ROUND(AVG((TCC_EA0_WRREQ_sum / $denom)), 0) - Fabric_L2 Atomic: - value: ROUND(AVG((TCC_EA0_ATOMIC_sum / $denom)), 0) - Fabric Rd Lat: - value: ROUND(AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else 0)), 0) - Fabric Wr Lat: - value: ROUND(AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else 0)), 0) - Fabric Atomic Lat: - value: ROUND(AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum - != 0) else 0)), 0) - HBM Rd: - value: ROUND(AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)), 0) - HBM Wr: - value: ROUND(AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)), 0) - gfx940: - Wavefront Occupancy: - value: ROUND(AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / $numActiveCUs), - 0) - coll_level: SQ_LEVEL_WAVES - Wave Life: - value: ROUND(AVG(((4 * (SQ_WAVE_CYCLES / SQ_WAVES)) if (SQ_WAVES != 0) - else 0)), 0) - SALU: - value: ROUND(AVG((SQ_INSTS_SALU / $denom)), 0) - SMEM: - value: ROUND(AVG((SQ_INSTS_SMEM / $denom)), 0) - VALU: - value: ROUND(AVG((SQ_INSTS_VALU / $denom)), 0) - MFMA: - value: ROUND(AVG((SQ_INSTS_MFMA / $denom)), 0) - VMEM: - value: ROUND(AVG((SQ_INSTS_VMEM / $denom)), 0) - LDS: - value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) - GWS: - value: ROUND(AVG((SQ_INSTS_GDS / $denom)), 0) - BR: - value: ROUND(AVG((SQ_INSTS_BRANCH / $denom)), 0) - Active CUs: - value: $numActiveCUs - Num CUs: - value: $cu_per_gpu - VGPR: - value: ROUND(AVG(Arch_VGPR), 0) - SGPR: - value: ROUND(AVG(SGPR), 0) - LDS Allocation: - value: ROUND(AVG(LDS_Per_Workgroup), 0) - Scratch Allocation: - value: ROUND(AVG(Scratch_Per_Workitem), 0) - Wavefronts: - value: ROUND(AVG(SPI_CSN_WAVE), 0) - Workgroups: - value: ROUND(AVG(SPI_CSN_NUM_THREADGROUPS), 0) - LDS Req: - value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) - LDS Util: - value: ROUND(AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))), 0) - LDS Latency: - value: ROUND(AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS - != 0) else None)),0) - coll_level: SQ_INST_LEVEL_LDS - VL1 Rd: - value: ROUND(AVG((TCP_TOTAL_READ_sum / $denom)), 0) - VL1 Wr: - value: ROUND(AVG((TCP_TOTAL_WRITE_sum / $denom)), 0) - VL1 Atomic: - value: ROUND(AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) - / $denom)), 0) - VL1 Hit: - value: ROUND(AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != - 0) else None )), 0) - VL1 Lat: - value: ROUND(AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if - (TCP_TA_TCP_STATE_READ_sum != 0) else None)), 0) - VL1 Coalesce: - value: ROUND(AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum - * 4)) if (TCP_TOTAL_ACCESSES_sum != None) else 0)), 0) - VL1 Stall: - value: ROUND(AVG((((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) - if (TCP_GATE_EN1_sum != 0) else None)), 0) - VL1_L2 Rd: - value: ROUND(AVG((TCP_TCC_READ_REQ_sum / $denom)), 0) - VL1_L2 Wr: - value: ROUND(AVG((TCP_TCC_WRITE_REQ_sum / $denom)), 0) - VL1_L2 Atomic: - value: ROUND(AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - / $denom)), 0) - sL1D Rd: - value: ROUND(AVG((SQC_DCACHE_REQ / $denom)), 0) - sL1D Hit: - value: ROUND((AVG(((SQC_DCACHE_HITS / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ - != 0) else None)) * 100), 0) - sL1D Lat: - value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ - != 0) else None)) * 100), 0) - coll_level: SQC_DCACHE_INFLIGHT_LEVEL - sL1D_L2 Rd: - value: ROUND(AVG((SQC_TC_DATA_READ_REQ / $denom)), 0) - sL1D_L2 Wr: - value: ROUND(AVG((SQC_TC_DATA_WRITE_REQ / $denom)), 0) - sL1D_L2 Atomic: - value: ROUND(AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)), 0) - IL1 Fetch: - value: ROUND(AVG((SQC_ICACHE_REQ / $denom)), 0) - IL1 Hit: - value: ROUND((AVG((SQC_ICACHE_HITS / SQC_ICACHE_REQ)) * 100), 0) - IL1 Lat: - value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_ICACHE_REQ) if (SQC_ICACHE_REQ - != 0) else None)) * 100), 0) - coll_level: SQC_ICACHE_INFLIGHT_LEVEL - IL1_L2 Rd: - value: ROUND(AVG((SQC_TC_INST_REQ / $denom)), 0) - L2 Rd: - value: ROUND(AVG((TCC_READ_sum / $denom)), 0) - L2 Wr: - value: ROUND(AVG((TCC_WRITE_sum / $denom)), 0) - L2 Atomic: - value: ROUND(AVG((TCC_ATOMIC_sum / $denom)), 0) - L2 Hit: - value: ROUND(AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) - if ((TCC_HIT_sum + TCC_MISS_sum) != 0) else 0)), 0) - Fabric_L2 Rd: - value: ROUND(AVG((TCC_EA0_RDREQ_sum / $denom)), 0) - Fabric_L2 Wr: - value: ROUND(AVG((TCC_EA0_WRREQ_sum / $denom)), 0) - Fabric_L2 Atomic: - value: ROUND(AVG((TCC_EA0_ATOMIC_sum / $denom)), 0) - Fabric Rd Lat: - value: ROUND(AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else 0)), 0) - Fabric Wr Lat: - value: ROUND(AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else 0)), 0) - Fabric Atomic Lat: - value: ROUND(AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum - != 0) else 0)), 0) - HBM Rd: - value: ROUND(AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)), 0) - HBM Wr: - value: ROUND(AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)), 0) - gfx942: - Wavefront Occupancy: - value: ROUND(AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / $numActiveCUs), - 0) - coll_level: SQ_LEVEL_WAVES - Wave Life: - value: ROUND(AVG(((4 * (SQ_WAVE_CYCLES / SQ_WAVES)) if (SQ_WAVES != 0) - else 0)), 0) - SALU: - value: ROUND(AVG((SQ_INSTS_SALU / $denom)), 0) - SMEM: - value: ROUND(AVG((SQ_INSTS_SMEM / $denom)), 0) - VALU: - value: ROUND(AVG((SQ_INSTS_VALU / $denom)), 0) - MFMA: - value: ROUND(AVG((SQ_INSTS_MFMA / $denom)), 0) - VMEM: - value: ROUND(AVG((SQ_INSTS_VMEM / $denom)), 0) - LDS: - value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) - GWS: - value: ROUND(AVG((SQ_INSTS_GDS / $denom)), 0) - BR: - value: ROUND(AVG((SQ_INSTS_BRANCH / $denom)), 0) - Active CUs: - value: $numActiveCUs - Num CUs: - value: $cu_per_gpu - VGPR: - value: ROUND(AVG(Arch_VGPR), 0) - SGPR: - value: ROUND(AVG(SGPR), 0) - LDS Allocation: - value: ROUND(AVG(LDS_Per_Workgroup), 0) - Scratch Allocation: - value: ROUND(AVG(Scratch_Per_Workitem), 0) - Wavefronts: - value: ROUND(AVG(SPI_CSN_WAVE), 0) - Workgroups: - value: ROUND(AVG(SPI_CSN_NUM_THREADGROUPS), 0) - LDS Req: - value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) - LDS Util: - value: ROUND(AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))), 0) - LDS Latency: - value: ROUND(AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS - != 0) else None)),0) - coll_level: SQ_INST_LEVEL_LDS - VL1 Rd: - value: ROUND(AVG((TCP_TOTAL_READ_sum / $denom)), 0) - VL1 Wr: - value: ROUND(AVG((TCP_TOTAL_WRITE_sum / $denom)), 0) - VL1 Atomic: - value: ROUND(AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) - / $denom)), 0) - VL1 Hit: - value: ROUND(AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != - 0) else None )), 0) - VL1 Lat: - value: ROUND(AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if - (TCP_TA_TCP_STATE_READ_sum != 0) else None)), 0) - VL1 Coalesce: - value: ROUND(AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum - * 4)) if (TCP_TOTAL_ACCESSES_sum != None) else 0)), 0) - VL1 Stall: - value: ROUND(AVG((((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) - if (TCP_GATE_EN1_sum != 0) else None)), 0) - VL1_L2 Rd: - value: ROUND(AVG((TCP_TCC_READ_REQ_sum / $denom)), 0) - VL1_L2 Wr: - value: ROUND(AVG((TCP_TCC_WRITE_REQ_sum / $denom)), 0) - VL1_L2 Atomic: - value: ROUND(AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - / $denom)), 0) - sL1D Rd: - value: ROUND(AVG((SQC_DCACHE_REQ / $denom)), 0) - sL1D Hit: - value: ROUND((AVG(((SQC_DCACHE_HITS / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ - != 0) else None)) * 100), 0) - sL1D Lat: - value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ - != 0) else None)) * 100), 0) - coll_level: SQC_DCACHE_INFLIGHT_LEVEL - sL1D_L2 Rd: - value: ROUND(AVG((SQC_TC_DATA_READ_REQ / $denom)), 0) - sL1D_L2 Wr: - value: ROUND(AVG((SQC_TC_DATA_WRITE_REQ / $denom)), 0) - sL1D_L2 Atomic: - value: ROUND(AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)), 0) - IL1 Fetch: - value: ROUND(AVG((SQC_ICACHE_REQ / $denom)), 0) - IL1 Hit: - value: ROUND((AVG((SQC_ICACHE_HITS / SQC_ICACHE_REQ)) * 100), 0) - IL1 Lat: - value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_ICACHE_REQ) if (SQC_ICACHE_REQ - != 0) else None)) * 100), 0) - coll_level: SQC_ICACHE_INFLIGHT_LEVEL - IL1_L2 Rd: - value: ROUND(AVG((SQC_TC_INST_REQ / $denom)), 0) - L2 Rd: - value: ROUND(AVG((TCC_READ_sum / $denom)), 0) - L2 Wr: - value: ROUND(AVG((TCC_WRITE_sum / $denom)), 0) - L2 Atomic: - value: ROUND(AVG((TCC_ATOMIC_sum / $denom)), 0) - L2 Hit: - value: ROUND(AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) - if ((TCC_HIT_sum + TCC_MISS_sum) != 0) else 0)), 0) - Fabric_L2 Rd: - value: ROUND(AVG((TCC_EA0_RDREQ_sum / $denom)), 0) - Fabric_L2 Wr: - value: ROUND(AVG((TCC_EA0_WRREQ_sum / $denom)), 0) - Fabric_L2 Atomic: - value: ROUND(AVG((TCC_EA0_ATOMIC_sum / $denom)), 0) - Fabric Rd Lat: - value: ROUND(AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else 0)), 0) - Fabric Wr Lat: - value: ROUND(AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else 0)), 0) - Fabric Atomic Lat: - value: ROUND(AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum - != 0) else 0)), 0) - HBM Rd: - value: ROUND(AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)), 0) - HBM Wr: - value: ROUND(AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)), 0) - gfx950: - Wavefront Occupancy: - value: ROUND(AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / $numActiveCUs), - 0) - coll_level: SQ_LEVEL_WAVES - Wave Life: - value: ROUND(AVG(((4 * (SQ_WAVE_CYCLES / SQ_WAVES)) if (SQ_WAVES != 0) - else 0)), 0) - SALU: - value: ROUND(AVG((SQ_INSTS_SALU / $denom)), 0) - SMEM: - value: ROUND(AVG((SQ_INSTS_SMEM / $denom)), 0) - VALU: - value: ROUND(AVG((SQ_INSTS_VALU / $denom)), 0) - MFMA: - value: ROUND(AVG((SQ_INSTS_MFMA / $denom)), 0) - VMEM: - value: ROUND(AVG((SQ_INSTS_VMEM / $denom)), 0) - LDS: - value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) - GWS: - value: ROUND(AVG((SQ_INSTS_GDS / $denom)), 0) - BR: - value: ROUND(AVG((SQ_INSTS_BRANCH / $denom)), 0) - Active CUs: - value: $numActiveCUs - Num CUs: - value: $cu_per_gpu - VGPR: - value: ROUND(AVG(Arch_VGPR), 0) - SGPR: - value: ROUND(AVG(SGPR), 0) - LDS Allocation: - value: ROUND(AVG(LDS_Per_Workgroup), 0) - Scratch Allocation: - value: ROUND(AVG(Scratch_Per_Workitem), 0) - Wavefronts: - value: ROUND(AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE), - 0) - Workgroups: - value: ROUND(AVG(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + - SPI_CS2_NUM_THREADGROUPS + SPI_CS3_NUM_THREADGROUPS), 0) - LDS Req: - value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) - LDS Util: - value: ROUND(AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))), 0) - LDS Latency: - value: ROUND(AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS - != 0) else None)),0) - coll_level: SQ_INST_LEVEL_LDS - VL1 Rd: - value: ROUND(AVG((TCP_TOTAL_READ_sum / $denom)), 0) - VL1 Wr: - value: ROUND(AVG((TCP_TOTAL_WRITE_sum / $denom)), 0) - VL1 Atomic: - value: ROUND(AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) - / $denom)), 0) - VL1 Hit: - value: ROUND(AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != - 0) else None )), 0) - VL1 Lat: - value: ROUND(AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if - (TCP_TA_TCP_STATE_READ_sum != 0) else None)), 0) - VL1 Coalesce: - value: ROUND(AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum - * 4)) if (TCP_TOTAL_ACCESSES_sum != None) else 0)), 0) - VL1 Stall: - value: ROUND(AVG((((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) - if (TCP_GATE_EN1_sum != 0) else None)), 0) - VL1_L2 Rd: - value: ROUND(AVG((TCP_TCC_READ_REQ_sum / $denom)), 0) - VL1_L2 Wr: - value: ROUND(AVG((TCP_TCC_WRITE_REQ_sum / $denom)), 0) - VL1_L2 Atomic: - value: ROUND(AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - / $denom)), 0) - sL1D Rd: - value: ROUND(AVG((SQC_DCACHE_REQ / $denom)), 0) - sL1D Hit: - value: ROUND((AVG(((SQC_DCACHE_HITS / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ - != 0) else None)) * 100), 0) - sL1D Lat: - value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ - != 0) else None)) * 100), 0) - coll_level: SQC_DCACHE_INFLIGHT_LEVEL - sL1D_L2 Rd: - value: ROUND(AVG((SQC_TC_DATA_READ_REQ / $denom)), 0) - sL1D_L2 Wr: - value: ROUND(AVG((SQC_TC_DATA_WRITE_REQ / $denom)), 0) - sL1D_L2 Atomic: - value: ROUND(AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)), 0) - IL1 Fetch: - value: ROUND(AVG((SQC_ICACHE_REQ / $denom)), 0) - IL1 Hit: - value: ROUND((AVG((SQC_ICACHE_HITS / SQC_ICACHE_REQ)) * 100), 0) - IL1 Lat: - value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_ICACHE_REQ) if (SQC_ICACHE_REQ - != 0) else None)) * 100), 0) - coll_level: SQC_ICACHE_INFLIGHT_LEVEL - IL1_L2 Rd: - value: ROUND(AVG((SQC_TC_INST_REQ / $denom)), 0) - L2 Rd: - value: ROUND(AVG((TCC_READ_sum / $denom)), 0) - L2 Wr: - value: ROUND(AVG((TCC_WRITE_sum / $denom)), 0) - L2 Atomic: - value: ROUND(AVG((TCC_ATOMIC_sum / $denom)), 0) - L2 Hit: - value: ROUND(AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) - if ((TCC_HIT_sum + TCC_MISS_sum) != 0) else 0)), 0) - L2 Rd Lat: - value: ROUND(AVG(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) - != 0) else None)), 0) - L2 Wr Lat: - value: ROUND(AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum - + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) if ((TCP_TCC_WRITE_REQ_sum + - TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != 0) else None)), 0) - Fabric_L2 Rd: - value: ROUND(AVG((TCC_EA0_RDREQ_sum / $denom)), 0) - Fabric_L2 Wr: - value: ROUND(AVG((TCC_EA0_WRREQ_sum / $denom)), 0) - Fabric_L2 Atomic: - value: ROUND(AVG((TCC_EA0_ATOMIC_sum / $denom)), 0) - Fabric Rd Lat: - value: ROUND(AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else 0)), 0) - Fabric Wr Lat: - value: ROUND(AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else 0)), 0) - Fabric Atomic Lat: - value: ROUND(AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum - != 0) else 0)), 0) - HBM Rd: - value: ROUND(AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)), 0) - HBM Wr: - value: ROUND(AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)), 0) - gfx908: - Wavefront Occupancy: - value: ROUND(AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD) / $numActiveCUs), - 0) - coll_level: SQ_LEVEL_WAVES - Wave Life: - value: ROUND(AVG(((4 * (SQ_WAVE_CYCLES / SQ_WAVES)) if (SQ_WAVES != 0) - else 0)), 0) - SALU: - value: ROUND(AVG((SQ_INSTS_SALU / $denom)), 0) - SMEM: - value: ROUND(AVG((SQ_INSTS_SMEM / $denom)), 0) - VALU: - value: ROUND(AVG((SQ_INSTS_VALU / $denom)), 0) - MFMA: - value: None - VMEM: - value: ROUND(AVG((SQ_INSTS_VMEM / $denom)), 0) - LDS: - value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) - GWS: - value: ROUND(AVG((SQ_INSTS_GDS / $denom)), 0) - BR: - value: ROUND(AVG((SQ_INSTS_BRANCH / $denom)), 0) - Active CUs: - value: $numActiveCUs - Num CUs: - value: $cu_per_gpu - VGPR: - value: ROUND(AVG(Arch_VGPR), 0) - SGPR: - value: ROUND(AVG(SGPR), 0) - LDS Allocation: - value: ROUND(AVG(LDS_Per_Workgroup), 0) - Scratch Allocation: - value: ROUND(AVG(Scratch_Per_Workitem), 0) - Wavefronts: - value: ROUND(AVG(SPI_CSN_WAVE), 0) - Workgroups: - value: ROUND(AVG(SPI_CSN_NUM_THREADGROUPS), 0) - LDS Req: - value: ROUND(AVG((SQ_INSTS_LDS / $denom)), 0) - LDS Util: - value: ROUND(AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))), 0) - LDS Latency: - value: ROUND(AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS - != 0) else None)),0) - coll_level: SQ_INST_LEVEL_LDS - VL1 Rd: - value: ROUND(AVG((TCP_TOTAL_READ_sum / $denom)), 0) - VL1 Wr: - value: ROUND(AVG((TCP_TOTAL_WRITE_sum / $denom)), 0) - VL1 Atomic: - value: ROUND(AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) - / $denom)), 0) - VL1 Hit: - value: ROUND(AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != - 0) else None )), 0) - VL1 Lat: - value: ROUND(AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if - (TCP_TA_TCP_STATE_READ_sum != 0) else None)), 0) - VL1 Coalesce: - value: ROUND(AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum - * 4)) if (TCP_TOTAL_ACCESSES_sum != None) else 0)), 0) - VL1 Stall: - value: ROUND(AVG((((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) - if (TCP_GATE_EN1_sum != 0) else None)), 0) - VL1_L2 Rd: - value: ROUND(AVG((TCP_TCC_READ_REQ_sum / $denom)), 0) - VL1_L2 Wr: - value: ROUND(AVG((TCP_TCC_WRITE_REQ_sum / $denom)), 0) - VL1_L2 Atomic: - value: ROUND(AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - / $denom)), 0) - sL1D Rd: - value: ROUND(AVG((SQC_DCACHE_REQ / $denom)), 0) - sL1D Hit: - value: ROUND((AVG(((SQC_DCACHE_HITS / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ - != 0) else None)) * 100), 0) - sL1D Lat: - value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_DCACHE_REQ) if (SQC_DCACHE_REQ - != 0) else None)) * 100), 0) - coll_level: SQC_DCACHE_INFLIGHT_LEVEL - sL1D_L2 Rd: - value: ROUND(AVG((SQC_TC_DATA_READ_REQ / $denom)), 0) - sL1D_L2 Wr: - value: ROUND(AVG((SQC_TC_DATA_WRITE_REQ / $denom)), 0) - sL1D_L2 Atomic: - value: ROUND(AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)), 0) - IL1 Fetch: - value: ROUND(AVG((SQC_ICACHE_REQ / $denom)), 0) - IL1 Hit: - value: ROUND((AVG((SQC_ICACHE_HITS / SQC_ICACHE_REQ)) * 100), 0) - IL1 Lat: - value: ROUND((AVG(((SQ_ACCUM_PREV_HIRES / SQC_ICACHE_REQ) if (SQC_ICACHE_REQ - != 0) else None)) * 100), 0) - coll_level: SQC_ICACHE_INFLIGHT_LEVEL - IL1_L2 Rd: - value: ROUND(AVG((SQC_TC_INST_REQ / $denom)), 0) - L2 Rd: - value: ROUND(AVG((TCC_READ_sum / $denom)), 0) - L2 Wr: - value: ROUND(AVG((TCC_WRITE_sum / $denom)), 0) - L2 Atomic: - value: ROUND(AVG((TCC_ATOMIC_sum / $denom)), 0) - L2 Hit: - value: ROUND(AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) - if ((TCC_HIT_sum + TCC_MISS_sum) != 0) else 0)), 0) - L2 Rd Lat: - value: ROUND(AVG(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) - != 0) else None)), 0) - L2 Wr Lat: - value: ROUND(AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum - + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) if ((TCP_TCC_WRITE_REQ_sum + - TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != 0) else None)), 0) - Fabric_L2 Rd: - value: ROUND(AVG((TCC_EA0_RDREQ_sum / $denom)), 0) - Fabric_L2 Wr: - value: ROUND(AVG((TCC_EA0_WRREQ_sum / $denom)), 0) - Fabric_L2 Atomic: - value: ROUND(AVG((TCC_EA0_ATOMIC_sum / $denom)), 0) - Fabric Rd Lat: - value: ROUND(AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else 0)), 0) - Fabric Wr Lat: - value: ROUND(AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else 0)), 0) - Fabric Atomic Lat: - value: ROUND(AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum - != 0) else 0)), 0) - HBM Rd: - value: ROUND(AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)), 0) - HBM Wr: - value: ROUND(AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)), 0) - comparable: false - cli_style: mem_chart - tui_style: mem_chart - metrics_description: - Wavefront Occupancy: - plain: Wavefronts per active CU. - rst: Wavefronts per active CU. - unit: Wavefronts - Wave Life: - plain: Average number of cycles executing a wave. - rst: Average number of cycles executing a wave. - unit: Cycles per wave - SALU: - plain: Total Number of SALU (Scalar ALU) instructions issued per normalization - unit. - rst: Total Number of SALU (Scalar ALU) instructions issued per normalization - unit. - unit: Instructions per normalization unit - SMEM: - plain: Total number of SMEM (Scalar Memory Read) instructions issued normalization - unit. - rst: Total number of SMEM (Scalar Memory Read) instructions issued normalization - unit. - unit: Instructions per normalization unit - VALU: - plain: The number of VALU (Vector ALU) instructions issued per normalization - unit. - rst: The number of VALU (Vector ALU) instructions issued per normalization unit. - unit: Instructions per normalization unit - MFMA: - plain: Total number of MFMA (Matrix-Fused-Multiply-Add) instructions issued - per normalization unit. - rst: Total number of MFMA (Matrix-Fused-Multiply-Add) instructions issued per - normalization unit. - unit: Instructions per normalization unit - VMEM: - plain: The number of VMEM (GPU Memory) read instructions issued (including FLAT/scratch - memory) per normalization unit. - rst: The number of VMEM (GPU Memory) read instructions issued (including FLAT/scratch - memory) per normalization unit. - unit: Instructions per normalization unit - LDS: - plain: The total number of LDS instructions (including, but not limited to, - read/write/atomics and HIP's __shfl instructions) executed per normalization - unit. - rst: The total number of LDS instructions (including, but not limited to, read/write/atomics - and HIP's __shfl instructions) executed per normalization unit. - unit: Instructions per normalization unit - GWS: - plain: Total number of GDS (global data sync) instructions issued per normalization - unit. - rst: Total number of GDS (global data sync) instructions issued per normalization - unit. - unit: Instructions per normalization unit - BR: - plain: Total number of BRANCH instructions issued per normalization unit. - rst: Total number of BRANCH instructions issued per normalization unit. - unit: Instructions per normalization unit - Active CUs: - plain: Total number of active compute units (CUs) on the accelerator during - the kernel execution. - rst: Total number of active compute units (CUs) on the accelerator during the - kernel execution. - unit: CUs - Num CUs: - plain: Total number of compute units (CUs) on the accelerator. - rst: Total number of compute units (CUs) on the accelerator. - unit: CUs - VGPR: - plain: |- - The number of architected vector general-purpose registers allocated - for the kernel, see VALU. Note: this may not exactly match the number of VGPRs - requested by the compiler due to allocation granularity. - rst: |- - The number of architected vector general-purpose registers allocated for the - kernel, see :ref:`VALU <desc-valu>`. Note: this may not exactly match the - number of VGPRs requested by the compiler due to allocation granularity. - unit: VGPRs - SGPR: - plain: |- - The number of scalar general-purpose registers allocated for the kernel, - see SALU. Note: this may not exactly match the number of SGPRs requested by - the compiler due to allocation granularity. - rst: |- - The number of scalar general-purpose registers allocated for the kernel, see - :ref:`SALU <desc-salu>`. Note: this may not exactly match the number of - SGPRs requested by the compiler due to allocation granularity. - unit: SGPRs - LDS Allocation: - plain: |- - The number of bytes of LDS memory (or, shared memory) allocated for - this kernel. Note: This may also be larger than what was requested at compile - time due to both allocation granularity and dynamic per-dispatch LDS allocations. - rst: |- - The number of bytes of :doc:`LDS <local-data-share>` memory (or, shared memory) - allocated for this kernel. Note: This may also be larger than what was requested - at compile time due to both allocation granularity and dynamic per-dispatch - LDS allocations. - unit: Bytes per workgroup - Scratch Allocation: - plain: The number of bytes of scratch memory requested per work-item for this - kernel. Scratch memory is used for stack memory on the accelerator, as well - as for register spills and restores. - rst: The number of bytes of :ref:`scratch memory <memory-spaces>` requested per - work-item for this kernel. Scratch memory is used for stack memory on the - accelerator, as well as for register spills and restores. - unit: Bytes per workgroup - Wavefronts: - plain: The total number of wavefronts, summed over all workgroups, forming this - kernel launch. - rst: The total number of wavefronts, summed over all workgroups, forming this - kernel launch. - unit: Wavefronts - Workgroups: - plain: The total number of workgroups forming this kernel launch. - rst: The total number of workgroups forming this kernel launch. - unit: Workgroups - LDS Req: - plain: The total number of LDS instructions (including, but not limited to, - read/write/atomics and HIP's __shfl instructions) executed per normalization - unit. - rst: The total number of LDS instructions (including, but not limited to, - read/write/atomics and HIP's ``__shfl`` instructions) executed - per :ref:`normalization unit <normalization-units>`. - unit: Instructions per normalization unit - LDS Util: - plain: Indicates what percent of the kernel's duration the LDS was actively - executing instructions (including, but not limited to, load, store, atomic - and HIP's __shfl operations). Calculated as the ratio of the total number - of cycles LDS was active over the total CU cycles. - rst: Indicates what percent of the kernel's duration the :ref:`LDS <desc-lds>` was - actively executing instructions (including, but not limited to, load, store, - atomic and HIP's ``__shfl`` operations). Calculated as the ratio of the - total number of cycles LDS was active over the :ref:`total CU cycles <total-cu-cycles>`. - unit: Percent - LDS Latency: - plain: The average number of round-trip cycles (i.e., from issue to data-return - / acknowledgment) required for an LDS instruction to complete. - rst: The average number of round-trip cycles (i.e., from issue to data-return / - acknowledgment) required for an LDS instruction to complete. - unit: Cycles - VL1 Rd: - plain: The total number of incoming read requests from the address processing - unit after coalescing per normalization unit - rst: The total number of incoming read requests from the :ref:`address processing - unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>` - unit: Requests per normalization unit - VL1 Wr: - plain: The total number of incoming write requests from the address processing - unit after coalescing per normalization unit - rst: The total number of incoming write requests from the :ref:`address processing - unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>` - unit: Requests per normalization unit - VL1 Atomic: - plain: The total number of incoming atomic requests from the address processing - unit after coalescing per normalization unit - rst: The total number of incoming atomic requests from the :ref:`address processing - unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>` - unit: Requests per normalization unit - VL1 Hit: - plain: The ratio of the number of vL1D cache line requests that hit in vL1D - cache over the total number of cache line requests to the vL1D Cache RAM. - rst: The ratio of the number of vL1D cache line requests that hit in vL1D cache - over the total number of cache line requests to the :ref:`vL1D Cache RAM - <desc-tc>`. - unit: Percent - VL1 Lat: - plain: Calculated as the average number of cycles that a vL1D cache line request - spent in the vL1D cache pipeline. - rst: Calculated as the average number of cycles that a vL1D cache line request - spent in the vL1D cache pipeline. - unit: Cycles - VL1 Coalesce: - plain: Indicates how well memory instructions were coalesced by the address - processing unit, ranging from uncoalesced (25%) to fully coalesced (100%). - Calculated as the average number of thread-requests generated per instruction - divided by the ideal number of thread-requests per instruction. - rst: Indicates how well memory instructions were coalesced by the :ref:`address - processing unit <desc-ta>`, ranging from uncoalesced (25%) to fully coalesced - (100%). Calculated as the average number of :ref:`thread-requests <thread-requests>` - generated per instruction divided by the ideal number of thread-requests per - instruction. - unit: Percent - VL1 Stall: - plain: The ratio of the number of cycles where the vL1D is stalled waiting to - issue a request for data to the L2 cache divided by the number of cycles where - the vL1D is active. - rst: The ratio of the number of cycles where the vL1D is stalled waiting to issue - a request for data to the :doc:`L2 cache <l2-cache>` divided by the number - of cycles where the vL1D is active [#vl1d-activity]_. - unit: Percent - VL1_L2 Rd: - plain: The number of read requests for a vL1D cache line that were not satisfied - by the vL1D and must be retrieved from the to the L2 Cache per normalization - unit. - rst: The number of read requests for a vL1D cache line that were not satisfied by - the vL1D and must be retrieved from the to the :doc:`L2 Cache <l2-cache>` - per :ref:`normalization unit <normalization-units>`. - unit: Requests per normalization unit - VL1_L2 Wr: - plain: The number of write requests to a vL1D cache line that were sent through - the vL1D to the L2 cache, per normalization unit. - rst: The number of write requests to a vL1D cache line that were sent through the - vL1D to the :doc:`L2 cache <l2-cache>`, per :ref:`normalization unit <normalization-units>`. - unit: Requests per normalization unit - VL1_L2 Atomic: - plain: The number of atomic requests that are sent through the vL1D to the L2 - cache, per normalization unit. This includes requests for atomics with, and - without return. - rst: The number of atomic requests that are sent through the vL1D to the :doc:`L2 - cache <l2-cache>`, per :ref:`normalization unit <normalization-units>`. This - includes requests for atomics with, and without return. - unit: Requests per normalization unit - sL1D Rd: - plain: The total number of requests, of any size or type, made to the sL1D per - normalization unit. - rst: The total number of requests, of any size or type, made to the sL1D per :ref:`normalization - unit <normalization-units>`. - unit: Requests per normalization unit - sL1D Hit: - plain: The total number of sL1D requests that hit on a previously loaded cache - line, per normalization unit. - rst: The total number of sL1D requests that hit on a previously loaded cache line, - per :ref:`normalization unit <normalization-units>`. - unit: Requests per normalization unit - sL1D_L2 Rd: - plain: The total number of read requests from sL1D to the L2, per normalization - unit. - rst: The total number of read requests from sL1D to the :doc:`L2 <l2-cache>`, per - :ref:`normalization unit <normalization-units>`. - unit: Requests per normalization unit - sL1D_L2 Wr: - plain: The total number of write requests from sL1D to the L2, per normalization - unit. Typically unused on current CDNA accelerators. - rst: The total number of write requests from sL1D to the :doc:`L2 <l2-cache>`, per - :ref:`normalization unit <normalization-units>`. Typically unused on current - CDNA accelerators. - unit: Requests per normalization unit - sL1D_L2 Atomic: - plain: The total number of atomic requests from sL1D to the L2, per normalization - unit. Typically unused on current CDNA accelerators. - rst: The total number of atomic requests from sL1D to the :doc:`L2 <l2-cache>`, - per :ref:`normalization unit <normalization-units>`. Typically unused on current - CDNA accelerators. - unit: Requests per normalization unit - IL1 Fetch: - plain: The total number of requests made to the L1I per normalization-unit. - rst: The total number of requests made to the L1I per :ref:`normalization-unit - <normalization-units>`. - unit: Requests per normalization unit - IL1 Hit: - plain: The percent of L1I requests that hit on a previously loaded line the - cache. Calculated as the ratio of the number of L1I requests that hit over - the number of all L1I requests. - rst: The total number of L1I requests that hit on a previously loaded cache line, - per :ref:`normalization-unit <normalization-units>`. - unit: Percent - IL1 Lat: - plain: The average number of cycles spent to fetch instructions to a CU. - rst: The average number of cycles spent to fetch instructions to a :doc:`CU - <compute-unit>`. - unit: Cycles - IL1_L2 Rd: - plain: The total number of requests across the L1I - L2 interface per normalization-unit. - rst: The total number of requests across the L1I - L2 interface per normalization-unit. - unit: Requests per normalization unit - L2 Rd: - plain: The total number of read requests to the L2 from all clients. - rst: The total number of read requests to the L2 from all clients. - unit: Requests per normalization unit - L2 Wr: - plain: The total number of write requests to the L2 from all clients. - rst: The total number of write requests to the L2 from all clients. - unit: Requests per normalization unit - L2 Atomic: - plain: The total number of atomic requests (with and without return) to the - L2 from all clients. - rst: The total number of atomic requests (with and without return) to the L2 from - all clients. - unit: Requests per normalization unit - L2 Hit: - plain: The ratio of the number of L2 cache line requests that hit in the L2 - cache over the total number of incoming cache line requests to the L2 cache. - rst: The ratio of the number of L2 cache line requests that hit in the L2 cache - over the total number of incoming cache line requests to the L2 cache. - unit: Percent - L2 Rd Lat: - plain: Calculated as the average number of cycles that the vL1D cache took to - issue and receive read requests from the L2 Cache. This number also includes - requests for atomics with return values. - rst: Calculated as the average number of cycles that the vL1D cache took to issue - and receive read requests from the :doc:`L2 Cache <l2-cache>`. This number - also includes requests for atomics with return values. - unit: Cycles - L2 Wr Lat: - plain: Calculated as the average number of cycles that the vL1D cache took to - issue and receive acknowledgement of a write request to the L2 Cache. This - number also includes requests for atomics without return values. - rst: Calculated as the average number of cycles that the vL1D cache took to issue - and receive acknowledgement of a write request to the :doc:`L2 Cache <l2-cache>`. - This number also includes requests for atomics without return values. - unit: Cycles - Fabric_L2 Rd: - plain: Number of L2 cache - Infinity Fabric read requests (either 32-byte or - 64-byte) summed over TCC instances per normalization unit. - rst: Number of L2 cache - Infinity Fabric read requests (either 32-byte or 64-byte) - summed over TCC instances per normalization unit. - unit: Requests per normalization unit - Fabric_L2 Wr: - plain: Number of L2 cache - Infinity Fabric write requests (either 32-byte or - 64-byte) summed over TCC instances per normalization unit. - rst: Number of L2 cache - Infinity Fabric write requests (either 32-byte or - 64-byte) summed over TCC instances per normalization unit. - unit: Requests per normalization unit - Fabric_L2 Atomic: - plain: Number of L2 cache - Infinity Fabric write requests (either 32-byte or - 64-byte) that are actually atomic requests summed over TCC instances per normalization - unit. - rst: Number of L2 cache - Infinity Fabric write requests (either 32-byte or - 64-byte) that are actually atomic requests summed over TCC instances per normalization - unit. - unit: Requests per normalization unit - Fabric Rd Lat: - plain: The time-averaged number of cycles read requests spent in Infinity Fabric - before data was returned to the L2. - rst: The time-averaged number of cycles read requests spent in Infinity Fabric - before data was returned to the L2. - unit: Cycles - Fabric Wr Lat: - plain: The time-averaged number of cycles write requests spent in Infinity Fabric - before a completion acknowledgement was returned to the L2. - rst: The time-averaged number of cycles write requests spent in Infinity Fabric - before a completion acknowledgement was returned to the L2. - unit: Cycles - Fabric Atomic Lat: - plain: The time-averaged number of cycles atomic requests spent in Infinity - Fabric before a completion acknowledgement (atomic without return value) or - data (atomic with return value) was returned to the L2. - rst: The time-averaged number of cycles atomic requests spent in Infinity Fabric - before a completion acknowledgement (atomic without return value) or data - (atomic with return value) was returned to the L2. - unit: Cycles - HBM Rd: - plain: The total number of L2 requests to Infinity Fabric to read 32B or 64B - of data from the accelerator's local HBM, per normalization unit. - rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of data - from the accelerator's local HBM, per :ref:`normalization unit <normalization-units>`. - See :ref:`l2-request-flow` for more detail. - unit: Requests per normalization unit - HBM Wr: - plain: |- - The total number of L2 requests to Infinity Fabric to write or atomically - update 32B or 64B of data in the accelerator's local HBM, per normalization - unit. - rst: The total number of L2 requests to Infinity Fabric to write 32B or 64B - of data from the accelerator's local HBM, per :ref:`normalization unit <normalization-units>`. - See :ref:`l2-request-flow` for more detail. - unit: Requests per normalization unit -- id: 400 - title: Roofline - data source: - - metric_table: - id: 401 - title: Roofline Performance Rates - cli_style: Roofline - tui_style: Roofline - header: - metric: Metric - value: Value - unit: Unit - peak: Peak (Empirical) - metric: - gfx90a: - VALU FLOPs (F16): - value: AVG((($wave_size * ( - SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16 - )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GFLOP/s - peak: $FP16Flops_empirical_peak - VALU FLOPs (F32): - value: AVG((($wave_size * ( - SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32 - )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GFLOP/s - peak: $FP32Flops_empirical_peak - VALU FLOPs (F64): - value: AVG((($wave_size * ( - SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64 - )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GFLOP/s - peak: $FP64Flops_empirical_peak - MFMA FLOPs (F64): - value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GFLOP/s - peak: $MFMAF64Flops_empirical_peak - MFMA FLOPs (F32): - value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GFLOP/s - peak: $MFMAF32Flops_empirical_peak - MFMA FLOPs (F16): - value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GFLOP/s - peak: $MFMAF16Flops_empirical_peak - MFMA FLOPs (BF16): - value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GFLOP/s - peak: $MFMABF16Flops_empirical_peak - MFMA IOPs (Int8): - value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GIOP/s - peak: $MFMAI8Ops_empirical_peak - HBM Bandwidth: - value: AVG((( - (TCC_EA_RDREQ_32B_sum * 32) + - ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) * 64) + - (TCC_EA_WRREQ_64B_sum * 64) + - ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) * 32) - ) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GB/s - peak: $HBMBw_empirical_peak - L2 Cache Bandwidth: - value: AVG(((((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + - TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) * - 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GB/s - peak: $L2Bw_empirical_peak - L1 Cache Bandwidth: - value: AVG((((TCP_TOTAL_CACHE_ACCESSES_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GB/s - peak: $L1Bw_empirical_peak - LDS Bandwidth: - value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * - 4 * $lds_banks_per_cu)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GB/s - peak: $LDSBw_empirical_peak - gfx908: - VALU FLOPs (F16): - value: AVG((($wave_size * ( - SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16 - )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GFLOP/s - peak: $FP16Flops_empirical_peak - VALU FLOPs (F32): - value: AVG((($wave_size * ( - SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32 - )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GFLOP/s - peak: $FP32Flops_empirical_peak - VALU FLOPs (F64): - value: AVG((($wave_size * ( - SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64 - )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GFLOP/s - peak: $FP64Flops_empirical_peak - MFMA FLOPs (F64): - value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GFLOP/s - peak: $MFMAF64Flops_empirical_peak - MFMA FLOPs (F32): - value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GFLOP/s - peak: $MFMAF32Flops_empirical_peak - MFMA FLOPs (F16): - value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GFLOP/s - peak: $MFMAF16Flops_empirical_peak - MFMA FLOPs (BF16): - value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GFLOP/s - peak: $MFMABF16Flops_empirical_peak - MFMA IOPs (Int8): - value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GIOP/s - peak: $MFMAI8Ops_empirical_peak - HBM Bandwidth: - value: AVG((( - (TCC_BUBBLE_sum * 128) + - (TCC_EA0_RDREQ_32B_sum * 32) + - ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) + - ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + - (TCC_EA0_WRREQ_64B_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GB/s - peak: $HBMBw_empirical_peak - L2 Cache Bandwidth: - value: AVG(((((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + - TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) * - 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GB/s - peak: $L2Bw_empirical_peak - L1 Cache Bandwidth: - value: AVG((((TCP_TOTAL_CACHE_ACCESSES_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GB/s - peak: $L1Bw_empirical_peak - LDS Bandwidth: - value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * - 4 * $lds_banks_per_cu)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GB/s - peak: $LDSBw_empirical_peak - gfx940: - VALU FLOPs (F16): - value: AVG((($wave_size * ( - SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16 - )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GFLOP/s - peak: $FP16Flops_empirical_peak - VALU FLOPs (F32): - value: AVG((($wave_size * ( - SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32 - )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GFLOP/s - peak: $FP32Flops_empirical_peak - VALU FLOPs (F64): - value: AVG((($wave_size * ( - SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64 - )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GFLOP/s - peak: $FP64Flops_empirical_peak - MFMA FLOPs (F64): - value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GFLOP/s - peak: $MFMAF64Flops_empirical_peak - MFMA FLOPs (F32): - value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GFLOP/s - peak: $MFMAF32Flops_empirical_peak - MFMA FLOPs (F16): - value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GFLOP/s - peak: $MFMAF16Flops_empirical_peak - MFMA FLOPs (BF16): - value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GFLOP/s - peak: $MFMABF16Flops_empirical_peak - MFMA FLOPs (F8): - value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GFLOP/s - peak: $MFMAF8Flops_empirical_peak - MFMA IOPs (Int8): - value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GIOP/s - peak: $MFMAI8Ops_empirical_peak - HBM Bandwidth: - value: AVG((( - (TCC_BUBBLE_sum * 128) + - (TCC_EA0_RDREQ_32B_sum * 32) + - ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) + - ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + - (TCC_EA0_WRREQ_64B_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GB/s - peak: $HBMBw_empirical_peak - L2 Cache Bandwidth: - value: AVG(((((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + - TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) * - 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GB/s - peak: $L2Bw_empirical_peak - L1 Cache Bandwidth: - value: AVG((((TCP_TOTAL_CACHE_ACCESSES_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GB/s - peak: $L1Bw_empirical_peak - LDS Bandwidth: - value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * - 4 * $lds_banks_per_cu)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GB/s - peak: $LDSBw_empirical_peak - gfx941: - VALU FLOPs (F16): - value: AVG((($wave_size * ( - SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16 - )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GFLOP/s - peak: $FP16Flops_empirical_peak - VALU FLOPs (F32): - value: AVG((($wave_size * ( - SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32 - )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GFLOP/s - peak: $FP32Flops_empirical_peak - VALU FLOPs (F64): - value: AVG((($wave_size * ( - SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64 - )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GFLOP/s - peak: $FP64Flops_empirical_peak - MFMA FLOPs (F64): - value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GFLOP/s - peak: $MFMAF64Flops_empirical_peak - MFMA FLOPs (F32): - value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GFLOP/s - peak: $MFMAF32Flops_empirical_peak - MFMA FLOPs (F16): - value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GFLOP/s - peak: $MFMAF16Flops_empirical_peak - MFMA FLOPs (BF16): - value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GFLOP/s - peak: $MFMABF16Flops_empirical_peak - MFMA FLOPs (F8): - value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GFLOP/s - peak: $MFMAF8Flops_empirical_peak - MFMA IOPs (Int8): - value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GIOP/s - peak: $MFMAI8Ops_empirical_peak - HBM Bandwidth: - value: AVG((( - (TCC_BUBBLE_sum * 128) + - (TCC_EA0_RDREQ_32B_sum * 32) + - ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) + - ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + - (TCC_EA0_WRREQ_64B_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GB/s - peak: $HBMBw_empirical_peak - L2 Cache Bandwidth: - value: AVG(((((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + - TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) * - 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GB/s - peak: $L2Bw_empirical_peak - L1 Cache Bandwidth: - value: AVG((((TCP_TOTAL_CACHE_ACCESSES_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GB/s - peak: $L1Bw_empirical_peak - LDS Bandwidth: - value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * - 4 * $lds_banks_per_cu)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GB/s - peak: $LDSBw_empirical_peak - gfx942: - VALU FLOPs (F16): - value: AVG((($wave_size * ( - SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16 - )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GFLOP/s - peak: $FP16Flops_empirical_peak - VALU FLOPs (F32): - value: AVG((($wave_size * ( - SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32 - )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GFLOP/s - peak: $FP32Flops_empirical_peak - VALU FLOPs (F64): - value: AVG((($wave_size * ( - SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64 - )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GFLOP/s - peak: $FP64Flops_empirical_peak - MFMA FLOPs (F64): - value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GFLOP/s - peak: $MFMAF64Flops_empirical_peak - MFMA FLOPs (F32): - value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GFLOP/s - peak: $MFMAF32Flops_empirical_peak - MFMA FLOPs (F16): - value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GFLOP/s - peak: $MFMAF16Flops_empirical_peak - MFMA FLOPs (BF16): - value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GFLOP/s - peak: $MFMABF16Flops_empirical_peak - MFMA FLOPs (F8): - value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GFLOP/s - peak: $MFMAF8Flops_empirical_peak - MFMA IOPs (Int8): - value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GIOP/s - peak: $MFMAI8Ops_empirical_peak - HBM Bandwidth: - value: AVG((( - (TCC_BUBBLE_sum * 128) + - (TCC_EA0_RDREQ_32B_sum * 32) + - ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) + - ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + - (TCC_EA0_WRREQ_64B_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GB/s - peak: $HBMBw_empirical_peak - L2 Cache Bandwidth: - value: AVG(((((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + - TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) * - 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GB/s - peak: $L2Bw_empirical_peak - L1 Cache Bandwidth: - value: AVG((((TCP_TOTAL_CACHE_ACCESSES_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GB/s - peak: $L1Bw_empirical_peak - LDS Bandwidth: - value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * - 4 * $lds_banks_per_cu)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GB/s - peak: $LDSBw_empirical_peak - gfx950: - VALU FLOPs (F16): - value: AVG((($wave_size * ( - SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16 - )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GFLOP/s - peak: $FP16Flops_empirical_peak - VALU FLOPs (F32): - value: AVG((($wave_size * ( - SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32 - )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GFLOP/s - peak: $FP32Flops_empirical_peak - VALU FLOPs (F64): - value: AVG((($wave_size * ( - SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64 - )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GFLOP/s - peak: $FP64Flops_empirical_peak - MFMA FLOPs (F64): - value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GFLOP/s - peak: $MFMAF64Flops_empirical_peak - MFMA FLOPs (F32): - value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GFLOP/s - peak: $MFMAF32Flops_empirical_peak - MFMA FLOPs (F16): - value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GFLOP/s - peak: $MFMAF16Flops_empirical_peak - MFMA FLOPs (BF16): - value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GFLOP/s - peak: $MFMABF16Flops_empirical_peak - MFMA FLOPs (F8): - value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GFLOP/s - peak: $MFMAF8Flops_empirical_peak - MFMA FLOPs (F6F4): - value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GFLOP/s - peak: $MFMA_FLOPs_F6F4_empirical_peak - MFMA IOPs (Int8): - value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GIOP/s - peak: $MFMAI8Ops_empirical_peak - HBM Bandwidth: - value: AVG((( - (TCC_BUBBLE_sum * 128) + - (TCC_EA0_RDREQ_32B_sum * 32) + - ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) + - ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + - (TCC_EA0_WRREQ_64B_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GB/s - peak: $HBMBw_empirical_peak - L2 Cache Bandwidth: - value: AVG(((((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + - TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) * - 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GB/s - peak: $L2Bw_empirical_peak - L1 Cache Bandwidth: - value: AVG((((TCP_TOTAL_CACHE_ACCESSES_sum * 64)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GB/s - peak: $L1Bw_empirical_peak - LDS Bandwidth: - value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * - 4 * $lds_banks_per_cu)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) - unit: GB/s - peak: $LDSBw_empirical_peak - - metric_table: - id: 402 - title: Roofline Plot Points - cli_style: Roofline - tui_style: Roofline - header: - metric: Metric - value: Value - unit: Unit - metric: - gfx90a: - AI HBM: - value: ( - SUM( - ($wave_size * ( - (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + - (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + - (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) - )) + - (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) - ) / - SUM( - (TCC_EA_RDREQ_32B_sum * 32) + - ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) * 64) + - (TCC_EA_WRREQ_64B_sum * 64) + - ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) * 32) - ) - ) - unit: FLOPs/Byte - AI L2: - value: ( - SUM( - ($wave_size * ( - (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + - (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + - (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) - )) + - (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) - ) / - SUM( - (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + - TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) * 64 - ) - ) - unit: FLOPs/Byte - AI L1: - value: ( - SUM( - ($wave_size * ( - (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + - (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + - (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) - )) + - (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) - ) / - SUM(TCP_TOTAL_CACHE_ACCESSES_sum * 64) - ) - unit: FLOPs/Byte - Performance (GFLOPs): - value: ( - SUM( - ($wave_size * ( - (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + - (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + - (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) - )) + - (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) - ) / - (SUM(End_Timestamp - Start_Timestamp) / 1e9) - ) / 1e9 - unit: GFLOP/s - gfx908: - AI HBM: - value: ( - SUM( - ($wave_size * ( - (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + - (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + - (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) - )) + - (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) - ) / - SUM( - (TCC_BUBBLE_sum * 128) + - (TCC_EA0_RDREQ_32B_sum * 32) + - ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) + - ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + - (TCC_EA0_WRREQ_64B_sum * 64) - ) - ) - unit: FLOPs/Byte - AI L2: - value: ( - SUM( - ($wave_size * ( - (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + - (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + - (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) - )) + - (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) - ) / - SUM( - (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + - TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) * 64 - ) - ) - unit: FLOPs/Byte - AI L1: - value: ( - SUM( - ($wave_size * ( - (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + - (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + - (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) - )) + - (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) - ) / - SUM(TCP_TOTAL_CACHE_ACCESSES_sum * 64) - ) - unit: FLOPs/Byte - Performance (GFLOPs): - value: ( - SUM( - ($wave_size * ( - (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + - (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + - (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) - )) + - (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) - ) / - (SUM(End_Timestamp - Start_Timestamp) / 1e9) - ) / 1e9 - unit: GFLOP/s - gfx940: - AI HBM: - value: ( - SUM( - ($wave_size * ( - (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + - (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + - (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) - )) + - (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) - ) / - SUM( - (TCC_BUBBLE_sum * 128) + - (TCC_EA0_RDREQ_32B_sum * 32) + - ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) + - ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + - (TCC_EA0_WRREQ_64B_sum * 64) - ) - ) - unit: FLOPs/Byte - AI L2: - value: ( - SUM( - ($wave_size * ( - (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + - (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + - (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) - )) + - (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + - ) / - SUM( - (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + - TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) * 64 - ) - ) - unit: FLOPs/Byte - AI L1: - value: ( - SUM( - ($wave_size * ( - (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + - (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + - (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) - )) + - (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + - ) / - SUM(TCP_TOTAL_CACHE_ACCESSES_sum * 64) - ) - unit: FLOPs/Byte - Performance (GFLOPs): - value: ( - SUM( - ($wave_size * ( - (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + - (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + - (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) - )) + - (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) - ) / - (SUM(End_Timestamp - Start_Timestamp) / 1e9) - ) / 1e9 - unit: GFLOP/s - gfx941: - AI HBM: - value: ( - SUM( - ($wave_size * ( - (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + - (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + - (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) - )) + - (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) - ) / - SUM( - (TCC_BUBBLE_sum * 128) + - (TCC_EA0_RDREQ_32B_sum * 32) + - ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) + - ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + - (TCC_EA0_WRREQ_64B_sum * 64) - ) - ) - unit: FLOPs/Byte - AI L2: - value: ( - SUM( - ($wave_size * ( - (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + - (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + - (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) - )) + - (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) - ) / - SUM( - (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + - TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) * 64 - ) - ) - unit: FLOPs/Byte - AI L1: - value: ( - SUM( - ($wave_size * ( - (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + - (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + - (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) - )) + - (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) - ) / - SUM(TCP_TOTAL_CACHE_ACCESSES_sum * 64) - ) - unit: FLOPs/Byte - Performance (GFLOPs): - value: ( - SUM( - ($wave_size * ( - (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + - (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + - (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) - )) + - (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) - ) / - (SUM(End_Timestamp - Start_Timestamp) / 1e9) - ) / 1e9 - unit: GFLOP/s - gfx942: - AI HBM: - value: ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 - + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) - + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) - + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * - 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 - * 512) ) / SUM( (TCC_BUBBLE_sum * 128) + (TCC_EA0_RDREQ_32B_sum * 32) - + ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) - + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + (TCC_EA0_WRREQ_64B_sum - * 64) ) ) - unit: FLOPs/Byte - AI L2: - value: ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 - + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) - + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) - + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * - 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 - * 512) ) / SUM( (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + - TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) * 64 ) ) - unit: FLOPs/Byte - AI L1: - value: ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 - + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) - + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) - + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * - 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 - * 512) ) / SUM( TCP_TOTAL_CACHE_ACCESSES_sum * 64 ) ) - unit: FLOPs/Byte - Performance (GFLOPs): - value: ( SUM( ($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 - + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) - + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) - + SQ_INSTS_VALU_TRANS_F64) )) + (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F32 * - 512) + (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + (SQ_INSTS_VALU_MFMA_MOPS_F8 - * 512) ) / (SUM(End_Timestamp - Start_Timestamp) / 1e9) ) / 1e9 - unit: GFLOP/s - gfx950: - AI HBM: - value: ( - SUM( - ($wave_size * ( - (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + - (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + - (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) - )) + - (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) - ) / - SUM( - (TCC_BUBBLE_sum * 128) + - (TCC_EA0_RDREQ_32B_sum * 32) + - ((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) * 64) + - ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) * 32) + - (TCC_EA0_WRREQ_64B_sum * 64) - ) - ) - unit: FLOPs/Byte - AI L2: - value: ( - SUM( - ($wave_size * ( - (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + - (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + - (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) - )) + - (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) - ) / - SUM( - (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum + - TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum + TCP_TCC_READ_REQ_sum) * 64 - ) - ) - unit: FLOPs/Byte - AI L1: - value: ( - SUM( - ($wave_size * ( - (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + - (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + - (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) - )) + - (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) - ) / - SUM(TCP_TOTAL_CACHE_ACCESSES_sum * 64) - ) - unit: FLOPs/Byte - Performance (GFLOPs): - value: ( - SUM( - ($wave_size * ( - (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + - (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + - (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) - )) + - (SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) + - (SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) - ) / - (SUM(End_Timestamp - Start_Timestamp) / 1e9) - ) / 1e9 - unit: GFLOP/s - metrics_description: - VALU FLOPs (F16): - plain: |- - The total 16-bit floating-point operations executed per second on the VALU. - This is presented with the value of the peak empirical F16 FLOPs achievable - on the specific accelerator. Note: this does not include any F16 operations - from MFMA instructions. - rst: |- - The total 16-bit floating-point operations executed per second on the :ref:`VALU - <desc-valu>`. This is presented with the value of the peak empirical F16 FLOPs achievable - on the specific accelerator. Note: this does not include any F16 operations - from :ref:`MFMA <desc-mfma>` instructions. - unit: GFLOPs - VALU FLOPs (F32): - plain: |- - The total 32-bit floating-point operations executed per second on the VALU. - This is presented with the value of the peak empirical F32 FLOPs achievable - on the specific accelerator. Note: this does not include any F32 operations - from MFMA instructions. - rst: |- - The total 32-bit floating-point operations executed per second on the :ref:`VALU - <desc-valu>`. This is presented with the value of the peak empirical F32 FLOPs achievable - on the specific accelerator. Note: this does not include any F32 operations - from :ref:`MFMA <desc-mfma>` instructions. - unit: GFLOPs - VALU FLOPs (F64): - plain: |- - The total 64-bit floating-point operations executed per second on the VALU. - This is presented with the value of the peak empirical F64 FLOPs achievable - on the specific accelerator. Note: this does not include any F64 operations - from MFMA instructions. - rst: |- - The total 64-bit floating-point operations executed per second on the :ref:`VALU - <desc-valu>`. This is presented with the value of the peak empirical F64 FLOPs achievable - on the specific accelerator. Note: this does not include any F64 operations - from :ref:`MFMA <desc-mfma>` instructions. - unit: GFLOPs - MFMA FLOPs (F8): - plain: The total number of 8-bit brain floating point MFMA operations executed - per second. This does not include any 16-bit brain floating point operations - from VALU instructions. The peak empirically measured F8 MFMA operations - achievable on the specific accelerator is displayed alongside for comparison. - It is supported on AMD Instinct MI300 series and later only. - rst: |- - The total number of 8-bit brain floating point :ref:`MFMA <desc-mfma>` - operations executed per second. Note: this does not include any 16-bit brain - floating point operations from :ref:`VALU <desc-valu>` instructions. The - peak empirically measured F8 MFMA operations achievable on the specific - accelerator is displayed alongside for comparison. It is supported on AMD - Instinct MI300 series and later only. - unit: GFLOPs - MFMA FLOPs (BF16): - plain: |- - The total number of 16-bit brain floating point MFMA operations executed - per second. Note: this does not include any 16-bit brain floating point - operations from VALU instructions. The peak empirically measured BF16 MFMA - operations achievable on the specific accelerator is displayed alongside - for comparison. - rst: |- - The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>` - operations executed per second. Note: this does not include any 16-bit brain - floating point operations from :ref:`VALU <desc-valu>` instructions. The - peak empirically measured BF16 MFMA operations achievable on the specific - accelerator is displayed alongside for comparison. - unit: GFLOPs - MFMA FLOPs (F16): - plain: |- - The total number of 16-bit floating point MFMA operations executed per - second. Note: this does not include any 16-bit floating point operations from - VALU instructions. The peak empirically measured F16 MFMA operations - achievable on the specific accelerator is displayed alongside for comparison. - rst: |- - The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations - executed per second. Note: this does not include any 16-bit floating point - operations from :ref:`VALU <desc-valu>` instructions. The peak empirically - measured F16 MFMA operations achievable on the specific accelerator is - displayed alongside for comparison. - unit: GFLOPs - MFMA FLOPs (F32): - plain: |- - The total number of 32-bit floating point MFMA operations executed per - second. Note: this does not include any 32-bit floating point operations from - VALU instructions. The peak empirically measured F32 MFMA operations - achievable on the specific accelerator is displayed alongside for comparison. - rst: |- - The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations - executed per second. Note: this does not include any 32-bit floating point - operations from :ref:`VALU <desc-valu>` instructions. The peak empirically - measured F32 MFMA operations achievable on the specific accelerator is - displayed alongside for comparison. - unit: GFLOPs - MFMA FLOPs (F64): - plain: |- - The total number of 64-bit floating point MFMA operations executed per - second. Note: this does not include any 64-bit floating point operations from - VALU instructions. The peak empirically measured F64 MFMA operations - achievable on the specific accelerator is displayed alongside for comparison. - rst: |- - The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations - executed per second. Note: this does not include any 64-bit floating point - operations from :ref:`VALU <desc-valu>` instructions. The peak empirically - measured F64 MFMA operations achievable on the specific accelerator is - displayed alongside for comparison. - unit: GFLOPs - MFMA FLOPs (F6F4): - plain: |- - The total number of 4-bit and 6-bit floating point MFMA operations executed - per second. Note: this does not include any floating point operations from - VALU instructions. The peak empirically measured F6F4 MFMA operations - achievable on the specific accelerator is displayed alongside for comparison. - It is supported on AMD Instinct MI350 series (gfx950) and later only. - rst: |- - The total number of 4-bit and 6-bit floating point :ref:`MFMA <desc-mfma>` - operations executed per second. Note: this does not include any floating point - operations from :ref:`VALU <desc-valu>` instructions. The peak empirically - measured F6F4 MFMA operations achievable on the specific accelerator is - displayed alongside for comparison. It is supported on AMD Instinct MI350 - series (gfx950) and later only. - unit: GFLOPs - MFMA IOPs (Int8): - plain: |- - The total number of 8-bit integer MFMA operations executed per second. - Note: this does not include any 8-bit integer operations from VALU instructions. - The peak empirically measured INT8 MFMA operations achievable on the specific - accelerator is displayed alongside for comparison. - rst: |- - The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed - per second. Note: this does not include any 8-bit integer operations from - :ref:`VALU <desc-valu>` instructions. The peak empirically measured INT8 MFMA - operations achievable on the specific accelerator is displayed alongside - for comparison. - unit: GIOPs - HBM Bandwidth: - plain: |- - The total number of bytes read from and written to High-Bandwidth - Memory (HBM) per second. The peak empirically measured bandwidth achievable - on the specific accelerator is displayed alongside for comparison. - rst: |- - The total number of bytes read from and written to High-Bandwidth - Memory (HBM) per second. The peak empirically measured bandwidth achievable - on the specific accelerator is displayed alongside for comparison. - unit: GB/s - L2 Cache Bandwidth: - plain: The number of bytes looked up in the L2 cache per unit time. The number - of bytes is calculated as the number of cache lines requested multiplied by - the cache line size. This value does not consider partial requests, so e.g., - if only a single value is requested in a cache line, the data movement will - still be counted as a full cache line. The peak empirically measured bandwidth - achievable on the specific accelerator is displayed alongside for comparison. - rst: The number of bytes looked up in the L2 cache per unit time. The number of - bytes is calculated as the number of cache lines requested multiplied by - the cache line size. This value does not consider partial requests, so e.g., - if only a single value is requested in a cache line, the data movement will - still be counted as a full cache line. The peak empirically measured - bandwidth achievable on the specific accelerator is displayed alongside - for comparison. - unit: GB/s - L1 Cache Bandwidth: - plain: The number of bytes looked up in the vL1D cache as a result of VMEM - instructions per unit time. The number of bytes is calculated as the number - of cache lines requested multiplied by the cache line size. This value does - not consider partial requests, so e.g., if only a single value is requested - in a cache line, the data movement will still be counted as a full cache line. - The peak empirically measured bandwidth achievable on the specific accelerator - is displayed alongside for comparison. - rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM - <desc-vmem>` instructions per unit time. The number of bytes is calculated - as the number of cache lines requested multiplied by the cache line size. - This value does not consider partial requests, so e.g., if only a single - value is requested in a cache line, the data movement will still be counted - as a full cache line. The peak empirically measured bandwidth achievable on - the specific accelerator is displayed alongside for comparison. - unit: GB/s - LDS Bandwidth: - plain: Indicates the maximum amount of bytes that could have been loaded from, - stored to, or atomically updated in the LDS per unit time (see LDS Bandwidth - example for more detail). The peak empirically measured LDS bandwidth - achievable on the specific accelerator is displayed alongside for comparison. - rst: Indicates the maximum amount of bytes that could have been loaded from, - stored to, or atomically updated in the LDS per unit time (see :ref:`LDS - Bandwidth <lds-bandwidth>` example for more detail). The peak empirically - measured LDS bandwidth achievable on the specific accelerator is displayed - alongside for comparison. - unit: GB/s - AI L1: - plain: |- - The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio - of total floating-point operations (FLOPs) to total bytes transferred between - the L1 cache and the processing units. This value is used as the x-coordinate - for the L1 roofline. - rst: |- - The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio - of total floating-point operations (FLOPs) to total bytes transferred between - the L1 cache and the processing units. This value is used as the x-coordinate - for the L1 roofline. - unit: FLOPs/Byte - AI L2: - plain: |- - The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio - of total floating-point operations (FLOPs) to total bytes transferred between - the L2 cache and the L1 cache. This value is used as the x-coordinate for - the L2 roofline. - rst: |- - The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio - of total floating-point operations (FLOPs) to total bytes transferred between - the L2 cache and the L1 cache. This value is used as the x-coordinate for - the L2 roofline. - unit: FLOPs/Byte - AI HBM: - plain: |- - The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM). - It is the ratio of total floating-point operations (FLOPs) to total bytes - transferred between HBM and the L2 cache. This value is used as the x-coordinate - for the HBM roofline. - rst: |- - The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM). - It is the ratio of total floating-point operations (FLOPs) to total bytes - transferred between HBM and the L2 cache. This value is used as the x-coordinate - for the HBM roofline. - unit: FLOPs/Byte - Performance (GFLOPs): - plain: |- - The overall achieved performance, measured in GigaFLOPs - per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point - operations divided by the total execution time. This value is used as the y-coordinate - for the kernel's point on the Roofline plot. - rst: |- - The overall achieved performance, measured in GigaFLOPs - per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point - operations divided by the total execution time. This value is used as the y-coordinate - for the kernel's point on the Roofline plot. - unit: GFLOP/s -- id: 500 - title: Command Processor (CPC/CPF) - data source: - - metric_table: - id: 501 - title: Command processor fetcher (CPF) - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - metric: - gfx90a: - CPF Utilization: - avg: AVG((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) - if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) - min: MIN((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) - if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) - max: MAX((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) - if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) - unit: pct - CPF Stall: - avg: AVG((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None)) - min: MIN((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None)) - max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None)) - unit: pct - CPF-L2 Utilization: - avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) - if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) - min: MIN((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) - if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) - max: MAX((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) - if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) - unit: pct - CPF-L2 Stall: - avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY - != 0) else None)) - min: MIN((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY - != 0) else None)) - max: MAX((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY - != 0) else None)) - unit: pct - CPF-UTCL1 Stall: - avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) - if (CPF_CPF_STAT_BUSY != 0) else None) - min: MIN(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) - if (CPF_CPF_STAT_BUSY != 0) else None) - max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) - if (CPF_CPF_STAT_BUSY != 0) else None) - unit: pct - gfx941: - CPF Utilization: - avg: AVG((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) - if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) - min: MIN((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) - if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) - max: MAX((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) - if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) - unit: pct - CPF Stall: - avg: AVG((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None)) - min: MIN((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None)) - max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None)) - unit: pct - CPF-L2 Utilization: - avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) - if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) - min: MIN((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) - if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) - max: MAX((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) - if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) - unit: pct - CPF-L2 Stall: - avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY - != 0) else None)) - min: MIN((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY - != 0) else None)) - max: MAX((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY - != 0) else None)) - unit: pct - CPF-UTCL1 Stall: - avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) - if (CPF_CPF_STAT_BUSY != 0) else None) - min: MIN(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) - if (CPF_CPF_STAT_BUSY != 0) else None) - max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) - if (CPF_CPF_STAT_BUSY != 0) else None) - unit: pct - gfx940: - CPF Utilization: - avg: AVG((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) - if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) - min: MIN((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) - if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) - max: MAX((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) - if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) - unit: pct - CPF Stall: - avg: AVG((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None)) - min: MIN((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None)) - max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None)) - unit: pct - CPF-L2 Utilization: - avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) - if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) - min: MIN((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) - if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) - max: MAX((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) - if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) - unit: pct - CPF-L2 Stall: - avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY - != 0) else None)) - min: MIN((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY - != 0) else None)) - max: MAX((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY - != 0) else None)) - unit: pct - CPF-UTCL1 Stall: - avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) - if (CPF_CPF_STAT_BUSY != 0) else None) - min: MIN(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) - if (CPF_CPF_STAT_BUSY != 0) else None) - max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) - if (CPF_CPF_STAT_BUSY != 0) else None) - unit: pct - gfx942: - CPF Utilization: - avg: AVG((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) - if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) - min: MIN((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) - if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) - max: MAX((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) - if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) - unit: pct - CPF Stall: - avg: AVG((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None)) - min: MIN((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None)) - max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None)) - unit: pct - CPF-L2 Utilization: - avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) - if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) - min: MIN((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) - if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) - max: MAX((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) - if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) - unit: pct - CPF-L2 Stall: - avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY - != 0) else None)) - min: MIN((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY - != 0) else None)) - max: MAX((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY - != 0) else None)) - unit: pct - CPF-UTCL1 Stall: - avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) - if (CPF_CPF_STAT_BUSY != 0) else None) - min: MIN(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) - if (CPF_CPF_STAT_BUSY != 0) else None) - max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) - if (CPF_CPF_STAT_BUSY != 0) else None) - unit: pct - gfx950: - CPF Utilization: - avg: AVG((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) - if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) - min: MIN((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) - if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) - max: MAX((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) - if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) - unit: pct - CPF Stall: - avg: AVG((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None)) - min: MIN((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None)) - max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None)) - unit: pct - CPF-L2 Utilization: - avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) - if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) - min: MIN((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) - if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) - max: MAX((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) - if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) - unit: pct - CPF-L2 Stall: - avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY - != 0) else None)) - min: MIN((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY - != 0) else None)) - max: MAX((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY - != 0) else None)) - unit: pct - CPF-UTCL1 Stall: - avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) - if (CPF_CPF_STAT_BUSY != 0) else None) - min: MIN(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) - if (CPF_CPF_STAT_BUSY != 0) else None) - max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) - if (CPF_CPF_STAT_BUSY != 0) else None) - unit: pct - gfx908: - CPF Utilization: - avg: AVG((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) - if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) - min: MIN((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) - if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) - max: MAX((((100 * CPF_CPF_STAT_BUSY) / (CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE)) - if ((CPF_CPF_STAT_BUSY + CPF_CPF_STAT_IDLE) != 0) else None)) - unit: pct - CPF Stall: - avg: AVG((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None)) - min: MIN((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None)) - max: MAX((((100 * CPF_CPF_STAT_STALL) / CPF_CPF_STAT_BUSY) if (CPF_CPF_STAT_BUSY - != 0) else None)) - unit: pct - CPF-L2 Utilization: - avg: AVG((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) - if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) - min: MIN((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) - if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) - max: MAX((((100 * CPF_CPF_TCIU_BUSY) / (CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE)) - if ((CPF_CPF_TCIU_BUSY + CPF_CPF_TCIU_IDLE) != 0) else None)) - unit: pct - CPF-L2 Stall: - avg: AVG((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY - != 0) else None)) - min: MIN((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY - != 0) else None)) - max: MAX((((100 * CPF_CPF_TCIU_STALL) / CPF_CPF_TCIU_BUSY) if (CPF_CPF_TCIU_BUSY - != 0) else None)) - unit: pct - CPF-UTCL1 Stall: - avg: AVG(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) - if (CPF_CPF_STAT_BUSY != 0) else None) - min: MIN(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) - if (CPF_CPF_STAT_BUSY != 0) else None) - max: MAX(((100 * CPF_CMP_UTCL1_STALL_ON_TRANSLATION) / CPF_CPF_STAT_BUSY) - if (CPF_CPF_STAT_BUSY != 0) else None) - unit: pct - - metric_table: - id: 502 - title: Command processor packet processor (CPC) - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - metric: - gfx90a: - CPC Utilization: - avg: AVG((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) - if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) - min: MIN((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) - if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) - max: MAX((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) - if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) - unit: pct - CPC Stall Rate: - avg: AVG((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None)) - min: MIN((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None)) - max: MAX((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None)) - unit: pct - CPC Packet Decoding Utilization: - avg: AVG((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if - (CPC_CPC_STAT_BUSY != 0) else None) - min: MIN((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if - (CPC_CPC_STAT_BUSY != 0) else None) - max: MAX((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if - (CPC_CPC_STAT_BUSY != 0) else None) - unit: pct - CPC-Workgroup Manager Utilization: - avg: AVG((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY - != 0) else None) - min: MIN((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY - != 0) else None) - max: MAX((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY - != 0) else None) - unit: Pct - CPC-L2 Utilization: - avg: AVG((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) - if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) - min: MIN((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) - if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) - max: MAX((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) - if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) - unit: pct - CPC-UTCL1 Stall: - avg: AVG(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) - if (CPC_CPC_STAT_BUSY != 0) else None) - min: MIN(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) - if (CPC_CPC_STAT_BUSY != 0) else None) - max: MAX(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) - if (CPC_CPC_STAT_BUSY != 0) else None) - unit: pct - CPC-UTCL2 Utilization: - avg: AVG((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) - if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) - min: MIN((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) - if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) - max: MAX((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) - if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) - unit: pct - gfx941: - CPC Utilization: - avg: AVG((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) - if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) - min: MIN((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) - if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) - max: MAX((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) - if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) - unit: pct - CPC Stall Rate: - avg: AVG((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None)) - min: MIN((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None)) - max: MAX((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None)) - unit: pct - CPC Packet Decoding Utilization: - avg: AVG((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if - (CPC_CPC_STAT_BUSY != 0) else None) - min: MIN((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if - (CPC_CPC_STAT_BUSY != 0) else None) - max: MAX((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if - (CPC_CPC_STAT_BUSY != 0) else None) - unit: pct - CPC-Workgroup Manager Utilization: - avg: AVG((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY - != 0) else None) - min: MIN((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY - != 0) else None) - max: MAX((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY - != 0) else None) - unit: Pct - CPC-L2 Utilization: - avg: AVG((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) - if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) - min: MIN((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) - if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) - max: MAX((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) - if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) - unit: pct - CPC-UTCL1 Stall: - avg: AVG(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) - if (CPC_CPC_STAT_BUSY != 0) else None) - min: MIN(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) - if (CPC_CPC_STAT_BUSY != 0) else None) - max: MAX(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) - if (CPC_CPC_STAT_BUSY != 0) else None) - unit: pct - CPC-UTCL2 Utilization: - avg: AVG((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) - if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) - min: MIN((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) - if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) - max: MAX((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) - if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) - unit: pct - gfx940: - CPC Utilization: - avg: AVG((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) - if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) - min: MIN((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) - if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) - max: MAX((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) - if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) - unit: pct - CPC Stall Rate: - avg: AVG((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None)) - min: MIN((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None)) - max: MAX((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None)) - unit: pct - CPC Packet Decoding Utilization: - avg: AVG((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if - (CPC_CPC_STAT_BUSY != 0) else None) - min: MIN((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if - (CPC_CPC_STAT_BUSY != 0) else None) - max: MAX((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if - (CPC_CPC_STAT_BUSY != 0) else None) - unit: pct - CPC-Workgroup Manager Utilization: - avg: AVG((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY - != 0) else None) - min: MIN((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY - != 0) else None) - max: MAX((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY - != 0) else None) - unit: Pct - CPC-L2 Utilization: - avg: AVG((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) - if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) - min: MIN((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) - if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) - max: MAX((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) - if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) - unit: pct - CPC-UTCL1 Stall: - avg: AVG(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) - if (CPC_CPC_STAT_BUSY != 0) else None) - min: MIN(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) - if (CPC_CPC_STAT_BUSY != 0) else None) - max: MAX(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) - if (CPC_CPC_STAT_BUSY != 0) else None) - unit: pct - CPC-UTCL2 Utilization: - avg: AVG((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) - if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) - min: MIN((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) - if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) - max: MAX((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) - if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) - unit: pct - gfx942: - CPC Utilization: - avg: AVG((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) - if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) - min: MIN((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) - if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) - max: MAX((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) - if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) - unit: pct - CPC Stall Rate: - avg: AVG((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None)) - min: MIN((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None)) - max: MAX((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None)) - unit: pct - CPC Packet Decoding Utilization: - avg: AVG((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if - (CPC_CPC_STAT_BUSY != 0) else None) - min: MIN((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if - (CPC_CPC_STAT_BUSY != 0) else None) - max: MAX((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if - (CPC_CPC_STAT_BUSY != 0) else None) - unit: pct - CPC-Workgroup Manager Utilization: - avg: AVG((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY - != 0) else None) - min: MIN((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY - != 0) else None) - max: MAX((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY - != 0) else None) - unit: Pct - CPC-L2 Utilization: - avg: AVG((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) - if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) - min: MIN((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) - if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) - max: MAX((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) - if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) - unit: pct - CPC-UTCL1 Stall: - avg: AVG(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) - if (CPC_CPC_STAT_BUSY != 0) else None) - min: MIN(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) - if (CPC_CPC_STAT_BUSY != 0) else None) - max: MAX(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) - if (CPC_CPC_STAT_BUSY != 0) else None) - unit: pct - CPC-UTCL2 Utilization: - avg: AVG((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) - if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) - min: MIN((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) - if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) - max: MAX((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) - if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) - unit: pct - gfx950: - CPC SYNC FIFO Full Rate: - avg: AVG((100 * CPC_SYNC_FIFO_FULL) / CPC_SYNC_WRREQ_FIFO_BUSY if (CPC_SYNC_WRREQ_FIFO_BUSY - != 0) else None) - min: MIN((100 * CPC_SYNC_FIFO_FULL) / CPC_SYNC_WRREQ_FIFO_BUSY if (CPC_SYNC_WRREQ_FIFO_BUSY - != 0) else None) - max: MAX((100 * CPC_SYNC_FIFO_FULL) / CPC_SYNC_WRREQ_FIFO_BUSY if (CPC_SYNC_WRREQ_FIFO_BUSY - != 0) else None) - unit: pct - CPC CANE Stall Rate: - avg: AVG((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0) - else None) - min: MIN((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0) - else None) - max: MAX((100 * CPC_CANE_STALL) / CPC_CANE_BUSY if (CPC_CANE_BUSY != 0) - else None) - unit: pct - CPC ADC Utilization: - avg: AVG((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else - None) - min: MIN((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else - None) - max: MAX((100 * CPC_TG_SEND) / CPC_GD_BUSY if (CPC_GD_BUSY != 0) else - None) - unit: pct - CPC Utilization: - avg: AVG((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) - if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) - min: MIN((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) - if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) - max: MAX((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) - if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) - unit: pct - CPC Stall Rate: - avg: AVG((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None)) - min: MIN((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None)) - max: MAX((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None)) - unit: pct - CPC Packet Decoding Utilization: - avg: AVG((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if - (CPC_CPC_STAT_BUSY != 0) else None) - min: MIN((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if - (CPC_CPC_STAT_BUSY != 0) else None) - max: MAX((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if - (CPC_CPC_STAT_BUSY != 0) else None) - unit: pct - CPC-Workgroup Manager Utilization: - avg: AVG((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY - != 0) else None) - min: MIN((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY - != 0) else None) - max: MAX((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY - != 0) else None) - unit: Pct - CPC-L2 Utilization: - avg: AVG((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) - if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) - min: MIN((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) - if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) - max: MAX((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) - if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) - unit: pct - CPC-UTCL1 Stall: - avg: AVG(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) - if (CPC_CPC_STAT_BUSY != 0) else None) - min: MIN(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) - if (CPC_CPC_STAT_BUSY != 0) else None) - max: MAX(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) - if (CPC_CPC_STAT_BUSY != 0) else None) - unit: pct - CPC-UTCL2 Utilization: - avg: AVG((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) - if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) - min: MIN((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) - if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) - max: MAX((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) - if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) - unit: pct - gfx908: - CPC Utilization: - avg: AVG((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) - if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) - min: MIN((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) - if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) - max: MAX((((100 * CPC_CPC_STAT_BUSY) / (CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE)) - if ((CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) != 0) else None)) - unit: pct - CPC Stall Rate: - avg: AVG((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None)) - min: MIN((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None)) - max: MAX((((100 * CPC_CPC_STAT_STALL) / CPC_CPC_STAT_BUSY) if (CPC_CPC_STAT_BUSY - != 0) else None)) - unit: pct - CPC Packet Decoding Utilization: - avg: AVG((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if - (CPC_CPC_STAT_BUSY != 0) else None) - min: MIN((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if - (CPC_CPC_STAT_BUSY != 0) else None) - max: MAX((100 * CPC_ME1_BUSY_FOR_PACKET_DECODE) / CPC_CPC_STAT_BUSY if - (CPC_CPC_STAT_BUSY != 0) else None) - unit: pct - CPC-Workgroup Manager Utilization: - avg: AVG((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY - != 0) else None) - min: MIN((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY - != 0) else None) - max: MAX((100 * CPC_ME1_DC0_SPI_BUSY) / CPC_CPC_STAT_BUSY if (CPC_CPC_STAT_BUSY - != 0) else None) - unit: Pct - CPC-L2 Utilization: - avg: AVG((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) - if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) - min: MIN((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) - if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) - max: MAX((((100 * CPC_CPC_TCIU_BUSY) / (CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE)) - if ((CPC_CPC_TCIU_BUSY + CPC_CPC_TCIU_IDLE) != 0) else None)) - unit: pct - CPC-UTCL1 Stall: - avg: AVG(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) - if (CPC_CPC_STAT_BUSY != 0) else None) - min: MIN(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) - if (CPC_CPC_STAT_BUSY != 0) else None) - max: MAX(((100 * CPC_UTCL1_STALL_ON_TRANSLATION) / CPC_CPC_STAT_BUSY) - if (CPC_CPC_STAT_BUSY != 0) else None) - unit: pct - CPC-UTCL2 Utilization: - avg: AVG((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) - if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) - min: MIN((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) - if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) - max: MAX((((100 * CPC_CPC_UTCL2IU_BUSY) / (CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE)) - if ((CPC_CPC_UTCL2IU_BUSY + CPC_CPC_UTCL2IU_IDLE) != 0) else None)) - unit: pct - metrics_description: - CPF Utilization: - plain: Percent of total cycles where the CPF was busy actively doing any work. - The ratio of CPF busy cycles over total cycles counted by the CPF. - rst: Percent of total cycles where the CPF was busy actively doing any work. - The ratio of CPF busy cycles over total cycles counted by the CPF. - unit: Percent - CPF Stall: - plain: Percent of CPF busy cycles where the CPF was stalled for any reason. - rst: Percent of CPF busy cycles where the CPF was stalled for any reason. - unit: Percent - CPF-L2 Utilization: - plain: Percent of total cycles counted by the CPF-L2 interface where the CPF-L2 - interface was active doing any work. The ratio of CPF-L2 busy cycles over - total cycles counted by the CPF-L2. - rst: Percent of total cycles counted by the CPF-:doc:`L2 <l2-cache>` interface where - the CPF-L2 interface was active doing any work. The ratio of CPF-L2 busy - cycles over total cycles counted by the CPF-L2. - unit: Percent - CPF-L2 Stall: - plain: Percent of CPF-L2 L2 busy cycles where the CPF-L2 interface was stalled - for any reason. - rst: Percent of CPF-:doc:`L2 <l2-cache>` L2 busy cycles where the CPF-L2 interface - was stalled for any reason. - unit: Percent - CPF-UTCL1 Stall: - plain: Percent of CPF busy cycles where the CPF was stalled by address translation. - rst: Percent of CPF busy cycles where the CPF was stalled by address translation. - unit: Percent - CPC Utilization: - plain: Percent of total cycles where the CPC was busy actively doing any work. - The ratio of CPC busy cycles over total cycles counted by the CPC. - rst: Percent of total cycles where the CPC was busy actively doing any work. - The ratio of CPC busy cycles over total cycles counted by the CPC. - unit: Percent - CPC Stall Rate: - plain: Percent of CPC busy cycles where the CPC was stalled for any reason. - rst: Percent of CPC busy cycles where the CPC was stalled for any reason. - unit: Percent - CPC Packet Decoding Utilization: - plain: Percent of CPC busy cycles spent decoding commands for processing. - rst: Percent of CPC busy cycles spent decoding commands for processing. - unit: Percent - CPC-Workgroup Manager Utilization: - plain: Percent of CPC busy cycles spent dispatching workgroups to the workgroup - manager. - rst: Percent of CPC busy cycles spent dispatching workgroups to the :ref:`workgroup - manager <desc-spi>`. - unit: Percent - CPC-L2 Utilization: - plain: Percent of total cycles counted by the CPC-L2 interface where the CPC-L2 - interface was active doing any work. - rst: Percent of total cycles counted by the CPC-:doc:`L2 <l2-cache>` interface where - the CPC-L2 interface was active doing any work. - unit: Percent - CPC-UTCL1 Stall: - plain: Percent of CPC busy cycles where the CPC was stalled by address translation - rst: Percent of CPC busy cycles where the CPC was stalled by address translation - unit: Percent - CPC-UTCL2 Utilization: - plain: |- - Percent of total cycles counted by the CPC's L2 address translation - interface where the CPC was busy doing address translation work. - rst: Percent of total cycles counted by the CPC's :doc:`L2 <l2-cache>` address translation - interface where the CPC was busy doing address translation work. - unit: Percent -- id: 600 - title: Workgroup Manager (SPI) - data source: - - metric_table: - id: 601 - title: Workgroup manager utilizations - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - metric: - gfx90a: - Accelerator Utilization: - avg: AVG(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) - min: MIN(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) - max: MAX(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) - unit: Pct - Scheduler-Pipe Utilization: - avg: AVG(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu - * $se_per_gpu)) - min: MIN(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu - * $se_per_gpu)) - max: MAX(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu - * $se_per_gpu)) - unit: Pct - Workgroup Manager Utilization: - avg: AVG(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) - min: MIN(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) - max: MAX(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) - unit: Pct - Shader Engine Utilization: - avg: AVG(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) - min: MIN(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) - max: MAX(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) - unit: Pct - SIMD Utilization: - avg: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - Dispatched Workgroups: - avg: AVG(SPI_CSN_NUM_THREADGROUPS) - min: MIN(SPI_CSN_NUM_THREADGROUPS) - max: MAX(SPI_CSN_NUM_THREADGROUPS) - unit: Workgroups - Dispatched Wavefronts: - avg: AVG(SPI_CSN_WAVE) - min: MIN(SPI_CSN_WAVE) - max: MAX(SPI_CSN_WAVE) - unit: Wavefronts - VGPR Writes: - avg: AVG((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) - else None)) - min: MIN((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) - else None)) - max: MAX((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) - else None)) - unit: Cycles/wave - SGPR Writes: - avg: AVG((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) - else None)) - min: MIN((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) - else None)) - max: MAX((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) - else None)) - unit: Cycles/wave - gfx941: - Accelerator Utilization: - avg: AVG(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) - min: MIN(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) - max: MAX(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) - unit: Pct - Scheduler-Pipe Utilization: - avg: AVG(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu - * $se_per_gpu)) - min: MIN(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu - * $se_per_gpu)) - max: MAX(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu - * $se_per_gpu)) - unit: Pct - Workgroup Manager Utilization: - avg: AVG(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) - min: MIN(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) - max: MAX(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) - unit: Pct - Shader Engine Utilization: - avg: AVG(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) - min: MIN(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) - max: MAX(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) - unit: Pct - SIMD Utilization: - avg: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - Dispatched Workgroups: - avg: AVG(SPI_CSN_NUM_THREADGROUPS) - min: MIN(SPI_CSN_NUM_THREADGROUPS) - max: MAX(SPI_CSN_NUM_THREADGROUPS) - unit: Workgroups - Dispatched Wavefronts: - avg: AVG(SPI_CSN_WAVE) - min: MIN(SPI_CSN_WAVE) - max: MAX(SPI_CSN_WAVE) - unit: Wavefronts - VGPR Writes: - avg: AVG((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) - else None)) - min: MIN((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) - else None)) - max: MAX((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) - else None)) - unit: Cycles/wave - SGPR Writes: - avg: AVG((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) - else None)) - min: MIN((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) - else None)) - max: MAX((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) - else None)) - unit: Cycles/wave - gfx940: - Accelerator Utilization: - avg: AVG(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) - min: MIN(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) - max: MAX(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) - unit: Pct - Scheduler-Pipe Utilization: - avg: AVG(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu - * $se_per_gpu)) - min: MIN(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu - * $se_per_gpu)) - max: MAX(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu - * $se_per_gpu)) - unit: Pct - Workgroup Manager Utilization: - avg: AVG(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) - min: MIN(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) - max: MAX(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) - unit: Pct - Shader Engine Utilization: - avg: AVG(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) - min: MIN(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) - max: MAX(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) - unit: Pct - SIMD Utilization: - avg: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - Dispatched Workgroups: - avg: AVG(SPI_CSN_NUM_THREADGROUPS) - min: MIN(SPI_CSN_NUM_THREADGROUPS) - max: MAX(SPI_CSN_NUM_THREADGROUPS) - unit: Workgroups - Dispatched Wavefronts: - avg: AVG(SPI_CSN_WAVE) - min: MIN(SPI_CSN_WAVE) - max: MAX(SPI_CSN_WAVE) - unit: Wavefronts - VGPR Writes: - avg: AVG((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) - else None)) - min: MIN((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) - else None)) - max: MAX((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) - else None)) - unit: Cycles/wave - SGPR Writes: - avg: AVG((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) - else None)) - min: MIN((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) - else None)) - max: MAX((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) - else None)) - unit: Cycles/wave - gfx942: - Accelerator Utilization: - avg: AVG(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) - min: MIN(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) - max: MAX(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) - unit: Pct - Scheduler-Pipe Utilization: - avg: AVG(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu - * $se_per_gpu)) - min: MIN(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu - * $se_per_gpu)) - max: MAX(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu - * $se_per_gpu)) - unit: Pct - Workgroup Manager Utilization: - avg: AVG(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) - min: MIN(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) - max: MAX(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) - unit: Pct - Shader Engine Utilization: - avg: AVG(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) - min: MIN(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) - max: MAX(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) - unit: Pct - SIMD Utilization: - avg: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - Dispatched Workgroups: - avg: AVG(SPI_CSN_NUM_THREADGROUPS) - min: MIN(SPI_CSN_NUM_THREADGROUPS) - max: MAX(SPI_CSN_NUM_THREADGROUPS) - unit: Workgroups - Dispatched Wavefronts: - avg: AVG(SPI_CSN_WAVE) - min: MIN(SPI_CSN_WAVE) - max: MAX(SPI_CSN_WAVE) - unit: Wavefronts - VGPR Writes: - avg: AVG((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) - else None)) - min: MIN((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) - else None)) - max: MAX((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) - else None)) - unit: Cycles/wave - SGPR Writes: - avg: AVG((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) - else None)) - min: MIN((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) - else None)) - max: MAX((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) - else None)) - unit: Cycles/wave - gfx950: - Schedule-Pipe Wave Occupancy: - avg: AVG(SPI_CSQ_P0_OCCUPANCY + SPI_CSQ_P1_OCCUPANCY + SPI_CSQ_P2_OCCUPANCY - + SPI_CSQ_P3_OCCUPANCY) - min: MIN(SPI_CSQ_P0_OCCUPANCY + SPI_CSQ_P1_OCCUPANCY + SPI_CSQ_P2_OCCUPANCY - + SPI_CSQ_P3_OCCUPANCY) - max: MAX(SPI_CSQ_P0_OCCUPANCY + SPI_CSQ_P1_OCCUPANCY + SPI_CSQ_P2_OCCUPANCY - + SPI_CSQ_P3_OCCUPANCY) - unit: Wave - Accelerator Utilization: - avg: AVG(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) - min: MIN(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) - max: MAX(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) - unit: Pct - Scheduler-Pipe Utilization: - avg: AVG(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) - / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) - min: MIN(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) - / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) - max: MAX(100 * (SPI_CS0_BUSY + SPI_CS1_BUSY + SPI_CS2_BUSY + SPI_CS3_BUSY) - / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu * $se_per_gpu)) - unit: Pct - Scheduler-Pipe Wave Utilization: - avg: AVG(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu - * $se_per_gpu)) - min: MIN(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu - * $se_per_gpu)) - max: MAX(100 * (SPI_CSC_WAVE_CNT_BUSY) / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu - * $se_per_gpu)) - unit: Pct - Workgroup Manager Utilization: - avg: AVG(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) - min: MIN(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) - max: MAX(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) - unit: Pct - Shader Engine Utilization: - avg: AVG(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) - min: MIN(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) - max: MAX(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) - unit: Pct - SIMD Utilization: - avg: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - Dispatched Workgroups: - avg: AVG(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS - + SPI_CS3_NUM_THREADGROUPS) - min: MIN(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS - + SPI_CS3_NUM_THREADGROUPS) - max: MAX(SPI_CS0_NUM_THREADGROUPS + SPI_CS1_NUM_THREADGROUPS + SPI_CS2_NUM_THREADGROUPS - + SPI_CS3_NUM_THREADGROUPS) - unit: Workgroups - Dispatched Wavefronts: - avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) - min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) - max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) - unit: Wavefronts - VGPR Writes: - avg: AVG((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE - + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE - + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) - min: MIN((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE - + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE - + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) - max: MAX((((SPI_VWC0_VDATA_VALID_WR + SPI_VWC1_VDATA_VALID_WR) / (SPI_CS0_WAVE - + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE - + SPI_CS2_WAVE + SPI_CS3_WAVE) != 0) else None)) - unit: Cycles/wave - SGPR Writes: - avg: AVG((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE - + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) - != 0) else None)) - min: MIN((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE - + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) - != 0) else None)) - max: MAX((((1 * SPI_SWC_CSC_WR) / (SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE - + SPI_CS3_WAVE)) if ((SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) - != 0) else None)) - unit: Cycles/wave - gfx908: - Accelerator Utilization: - avg: AVG(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) - min: MIN(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) - max: MAX(100 * $GRBM_GUI_ACTIVE_PER_XCD / $GRBM_COUNT_PER_XCD) - unit: Pct - Scheduler-Pipe Utilization: - avg: AVG(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu - * $se_per_gpu)) - min: MIN(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu - * $se_per_gpu)) - max: MAX(100 * SPI_CSN_BUSY / ($GRBM_GUI_ACTIVE_PER_XCD * $pipes_per_gpu - * $se_per_gpu)) - unit: Pct - Workgroup Manager Utilization: - avg: AVG(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) - min: MIN(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) - max: MAX(100 * $GRBM_SPI_BUSY_PER_XCD / $GRBM_GUI_ACTIVE_PER_XCD) - unit: Pct - Shader Engine Utilization: - avg: AVG(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) - min: MIN(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) - max: MAX(100 * SQ_BUSY_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $se_per_gpu)) - unit: Pct - SIMD Utilization: - avg: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - Dispatched Workgroups: - avg: AVG(SPI_CSN_NUM_THREADGROUPS) - min: MIN(SPI_CSN_NUM_THREADGROUPS) - max: MAX(SPI_CSN_NUM_THREADGROUPS) - unit: Workgroups - Dispatched Wavefronts: - avg: AVG(SPI_CSN_WAVE) - min: MIN(SPI_CSN_WAVE) - max: MAX(SPI_CSN_WAVE) - unit: Wavefronts - VGPR Writes: - avg: AVG((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) - else None)) - min: MIN((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) - else None)) - max: MAX((((4 * SPI_VWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) - else None)) - unit: Cycles/wave - SGPR Writes: - avg: AVG((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) - else None)) - min: MIN((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) - else None)) - max: MAX((((1 * SPI_SWC_CSC_WR) / SPI_CSN_WAVE) if (SPI_CSN_WAVE != 0) - else None)) - unit: Cycles/wave - - metric_table: - id: 602 - title: Workgroup Manager - Resource Allocation - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - metric: - gfx90a: - Not-scheduled Rate (Workgroup Manager): - avg: AVG((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - min: MIN((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - max: MAX((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - unit: Pct - Not-scheduled Rate (Scheduler-Pipe): - avg: AVG((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - min: MIN((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - max: MAX((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - unit: Pct - Scheduler-Pipe Stall Rate: - avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) - min: MIN((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) - max: MAX((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) - unit: Pct - Scratch Stall Rate: - avg: AVG((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - min: MIN((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - max: MAX((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - unit: Pct - Insufficient SIMD Waveslots: - avg: AVG(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - min: MIN(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - max: MAX(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - unit: Pct - Insufficient SIMD VGPRs: - avg: AVG(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - min: MIN(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - max: MAX(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - unit: Pct - Insufficient SIMD SGPRs: - avg: AVG(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - min: MIN(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - max: MAX(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - unit: Pct - Insufficient CU LDS: - avg: AVG(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - Insufficient CU Barriers: - avg: AVG(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - Reached CU Workgroup Limit: - avg: AVG(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - min: MIN(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - max: MAX(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - unit: Pct - Reached CU Wavefront Limit: - avg: AVG(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - gfx941: - Not-scheduled Rate (Workgroup Manager): - avg: AVG((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - min: MIN((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - max: MAX((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - unit: Pct - Not-scheduled Rate (Scheduler-Pipe): - avg: AVG((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - min: MIN((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - max: MAX((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - unit: Pct - Scheduler-Pipe Stall Rate: - avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) - min: MIN((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) - max: MAX((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) - unit: Pct - Scratch Stall Rate: - avg: AVG((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - min: MIN((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - max: MAX((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - unit: Pct - Insufficient SIMD Waveslots: - avg: AVG(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - min: MIN(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - max: MAX(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - unit: Pct - Insufficient SIMD VGPRs: - avg: AVG(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - min: MIN(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - max: MAX(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - unit: Pct - Insufficient SIMD SGPRs: - avg: AVG(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - min: MIN(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - max: MAX(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - unit: Pct - Insufficient CU LDS: - avg: AVG(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - Insufficient CU Barriers: - avg: AVG(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - Reached CU Workgroup Limit: - avg: AVG(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - min: MIN(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - max: MAX(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - unit: Pct - Reached CU Wavefront Limit: - avg: AVG(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - gfx940: - Not-scheduled Rate (Workgroup Manager): - avg: AVG((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - min: MIN((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - max: MAX((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - unit: Pct - Not-scheduled Rate (Scheduler-Pipe): - avg: AVG((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - min: MIN((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - max: MAX((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - unit: Pct - Scheduler-Pipe Stall Rate: - avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) - min: MIN((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) - max: MAX((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) - unit: Pct - Scratch Stall Rate: - avg: AVG((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - min: MIN((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - max: MAX((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - unit: Pct - Insufficient SIMD Waveslots: - avg: AVG(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - min: MIN(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - max: MAX(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - unit: Pct - Insufficient SIMD VGPRs: - avg: AVG(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - min: MIN(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - max: MAX(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - unit: Pct - Insufficient SIMD SGPRs: - avg: AVG(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - min: MIN(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - max: MAX(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - unit: Pct - Insufficient CU LDS: - avg: AVG(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - Insufficient CU Barriers: - avg: AVG(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - Reached CU Workgroup Limit: - avg: AVG(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - min: MIN(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - max: MAX(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - unit: Pct - Reached CU Wavefront Limit: - avg: AVG(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - gfx942: - Not-scheduled Rate (Workgroup Manager): - avg: AVG((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - min: MIN((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - max: MAX((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - unit: Pct - Not-scheduled Rate (Scheduler-Pipe): - avg: AVG((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - min: MIN((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - max: MAX((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - unit: Pct - Scheduler-Pipe Stall Rate: - avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) - min: MIN((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) - max: MAX((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) - unit: Pct - Scratch Stall Rate: - avg: AVG((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - min: MIN((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - max: MAX((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - unit: Pct - Insufficient SIMD Waveslots: - avg: AVG(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - min: MIN(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - max: MAX(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - unit: Pct - Insufficient SIMD VGPRs: - avg: AVG(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - min: MIN(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - max: MAX(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - unit: Pct - Insufficient SIMD SGPRs: - avg: AVG(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - min: MIN(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - max: MAX(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - unit: Pct - Insufficient CU LDS: - avg: AVG(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - Insufficient CU Barriers: - avg: AVG(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - Reached CU Workgroup Limit: - avg: AVG(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - min: MIN(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - max: MAX(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - unit: Pct - Reached CU Wavefront Limit: - avg: AVG(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - gfx950: - Not-scheduled Rate (Workgroup Manager): - avg: AVG((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - min: MIN((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - max: MAX((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - unit: Pct - Not-scheduled Rate (Scheduler-Pipe): - avg: AVG((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - min: MIN((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - max: MAX((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - unit: Pct - Scheduler-Pipe FIFO Full Rate: - avg: AVG((100 * (SPI_CS0_CRAWLER_STALL + SPI_CS1_CRAWLER_STALL + SPI_CS2_CRAWLER_STALL - + SPI_CS3_CRAWLER_STALL) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if - ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - min: MIN((100 * (SPI_CS0_CRAWLER_STALL + SPI_CS1_CRAWLER_STALL + SPI_CS2_CRAWLER_STALL - + SPI_CS3_CRAWLER_STALL) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if - ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - max: MAX((100 * (SPI_CS0_CRAWLER_STALL + SPI_CS1_CRAWLER_STALL + SPI_CS2_CRAWLER_STALL - + SPI_CS3_CRAWLER_STALL) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) if - ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - unit: Pct - Scheduler-Pipe Stall Rate: - avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) - min: MIN((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) - max: MAX((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) - unit: Pct - Scratch Stall Rate: - avg: AVG((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - min: MIN((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - max: MAX((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - unit: Pct - Insufficient SIMD Waveslots: - avg: AVG(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - min: MIN(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - max: MAX(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - unit: Pct - Insufficient SIMD VGPRs: - avg: AVG(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - min: MIN(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - max: MAX(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - unit: Pct - Insufficient SIMD SGPRs: - avg: AVG(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - min: MIN(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - max: MAX(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - unit: Pct - Insufficient CU LDS: - avg: AVG(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - Insufficient CU Barriers: - avg: AVG(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - Reached CU Workgroup Limit: - avg: AVG(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - min: MIN(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - max: MAX(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - unit: Pct - Reached CU Wavefront Limit: - avg: AVG(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - gfx908: - Not-scheduled Rate (Workgroup Manager): - avg: AVG((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - min: MIN((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - max: MAX((100 * SPI_RA_REQ_NO_ALLOC_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - unit: Pct - Not-scheduled Rate (Scheduler-Pipe): - avg: AVG((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - min: MIN((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - max: MAX((100 * SPI_RA_REQ_NO_ALLOC / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - unit: Pct - Scheduler-Pipe Stall Rate: - avg: AVG((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) - min: MIN((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) - max: MAX((((100 * SPI_RA_RES_STALL_CSN) / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None)) - unit: Pct - Scratch Stall Rate: - avg: AVG((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - min: MIN((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - max: MAX((100 * SPI_RA_TMP_STALL_CSN / ($GRBM_SPI_BUSY_PER_XCD * $se_per_gpu)) - if ($GRBM_SPI_BUSY_PER_XCD != 0) else None) - unit: Pct - Insufficient SIMD Waveslots: - avg: AVG(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - min: MIN(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - max: MAX(100 * SPI_RA_WAVE_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - unit: Pct - Insufficient SIMD VGPRs: - avg: AVG(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - min: MIN(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - max: MAX(100 * SPI_RA_VGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - unit: Pct - Insufficient SIMD SGPRs: - avg: AVG(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - min: MIN(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - max: MAX(100 * SPI_RA_SGPR_SIMD_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - unit: Pct - Insufficient CU LDS: - avg: AVG(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(400 * SPI_RA_LDS_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - Insufficient CU Barriers: - avg: AVG(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(400 * SPI_RA_BAR_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - Reached CU Workgroup Limit: - avg: AVG(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - min: MIN(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - max: MAX(400 * SPI_RA_TGLIM_CU_FULL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * - $cu_per_gpu)) - unit: Pct - Reached CU Wavefront Limit: - avg: AVG(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - min: MIN(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - max: MAX(400 * SPI_RA_WVLIM_STALL_CSN / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)) - unit: Pct - metrics_description: - Accelerator Utilization: - plain: The percent of cycles in the kernel where the accelerator was actively - doing any work. - rst: The percent of cycles in the kernel where the accelerator was actively - doing any work. - unit: Percent - Scheduler-Pipe Utilization: - plain: The percent of total scheduler-pipe cycles in the kernel where the scheduler-pipes - were actively doing any work. - rst: |- - The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>` - in the kernel where the scheduler-pipes were actively doing any work. Note: this - value is expected to range between 0% and 25%. See :ref:`desc-spi`. - unit: Percent - Workgroup Manager Utilization: - plain: The percent of cycles in the kernel where the workgroup manager was actively - doing any work. - rst: The percent of cycles in the kernel where the workgroup manager was actively - doing any work. - unit: Percent - Shader Engine Utilization: - plain: The percent of total shader engine cycles in the kernel where any CU - in a shader-engine was actively doing any work, normalized over all shader-engines. - Low values (e.g., << 100%) indicate that the accelerator was not fully saturated - by the kernel, or a potential load-imbalance issue. - rst: The percent of :ref:`total shader engine cycles <total-se-cycles>` in the kernel - where any CU in a shader-engine was actively doing any work, normalized over - all shader-engines. Low values (e.g., << 100%) indicate that the accelerator - was not fully saturated by the kernel, or a potential load-imbalance issue. - unit: Percent - SIMD Utilization: - plain: The percent of total SIMD cycles in the kernel where any SIMD on a CU - was actively doing any work, summed over all CUs. Low values (less than 100%) - indicate that the accelerator was not fully saturated by the kernel, or a - potential load-imbalance issue. - rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel where - any :ref:`SIMD <desc-valu>` on a CU was actively doing any work, summed over - all CUs. Low values (less than 100%) indicate that the accelerator was not - fully saturated by the kernel, or a potential load-imbalance issue. - unit: Percent - Dispatched Workgroups: - plain: The total number of workgroups forming this kernel launch. - rst: The total number of workgroups forming this kernel launch. - unit: Workgroups - Dispatched Wavefronts: - plain: The total number of wavefronts, summed over all workgroups, forming this - kernel launch. - rst: The total number of wavefronts, summed over all workgroups, forming this - kernel launch. - unit: Wavefronts - VGPR Writes: - plain: The average number of cycles spent initializing VGPRs at wave creation. - rst: The average number of cycles spent initializing :ref:`VGPRs <desc-valu>` at - wave creation. - unit: Cycles/wave - SGPR Writes: - plain: The average number of cycles spent initializing SGPRs at wave creation. - rst: The average number of cycles spent initializing :ref:`SGPRs <desc-salu>` at - wave creation. - unit: Cycles/wave - Not-scheduled Rate (Workgroup Manager): - plain: The percent of total scheduler-pipe cycles in the kernel where a workgroup - could not be scheduled to a CU due to a bottleneck within the workgroup manager - rather than a lack of a CU or SIMD with sufficient resources. - rst: |- - The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>` - in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>` - due to a bottleneck within the workgroup manager rather than a lack of a - CU or :ref:`SIMD <desc-valu>` with sufficient resources. Note: this value - is expected to range between 0-25%. See note in :ref:`workgroup manager <desc-spi>` - description. - unit: Percent - Not-scheduled Rate (Scheduler-Pipe): - plain: |- - The percent of total scheduler-pipe cycles in the kernel where a workgroup - could not be scheduled to a CU due to a bottleneck within the scheduler-pipes - rather than a lack of a CU or SIMD with sufficient resources. - rst: |- - The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>` - in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>` - due to a bottleneck within the scheduler-pipes rather than a lack of a CU - or :ref:`SIMD <desc-valu>` with sufficient resources. Note: this value is - expected to range between 0-25%, see note in :ref:`workgroup manager <desc-spi>` - description. - unit: Percent - Scheduler-Pipe Stall Rate: - plain: The percent of total scheduler-pipe cycles in the kernel where a workgroup - could not be scheduled to a CU due to occupancy limitations (like a lack of - a CU or SIMD with sufficient resources). - rst: |- - The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>` - in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>` - due to occupancy limitations (like a lack of a CU or :ref:`SIMD <desc-valu>` - with sufficient resources). Note: this value is expected to range between - 0-25%, see note in :ref:`workgroup manager <desc-spi>` description. - unit: Percent - Scratch Stall Rate: - plain: The percent of total shader-engine cycles in the kernel where a workgroup - could not be scheduled to a CU due to lack of private (a.k.a., scratch) memory - slots. While this can reach up to 100%, note that the actual occupancy limitations - on a kernel using private memory are typically quite small (for example, less - than 1% of the total number of waves that can be scheduled to an accelerator). - rst: The percent of :ref:`total shader-engine cycles <total-se-cycles>` in the kernel - where a workgroup could not be scheduled to a :doc:`CU <compute-unit>` due - to lack of :ref:`private (a.k.a., scratch) memory <memory-type>` slots. While - this can reach up to 100%, note that the actual occupancy limitations on - a kernel using private memory are typically quite small (for example, less than - 1% of the total number of waves that can be scheduled to an accelerator). - unit: Percent - Insufficient SIMD Waveslots: - plain: The percent of total SIMD cycles in the kernel where a workgroup could - not be scheduled to a SIMD due to lack of available waveslots. - rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel where - a workgroup could not be scheduled to a :ref:`SIMD <desc-valu>` due to lack - of available :ref:`waveslots <desc-valu>`. - unit: Percent - Insufficient SIMD VGPRs: - plain: The percent of total SIMD cycles in the kernel where a workgroup could - not be scheduled to a SIMD due to lack of available VGPRs. - rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel where - a workgroup could not be scheduled to a :ref:`SIMD <desc-valu>` due to lack - of available :ref:`VGPRs <desc-valu>`. - unit: Percent - Insufficient SIMD SGPRs: - plain: The percent of total SIMD cycles in the kernel where a workgroup could - not be scheduled to a SIMD due to lack of available SGPRs. - rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel where - a workgroup could not be scheduled to a :ref:`SIMD <desc-valu>` due to lack - of available :ref:`SGPRs <desc-salu>`. - unit: Percent - Insufficient CU LDS: - plain: The percent of total CU cycles in the kernel where a workgroup could - not be scheduled to a CU due to lack of available LDS. - rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel where - a workgroup could not be scheduled to a :doc:`CU <compute-unit>` due to lack - of available :doc:`LDS <local-data-share>`. - unit: Percent - Insufficient CU Barriers: - plain: The percent of total CU cycles in the kernel where a workgroup could - not be scheduled to a CU due to lack of available barriers. - rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel where - a workgroup could not be scheduled to a :doc:`CU <compute-unit>` due to lack - of available :ref:`barriers <desc-barrier>`. - unit: Percent - Reached CU Workgroup Limit: - plain: The percent of total CU cycles in the kernel where a workgroup could - not be scheduled to a CU due to limits within the workgroup manager. This - is expected to be always be zero on CDNA2 or newer accelerators (and small - for previous accelerators). - rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel where - a workgroup could not be scheduled to a :doc:`CU <compute-unit>` due to limits - within the workgroup manager. This is expected to be always be zero on CDNA2 - or newer accelerators (and small for previous accelerators). - unit: Percent - Reached CU Wavefront Limit: - plain: The percent of total CU cycles in the kernel where a wavefront could - not be scheduled to a CU due to limits within the workgroup manager. This - is expected to be always be zero on CDNA2 or newer accelerators (and small - for previous accelerators). - rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel where - a wavefront could not be scheduled to a :doc:`CU <compute-unit>` due to limits - within the workgroup manager. This is expected to be always be zero on CDNA2 - or newer accelerators (and small for previous accelerators). - unit: Percent -- id: 700 - title: Wavefront - data source: - - metric_table: - id: 701 - title: Wavefront Launch Stats - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - metric: - gfx90a: - Grid Size: - avg: AVG(Grid_Size) - min: MIN(Grid_Size) - max: MAX(Grid_Size) - unit: Work Items - Workgroup Size: - avg: AVG(Workgroup_Size) - min: MIN(Workgroup_Size) - max: MAX(Workgroup_Size) - unit: Work Items - Total Wavefronts: - avg: AVG(SPI_CSN_WAVE) - min: MIN(SPI_CSN_WAVE) - max: MAX(SPI_CSN_WAVE) - unit: Wavefronts - Saved Wavefronts: - avg: AVG(SQ_WAVES_SAVED) - min: MIN(SQ_WAVES_SAVED) - max: MAX(SQ_WAVES_SAVED) - unit: Wavefronts - Restored Wavefronts: - avg: AVG(SQ_WAVES_RESTORED) - min: MIN(SQ_WAVES_RESTORED) - max: MAX(SQ_WAVES_RESTORED) - unit: Wavefronts - VGPRs: - avg: AVG(Arch_VGPR) - min: MIN(Arch_VGPR) - max: MAX(Arch_VGPR) - unit: Registers - AGPRs: - avg: AVG(Accum_VGPR) - min: MIN(Accum_VGPR) - max: MAX(Accum_VGPR) - unit: Registers - SGPRs: - avg: AVG(SGPR) - min: MIN(SGPR) - max: MAX(SGPR) - unit: Registers - LDS Allocation: - avg: AVG(LDS_Per_Workgroup) - min: MIN(LDS_Per_Workgroup) - max: MAX(LDS_Per_Workgroup) - unit: Bytes - Scratch Allocation: - avg: AVG(Scratch_Per_Workitem) - min: MIN(Scratch_Per_Workitem) - max: MAX(Scratch_Per_Workitem) - unit: Bytes/Workitem - gfx941: - Grid Size: - avg: AVG(Grid_Size) - min: MIN(Grid_Size) - max: MAX(Grid_Size) - unit: Work Items - Workgroup Size: - avg: AVG(Workgroup_Size) - min: MIN(Workgroup_Size) - max: MAX(Workgroup_Size) - unit: Work Items - Total Wavefronts: - avg: AVG(SPI_CSN_WAVE) - min: MIN(SPI_CSN_WAVE) - max: MAX(SPI_CSN_WAVE) - unit: Wavefronts - Saved Wavefronts: - avg: AVG(SQ_WAVES_SAVED) - min: MIN(SQ_WAVES_SAVED) - max: MAX(SQ_WAVES_SAVED) - unit: Wavefronts - Restored Wavefronts: - avg: AVG(SQ_WAVES_RESTORED) - min: MIN(SQ_WAVES_RESTORED) - max: MAX(SQ_WAVES_RESTORED) - unit: Wavefronts - VGPRs: - avg: AVG(Arch_VGPR) - min: MIN(Arch_VGPR) - max: MAX(Arch_VGPR) - unit: Registers - AGPRs: - avg: AVG(Accum_VGPR) - min: MIN(Accum_VGPR) - max: MAX(Accum_VGPR) - unit: Registers - SGPRs: - avg: AVG(SGPR) - min: MIN(SGPR) - max: MAX(SGPR) - unit: Registers - LDS Allocation: - avg: AVG(LDS_Per_Workgroup) - min: MIN(LDS_Per_Workgroup) - max: MAX(LDS_Per_Workgroup) - unit: Bytes - Scratch Allocation: - avg: AVG(Scratch_Per_Workitem) - min: MIN(Scratch_Per_Workitem) - max: MAX(Scratch_Per_Workitem) - unit: Bytes/Workitem - gfx940: - Grid Size: - avg: AVG(Grid_Size) - min: MIN(Grid_Size) - max: MAX(Grid_Size) - unit: Work Items - Workgroup Size: - avg: AVG(Workgroup_Size) - min: MIN(Workgroup_Size) - max: MAX(Workgroup_Size) - unit: Work Items - Total Wavefronts: - avg: AVG(SPI_CSN_WAVE) - min: MIN(SPI_CSN_WAVE) - max: MAX(SPI_CSN_WAVE) - unit: Wavefronts - Saved Wavefronts: - avg: AVG(SQ_WAVES_SAVED) - min: MIN(SQ_WAVES_SAVED) - max: MAX(SQ_WAVES_SAVED) - unit: Wavefronts - Restored Wavefronts: - avg: AVG(SQ_WAVES_RESTORED) - min: MIN(SQ_WAVES_RESTORED) - max: MAX(SQ_WAVES_RESTORED) - unit: Wavefronts - VGPRs: - avg: AVG(Arch_VGPR) - min: MIN(Arch_VGPR) - max: MAX(Arch_VGPR) - unit: Registers - AGPRs: - avg: AVG(Accum_VGPR) - min: MIN(Accum_VGPR) - max: MAX(Accum_VGPR) - unit: Registers - SGPRs: - avg: AVG(SGPR) - min: MIN(SGPR) - max: MAX(SGPR) - unit: Registers - LDS Allocation: - avg: AVG(LDS_Per_Workgroup) - min: MIN(LDS_Per_Workgroup) - max: MAX(LDS_Per_Workgroup) - unit: Bytes - Scratch Allocation: - avg: AVG(Scratch_Per_Workitem) - min: MIN(Scratch_Per_Workitem) - max: MAX(Scratch_Per_Workitem) - unit: Bytes/Workitem - gfx942: - Grid Size: - avg: AVG(Grid_Size) - min: MIN(Grid_Size) - max: MAX(Grid_Size) - unit: Work Items - Workgroup Size: - avg: AVG(Workgroup_Size) - min: MIN(Workgroup_Size) - max: MAX(Workgroup_Size) - unit: Work Items - Total Wavefronts: - avg: AVG(SPI_CSN_WAVE) - min: MIN(SPI_CSN_WAVE) - max: MAX(SPI_CSN_WAVE) - unit: Wavefronts - Saved Wavefronts: - avg: AVG(SQ_WAVES_SAVED) - min: MIN(SQ_WAVES_SAVED) - max: MAX(SQ_WAVES_SAVED) - unit: Wavefronts - Restored Wavefronts: - avg: AVG(SQ_WAVES_RESTORED) - min: MIN(SQ_WAVES_RESTORED) - max: MAX(SQ_WAVES_RESTORED) - unit: Wavefronts - VGPRs: - avg: AVG(Arch_VGPR) - min: MIN(Arch_VGPR) - max: MAX(Arch_VGPR) - unit: Registers - AGPRs: - avg: AVG(Accum_VGPR) - min: MIN(Accum_VGPR) - max: MAX(Accum_VGPR) - unit: Registers - SGPRs: - avg: AVG(SGPR) - min: MIN(SGPR) - max: MAX(SGPR) - unit: Registers - LDS Allocation: - avg: AVG(LDS_Per_Workgroup) - min: MIN(LDS_Per_Workgroup) - max: MAX(LDS_Per_Workgroup) - unit: Bytes - Scratch Allocation: - avg: AVG(Scratch_Per_Workitem) - min: MIN(Scratch_Per_Workitem) - max: MAX(Scratch_Per_Workitem) - unit: Bytes/Workitem - gfx950: - Grid Size: - avg: AVG(Grid_Size) - min: MIN(Grid_Size) - max: MAX(Grid_Size) - unit: Work Items - Workgroup Size: - avg: AVG(Workgroup_Size) - min: MIN(Workgroup_Size) - max: MAX(Workgroup_Size) - unit: Work Items - Total Wavefronts: - avg: AVG(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) - min: MIN(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) - max: MAX(SPI_CS0_WAVE + SPI_CS1_WAVE + SPI_CS2_WAVE + SPI_CS3_WAVE) - unit: Wavefronts - Saved Wavefronts: - avg: AVG(SQ_WAVES_SAVED) - min: MIN(SQ_WAVES_SAVED) - max: MAX(SQ_WAVES_SAVED) - unit: Wavefronts - Restored Wavefronts: - avg: AVG(SQ_WAVES_RESTORED) - min: MIN(SQ_WAVES_RESTORED) - max: MAX(SQ_WAVES_RESTORED) - unit: Wavefronts - VGPRs: - avg: AVG(Arch_VGPR) - min: MIN(Arch_VGPR) - max: MAX(Arch_VGPR) - unit: Registers - AGPRs: - avg: AVG(Accum_VGPR) - min: MIN(Accum_VGPR) - max: MAX(Accum_VGPR) - unit: Registers - SGPRs: - avg: AVG(SGPR) - min: MIN(SGPR) - max: MAX(SGPR) - unit: Registers - LDS Allocation: - avg: AVG(LDS_Per_Workgroup) - min: MIN(LDS_Per_Workgroup) - max: MAX(LDS_Per_Workgroup) - unit: Bytes - Scratch Allocation: - avg: AVG(Scratch_Per_Workitem) - min: MIN(Scratch_Per_Workitem) - max: MAX(Scratch_Per_Workitem) - unit: Bytes/Workitem - gfx908: - Grid Size: - avg: AVG(Grid_Size) - min: MIN(Grid_Size) - max: MAX(Grid_Size) - unit: Work Items - Workgroup Size: - avg: AVG(Workgroup_Size) - min: MIN(Workgroup_Size) - max: MAX(Workgroup_Size) - unit: Work Items - Total Wavefronts: - avg: AVG(SPI_CSN_WAVE) - min: MIN(SPI_CSN_WAVE) - max: MAX(SPI_CSN_WAVE) - unit: Wavefronts - Saved Wavefronts: - avg: AVG(SQ_WAVES_SAVED) - min: MIN(SQ_WAVES_SAVED) - max: MAX(SQ_WAVES_SAVED) - unit: Wavefronts - Restored Wavefronts: - avg: AVG(SQ_WAVES_RESTORED) - min: MIN(SQ_WAVES_RESTORED) - max: MAX(SQ_WAVES_RESTORED) - unit: Wavefronts - VGPRs: - avg: AVG(Arch_VGPR) - min: MIN(Arch_VGPR) - max: MAX(Arch_VGPR) - unit: Registers - AGPRs: - avg: AVG(Accum_VGPR) - min: MIN(Accum_VGPR) - max: MAX(Accum_VGPR) - unit: Registers - SGPRs: - avg: AVG(SGPR) - min: MIN(SGPR) - max: MAX(SGPR) - unit: Registers - LDS Allocation: - avg: AVG(LDS_Per_Workgroup) - min: MIN(LDS_Per_Workgroup) - max: MAX(LDS_Per_Workgroup) - unit: Bytes - Scratch Allocation: - avg: AVG(Scratch_Per_Workitem) - min: MIN(Scratch_Per_Workitem) - max: MAX(Scratch_Per_Workitem) - unit: Bytes/Workitem - - metric_table: - id: 702 - title: Wavefront Runtime Stats - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - metric: - gfx90a: - Kernel Time: - avg: AVG((End_Timestamp - Start_Timestamp)) - min: MIN((End_Timestamp - Start_Timestamp)) - max: MAX((End_Timestamp - Start_Timestamp)) - unit: ns - Kernel Time (Cycles): - avg: AVG($GRBM_GUI_ACTIVE_PER_XCD) - min: MIN($GRBM_GUI_ACTIVE_PER_XCD) - max: MAX($GRBM_GUI_ACTIVE_PER_XCD) - unit: Cycle - Instructions per wavefront: - avg: AVG((SQ_INSTS / SQ_WAVES)) - min: MIN((SQ_INSTS / SQ_WAVES)) - max: MAX((SQ_INSTS / SQ_WAVES)) - unit: Instr/wavefront - Wave Cycles: - avg: AVG(((4 * SQ_WAVE_CYCLES) / $denom)) - min: MIN(((4 * SQ_WAVE_CYCLES) / $denom)) - max: MAX(((4 * SQ_WAVE_CYCLES) / $denom)) - unit: (Cycles + $normUnit) - Dependency Wait Cycles: - avg: AVG(((4 * SQ_WAIT_ANY) / $denom)) - min: MIN(((4 * SQ_WAIT_ANY) / $denom)) - max: MAX(((4 * SQ_WAIT_ANY) / $denom)) - unit: (Cycles + $normUnit) - Issue Wait Cycles: - avg: AVG(((4 * SQ_WAIT_INST_ANY) / $denom)) - min: MIN(((4 * SQ_WAIT_INST_ANY) / $denom)) - max: MAX(((4 * SQ_WAIT_INST_ANY) / $denom)) - unit: (Cycles + $normUnit) - Active Cycles: - avg: AVG(((4 * SQ_ACTIVE_INST_ANY) / $denom)) - min: MIN(((4 * SQ_ACTIVE_INST_ANY) / $denom)) - max: MAX(((4 * SQ_ACTIVE_INST_ANY) / $denom)) - unit: (Cycles + $normUnit) - Wavefront Occupancy: - avg: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) - min: MIN((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) - max: MAX((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) - unit: Wavefronts - coll_level: SQ_LEVEL_WAVES - gfx941: - Kernel Time: - avg: AVG((End_Timestamp - Start_Timestamp)) - min: MIN((End_Timestamp - Start_Timestamp)) - max: MAX((End_Timestamp - Start_Timestamp)) - unit: ns - Kernel Time (Cycles): - avg: AVG($GRBM_GUI_ACTIVE_PER_XCD) - min: MIN($GRBM_GUI_ACTIVE_PER_XCD) - max: MAX($GRBM_GUI_ACTIVE_PER_XCD) - unit: Cycle - Instructions per wavefront: - avg: AVG((SQ_INSTS / SQ_WAVES)) - min: MIN((SQ_INSTS / SQ_WAVES)) - max: MAX((SQ_INSTS / SQ_WAVES)) - unit: Instr/wavefront - Wave Cycles: - avg: AVG(((4 * SQ_WAVE_CYCLES) / $denom)) - min: MIN(((4 * SQ_WAVE_CYCLES) / $denom)) - max: MAX(((4 * SQ_WAVE_CYCLES) / $denom)) - unit: (Cycles + $normUnit) - Dependency Wait Cycles: - avg: AVG(((4 * SQ_WAIT_ANY) / $denom)) - min: MIN(((4 * SQ_WAIT_ANY) / $denom)) - max: MAX(((4 * SQ_WAIT_ANY) / $denom)) - unit: (Cycles + $normUnit) - Issue Wait Cycles: - avg: AVG(((4 * SQ_WAIT_INST_ANY) / $denom)) - min: MIN(((4 * SQ_WAIT_INST_ANY) / $denom)) - max: MAX(((4 * SQ_WAIT_INST_ANY) / $denom)) - unit: (Cycles + $normUnit) - Active Cycles: - avg: AVG(((4 * SQ_ACTIVE_INST_ANY) / $denom)) - min: MIN(((4 * SQ_ACTIVE_INST_ANY) / $denom)) - max: MAX(((4 * SQ_ACTIVE_INST_ANY) / $denom)) - unit: (Cycles + $normUnit) - Wavefront Occupancy: - avg: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) - min: MIN((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) - max: MAX((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) - unit: Wavefronts - coll_level: SQ_LEVEL_WAVES - gfx940: - Kernel Time: - avg: AVG((End_Timestamp - Start_Timestamp)) - min: MIN((End_Timestamp - Start_Timestamp)) - max: MAX((End_Timestamp - Start_Timestamp)) - unit: ns - Kernel Time (Cycles): - avg: AVG($GRBM_GUI_ACTIVE_PER_XCD) - min: MIN($GRBM_GUI_ACTIVE_PER_XCD) - max: MAX($GRBM_GUI_ACTIVE_PER_XCD) - unit: Cycle - Instructions per wavefront: - avg: AVG((SQ_INSTS / SQ_WAVES)) - min: MIN((SQ_INSTS / SQ_WAVES)) - max: MAX((SQ_INSTS / SQ_WAVES)) - unit: Instr/wavefront - Wave Cycles: - avg: AVG(((4 * SQ_WAVE_CYCLES) / $denom)) - min: MIN(((4 * SQ_WAVE_CYCLES) / $denom)) - max: MAX(((4 * SQ_WAVE_CYCLES) / $denom)) - unit: (Cycles + $normUnit) - Dependency Wait Cycles: - avg: AVG(((4 * SQ_WAIT_ANY) / $denom)) - min: MIN(((4 * SQ_WAIT_ANY) / $denom)) - max: MAX(((4 * SQ_WAIT_ANY) / $denom)) - unit: (Cycles + $normUnit) - Issue Wait Cycles: - avg: AVG(((4 * SQ_WAIT_INST_ANY) / $denom)) - min: MIN(((4 * SQ_WAIT_INST_ANY) / $denom)) - max: MAX(((4 * SQ_WAIT_INST_ANY) / $denom)) - unit: (Cycles + $normUnit) - Active Cycles: - avg: AVG(((4 * SQ_ACTIVE_INST_ANY) / $denom)) - min: MIN(((4 * SQ_ACTIVE_INST_ANY) / $denom)) - max: MAX(((4 * SQ_ACTIVE_INST_ANY) / $denom)) - unit: (Cycles + $normUnit) - Wavefront Occupancy: - avg: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) - min: MIN((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) - max: MAX((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) - unit: Wavefronts - coll_level: SQ_LEVEL_WAVES - gfx942: - Kernel Time: - avg: AVG((End_Timestamp - Start_Timestamp)) - min: MIN((End_Timestamp - Start_Timestamp)) - max: MAX((End_Timestamp - Start_Timestamp)) - unit: ns - Kernel Time (Cycles): - avg: AVG($GRBM_GUI_ACTIVE_PER_XCD) - min: MIN($GRBM_GUI_ACTIVE_PER_XCD) - max: MAX($GRBM_GUI_ACTIVE_PER_XCD) - unit: Cycle - Instructions per wavefront: - avg: AVG((SQ_INSTS / SQ_WAVES)) - min: MIN((SQ_INSTS / SQ_WAVES)) - max: MAX((SQ_INSTS / SQ_WAVES)) - unit: Instr/wavefront - Wave Cycles: - avg: AVG(((4 * SQ_WAVE_CYCLES) / $denom)) - min: MIN(((4 * SQ_WAVE_CYCLES) / $denom)) - max: MAX(((4 * SQ_WAVE_CYCLES) / $denom)) - unit: (Cycles + $normUnit) - Dependency Wait Cycles: - avg: AVG(((4 * SQ_WAIT_ANY) / $denom)) - min: MIN(((4 * SQ_WAIT_ANY) / $denom)) - max: MAX(((4 * SQ_WAIT_ANY) / $denom)) - unit: (Cycles + $normUnit) - Issue Wait Cycles: - avg: AVG(((4 * SQ_WAIT_INST_ANY) / $denom)) - min: MIN(((4 * SQ_WAIT_INST_ANY) / $denom)) - max: MAX(((4 * SQ_WAIT_INST_ANY) / $denom)) - unit: (Cycles + $normUnit) - Active Cycles: - avg: AVG(((4 * SQ_ACTIVE_INST_ANY) / $denom)) - min: MIN(((4 * SQ_ACTIVE_INST_ANY) / $denom)) - max: MAX(((4 * SQ_ACTIVE_INST_ANY) / $denom)) - unit: (Cycles + $normUnit) - Wavefront Occupancy: - avg: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) - min: MIN((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) - max: MAX((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) - unit: Wavefronts - coll_level: SQ_LEVEL_WAVES - gfx950: - Kernel Time: - avg: AVG((End_Timestamp - Start_Timestamp)) - min: MIN((End_Timestamp - Start_Timestamp)) - max: MAX((End_Timestamp - Start_Timestamp)) - unit: ns - Kernel Time (Cycles): - avg: AVG($GRBM_GUI_ACTIVE_PER_XCD) - min: MIN($GRBM_GUI_ACTIVE_PER_XCD) - max: MAX($GRBM_GUI_ACTIVE_PER_XCD) - unit: Cycle - Instructions per wavefront: - avg: AVG((SQ_INSTS / SQ_WAVES)) - min: MIN((SQ_INSTS / SQ_WAVES)) - max: MAX((SQ_INSTS / SQ_WAVES)) - unit: Instr/wavefront - Wave Cycles: - avg: AVG(((4 * SQ_WAVE_CYCLES) / $denom)) - min: MIN(((4 * SQ_WAVE_CYCLES) / $denom)) - max: MAX(((4 * SQ_WAVE_CYCLES) / $denom)) - unit: (Cycles + $normUnit) - Dependency Wait Cycles: - avg: AVG(((4 * SQ_WAIT_ANY) / $denom)) - min: MIN(((4 * SQ_WAIT_ANY) / $denom)) - max: MAX(((4 * SQ_WAIT_ANY) / $denom)) - unit: (Cycles + $normUnit) - Issue Wait Cycles: - avg: AVG(((4 * SQ_WAIT_INST_ANY) / $denom)) - min: MIN(((4 * SQ_WAIT_INST_ANY) / $denom)) - max: MAX(((4 * SQ_WAIT_INST_ANY) / $denom)) - unit: (Cycles + $normUnit) - Active Cycles: - avg: AVG(((4 * SQ_ACTIVE_INST_ANY) / $denom)) - min: MIN(((4 * SQ_ACTIVE_INST_ANY) / $denom)) - max: MAX(((4 * SQ_ACTIVE_INST_ANY) / $denom)) - unit: (Cycles + $normUnit) - Wavefront Occupancy: - avg: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) - min: MIN((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) - max: MAX((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) - unit: Wavefronts - coll_level: SQ_LEVEL_WAVES - gfx908: - Kernel Time: - avg: AVG((End_Timestamp - Start_Timestamp)) - min: MIN((End_Timestamp - Start_Timestamp)) - max: MAX((End_Timestamp - Start_Timestamp)) - unit: ns - Kernel Time (Cycles): - avg: AVG($GRBM_GUI_ACTIVE_PER_XCD) - min: MIN($GRBM_GUI_ACTIVE_PER_XCD) - max: MAX($GRBM_GUI_ACTIVE_PER_XCD) - unit: Cycle - Instructions per wavefront: - avg: AVG((SQ_INSTS / SQ_WAVES)) - min: MIN((SQ_INSTS / SQ_WAVES)) - max: MAX((SQ_INSTS / SQ_WAVES)) - unit: Instr/wavefront - Wave Cycles: - avg: AVG(((4 * SQ_WAVE_CYCLES) / $denom)) - min: MIN(((4 * SQ_WAVE_CYCLES) / $denom)) - max: MAX(((4 * SQ_WAVE_CYCLES) / $denom)) - unit: (Cycles + $normUnit) - Dependency Wait Cycles: - avg: AVG(((4 * SQ_WAIT_ANY) / $denom)) - min: MIN(((4 * SQ_WAIT_ANY) / $denom)) - max: MAX(((4 * SQ_WAIT_ANY) / $denom)) - unit: (Cycles + $normUnit) - Issue Wait Cycles: - avg: AVG(((4 * SQ_WAIT_INST_ANY) / $denom)) - min: MIN(((4 * SQ_WAIT_INST_ANY) / $denom)) - max: MAX(((4 * SQ_WAIT_INST_ANY) / $denom)) - unit: (Cycles + $normUnit) - Active Cycles: - avg: AVG(((4 * SQ_ACTIVE_INST_ANY) / $denom)) - min: MIN(((4 * SQ_ACTIVE_INST_ANY) / $denom)) - max: MAX(((4 * SQ_ACTIVE_INST_ANY) / $denom)) - unit: (Cycles + $normUnit) - Wavefront Occupancy: - avg: AVG((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) - min: MIN((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) - max: MAX((SQ_ACCUM_PREV_HIRES / $GRBM_GUI_ACTIVE_PER_XCD)) - unit: Wavefronts - coll_level: SQ_LEVEL_WAVES - metrics_description: - Grid Size: - plain: The total number of work-items (or, threads) launched as a part of the - kernel dispatch. In HIP, this is equivalent to the total grid size multiplied - by the total workgroup (or, block) size. - rst: The total number of work-items (or, threads) launched as a part of the - kernel dispatch. In HIP, this is equivalent to the total grid size multiplied - by the total workgroup (or, block) size. - unit: Work-Items - Workgroup Size: - plain: The total number of work-items (or, threads) in each workgroup (or, block) - launched as part of the kernel dispatch. In HIP, this is equivalent to the - total block size. - rst: The total number of work-items (or, threads) in each workgroup (or, block) - launched as part of the kernel dispatch. In HIP, this is equivalent to the - total block size. - unit: Work-Items - Total Wavefronts: - plain: |- - The total number of wavefronts launched as part of the kernel dispatch. - On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront - size is always 64 work-items. Thus, the total number of wavefronts should - be equivalent to the ceiling of grid size divided by 64. - rst: |- - The total number of wavefronts launched as part of the kernel dispatch. - On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront - size is always 64 work-items. Thus, the total number of wavefronts should - be equivalent to the ceiling of grid size divided by 64. - unit: Wavefronts - Saved Wavefronts: - plain: The total number of wavefronts saved at a context-save. - rst: The total number of wavefronts saved at a context-save. See `cwsr_enable - <https://docs.kernel.org/gpu/amdgpu/module-parameters.html?highlight=cwsr>`_. - unit: Wavefronts - Restored Wavefronts: - plain: The total number of wavefronts restored from a context-save. - rst: The total number of wavefronts restored from a context-save. See `cwsr_enable - <https://docs.kernel.org/gpu/amdgpu/module-parameters.html?highlight=cwsr>`_. - unit: Wavefronts - VGPRs: - plain: |- - The number of architected vector general-purpose registers allocated - for the kernel, see VALU. Note: this may not exactly match the number of VGPRs - requested by the compiler due to allocation granularity. - rst: |- - The number of architected vector general-purpose registers allocated for the - kernel, see :ref:`VALU <desc-valu>`. Note: this may not exactly match the - number of VGPRs requested by the compiler due to allocation granularity. - unit: VGPRs - AGPRs: - plain: |- - The number of accumulation vector general-purpose registers allocated - for the kernel, see AGPRs. Note: this may not exactly match the number of - AGPRs requested by the compiler due to allocation granularity. - rst: |- - The number of accumulation vector general-purpose registers allocated - for the kernel, see :ref:`AGPRs <desc-agprs>`. Note: this may not exactly match - the number of AGPRs requested by the compiler due to allocation granularity. - unit: AGPRs - SGPRs: - plain: |- - The number of scalar general-purpose registers allocated for the kernel, - see SALU. Note: this may not exactly match the number of SGPRs requested by - the compiler due to allocation granularity. - rst: |- - The number of scalar general-purpose registers allocated for the kernel, see - :ref:`SALU <desc-salu>`. Note: this may not exactly match the number of - SGPRs requested by the compiler due to allocation granularity. - unit: SGPRs - LDS Allocation: - plain: |- - The number of bytes of LDS memory (or, shared memory) allocated for - this kernel. Note: This may also be larger than what was requested at compile - time due to both allocation granularity and dynamic per-dispatch LDS allocations. - rst: |- - The number of bytes of :doc:`LDS <local-data-share>` memory (or, shared memory) - allocated for this kernel. Note: This may also be larger than what was requested - at compile time due to both allocation granularity and dynamic per-dispatch - LDS allocations. - unit: Bytes per workgroup - Scratch Allocation: - plain: The number of bytes of scratch memory requested per work-item for this - kernel. Scratch memory is used for stack memory on the accelerator, as well - as for register spills and restores. - rst: The number of bytes of :ref:`scratch memory <memory-spaces>` requested per - work-item for this kernel. Scratch memory is used for stack memory on the - accelerator, as well as for register spills and restores. - unit: Bytes per work-item - Kernel Time: - plain: The total duration of the executed kernel. - rst: The total duration of the executed kernel. - unit: Nanoseconds - Kernel Time (Cycles): - plain: The total duration of the executed kernel in cycles. - rst: The total duration of the executed kernel in cycles. - unit: Cycles - Instructions per wavefront: - plain: The average number of instructions (of all types) executed per wavefront. - This is averaged over all wavefronts in a kernel dispatch. - rst: The average number of instructions (of all types) executed per wavefront. - This is averaged over all wavefronts in a kernel dispatch. - unit: Instructions per wavefront - Wave Cycles: - plain: The number of cycles a wavefront in the kernel dispatch spent resident - on a compute unit per normalization unit. This is averaged over all wavefronts - in a kernel dispatch. - rst: |- - The number of cycles a wavefront in the kernel dispatch spent resident - on a compute unit per :ref:`normalization unit <normalization-units>`. This is - averaged over all wavefronts in a kernel dispatch. Note: this should not - be directly compared to the kernel cycles above. - unit: Cycles per normalization unit - Dependency Wait Cycles: - plain: The number of cycles a wavefront in the kernel dispatch spent resident - on a compute unit per normalization unit. This is averaged over all wavefronts - in a kernel dispatch. - rst: The number of cycles a wavefront in the kernel dispatch stalled waiting on - memory of any kind (e.g., instruction fetch, vector or scalar memory, etc.) - per :ref:`normalization unit <normalization-units>`. This counter is incremented - at every cycle by *all* wavefronts on a CU stalled at a memory operation. As - such, it is most useful to get a sense of how waves were spending their time, - rather than identification of a precise limiter because another wave could - be actively executing while a wave is stalled. The sum of this metric, Issue - Wait Cycles and Active Cycles should be equal to the total Wave Cycles metric. - unit: Cycles per normalization unit - Issue Wait Cycles: - plain: The number of cycles a wavefront in the kernel dispatch was unable to - issue an instruction for any reason (e.g., execution pipe back-pressure, arbitration - loss, etc.) per normalization unit. This counter is incremented at every cycle - by all wavefronts on a CU unable to issue an instruction. As such, it is most - useful to get a sense of how waves were spending their time, rather than identification - of a precise limiter because another wave could be actively executing while - a wave is issue stalled. The sum of this metric, Dependency Wait Cycles and - Active Cycles should be equal to the total Wave Cycles metric. - rst: The number of cycles a wavefront in the kernel dispatch was unable to issue - an instruction for any reason (e.g., execution pipe back-pressure, arbitration - loss, etc.) per :ref:`normalization unit <normalization-units>`. This counter - is incremented at every cycle by *all* wavefronts on a CU unable to issue - an instruction. As such, it is most useful to get a sense of how waves were spending - their time, rather than identification of a precise limiter because another - wave could be actively executing while a wave is issue stalled. The sum - of this metric, Dependency Wait Cycles and Active Cycles should be equal - to the total Wave Cycles metric. - unit: Cycles per normalization unit - Active Cycles: - plain: The average number of cycles a wavefront in the kernel dispatch was actively - executing instructions per normalization unit. This measurement is made on - a per-wavefront basis, and may include cycles that another wavefront spent - actively executing (on another execution unit, for example) or was stalled. - As such, it is most useful to get a sense of how waves were spending their - time, rather than identification of a precise limiter. The sum of this metric, - Issue Wait Cycles and Active Wait Cycles should be equal to the total Wave - Cycles metric. - rst: The average number of cycles a wavefront in the kernel dispatch was actively - executing instructions per :ref:`normalization unit <normalization-units>`. - This measurement is made on a per-wavefront basis, and may include cycles - that another wavefront spent actively executing (on another execution unit, - for example) or was stalled. As such, it is most useful to get a sense of - how waves were spending their time, rather than identification of a precise - limiter. The sum of this metric, Issue Wait Cycles and Active Wait Cycles - should be equal to the total Wave Cycles metric. - unit: Cycles per normalization unit - Wavefront Occupancy: - plain: |- - The time-averaged number of wavefronts resident on the accelerator over - the lifetime of the kernel. Note: this metric may be inaccurate for short-running - kernels (less than 1ms). - rst: |- - The time-averaged number of wavefronts resident on the accelerator over the - lifetime of the kernel. Note: this metric may be inaccurate for short-running - kernels (less than 1ms). - unit: Wavefronts -- id: 1000 - title: Compute Units - Instruction Mix - data source: - - metric_table: - id: 1001 - title: Overall Instruction Mix - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - metric: - gfx90a: - VALU: - avg: AVG(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) - min: MIN(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) - max: MAX(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) - unit: (instr + $normUnit) - VMEM: - avg: AVG(((SQ_INSTS_VMEM - SQ_INSTS_FLAT_LDS_ONLY) / $denom)) - min: MIN(((SQ_INSTS_VMEM - SQ_INSTS_FLAT_LDS_ONLY) / $denom)) - max: MAX(((SQ_INSTS_VMEM - SQ_INSTS_FLAT_LDS_ONLY) / $denom)) - unit: (instr + $normUnit) - LDS: - avg: AVG((SQ_INSTS_LDS / $denom)) - min: MIN((SQ_INSTS_LDS / $denom)) - max: MAX((SQ_INSTS_LDS / $denom)) - unit: (instr + $normUnit) - MFMA: - avg: AVG((SQ_INSTS_MFMA / $denom)) - min: MIN((SQ_INSTS_MFMA / $denom)) - max: MAX((SQ_INSTS_MFMA / $denom)) - unit: (instr + $normUnit) - SALU: - avg: AVG((SQ_INSTS_SALU / $denom)) - min: MIN((SQ_INSTS_SALU / $denom)) - max: MAX((SQ_INSTS_SALU / $denom)) - unit: (instr + $normUnit) - SMEM: - avg: AVG((SQ_INSTS_SMEM / $denom)) - min: MIN((SQ_INSTS_SMEM / $denom)) - max: MAX((SQ_INSTS_SMEM / $denom)) - unit: (instr + $normUnit) - Branch: - avg: AVG((SQ_INSTS_BRANCH / $denom)) - min: MIN((SQ_INSTS_BRANCH / $denom)) - max: MAX((SQ_INSTS_BRANCH / $denom)) - unit: (instr + $normUnit) - gfx941: - VALU: - avg: AVG(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) - min: MIN(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) - max: MAX(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) - unit: (instr + $normUnit) - VMEM: - avg: AVG(((SQ_INSTS_VMEM) / $denom)) - min: MIN(((SQ_INSTS_VMEM) / $denom)) - max: MAX(((SQ_INSTS_VMEM) / $denom)) - unit: (instr + $normUnit) - LDS: - avg: AVG((SQ_INSTS_LDS / $denom)) - min: MIN((SQ_INSTS_LDS / $denom)) - max: MAX((SQ_INSTS_LDS / $denom)) - unit: (instr + $normUnit) - MFMA: - avg: AVG((SQ_INSTS_MFMA / $denom)) - min: MIN((SQ_INSTS_MFMA / $denom)) - max: MAX((SQ_INSTS_MFMA / $denom)) - unit: (instr + $normUnit) - SALU: - avg: AVG((SQ_INSTS_SALU / $denom)) - min: MIN((SQ_INSTS_SALU / $denom)) - max: MAX((SQ_INSTS_SALU / $denom)) - unit: (instr + $normUnit) - SMEM: - avg: AVG((SQ_INSTS_SMEM / $denom)) - min: MIN((SQ_INSTS_SMEM / $denom)) - max: MAX((SQ_INSTS_SMEM / $denom)) - unit: (instr + $normUnit) - Branch: - avg: AVG((SQ_INSTS_BRANCH / $denom)) - min: MIN((SQ_INSTS_BRANCH / $denom)) - max: MAX((SQ_INSTS_BRANCH / $denom)) - unit: (instr + $normUnit) - gfx940: - VALU: - avg: AVG(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) - min: MIN(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) - max: MAX(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) - unit: (instr + $normUnit) - VMEM: - avg: AVG(((SQ_INSTS_VMEM) / $denom)) - min: MIN(((SQ_INSTS_VMEM) / $denom)) - max: MAX(((SQ_INSTS_VMEM) / $denom)) - unit: (instr + $normUnit) - LDS: - avg: AVG((SQ_INSTS_LDS / $denom)) - min: MIN((SQ_INSTS_LDS / $denom)) - max: MAX((SQ_INSTS_LDS / $denom)) - unit: (instr + $normUnit) - MFMA: - avg: AVG((SQ_INSTS_MFMA / $denom)) - min: MIN((SQ_INSTS_MFMA / $denom)) - max: MAX((SQ_INSTS_MFMA / $denom)) - unit: (instr + $normUnit) - SALU: - avg: AVG((SQ_INSTS_SALU / $denom)) - min: MIN((SQ_INSTS_SALU / $denom)) - max: MAX((SQ_INSTS_SALU / $denom)) - unit: (instr + $normUnit) - SMEM: - avg: AVG((SQ_INSTS_SMEM / $denom)) - min: MIN((SQ_INSTS_SMEM / $denom)) - max: MAX((SQ_INSTS_SMEM / $denom)) - unit: (instr + $normUnit) - Branch: - avg: AVG((SQ_INSTS_BRANCH / $denom)) - min: MIN((SQ_INSTS_BRANCH / $denom)) - max: MAX((SQ_INSTS_BRANCH / $denom)) - unit: (instr + $normUnit) - gfx942: - VALU: - avg: AVG(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) - min: MIN(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) - max: MAX(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) - unit: (instr + $normUnit) - VMEM: - avg: AVG(((SQ_INSTS_VMEM) / $denom)) - min: MIN(((SQ_INSTS_VMEM) / $denom)) - max: MAX(((SQ_INSTS_VMEM) / $denom)) - unit: (instr + $normUnit) - LDS: - avg: AVG((SQ_INSTS_LDS / $denom)) - min: MIN((SQ_INSTS_LDS / $denom)) - max: MAX((SQ_INSTS_LDS / $denom)) - unit: (instr + $normUnit) - MFMA: - avg: AVG((SQ_INSTS_MFMA / $denom)) - min: MIN((SQ_INSTS_MFMA / $denom)) - max: MAX((SQ_INSTS_MFMA / $denom)) - unit: (instr + $normUnit) - SALU: - avg: AVG((SQ_INSTS_SALU / $denom)) - min: MIN((SQ_INSTS_SALU / $denom)) - max: MAX((SQ_INSTS_SALU / $denom)) - unit: (instr + $normUnit) - SMEM: - avg: AVG((SQ_INSTS_SMEM / $denom)) - min: MIN((SQ_INSTS_SMEM / $denom)) - max: MAX((SQ_INSTS_SMEM / $denom)) - unit: (instr + $normUnit) - Branch: - avg: AVG((SQ_INSTS_BRANCH / $denom)) - min: MIN((SQ_INSTS_BRANCH / $denom)) - max: MAX((SQ_INSTS_BRANCH / $denom)) - unit: (instr + $normUnit) - gfx950: - VALU: - avg: AVG(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) - min: MIN(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) - max: MAX(((SQ_INSTS_VALU - SQ_INSTS_MFMA) / $denom)) - unit: (instr + $normUnit) - VMEM: - avg: AVG(((SQ_INSTS_VMEM) / $denom)) - min: MIN(((SQ_INSTS_VMEM) / $denom)) - max: MAX(((SQ_INSTS_VMEM) / $denom)) - unit: (instr + $normUnit) - LDS: - avg: AVG((SQ_INSTS_LDS / $denom)) - min: MIN((SQ_INSTS_LDS / $denom)) - max: MAX((SQ_INSTS_LDS / $denom)) - unit: (instr + $normUnit) - MFMA: - avg: AVG((SQ_INSTS_MFMA / $denom)) - min: MIN((SQ_INSTS_MFMA / $denom)) - max: MAX((SQ_INSTS_MFMA / $denom)) - unit: (instr + $normUnit) - SALU: - avg: AVG((SQ_INSTS_SALU / $denom)) - min: MIN((SQ_INSTS_SALU / $denom)) - max: MAX((SQ_INSTS_SALU / $denom)) - unit: (instr + $normUnit) - SMEM: - avg: AVG((SQ_INSTS_SMEM / $denom)) - min: MIN((SQ_INSTS_SMEM / $denom)) - max: MAX((SQ_INSTS_SMEM / $denom)) - unit: (instr + $normUnit) - Branch: - avg: AVG((SQ_INSTS_BRANCH / $denom)) - min: MIN((SQ_INSTS_BRANCH / $denom)) - max: MAX((SQ_INSTS_BRANCH / $denom)) - unit: (instr + $normUnit) - gfx908: - LDS: - avg: AVG((SQ_INSTS_LDS / $denom)) - min: MIN((SQ_INSTS_LDS / $denom)) - max: MAX((SQ_INSTS_LDS / $denom)) - unit: (instr + $normUnit) - SALU: - avg: AVG((SQ_INSTS_SALU / $denom)) - min: MIN((SQ_INSTS_SALU / $denom)) - max: MAX((SQ_INSTS_SALU / $denom)) - unit: (instr + $normUnit) - SMEM: - avg: AVG((SQ_INSTS_SMEM / $denom)) - min: MIN((SQ_INSTS_SMEM / $denom)) - max: MAX((SQ_INSTS_SMEM / $denom)) - unit: (instr + $normUnit) - Branch: - avg: AVG((SQ_INSTS_BRANCH / $denom)) - min: MIN((SQ_INSTS_BRANCH / $denom)) - max: MAX((SQ_INSTS_BRANCH / $denom)) - unit: (instr + $normUnit) - - metric_table: - id: 1002 - title: VALU Arithmetic Instruction Mix - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - metric: - gfx90a: - INT32: - avg: AVG((SQ_INSTS_VALU_INT32 / $denom)) - min: MIN((SQ_INSTS_VALU_INT32 / $denom)) - max: MAX((SQ_INSTS_VALU_INT32 / $denom)) - unit: (instr + $normUnit) - INT64: - avg: AVG((SQ_INSTS_VALU_INT64 / $denom)) - min: MIN((SQ_INSTS_VALU_INT64 / $denom)) - max: MAX((SQ_INSTS_VALU_INT64 / $denom)) - unit: (instr + $normUnit) - F16-ADD: - avg: AVG((SQ_INSTS_VALU_ADD_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_ADD_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_ADD_F16 / $denom)) - unit: (instr + $normUnit) - F16-MUL: - avg: AVG((SQ_INSTS_VALU_MUL_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_MUL_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_MUL_F16 / $denom)) - unit: (instr + $normUnit) - F16-FMA: - avg: AVG((SQ_INSTS_VALU_FMA_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_FMA_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_FMA_F16 / $denom)) - unit: (instr + $normUnit) - F16-Trans: - avg: AVG((SQ_INSTS_VALU_TRANS_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_TRANS_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_TRANS_F16 / $denom)) - unit: (instr + $normUnit) - F32-ADD: - avg: AVG((SQ_INSTS_VALU_ADD_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_ADD_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_ADD_F32 / $denom)) - unit: (instr + $normUnit) - F32-MUL: - avg: AVG((SQ_INSTS_VALU_MUL_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_MUL_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_MUL_F32 / $denom)) - unit: (instr + $normUnit) - F32-FMA: - avg: AVG((SQ_INSTS_VALU_FMA_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_FMA_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_FMA_F32 / $denom)) - unit: (instr + $normUnit) - F32-Trans: - avg: AVG((SQ_INSTS_VALU_TRANS_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_TRANS_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_TRANS_F32 / $denom)) - unit: (instr + $normUnit) - F64-ADD: - avg: AVG((SQ_INSTS_VALU_ADD_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_ADD_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_ADD_F64 / $denom)) - unit: (instr + $normUnit) - F64-MUL: - avg: AVG((SQ_INSTS_VALU_MUL_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_MUL_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_MUL_F64 / $denom)) - unit: (instr + $normUnit) - F64-FMA: - avg: AVG((SQ_INSTS_VALU_FMA_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_FMA_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_FMA_F64 / $denom)) - unit: (instr + $normUnit) - F64-Trans: - avg: AVG((SQ_INSTS_VALU_TRANS_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_TRANS_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_TRANS_F64 / $denom)) - unit: (instr + $normUnit) - Conversion: - avg: AVG((SQ_INSTS_VALU_CVT / $denom)) - min: MIN((SQ_INSTS_VALU_CVT / $denom)) - max: MAX((SQ_INSTS_VALU_CVT / $denom)) - unit: (instr + $normUnit) - gfx941: - INT32: - avg: AVG((SQ_INSTS_VALU_INT32 / $denom)) - min: MIN((SQ_INSTS_VALU_INT32 / $denom)) - max: MAX((SQ_INSTS_VALU_INT32 / $denom)) - unit: (instr + $normUnit) - INT64: - avg: AVG((SQ_INSTS_VALU_INT64 / $denom)) - min: MIN((SQ_INSTS_VALU_INT64 / $denom)) - max: MAX((SQ_INSTS_VALU_INT64 / $denom)) - unit: (instr + $normUnit) - F16-ADD: - avg: AVG((SQ_INSTS_VALU_ADD_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_ADD_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_ADD_F16 / $denom)) - unit: (instr + $normUnit) - F16-MUL: - avg: AVG((SQ_INSTS_VALU_MUL_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_MUL_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_MUL_F16 / $denom)) - unit: (instr + $normUnit) - F16-FMA: - avg: AVG((SQ_INSTS_VALU_FMA_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_FMA_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_FMA_F16 / $denom)) - unit: (instr + $normUnit) - F16-Trans: - avg: AVG((SQ_INSTS_VALU_TRANS_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_TRANS_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_TRANS_F16 / $denom)) - unit: (instr + $normUnit) - F32-ADD: - avg: AVG((SQ_INSTS_VALU_ADD_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_ADD_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_ADD_F32 / $denom)) - unit: (instr + $normUnit) - F32-MUL: - avg: AVG((SQ_INSTS_VALU_MUL_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_MUL_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_MUL_F32 / $denom)) - unit: (instr + $normUnit) - F32-FMA: - avg: AVG((SQ_INSTS_VALU_FMA_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_FMA_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_FMA_F32 / $denom)) - unit: (instr + $normUnit) - F32-Trans: - avg: AVG((SQ_INSTS_VALU_TRANS_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_TRANS_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_TRANS_F32 / $denom)) - unit: (instr + $normUnit) - F64-ADD: - avg: AVG((SQ_INSTS_VALU_ADD_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_ADD_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_ADD_F64 / $denom)) - unit: (instr + $normUnit) - F64-MUL: - avg: AVG((SQ_INSTS_VALU_MUL_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_MUL_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_MUL_F64 / $denom)) - unit: (instr + $normUnit) - F64-FMA: - avg: AVG((SQ_INSTS_VALU_FMA_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_FMA_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_FMA_F64 / $denom)) - unit: (instr + $normUnit) - F64-Trans: - avg: AVG((SQ_INSTS_VALU_TRANS_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_TRANS_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_TRANS_F64 / $denom)) - unit: (instr + $normUnit) - Conversion: - avg: AVG((SQ_INSTS_VALU_CVT / $denom)) - min: MIN((SQ_INSTS_VALU_CVT / $denom)) - max: MAX((SQ_INSTS_VALU_CVT / $denom)) - unit: (instr + $normUnit) - gfx940: - INT32: - avg: AVG((SQ_INSTS_VALU_INT32 / $denom)) - min: MIN((SQ_INSTS_VALU_INT32 / $denom)) - max: MAX((SQ_INSTS_VALU_INT32 / $denom)) - unit: (instr + $normUnit) - INT64: - avg: AVG((SQ_INSTS_VALU_INT64 / $denom)) - min: MIN((SQ_INSTS_VALU_INT64 / $denom)) - max: MAX((SQ_INSTS_VALU_INT64 / $denom)) - unit: (instr + $normUnit) - F16-ADD: - avg: AVG((SQ_INSTS_VALU_ADD_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_ADD_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_ADD_F16 / $denom)) - unit: (instr + $normUnit) - F16-MUL: - avg: AVG((SQ_INSTS_VALU_MUL_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_MUL_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_MUL_F16 / $denom)) - unit: (instr + $normUnit) - F16-FMA: - avg: AVG((SQ_INSTS_VALU_FMA_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_FMA_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_FMA_F16 / $denom)) - unit: (instr + $normUnit) - F16-Trans: - avg: AVG((SQ_INSTS_VALU_TRANS_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_TRANS_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_TRANS_F16 / $denom)) - unit: (instr + $normUnit) - F32-ADD: - avg: AVG((SQ_INSTS_VALU_ADD_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_ADD_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_ADD_F32 / $denom)) - unit: (instr + $normUnit) - F32-MUL: - avg: AVG((SQ_INSTS_VALU_MUL_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_MUL_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_MUL_F32 / $denom)) - unit: (instr + $normUnit) - F32-FMA: - avg: AVG((SQ_INSTS_VALU_FMA_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_FMA_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_FMA_F32 / $denom)) - unit: (instr + $normUnit) - F32-Trans: - avg: AVG((SQ_INSTS_VALU_TRANS_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_TRANS_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_TRANS_F32 / $denom)) - unit: (instr + $normUnit) - F64-ADD: - avg: AVG((SQ_INSTS_VALU_ADD_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_ADD_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_ADD_F64 / $denom)) - unit: (instr + $normUnit) - F64-MUL: - avg: AVG((SQ_INSTS_VALU_MUL_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_MUL_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_MUL_F64 / $denom)) - unit: (instr + $normUnit) - F64-FMA: - avg: AVG((SQ_INSTS_VALU_FMA_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_FMA_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_FMA_F64 / $denom)) - unit: (instr + $normUnit) - F64-Trans: - avg: AVG((SQ_INSTS_VALU_TRANS_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_TRANS_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_TRANS_F64 / $denom)) - unit: (instr + $normUnit) - Conversion: - avg: AVG((SQ_INSTS_VALU_CVT / $denom)) - min: MIN((SQ_INSTS_VALU_CVT / $denom)) - max: MAX((SQ_INSTS_VALU_CVT / $denom)) - unit: (instr + $normUnit) - gfx942: - INT32: - avg: AVG((SQ_INSTS_VALU_INT32 / $denom)) - min: MIN((SQ_INSTS_VALU_INT32 / $denom)) - max: MAX((SQ_INSTS_VALU_INT32 / $denom)) - unit: (instr + $normUnit) - INT64: - avg: AVG((SQ_INSTS_VALU_INT64 / $denom)) - min: MIN((SQ_INSTS_VALU_INT64 / $denom)) - max: MAX((SQ_INSTS_VALU_INT64 / $denom)) - unit: (instr + $normUnit) - F16-ADD: - avg: AVG((SQ_INSTS_VALU_ADD_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_ADD_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_ADD_F16 / $denom)) - unit: (instr + $normUnit) - F16-MUL: - avg: AVG((SQ_INSTS_VALU_MUL_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_MUL_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_MUL_F16 / $denom)) - unit: (instr + $normUnit) - F16-FMA: - avg: AVG((SQ_INSTS_VALU_FMA_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_FMA_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_FMA_F16 / $denom)) - unit: (instr + $normUnit) - F16-Trans: - avg: AVG((SQ_INSTS_VALU_TRANS_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_TRANS_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_TRANS_F16 / $denom)) - unit: (instr + $normUnit) - F32-ADD: - avg: AVG((SQ_INSTS_VALU_ADD_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_ADD_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_ADD_F32 / $denom)) - unit: (instr + $normUnit) - F32-MUL: - avg: AVG((SQ_INSTS_VALU_MUL_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_MUL_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_MUL_F32 / $denom)) - unit: (instr + $normUnit) - F32-FMA: - avg: AVG((SQ_INSTS_VALU_FMA_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_FMA_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_FMA_F32 / $denom)) - unit: (instr + $normUnit) - F32-Trans: - avg: AVG((SQ_INSTS_VALU_TRANS_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_TRANS_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_TRANS_F32 / $denom)) - unit: (instr + $normUnit) - F64-ADD: - avg: AVG((SQ_INSTS_VALU_ADD_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_ADD_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_ADD_F64 / $denom)) - unit: (instr + $normUnit) - F64-MUL: - avg: AVG((SQ_INSTS_VALU_MUL_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_MUL_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_MUL_F64 / $denom)) - unit: (instr + $normUnit) - F64-FMA: - avg: AVG((SQ_INSTS_VALU_FMA_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_FMA_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_FMA_F64 / $denom)) - unit: (instr + $normUnit) - F64-Trans: - avg: AVG((SQ_INSTS_VALU_TRANS_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_TRANS_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_TRANS_F64 / $denom)) - unit: (instr + $normUnit) - Conversion: - avg: AVG((SQ_INSTS_VALU_CVT / $denom)) - min: MIN((SQ_INSTS_VALU_CVT / $denom)) - max: MAX((SQ_INSTS_VALU_CVT / $denom)) - unit: (instr + $normUnit) - gfx950: - INT32: - avg: AVG((SQ_INSTS_VALU_INT32 / $denom)) - min: MIN((SQ_INSTS_VALU_INT32 / $denom)) - max: MAX((SQ_INSTS_VALU_INT32 / $denom)) - unit: (instr + $normUnit) - INT64: - avg: AVG((SQ_INSTS_VALU_INT64 / $denom)) - min: MIN((SQ_INSTS_VALU_INT64 / $denom)) - max: MAX((SQ_INSTS_VALU_INT64 / $denom)) - unit: (instr + $normUnit) - F16-ADD: - avg: AVG((SQ_INSTS_VALU_ADD_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_ADD_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_ADD_F16 / $denom)) - unit: (instr + $normUnit) - F16-MUL: - avg: AVG((SQ_INSTS_VALU_MUL_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_MUL_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_MUL_F16 / $denom)) - unit: (instr + $normUnit) - F16-FMA: - avg: AVG((SQ_INSTS_VALU_FMA_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_FMA_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_FMA_F16 / $denom)) - unit: (instr + $normUnit) - F16-Trans: - avg: AVG((SQ_INSTS_VALU_TRANS_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_TRANS_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_TRANS_F16 / $denom)) - unit: (instr + $normUnit) - F32-ADD: - avg: AVG((SQ_INSTS_VALU_ADD_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_ADD_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_ADD_F32 / $denom)) - unit: (instr + $normUnit) - F32-MUL: - avg: AVG((SQ_INSTS_VALU_MUL_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_MUL_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_MUL_F32 / $denom)) - unit: (instr + $normUnit) - F32-FMA: - avg: AVG((SQ_INSTS_VALU_FMA_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_FMA_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_FMA_F32 / $denom)) - unit: (instr + $normUnit) - F32-Trans: - avg: AVG((SQ_INSTS_VALU_TRANS_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_TRANS_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_TRANS_F32 / $denom)) - unit: (instr + $normUnit) - F64-ADD: - avg: AVG((SQ_INSTS_VALU_ADD_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_ADD_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_ADD_F64 / $denom)) - unit: (instr + $normUnit) - F64-MUL: - avg: AVG((SQ_INSTS_VALU_MUL_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_MUL_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_MUL_F64 / $denom)) - unit: (instr + $normUnit) - F64-FMA: - avg: AVG((SQ_INSTS_VALU_FMA_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_FMA_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_FMA_F64 / $denom)) - unit: (instr + $normUnit) - F64-Trans: - avg: AVG((SQ_INSTS_VALU_TRANS_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_TRANS_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_TRANS_F64 / $denom)) - unit: (instr + $normUnit) - Conversion: - avg: AVG((SQ_INSTS_VALU_CVT / $denom)) - min: MIN((SQ_INSTS_VALU_CVT / $denom)) - max: MAX((SQ_INSTS_VALU_CVT / $denom)) - unit: (instr + $normUnit) - gfx908: {} - - metric_table: - id: 1003 - title: VMEM Instruction Mix - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - metric: - gfx90a: - Global/Generic Instr: - avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - Global/Generic Read: - avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - Global/Generic Write: - avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - Global/Generic Atomic: - avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - Spill/Stack Instr: - avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - Spill/Stack Read: - avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - Spill/Stack Write: - avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - Spill/Stack Atomic: - avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - gfx941: - Global/Generic Instr: - avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - Global/Generic Read: - avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - Global/Generic Write: - avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - Global/Generic Atomic: - avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - Spill/Stack Instr: - avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - Spill/Stack Read: - avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - Spill/Stack Write: - avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - Spill/Stack Atomic: - avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - gfx940: - Global/Generic Instr: - avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - Global/Generic Read: - avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - Global/Generic Write: - avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - Global/Generic Atomic: - avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - Spill/Stack Instr: - avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - Spill/Stack Read: - avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - Spill/Stack Write: - avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - Spill/Stack Atomic: - avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - gfx942: - Global/Generic Instr: - avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - Global/Generic Read: - avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - Global/Generic Write: - avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - Global/Generic Atomic: - avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - Spill/Stack Instr: - avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - Spill/Stack Read: - avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - Spill/Stack Write: - avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - Spill/Stack Atomic: - avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - gfx950: - Global/Generic Instr: - avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - Global/Generic Read: - avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - Global/Generic Write: - avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - Global/Generic Atomic: - avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - Spill/Stack Instr: - avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - Spill/Stack Coalesceable Instr: - avg: AVG((TA_BUFFER_COALESCEABLE_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_COALESCEABLE_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_COALESCEABLE_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - Spill/Stack Read: - avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - Spill/Stack Write: - avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - Spill/Stack Atomic: - avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - gfx908: - Global/Generic Instr: - avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - Global/Generic Read: - avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - Global/Generic Write: - avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - Global/Generic Atomic: - avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - Spill/Stack Instr: - avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - Spill/Stack Read: - avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - Spill/Stack Write: - avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - Spill/Stack Atomic: - avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (instr + $normUnit) - - metric_table: - id: 1004 - title: MFMA Arithmetic Instruction Mix - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - metric: - gfx90a: - MFMA-I8: - avg: AVG((SQ_INSTS_VALU_MFMA_I8 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_I8 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_I8 / $denom)) - unit: (instr + $normUnit) - MFMA-F16: - avg: AVG((SQ_INSTS_VALU_MFMA_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_F16 / $denom)) - unit: (instr + $normUnit) - MFMA-BF16: - avg: AVG((SQ_INSTS_VALU_MFMA_BF16 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_BF16 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_BF16 / $denom)) - unit: (instr + $normUnit) - MFMA-F32: - avg: AVG((SQ_INSTS_VALU_MFMA_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_F32 / $denom)) - unit: (instr + $normUnit) - MFMA-F64: - avg: AVG((SQ_INSTS_VALU_MFMA_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_F64 / $denom)) - unit: (instr + $normUnit) - gfx941: - MFMA-I8: - avg: AVG((SQ_INSTS_VALU_MFMA_I8 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_I8 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_I8 / $denom)) - unit: (instr + $normUnit) - MFMA-F8: - avg: AVG((SQ_INSTS_VALU_MFMA_F8 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_F8 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_F8 / $denom)) - unit: (instr + $normUnit) - MFMA-F16: - avg: AVG((SQ_INSTS_VALU_MFMA_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_F16 / $denom)) - unit: (instr + $normUnit) - MFMA-BF16: - avg: AVG((SQ_INSTS_VALU_MFMA_BF16 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_BF16 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_BF16 / $denom)) - unit: (instr + $normUnit) - MFMA-F32: - avg: AVG((SQ_INSTS_VALU_MFMA_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_F32 / $denom)) - unit: (instr + $normUnit) - MFMA-F64: - avg: AVG((SQ_INSTS_VALU_MFMA_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_F64 / $denom)) - unit: (instr + $normUnit) - gfx940: - MFMA-I8: - avg: AVG((SQ_INSTS_VALU_MFMA_I8 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_I8 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_I8 / $denom)) - unit: (instr + $normUnit) - MFMA-F8: - avg: AVG((SQ_INSTS_VALU_MFMA_F8 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_F8 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_F8 / $denom)) - unit: (instr + $normUnit) - MFMA-F16: - avg: AVG((SQ_INSTS_VALU_MFMA_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_F16 / $denom)) - unit: (instr + $normUnit) - MFMA-BF16: - avg: AVG((SQ_INSTS_VALU_MFMA_BF16 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_BF16 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_BF16 / $denom)) - unit: (instr + $normUnit) - MFMA-F32: - avg: AVG((SQ_INSTS_VALU_MFMA_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_F32 / $denom)) - unit: (instr + $normUnit) - MFMA-F64: - avg: AVG((SQ_INSTS_VALU_MFMA_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_F64 / $denom)) - unit: (instr + $normUnit) - gfx942: - MFMA-I8: - avg: AVG((SQ_INSTS_VALU_MFMA_I8 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_I8 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_I8 / $denom)) - unit: (instr + $normUnit) - MFMA-F8: - avg: AVG((SQ_INSTS_VALU_MFMA_F8 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_F8 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_F8 / $denom)) - unit: (instr + $normUnit) - MFMA-F16: - avg: AVG((SQ_INSTS_VALU_MFMA_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_F16 / $denom)) - unit: (instr + $normUnit) - MFMA-BF16: - avg: AVG((SQ_INSTS_VALU_MFMA_BF16 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_BF16 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_BF16 / $denom)) - unit: (instr + $normUnit) - MFMA-F32: - avg: AVG((SQ_INSTS_VALU_MFMA_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_F32 / $denom)) - unit: (instr + $normUnit) - MFMA-F64: - avg: AVG((SQ_INSTS_VALU_MFMA_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_F64 / $denom)) - unit: (instr + $normUnit) - gfx950: - MFMA-I8: - avg: AVG((SQ_INSTS_VALU_MFMA_I8 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_I8 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_I8 / $denom)) - unit: (instr + $normUnit) - MFMA-F8: - avg: AVG((SQ_INSTS_VALU_MFMA_F8 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_F8 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_F8 / $denom)) - unit: (instr + $normUnit) - MFMA-F16: - avg: AVG((SQ_INSTS_VALU_MFMA_F16 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_F16 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_F16 / $denom)) - unit: (instr + $normUnit) - MFMA-BF16: - avg: AVG((SQ_INSTS_VALU_MFMA_BF16 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_BF16 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_BF16 / $denom)) - unit: (instr + $normUnit) - MFMA-F32: - avg: AVG((SQ_INSTS_VALU_MFMA_F32 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_F32 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_F32 / $denom)) - unit: (instr + $normUnit) - MFMA-F64: - avg: AVG((SQ_INSTS_VALU_MFMA_F64 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_F64 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_F64 / $denom)) - unit: (instr + $normUnit) - MFMA-F6F4: - avg: AVG((SQ_INSTS_VALU_MFMA_F6F4 / $denom)) - min: MIN((SQ_INSTS_VALU_MFMA_F6F4 / $denom)) - max: MAX((SQ_INSTS_VALU_MFMA_F6F4 / $denom)) - unit: (instr + $normUnit) - gfx908: {} - metrics_description: - VALU: - plain: The total number of vector arithmetic logic unit (VALU) operations issued. - These are the workhorses of the compute unit, and are used to execute a wide - range of instruction types including floating point operations, non-uniform - address calculations, transcendental operations, integer operations, shifts, - conditional evaluation, etc. - rst: The total number of vector arithmetic logic unit (VALU) operations issued. - These are the workhorses of the :doc:`compute unit <compute-unit>`, and are - used to execute a wide range of instruction types including floating point - operations, non-uniform address calculations, transcendental operations, - integer operations, shifts, conditional evaluation, etc. - unit: Instructions - VMEM: - plain: The total number of vector memory operations issued. These include most - loads, stores and atomic operations and all accesses to generic, global, private - and texture memory. - rst: The total number of vector memory operations issued. These include most loads, - stores and atomic operations and all accesses to :ref:`generic, global, private - and texture <memory-spaces>` memory. - unit: Instructions - LDS: - plain: The total number of LDS (also known as shared memory) operations issued. - These include loads, stores, atomics, and HIP's __shfl operations. - rst: The total number of LDS (also known as shared memory) operations issued. These - include loads, stores, atomics, and HIP's ``__shfl`` operations. - unit: Instructions - MFMA: - plain: The total number of matrix fused multiply-add instructions issued. - rst: The total number of matrix fused multiply-add instructions issued. - unit: Instructions - SALU: - plain: The total number of scalar arithmetic logic unit (SALU) operations issued. - Typically these are used for address calculations, literal constants, and - other operations that are provably uniform across a wavefront. Although scalar - memory (SMEM) operations are issued by the SALU, they are counted separately - in this section. - rst: The total number of scalar arithmetic logic unit (SALU) operations issued. - Typically these are used for address calculations, literal constants, and - other operations that are provably uniform across a wavefront. Although scalar - memory (SMEM) operations are issued by the SALU, they are counted separately - in this section. - unit: Instructions - SMEM: - plain: The total number of scalar memory (SMEM) operations issued. These are - typically used for loading kernel arguments, base-pointers and loads from - HIP's __constant__ memory. - rst: The total number of scalar memory (SMEM) operations issued. These are typically - used for loading kernel arguments, base-pointers and loads from HIP's ``__constant__`` - memory. - unit: Instructions - Branch: - plain: The total number of branch operations issued. These typically consist - of jump or branch operations and are used to implement control flow. - rst: The total number of branch operations issued. These typically consist of jump - or branch operations and are used to implement control flow. - unit: Instructions - INT32: - plain: The total number of instructions operating on 32-bit integer operands - issued to the VALU per normalization unit. - rst: The total number of instructions operating on 32-bit integer operands issued - to the VALU per :ref:`normalization unit <normalization-units>`. - unit: Instructions per normalization unit - INT64: - plain: The total number of instructions operating on 64-bit integer operands - issued to the VALU per normalization unit. - rst: The total number of instructions operating on 64-bit integer operands issued - to the VALU per :ref:`normalization unit <normalization-units>`. - unit: Instructions per normalization unit - F16-ADD: - plain: The total number of addition instructions operating on 16-bit floating-point - operands issued to the VALU per normalization unit. - rst: The total number of addition instructions operating on 16-bit floating-point - operands issued to the VALU per :ref:`normalization unit <normalization-units>`. - unit: Instructions per normalization unit - F16-MUL: - plain: The total number of multiplication instructions operating on 16-bit floating-point - operands issued to the VALU per normalization unit. - rst: The total number of multiplication instructions operating on 16-bit floating-point - operands issued to the VALU per :ref:`normalization unit <normalization-units>`. - unit: Instructions per normalization unit - F16-FMA: - plain: The total number of fused multiply-add instructions operating on 16-bit - floating-point operands issued to the VALU per normalization unit. - rst: The total number of fused multiply-add instructions operating on 16-bit floating-point - operands issued to the VALU per :ref:`normalization unit <normalization-units>`. - unit: Instructions per normalization unit - F16-Trans: - plain: The total number of transcendental instructions (e.g., sqrt) operating - on 16-bit floating-point operands issued to the VALU per normalization unit. - rst: The total number of transcendental instructions (e.g., `sqrt`) operating on - 16-bit floating-point operands issued to the VALU per :ref:`normalization - unit <normalization-units>`. - unit: Instructions per normalization unit - F32-ADD: - plain: The total number of addition instructions operating on 32-bit floating-point - operands issued to the VALU per normalization unit. - rst: The total number of addition instructions operating on 32-bit floating-point - operands issued to the VALU per :ref:`normalization unit <normalization-units>`. - unit: Instructions per normalization unit - F32-MUL: - plain: The total number of multiplication instructions operating on 32-bit floating-point - operands issued to the VALU per normalization unit. - rst: The total number of multiplication instructions operating on 32-bit floating-point - operands issued to the VALU per :ref:`normalization unit <normalization-units>`. - unit: Instructions per normalization unit - F32-FMA: - plain: The total number of fused multiply-add instructions operating on 32-bit - floating-point operands issued to the VALU per normalization unit. - rst: The total number of fused multiply-add instructions operating on 32-bit floating-point - operands issued to the VALU per :ref:`normalization unit <normalization-units>`. - unit: Instructions per normalization unit - F32-Trans: - plain: The total number of transcendental instructions (such as sqrt) operating - on 32-bit floating-point operands issued to the VALU per normalization unit. - rst: The total number of transcendental instructions (such as ``sqrt``) operating - on 32-bit floating-point operands issued to the VALU per :ref:`normalization - unit <normalization-units>`. - unit: Instructions per normalization unit - F64-ADD: - plain: The total number of addition instructions operating on 64-bit floating-point - operands issued to the VALU per normalization unit. - rst: The total number of addition instructions operating on 64-bit floating-point - operands issued to the VALU per :ref:`normalization unit <normalization-units>`. - unit: Instructions per normalization unit - F64-MUL: - plain: The total number of multiplication instructions operating on 64-bit floating-point - operands issued to the VALU per normalization unit. - rst: The total number of multiplication instructions operating on 64-bit floating-point - operands issued to the VALU per :ref:`normalization unit <normalization-units>`. - unit: Instructions per normalization unit - F64-FMA: - plain: The total number of fused multiply-add instructions operating on 64-bit - floating-point operands issued to the VALU per normalization unit. - rst: The total number of fused multiply-add instructions operating on 64-bit floating-point - operands issued to the VALU per :ref:`normalization unit <normalization-units>`. - unit: Instructions per normalization unit - F64-Trans: - plain: The total number of transcendental instructions (such as sqrt) operating - on 64-bit floating-point operands issued to the VALU per normalization unit. - rst: The total number of transcendental instructions (such as `sqrt`) operating - on 64-bit floating-point operands issued to the VALU per :ref:`normalization - unit <normalization-units>`. - unit: Instructions per normalization unit - Conversion: - plain: |- - The total number of type conversion instructions (such as converting - data to or from F32\u2194F64) issued to the VALU per normalization unit. - rst: |- - The total number of type conversion instructions (such as converting data - to or from F32\u2194F64) issued to the VALU per :ref:`normalization unit - <normalization-units>`. - unit: Instructions per normalization unit - Global/Generic Instr: - plain: The total number of global & generic memory instructions executed on - all compute units on the accelerator, per normalization unit. - rst: The total number of global & generic memory instructions executed on all :doc:`compute - units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`. - unit: Instructions per normalization unit - Global/Generic Read: - plain: The total number of global & generic memory read instructions executed - on all compute units on the accelerator, per normalization unit. - rst: The total number of global & generic memory read instructions executed - on all :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization - unit <normalization-units>`. - unit: Instructions per normalization unit - Global/Generic Write: - plain: The total number of global & generic memory write instructions executed - on all compute units on the accelerator, per normalization unit. - rst: The total number of global & generic memory write instructions executed on - all :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization - unit <normalization-units>`. - unit: Instructions per normalization unit - Global/Generic Atomic: - plain: The total number of global & generic memory atomic (with and without - return) instructions executed on all compute units on the accelerator, per - normalization unit. - rst: The total number of global & generic memory atomic (with and without return) - instructions executed on all :doc:`compute units <compute-unit>` on the accelerator, - per :ref:`normalization unit <normalization-units>`. - unit: Instructions per normalization unit - Spill/Stack Instr: - plain: The total number of spill/stack memory instructions executed on all compute - units on the accelerator, per normalization unit. - rst: The total number of spill/stack memory instructions executed on all :doc:`compute - units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`. - unit: Instructions per normalization unit - Spill/Stack Read: - plain: The total number of spill/stack memory read instructions executed on - all compute units on the accelerator, per normalization unit. - rst: The total number of spill/stack memory read instructions executed on all :doc:`compute - units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`. - unit: Instructions per normalization unit - Spill/Stack Write: - plain: The total number of spill/stack memory write instructions executed on - all compute units on the accelerator, per normalization unit. - rst: The total number of spill/stack memory write instructions executed on all :doc:`compute - units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`. - unit: Instructions per normalization unit - Spill/Stack Atomic: - plain: The total number of spill/stack memory atomic (with and without return) - instructions executed on all compute units on the accelerator, per normalization - unit. Typically unused as these memory operations are typically used to implement - thread-local storage. - rst: The total number of spill/stack memory atomic (with and without return) instructions - executed on all :doc:`compute units <compute-unit>` on the accelerator, per - :ref:`normalization unit <normalization-units>`. Typically unused as these - memory operations are typically used to implement thread-local storage. - unit: Instructions per normalization unit - MFMA-I8: - plain: The total number of 8-bit integer MFMA instructions issued per normalization - unit. - rst: The total number of 8-bit integer :ref:`MFMA <desc-mfma>` instructions issued - per :ref:`normalization unit <normalization-units>`. - unit: Instructions per normalization unit - MFMA-F8: - plain: The total number of 8-bit floating point MFMA instructions issued per - normalization unit. This is supported in AMD Instinct MI300 series and later - only. - rst: The total number of 8-bit floating point :ref:`MFMA <desc-mfma>` instructions issued - per :ref:`normalization unit <normalization-units>`. This is supported in - AMD Instinct MI300 series and later only. - unit: Instructions per normalization unit - MFMA-F16: - plain: The total number of 16-bit floating point MFMA instructions issued per - normalization unit. - rst: The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` instructions - issued per :ref:`normalization unit <normalization-units>`. - unit: Instructions per normalization unit - MFMA-BF16: - plain: The total number of 16-bit brain floating point MFMA instructions issued - per normalization unit. - rst: The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>` instructions - issued per :ref:`normalization unit <normalization-units>`. - unit: Instructions per normalization unit - MFMA-F32: - plain: The total number of 32-bit floating-point MFMA instructions issued per - normalization unit. - rst: The total number of 32-bit floating-point :ref:`MFMA <desc-mfma>` instructions - issued per :ref:`normalization unit <normalization-units>`. - unit: Instructions per normalization unit - MFMA-F64: - plain: The total number of 64-bit floating-point MFMA instructions issued per - normalization unit. - rst: The total number of 64-bit floating-point :ref:`MFMA <desc-mfma>` instructions - issued per :ref:`normalization unit <normalization-units>`. - unit: Instructions per normalization unit -- id: 1100 - title: Compute Units - Compute Pipeline - data source: - - metric_table: - id: 1101 - title: Compute Speed-of-Light - header: - metric: Metric - value: Avg - unit: Unit - peak: Peak - pop: Pct of Peak - metric: - gfx90a: - VALU FLOPs: - value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) - + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) - + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) - pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) - + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) - + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) - / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) - VALU IOPs: - value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp - - Start_Timestamp))) - unit: GIOP - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) - pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / - (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) - * 64) * 2) / 1000)) - MFMA FLOPs (BF16): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 1024) / 1000)) - MFMA FLOPs (F16): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 1024) / 1000)) - MFMA FLOPs (F32): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) - MFMA FLOPs (F64): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) - MFMA IOPs (INT8): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GIOP - peak: ((($max_sclk * $cu_per_gpu) * 1024) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 1024) / 1000)) - gfx941: - VALU FLOPs: - value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) - + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) - + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) - pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) - + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) - + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) - / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) - VALU IOPs: - value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp - - Start_Timestamp))) - unit: GIOP - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) - pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / - (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) - * 64) * 2) / 1000)) - MFMA FLOPs (F8): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - MFMA FLOPs (BF16): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) - MFMA FLOPs (F16): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) - MFMA FLOPs (F32): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) - MFMA FLOPs (F64): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) - MFMA IOPs (INT8): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GIOP - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - gfx940: - VALU FLOPs: - value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) - + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) - + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) - pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) - + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) - + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) - / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) - VALU IOPs: - value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp - - Start_Timestamp))) - unit: GIOP - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) - pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / - (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) - * 64) * 2) / 1000)) - MFMA FLOPs (F8): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - MFMA FLOPs (BF16): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) - MFMA FLOPs (F16): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) - MFMA FLOPs (F32): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) - MFMA FLOPs (F64): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) - MFMA IOPs (INT8): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GIOP - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - gfx942: - VALU FLOPs: - value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) - + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) - + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) - pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) - + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) - + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) - / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) - VALU IOPs: - value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp - - Start_Timestamp))) - unit: GIOP - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) - pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / - (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) - * 64) * 2) / 1000)) - MFMA FLOPs (F8): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - MFMA FLOPs (BF16): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) - MFMA FLOPs (F16): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) - MFMA FLOPs (F32): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) - MFMA FLOPs (F64): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) - MFMA IOPs (INT8): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GIOP - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - gfx950: - VALU FLOPs: - value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) - + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) - + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) - pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) - + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32)))) - + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))) - / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)) - VALU IOPs: - value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp - - Start_Timestamp))) - unit: GIOP - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) - pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / - (End_Timestamp - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) - * 64) * 2) / 1000)) - MFMA FLOPs (F8): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) - MFMA FLOPs (BF16): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - MFMA FLOPs (F16): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) - MFMA FLOPs (F32): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000)) - MFMA FLOPs (F64): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 128) / 1000)) - MFMA FLOPs (F6F4): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 16834) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 16834) / 1000)) - MFMA IOPs (INT8): - value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) - unit: GIOP - peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) - pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) - gfx908: {} - - metric_table: - id: 1102 - title: Pipeline Statistics - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - metric: - gfx90a: - IPC: - avg: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) - min: MIN((SQ_INSTS / SQ_BUSY_CU_CYCLES)) - max: MAX((SQ_INSTS / SQ_BUSY_CU_CYCLES)) - unit: Instr/cycle - IPC (Issued): - avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) - + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) - / SQ_ACTIVE_INST_ANY)) - min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) - + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) - / SQ_ACTIVE_INST_ANY)) - max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) - + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) - / SQ_ACTIVE_INST_ANY)) - unit: Instr/cycle - SALU Utilization: - avg: AVG((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - min: MIN((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - max: MAX((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - unit: pct - VALU Utilization: - avg: AVG((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - min: MIN((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - max: MAX((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - unit: pct - VMEM Utilization: - avg: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) - / $cu_per_gpu)) - min: MIN((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) - / $cu_per_gpu)) - max: MAX((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) - / $cu_per_gpu)) - unit: pct - Branch Utilization: - avg: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - min: MIN((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - max: MAX((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - unit: pct - VALU Active Threads: - avg: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU - != 0) else None)) - min: MIN(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU - != 0) else None)) - max: MAX(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU - != 0) else None)) - unit: Threads - MFMA Utilization: - avg: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) - min: MIN(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) - max: MAX(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) - unit: pct - MFMA Instruction Cycles: - avg: AVG(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA - != 0) else None)) - min: MIN(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA - != 0) else None)) - max: MAX(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA - != 0) else None)) - unit: cycles/instr - VMEM Latency: - avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) - else None)) - min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) - else None)) - max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) - else None)) - unit: Cycles - coll_level: SQ_INST_LEVEL_VMEM - SMEM Latency: - avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) - else None)) - min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) - else None)) - max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) - else None)) - unit: Cycles - coll_level: SQ_INST_LEVEL_SMEM - gfx941: - IPC: - avg: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) - min: MIN((SQ_INSTS / SQ_BUSY_CU_CYCLES)) - max: MAX((SQ_INSTS / SQ_BUSY_CU_CYCLES)) - unit: Instr/cycle - IPC (Issued): - avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) - + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) - / SQ_ACTIVE_INST_ANY)) - min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) - + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) - / SQ_ACTIVE_INST_ANY)) - max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) - + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) - / SQ_ACTIVE_INST_ANY)) - unit: Instr/cycle - SALU Utilization: - avg: AVG((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - min: MIN((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - max: MAX((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - unit: pct - VALU Utilization: - avg: AVG((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - min: MIN((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - max: MAX((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - unit: pct - VMEM Utilization: - avg: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) - / $cu_per_gpu)) - min: MIN((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) - / $cu_per_gpu)) - max: MAX((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) - / $cu_per_gpu)) - unit: pct - Branch Utilization: - avg: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - min: MIN((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - max: MAX((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - unit: pct - VALU Active Threads: - avg: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU - != 0) else None)) - min: MIN(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU - != 0) else None)) - max: MAX(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU - != 0) else None)) - unit: Threads - MFMA Utilization: - avg: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) - min: MIN(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) - max: MAX(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) - unit: pct - MFMA Instruction Cycles: - avg: AVG(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA - != 0) else None)) - min: MIN(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA - != 0) else None)) - max: MAX(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA - != 0) else None)) - unit: cycles/instr - VMEM Latency: - avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) - else None)) - min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) - else None)) - max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) - else None)) - unit: Cycles - coll_level: SQ_INST_LEVEL_VMEM - SMEM Latency: - avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) - else None)) - min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) - else None)) - max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) - else None)) - unit: Cycles - coll_level: SQ_INST_LEVEL_SMEM - gfx940: - IPC: - avg: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) - min: MIN((SQ_INSTS / SQ_BUSY_CU_CYCLES)) - max: MAX((SQ_INSTS / SQ_BUSY_CU_CYCLES)) - unit: Instr/cycle - IPC (Issued): - avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) - + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) - / SQ_ACTIVE_INST_ANY)) - min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) - + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) - / SQ_ACTIVE_INST_ANY)) - max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) - + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) - / SQ_ACTIVE_INST_ANY)) - unit: Instr/cycle - SALU Utilization: - avg: AVG((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - min: MIN((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - max: MAX((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - unit: pct - VALU Utilization: - avg: AVG((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - min: MIN((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - max: MAX((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - unit: pct - VMEM Utilization: - avg: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) - / $cu_per_gpu)) - min: MIN((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) - / $cu_per_gpu)) - max: MAX((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) - / $cu_per_gpu)) - unit: pct - Branch Utilization: - avg: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - min: MIN((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - max: MAX((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - unit: pct - VALU Active Threads: - avg: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU - != 0) else None)) - min: MIN(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU - != 0) else None)) - max: MAX(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU - != 0) else None)) - unit: Threads - MFMA Utilization: - avg: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) - min: MIN(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) - max: MAX(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) - unit: pct - MFMA Instruction Cycles: - avg: AVG(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA - != 0) else None)) - min: MIN(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA - != 0) else None)) - max: MAX(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA - != 0) else None)) - unit: cycles/instr - VMEM Latency: - avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) - else None)) - min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) - else None)) - max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) - else None)) - unit: Cycles - coll_level: SQ_INST_LEVEL_VMEM - SMEM Latency: - avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) - else None)) - min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) - else None)) - max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) - else None)) - unit: Cycles - coll_level: SQ_INST_LEVEL_SMEM - gfx942: - IPC: - avg: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) - min: MIN((SQ_INSTS / SQ_BUSY_CU_CYCLES)) - max: MAX((SQ_INSTS / SQ_BUSY_CU_CYCLES)) - unit: Instr/cycle - IPC (Issued): - avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) - + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) - / SQ_ACTIVE_INST_ANY)) - min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) - + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) - / SQ_ACTIVE_INST_ANY)) - max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) - + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) - / SQ_ACTIVE_INST_ANY)) - unit: Instr/cycle - SALU Utilization: - avg: AVG((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - min: MIN((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - max: MAX((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - unit: pct - VALU Utilization: - avg: AVG((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - min: MIN((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - max: MAX((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - unit: pct - VMEM Utilization: - avg: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) - / $cu_per_gpu)) - min: MIN((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) - / $cu_per_gpu)) - max: MAX((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) - / $cu_per_gpu)) - unit: pct - Branch Utilization: - avg: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - min: MIN((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - max: MAX((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - unit: pct - VALU Active Threads: - avg: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU - != 0) else None)) - min: MIN(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU - != 0) else None)) - max: MAX(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU - != 0) else None)) - unit: Threads - MFMA Utilization: - avg: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) - min: MIN(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) - max: MAX(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) - unit: pct - MFMA Instruction Cycles: - avg: AVG(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA - != 0) else None)) - min: MIN(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA - != 0) else None)) - max: MAX(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA - != 0) else None)) - unit: cycles/instr - VMEM Latency: - avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) - else None)) - min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) - else None)) - max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) - else None)) - unit: Cycles - coll_level: SQ_INST_LEVEL_VMEM - SMEM Latency: - avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) - else None)) - min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) - else None)) - max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) - else None)) - unit: Cycles - coll_level: SQ_INST_LEVEL_SMEM - gfx950: - IPC: - avg: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) - min: MIN((SQ_INSTS / SQ_BUSY_CU_CYCLES)) - max: MAX((SQ_INSTS / SQ_BUSY_CU_CYCLES)) - unit: Instr/cycle - IPC (Issued): - avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) - + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) - / SQ_ACTIVE_INST_ANY)) - min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) - + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) - / SQ_ACTIVE_INST_ANY)) - max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) - + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) - / SQ_ACTIVE_INST_ANY)) - unit: Instr/cycle - SALU Utilization: - avg: AVG((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - min: MIN((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - max: MAX((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - unit: pct - VALU Utilization: - avg: AVG((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - min: MIN((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - max: MAX((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - unit: pct - VALU Co-Issue Efficiency: - avg: AVG((100 * SQ_ACTIVE_INST_VALU2) / (SQ_ACTIVE_INST_VALU - SQ_ACTIVE_INST_VALU2)) - min: MIN((100 * SQ_ACTIVE_INST_VALU2) / (SQ_ACTIVE_INST_VALU - SQ_ACTIVE_INST_VALU2)) - max: MAX((100 * SQ_ACTIVE_INST_VALU2) / (SQ_ACTIVE_INST_VALU - SQ_ACTIVE_INST_VALU2)) - unit: pct - VMEM Utilization: - avg: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) - / $cu_per_gpu)) - min: MIN((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) - / $cu_per_gpu)) - max: MAX((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD) - / $cu_per_gpu)) - unit: pct - Branch Utilization: - avg: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - min: MIN((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - max: MAX((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - unit: pct - VALU Active Threads: - avg: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU - != 0) else None)) - min: MIN(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU - != 0) else None)) - max: MAX(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU - != 0) else None)) - unit: Threads - MFMA Utilization: - avg: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) - min: MIN(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) - max: MAX(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / ((4 * $cu_per_gpu) * $GRBM_GUI_ACTIVE_PER_XCD))) - unit: pct - MFMA Instruction Cycles: - avg: AVG(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA - != 0) else None)) - min: MIN(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA - != 0) else None)) - max: MAX(((SQ_VALU_MFMA_BUSY_CYCLES / SQ_INSTS_MFMA) if (SQ_INSTS_MFMA - != 0) else None)) - unit: cycles/instr - VMEM Latency: - avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) - else None)) - min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) - else None)) - max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_VMEM) if (SQ_INSTS_VMEM != 0) - else None)) - unit: Cycles - coll_level: SQ_INST_LEVEL_VMEM - SMEM Latency: - avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) - else None)) - min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) - else None)) - max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_SMEM) if (SQ_INSTS_SMEM != 0) - else None)) - unit: Cycles - coll_level: SQ_INST_LEVEL_SMEM - gfx908: - IPC: - avg: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES)) - min: MIN((SQ_INSTS / SQ_BUSY_CU_CYCLES)) - max: MAX((SQ_INSTS / SQ_BUSY_CU_CYCLES)) - unit: Instr/cycle - IPC (Issued): - avg: AVG(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) - + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) - / SQ_ACTIVE_INST_ANY)) - min: MIN(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) - + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) - / SQ_ACTIVE_INST_ANY)) - max: MAX(((((((((SQ_INSTS_VALU + SQ_INSTS_VMEM) + SQ_INSTS_SALU) + SQ_INSTS_SMEM)) - + SQ_INSTS_BRANCH) + SQ_INSTS_SENDMSG) + SQ_INSTS_VSKIPPED + SQ_INSTS_LDS) - / SQ_ACTIVE_INST_ANY)) - unit: Instr/cycle - SALU Utilization: - avg: AVG((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - min: MIN((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - max: MAX((((100 * SQ_ACTIVE_INST_SCA) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - unit: pct - VALU Utilization: - avg: AVG((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - min: MIN((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - max: MAX((((100 * SQ_ACTIVE_INST_VALU) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu)) - unit: pct - VALU Active Threads: - avg: AVG(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU - != 0) else None)) - min: MIN(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU - != 0) else None)) - max: MAX(((SQ_THREAD_CYCLES_VALU / SQ_ACTIVE_INST_VALU) if (SQ_ACTIVE_INST_VALU - != 0) else None)) - unit: Threads - - metric_table: - id: 1103 - title: Arithmetic Operations - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - metric: - gfx90a: - FLOPs (Total): - avg: AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) - + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * - SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) - + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) - + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) - + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) - / $denom)) - min: MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) - + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * - SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) - + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) - + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) - + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) - / $denom)) - max: MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) - + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * - SQ_INSTS_VALU_MFMA_MOPS_F16) + (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) - + (64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) - + (SQ_INSTS_VALU_FMA_F32 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) - + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (SQ_INSTS_VALU_FMA_F64 * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) - / $denom)) - unit: (OPs + $normUnit) - IOPs (Total): - avg: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 - * 512)) / $denom) - min: MIN(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 - * 512)) / $denom) - max: MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 - * 512)) / $denom) - unit: (OPs + $normUnit) - F16 OPs: - avg: AVG(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) - + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + - (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) - min: MIN(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) - + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + - (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) - max: MAX(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) - + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + - (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) - unit: (OPs + $normUnit) - BF16 OPs: - avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) - min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) - max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) - unit: (OPs + $normUnit) - F32 OPs: - avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) - + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) - / $denom)) - min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) - + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) - / $denom)) - max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) - + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) - / $denom)) - unit: (OPs + $normUnit) - F64 OPs: - avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) - / $denom)) - min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) - / $denom)) - max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) - / $denom)) - unit: (OPs + $normUnit) - INT8 OPs: - avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) - min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) - max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) - unit: (OPs + $normUnit) - gfx941: - FLOPs (Total): - avg: AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) - + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * - SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + - (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 - * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 - + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 - * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom)) - min: MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) - + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * - SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + - (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 - * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 - + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 - * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom)) - max: MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) - + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * - SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + - (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 - * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 - + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 - * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom)) - unit: (OPs + $normUnit) - IOPs (Total): - avg: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 - * 512)) / $denom) - min: MIN(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 - * 512)) / $denom) - max: MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 - * 512)) / $denom) - unit: (OPs + $normUnit) - F8 OPs: - avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) - min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) - max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) - unit: (OPs + $normUnit) - F16 OPs: - avg: AVG(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) - + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + - (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) - min: MIN(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) - + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + - (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) - max: MAX(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) - + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + - (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) - unit: (OPs + $normUnit) - BF16 OPs: - avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) - min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) - max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) - unit: (OPs + $normUnit) - F32 OPs: - avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) - + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) - / $denom)) - min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) - + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) - / $denom)) - max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) - + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) - / $denom)) - unit: (OPs + $normUnit) - F64 OPs: - avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) - / $denom)) - min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) - / $denom)) - max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) - / $denom)) - unit: (OPs + $normUnit) - INT8 OPs: - avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) - min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) - max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) - unit: (OPs + $normUnit) - gfx940: - FLOPs (Total): - avg: AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) - + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * - SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + - (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 - * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 - + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 - * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom)) - min: MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) - + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * - SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + - (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 - * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 - + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 - * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom)) - max: MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) - + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * - SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + - (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 - * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 - + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 - * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom)) - unit: (OPs + $normUnit) - IOPs (Total): - avg: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 - * 512)) / $denom) - min: MIN(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 - * 512)) / $denom) - max: MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 - * 512)) / $denom) - unit: (OPs + $normUnit) - F8 OPs: - avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) - min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) - max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) - unit: (OPs + $normUnit) - F16 OPs: - avg: AVG(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) - + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + - (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) - min: MIN(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) - + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + - (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) - max: MAX(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) - + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + - (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) - unit: (OPs + $normUnit) - BF16 OPs: - avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) - min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) - max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) - unit: (OPs + $normUnit) - F32 OPs: - avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) - + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) - / $denom)) - min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) - + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) - / $denom)) - max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) - + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) - / $denom)) - unit: (OPs + $normUnit) - F64 OPs: - avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) - / $denom)) - min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) - / $denom)) - max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) - / $denom)) - unit: (OPs + $normUnit) - INT8 OPs: - avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) - min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) - max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) - unit: (OPs + $normUnit) - gfx942: - FLOPs (Total): - avg: AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) - + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * - SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + - (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 - * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 - + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 - * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom)) - min: MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) - + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * - SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + - (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 - * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 - + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 - * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom)) - max: MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) - + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * - SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + - (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 - * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 - + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 - * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) / $denom)) - unit: (OPs + $normUnit) - IOPs (Total): - avg: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 - * 512)) / $denom) - min: MIN(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 - * 512)) / $denom) - max: MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 - * 512)) / $denom) - unit: (OPs + $normUnit) - F8 OPs: - avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) - min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) - max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) - unit: (OPs + $normUnit) - F16 OPs: - avg: AVG(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) - + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + - (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) - min: MIN(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) - + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + - (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) - max: MAX(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) - + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + - (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) - unit: (OPs + $normUnit) - BF16 OPs: - avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) - min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) - max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) - unit: (OPs + $normUnit) - F32 OPs: - avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) - + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) - / $denom)) - min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) - + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) - / $denom)) - max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) - + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) - / $denom)) - unit: (OPs + $normUnit) - F64 OPs: - avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) - / $denom)) - min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) - / $denom)) - max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) - / $denom)) - unit: (OPs + $normUnit) - INT8 OPs: - avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) - min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) - max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) - unit: (OPs + $normUnit) - gfx950: - FLOPs (Total): - avg: AVG((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) - + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * - SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + - (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 - * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 - + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 - * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) - / $denom)) - min: MIN((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) - + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * - SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + - (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 - * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 - + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 - * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) - / $denom)) - max: MAX((((((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) - + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_FMA_F16 * 2))) + ((512 * - SQ_INSTS_VALU_MFMA_MOPS_F8) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F16) + - (512 * SQ_INSTS_VALU_MFMA_MOPS_BF16))) + (64 * (((SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (SQ_INSTS_VALU_FMA_F32 - * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) + (64 * (((SQ_INSTS_VALU_ADD_F64 - + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) + (SQ_INSTS_VALU_FMA_F64 - * 2)))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4)) - / $denom)) - unit: (OPs + $normUnit) - IOPs (Total): - avg: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 - * 512)) / $denom) - min: MIN(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 - * 512)) / $denom) - max: MAX(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) + (SQ_INSTS_VALU_MFMA_MOPS_I8 - * 512)) / $denom) - unit: (OPs + $normUnit) - F8 OPs: - avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) - min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) - max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_F8) / $denom)) - unit: (OPs + $normUnit) - F16 OPs: - avg: AVG(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) - + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + - (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) - min: MIN(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) - + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + - (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) - max: MAX(((((((64 * SQ_INSTS_VALU_ADD_F16) + (64 * SQ_INSTS_VALU_MUL_F16)) - + (64 * SQ_INSTS_VALU_TRANS_F16)) + (128 * SQ_INSTS_VALU_FMA_F16)) + - (512 * SQ_INSTS_VALU_MFMA_MOPS_F16)) / $denom)) - unit: (OPs + $normUnit) - BF16 OPs: - avg: AVG(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) - min: MIN(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) - max: MAX(((512 * SQ_INSTS_VALU_MFMA_MOPS_BF16) / $denom)) - unit: (OPs + $normUnit) - F32 OPs: - avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) - + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) - / $denom)) - min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) - + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) - / $denom)) - max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) - + (SQ_INSTS_VALU_FMA_F32 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F32)) - / $denom)) - unit: (OPs + $normUnit) - F64 OPs: - avg: AVG((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) - / $denom)) - min: MIN((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) - / $denom)) - max: MAX((((64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64) - + (SQ_INSTS_VALU_FMA_F64 * 2))) + (512 * SQ_INSTS_VALU_MFMA_MOPS_F64)) - / $denom)) - unit: (OPs + $normUnit) - F6F4 OPs: - avg: AVG((512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4) / $denom) - min: MIN((512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4) / $denom) - max: MAX((512 * SQ_INSTS_VALU_MFMA_MOPS_F6F4) / $denom) - unit: (OPs + $normUnit) - INT8 OPs: - avg: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) - min: MIN(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) - max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom)) - unit: (OPs + $normUnit) - gfx908: {} - metrics_description: - VALU FLOPs: - plain: |- - The total floating-point operations executed per second on the VALU. - This is also presented as a percent of the peak theoretical FLOPs achievable - on the specific accelerator. Note: this does not include any floating-point - operations from MFMA instructions. - rst: |- - The total floating-point operations executed per second on the :ref:`VALU - <desc-valu>`. This is also presented as a percent of the peak theoretical - FLOPs achievable on the specific accelerator. Note: this does not include - any floating-point operations from :ref:`MFMA <desc-mfma>` instructions. - unit: GFLOPs - VALU IOPs: - plain: |- - The total integer operations executed per second on the VALU. This is - also presented as a percent of the peak theoretical IOPs achievable on the - specific accelerator. Note: this does not include any integer operations from - MFMA instructions. - rst: |- - The total integer operations executed per second on the :ref:`VALU <desc-valu>`. - This is also presented as a percent of the peak theoretical IOPs achievable - on the specific accelerator. Note: this does not include any integer operations - from :ref:`MFMA <desc-mfma>` instructions. - unit: GIOPs - MFMA FLOPs (BF16): - plain: |- - The total number of 16-bit brain floating point MFMA operations executed - per second. Note: this does not include any 16-bit brain floating point operations - from VALU instructions. This is also presented as a percent of the peak theoretical - BF16 MFMA operations achievable on the specific accelerator. - rst: |- - The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>` operations - executed per second. Note: this does not include any 16-bit brain floating - point operations from :ref:`VALU <desc-valu>` instructions. This is also - presented as a percent of the peak theoretical BF16 MFMA operations achievable - on the specific accelerator. - unit: GFLOPs - MFMA FLOPs (F16): - plain: |- - The total number of 16-bit floating point MFMA operations executed per - second. Note: this does not include any 16-bit floating point operations from - VALU instructions. This is also presented as a percent of the peak theoretical - F16 MFMA operations achievable on the specific accelerator. - rst: |- - The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations - executed per second. Note: this does not include any 16-bit floating point - operations from :ref:`VALU <desc-valu>` instructions. This is also presented - as a percent of the peak theoretical F16 MFMA operations achievable on the - specific accelerator. - unit: GFLOPs - MFMA FLOPs (F32): - plain: |- - The total number of 32-bit floating point MFMA operations executed per - second. Note: this does not include any 32-bit floating point operations from - VALU instructions. This is also presented as a percent of the peak theoretical - F32 MFMA operations achievable on the specific accelerator. - rst: |- - The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations - executed per second. Note: this does not include any 32-bit floating point - operations from :ref:`VALU <desc-valu>` instructions. This is also presented - as a percent of the peak theoretical F32 MFMA operations achievable on the - specific accelerator. - unit: GFLOPs - MFMA FLOPs (F64): - plain: |- - The total number of 64-bit floating point MFMA operations executed per - second. Note: this does not include any 64-bit floating point operations from - VALU instructions. This is also presented as a percent of the peak theoretical - F64 MFMA operations achievable on the specific accelerator. - rst: |- - The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations - executed per second. Note: this does not include any 64-bit floating point - operations from :ref:`VALU <desc-valu>` instructions. This is also presented - as a percent of the peak theoretical F64 MFMA operations achievable on the - specific accelerator. The total number of 64-bit floating point :ref:`MFMA - <desc-mfma>` operations executed per second. Note: this does not include - any 64-bit floating point operations from :ref:`VALU <desc-valu>` instructions. - This is also presented as a percent of the peak theoretical F64 MFMA operations - achievable on the specific accelerator. - unit: GFLOPs - MFMA IOPs (INT8): - plain: |- - The total number of 8-bit integer MFMA operations executed per second. - Note: this does not include any 8-bit integer operations from VALU instructions. - This is also presented as a percent of the peak theoretical INT8 MFMA operations - achievable on the specific accelerator. - rst: |- - The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed - per second. Note: this does not include any 8-bit integer operations from - :ref:`VALU <desc-valu>` instructions. This is also presented as a percent - of the peak theoretical INT8 MFMA operations achievable on the specific accelerator. - unit: GFLOPs - IPC: - plain: The ratio of the total number of instructions executed on the CU over - the total active CU cycles. - rst: The ratio of the total number of instructions executed on the :doc:`CU - <compute-unit>` over the :ref:`total active CU cycles <total-active-cu-cycles>`. - unit: Instructions per cycle - IPC (Issued): - plain: The ratio of the total number of (non-internal) instructions issued over - the number of cycles where the scheduler was actively working on issuing instructions. - rst: The ratio of the total number of (non-:ref:`internal <ipc-internal-instructions>`) - instructions issued over the number of cycles where the :ref:`scheduler <desc-scheduler>` - was actively working on issuing instructions. Refer to the :ref:`Issued - IPC <issued-ipc>` example for further detail. - unit: Instructions per cycle - SALU Utilization: - plain: Indicates what percent of the kernel's duration the SALU was busy executing - instructions. Computed as the ratio of the total number of cycles spent by - the scheduler issuing SALU / SMEM instructions over the total CU cycles. - rst: Indicates what percent of the kernel's duration the :ref:`SALU <desc-salu>` - was busy executing instructions. Computed as the ratio of the total number - of cycles spent by the :ref:`scheduler <desc-scheduler>` issuing SALU / :ref:`SMEM - <desc-smem>` instructions over the :ref:`total CU cycles <total-cu-cycles>`. - unit: Percent - VALU Utilization: - plain: Indicates what percent of the kernel's duration the VALU was busy executing - instructions. Does not include VMEM operations. Computed as the ratio of the - total number of cycles spent by the scheduler issuing VALU instructions over - the total CU cycles. - rst: Indicates what percent of the kernel's duration the :ref:`VALU <desc-valu>` - was busy executing instructions. Does not include :ref:`VMEM <desc-vmem>` - operations. Computed as the ratio of the total number of cycles spent by - the :ref:`scheduler <desc-scheduler>` issuing VALU instructions over the - :ref:`total CU cycles <total-cu-cycles>`. - unit: Percent - VMEM Utilization: - plain: Indicates what percent of the kernel's duration the VMEM unit was busy - executing instructions, including both global/generic and spill/scratch operations - (see the VMEM instruction count metrics for more detail). Does not include - VALU operations. Computed as the ratio of the total number of cycles spent - by the scheduler issuing VMEM instructions over the total CU cycles. - rst: Indicates what percent of the kernel's duration the :ref:`VMEM <desc-vmem>` - unit was busy executing instructions, including both global/generic and spill/scratch - operations (see the :ref:`VMEM instruction count metrics <ta-instruction-counts>` - for more detail). Does not include :ref:`VALU <desc-valu>` operations. Computed as - the ratio of the total number of cycles spent by the :ref:`scheduler <desc-scheduler>` - issuing VMEM instructions over the :ref:`total CU cycles <total-cu-cycles>`. - unit: Percent - Branch Utilization: - plain: Indicates what percent of the kernel's duration the branch unit was busy - executing instructions. Computed as the ratio of the total number of cycles - spent by the scheduler issuing branch instructions over the total CU cycles. - rst: Indicates what percent of the kernel's duration the :ref:`branch <desc-branch>` - unit was busy executing instructions. Computed as the ratio of the total - number of cycles spent by the :ref:`scheduler <desc-scheduler>` issuing branch - instructions over the :ref:`total CU cycles <total-cu-cycles>`. - unit: Percent - VALU Active Threads: - plain: Indicates the average level of divergence within a wavefront over the - lifetime of the kernel. The number of work-items that were active in a wavefront - during execution of each VALU instruction, time-averaged over all VALU instructions - run on all wavefronts in the kernel - rst: Indicates the average level of :ref:`divergence <desc-divergence>` within a - wavefront over the lifetime of the kernel. The number of work-items that - were active in a wavefront during execution of each :ref:`VALU <desc-valu>` - instruction, time-averaged over all VALU instructions run on all wavefronts - in the kernel. - unit: Work-items - MFMA Utilization: - plain: Indicates what percent of the kernel's duration the MFMA unit was busy - executing instructions. Computed as the ratio of the total number of cycles - spent by the MFMA was busy over the total CU cycles. - rst: Indicates what percent of the kernel's duration the :ref:`MFMA <desc-mfma>` - unit was busy executing instructions. Computed as the ratio of the total - number of cycles spent by the :ref:`MFMA <desc-salu>` was busy over the :ref:`total - CU cycles <total-cu-cycles>`. - unit: Percent - MFMA Instruction Cycles: - plain: The average duration of MFMA instructions in this kernel in cycles. Computed - as the ratio of the total number of cycles the MFMA unit was busy over the - total number of MFMA instructions. - rst: The average duration of :ref:`MFMA <desc-mfma>` instructions in this kernel - in cycles. Computed as the ratio of the total number of cycles the MFMA unit - was busy over the total number of MFMA instructions. Compare to, for example, - the `AMD Matrix Instruction Calculator <https://github.com/RadeonOpenCompute/amd_matrix_instruction_calculator>`_. - unit: Cycles per instruction - VMEM Latency: - plain: The average number of round-trip cycles (that is, from issue to data - return / acknowledgment) required for a VMEM instruction to complete. - rst: The average number of round-trip cycles (that is, from issue to data return - / acknowledgment) required for a VMEM instruction to complete. - unit: Cycles - SMEM Latency: - plain: The average number of round-trip cycles (that is, from issue to data - return / acknowledgment) required for a SMEM instruction to complete. - rst: The average number of round-trip cycles (that is, from issue to data return - / acknowledgment) required for a SMEM instruction to complete. - unit: Cycles - FLOPs (Total): - plain: The total number of floating-point operations executed on either the - VALU or MFMA units, per normalization unit. - rst: The total number of floating-point operations executed on either the :ref:`VALU - <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization unit - <normalization-units>`. - unit: FLOP per normalization unit - IOPs (Total): - plain: The total number of integer operations executed on either the VALU or - MFMA units, per normalization unit. - rst: The total number of integer operations executed on either the :ref:`VALU - <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization unit - <normalization-units>`. - unit: IOP per normalization unit - F16 OPs: - plain: The total number of 16-bit floating-point operations executed on either - the VALU or MFMA units, per normalization unit. - rst: The total number of 16-bit floating-point operations executed on either - the :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization - unit <normalization-units>`. - unit: FLOP per normalization unit - BF16 OPs: - plain: The total number of 16-bit brain floating-point operations executed on - either the VALU or MFMA units, per normalization unit. - rst: |- - The total number of 16-bit brain floating-point operations executed on - either the :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization - unit <normalization-units>`. Note: on current CDNA accelerators, the VALU - has no native BF16 instructions. - unit: FLOP per normalization unit - F32 OPs: - plain: The total number of 32-bit floating-point operations executed on either - the VALU or MFMA units, per normalization unit. - rst: The total number of 32-bit floating-point operations executed on either the - :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization - unit <normalization-units>`. - unit: FLOP per normalization unit - F64 OPs: - plain: The total number of 64-bit floating-point operations executed on either - the VALU or MFMA units, per normalization unit. - rst: The total number of 64-bit floating-point operations executed on either the - :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization - unit <normalization-units>`. - unit: FLOP per normalization unit - INT8 OPs: - plain: The total number of 8-bit integer operations executed on either the VALU - or MFMA units, per normalization unit. - rst: |- - The total number of 8-bit integer operations executed on either the :ref:`VALU - <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization unit - <normalization-units>`. Note: on current CDNA accelerators, the VALU has - no native INT8 instructions. - unit: IOP per normalization unit -- id: 1200 - title: Local Data Share (LDS) - data source: - - metric_table: - id: 1201 - title: LDS Speed-of-Light - header: - metric: Metric - value: Avg - unit: Unit - metric: - gfx90a: - Utilization: - value: AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: Pct of Peak - Access Rate: - value: AVG(((200 * SQ_ACTIVE_INST_LDS) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: Pct of Peak - Theoretical Bandwidth Utilization: - value: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * - 0.00128))) - unit: Pct of Peak - Bank Conflict Rate: - value: AVG((((SQ_LDS_BANK_CONFLICT * 3.125) / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - unit: Pct of Peak - gfx941: - Utilization: - value: AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: Pct of Peak - Access Rate: - value: AVG(((200 * SQ_ACTIVE_INST_LDS) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: Pct of Peak - Theoretical Bandwidth Utilization: - value: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * - 0.00128))) - unit: Pct of Peak - Bank Conflict Rate: - value: AVG((((SQ_LDS_BANK_CONFLICT * 3.125) / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - unit: Pct of Peak - gfx940: - Utilization: - value: AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: Pct of Peak - Access Rate: - value: AVG(((200 * SQ_ACTIVE_INST_LDS) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: Pct of Peak - Theoretical Bandwidth Utilization: - value: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * - 0.00128))) - unit: Pct of Peak - Bank Conflict Rate: - value: AVG((((SQ_LDS_BANK_CONFLICT * 3.125) / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - unit: Pct of Peak - gfx942: - Utilization: - value: AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: Pct of Peak - Access Rate: - value: AVG(((200 * SQ_ACTIVE_INST_LDS) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: Pct of Peak - Theoretical Bandwidth Utilization: - value: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * - 0.00128))) - unit: Pct of Peak - Bank Conflict Rate: - value: AVG((((SQ_LDS_BANK_CONFLICT * 3.125) / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - unit: Pct of Peak - gfx950: - Utilization: - value: AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: Pct of Peak - Access Rate: - value: AVG(((200 * SQ_ACTIVE_INST_LDS) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: Pct of Peak - Theoretical Bandwidth Utilization: - value: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * - 0.00128))) - unit: Pct of Peak - Bank Conflict Rate: - value: AVG((((SQ_LDS_BANK_CONFLICT * 3.125) / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - unit: Pct of Peak - gfx908: - Utilization: - value: AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: Pct of Peak - Access Rate: - value: AVG(((200 * SQ_ACTIVE_INST_LDS) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: Pct of Peak - Theoretical Bandwidth Utilization: - value: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * - 0.00128))) - unit: Pct of Peak - Bank Conflict Rate: - value: AVG((((SQ_LDS_BANK_CONFLICT * 3.125) / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - unit: Pct of Peak - comparable: false - cli_style: simple_bar - tui_style: simple_bar - - metric_table: - id: 1202 - title: LDS Statistics - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - metric: - gfx90a: - LDS Instructions: - avg: AVG((SQ_INSTS_LDS / $denom)) - min: MIN((SQ_INSTS_LDS / $denom)) - max: MAX((SQ_INSTS_LDS / $denom)) - unit: (Instr + $normUnit) - Theoretical Bandwidth: - avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp))) - min: MIN(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp))) - max: MAX(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp))) - unit: Gbps - LDS Latency: - avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) - else None)) - min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) - else None)) - max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) - else None)) - unit: Cycles - coll_level: SQ_INST_LEVEL_LDS - Bank Conflicts/Access: - avg: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - min: MIN(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - max: MAX(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - unit: Conflicts/Access - Index Accesses: - avg: AVG((SQ_LDS_IDX_ACTIVE / $denom)) - min: MIN((SQ_LDS_IDX_ACTIVE / $denom)) - max: MAX((SQ_LDS_IDX_ACTIVE / $denom)) - unit: (Cycles + $normUnit) - Atomic Return Cycles: - avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom)) - min: MIN((SQ_LDS_ATOMIC_RETURN / $denom)) - max: MAX((SQ_LDS_ATOMIC_RETURN / $denom)) - unit: (Cycles + $normUnit) - Bank Conflict: - avg: AVG((SQ_LDS_BANK_CONFLICT / $denom)) - min: MIN((SQ_LDS_BANK_CONFLICT / $denom)) - max: MAX((SQ_LDS_BANK_CONFLICT / $denom)) - unit: (Cycles + $normUnit) - Addr Conflict: - avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom)) - min: MIN((SQ_LDS_ADDR_CONFLICT / $denom)) - max: MAX((SQ_LDS_ADDR_CONFLICT / $denom)) - unit: (Cycles + $normUnit) - Unaligned Stall: - avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom)) - min: MIN((SQ_LDS_UNALIGNED_STALL / $denom)) - max: MAX((SQ_LDS_UNALIGNED_STALL / $denom)) - unit: (Cycles + $normUnit) - Mem Violations: - avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom)) - min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom)) - max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom)) - unit: (Accesses + $normUnit) - gfx941: - LDS Instructions: - avg: AVG((SQ_INSTS_LDS / $denom)) - min: MIN((SQ_INSTS_LDS / $denom)) - max: MAX((SQ_INSTS_LDS / $denom)) - unit: (Instr + $normUnit) - Theoretical Bandwidth: - avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp))) - min: MIN(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp))) - max: MAX(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp))) - unit: Gbps - LDS Latency: - avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) - else None)) - min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) - else None)) - max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) - else None)) - unit: Cycles - coll_level: SQ_INST_LEVEL_LDS - Bank Conflicts/Access: - avg: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - min: MIN(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - max: MAX(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - unit: Conflicts/Access - Index Accesses: - avg: AVG((SQ_LDS_IDX_ACTIVE / $denom)) - min: MIN((SQ_LDS_IDX_ACTIVE / $denom)) - max: MAX((SQ_LDS_IDX_ACTIVE / $denom)) - unit: (Cycles + $normUnit) - Atomic Return Cycles: - avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom)) - min: MIN((SQ_LDS_ATOMIC_RETURN / $denom)) - max: MAX((SQ_LDS_ATOMIC_RETURN / $denom)) - unit: (Cycles + $normUnit) - Bank Conflict: - avg: AVG((SQ_LDS_BANK_CONFLICT / $denom)) - min: MIN((SQ_LDS_BANK_CONFLICT / $denom)) - max: MAX((SQ_LDS_BANK_CONFLICT / $denom)) - unit: (Cycles + $normUnit) - Addr Conflict: - avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom)) - min: MIN((SQ_LDS_ADDR_CONFLICT / $denom)) - max: MAX((SQ_LDS_ADDR_CONFLICT / $denom)) - unit: (Cycles + $normUnit) - Unaligned Stall: - avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom)) - min: MIN((SQ_LDS_UNALIGNED_STALL / $denom)) - max: MAX((SQ_LDS_UNALIGNED_STALL / $denom)) - unit: (Cycles + $normUnit) - Mem Violations: - avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom)) - min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom)) - max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom)) - unit: (Accesses + $normUnit) - gfx940: - LDS Instructions: - avg: AVG((SQ_INSTS_LDS / $denom)) - min: MIN((SQ_INSTS_LDS / $denom)) - max: MAX((SQ_INSTS_LDS / $denom)) - unit: (Instr + $normUnit) - Theoretical Bandwidth: - avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp))) - min: MIN(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp))) - max: MAX(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp))) - unit: Gbps - LDS Latency: - avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) - else None)) - min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) - else None)) - max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) - else None)) - unit: Cycles - coll_level: SQ_INST_LEVEL_LDS - Bank Conflicts/Access: - avg: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - min: MIN(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - max: MAX(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - unit: Conflicts/Access - Index Accesses: - avg: AVG((SQ_LDS_IDX_ACTIVE / $denom)) - min: MIN((SQ_LDS_IDX_ACTIVE / $denom)) - max: MAX((SQ_LDS_IDX_ACTIVE / $denom)) - unit: (Cycles + $normUnit) - Atomic Return Cycles: - avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom)) - min: MIN((SQ_LDS_ATOMIC_RETURN / $denom)) - max: MAX((SQ_LDS_ATOMIC_RETURN / $denom)) - unit: (Cycles + $normUnit) - Bank Conflict: - avg: AVG((SQ_LDS_BANK_CONFLICT / $denom)) - min: MIN((SQ_LDS_BANK_CONFLICT / $denom)) - max: MAX((SQ_LDS_BANK_CONFLICT / $denom)) - unit: (Cycles + $normUnit) - Addr Conflict: - avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom)) - min: MIN((SQ_LDS_ADDR_CONFLICT / $denom)) - max: MAX((SQ_LDS_ADDR_CONFLICT / $denom)) - unit: (Cycles + $normUnit) - Unaligned Stall: - avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom)) - min: MIN((SQ_LDS_UNALIGNED_STALL / $denom)) - max: MAX((SQ_LDS_UNALIGNED_STALL / $denom)) - unit: (Cycles + $normUnit) - Mem Violations: - avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom)) - min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom)) - max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom)) - unit: (Accesses + $normUnit) - gfx942: - LDS Instructions: - avg: AVG((SQ_INSTS_LDS / $denom)) - min: MIN((SQ_INSTS_LDS / $denom)) - max: MAX((SQ_INSTS_LDS / $denom)) - unit: (Instr + $normUnit) - Theoretical Bandwidth: - avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp))) - min: MIN(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp))) - max: MAX(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp))) - unit: Gbps - LDS Latency: - avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) - else None)) - min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) - else None)) - max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) - else None)) - unit: Cycles - coll_level: SQ_INST_LEVEL_LDS - Bank Conflicts/Access: - avg: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - min: MIN(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - max: MAX(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - unit: Conflicts/Access - Index Accesses: - avg: AVG((SQ_LDS_IDX_ACTIVE / $denom)) - min: MIN((SQ_LDS_IDX_ACTIVE / $denom)) - max: MAX((SQ_LDS_IDX_ACTIVE / $denom)) - unit: (Cycles + $normUnit) - Atomic Return Cycles: - avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom)) - min: MIN((SQ_LDS_ATOMIC_RETURN / $denom)) - max: MAX((SQ_LDS_ATOMIC_RETURN / $denom)) - unit: (Cycles + $normUnit) - Bank Conflict: - avg: AVG((SQ_LDS_BANK_CONFLICT / $denom)) - min: MIN((SQ_LDS_BANK_CONFLICT / $denom)) - max: MAX((SQ_LDS_BANK_CONFLICT / $denom)) - unit: (Cycles + $normUnit) - Addr Conflict: - avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom)) - min: MIN((SQ_LDS_ADDR_CONFLICT / $denom)) - max: MAX((SQ_LDS_ADDR_CONFLICT / $denom)) - unit: (Cycles + $normUnit) - Unaligned Stall: - avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom)) - min: MIN((SQ_LDS_UNALIGNED_STALL / $denom)) - max: MAX((SQ_LDS_UNALIGNED_STALL / $denom)) - unit: (Cycles + $normUnit) - Mem Violations: - avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom)) - min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom)) - max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom)) - unit: (Accesses + $normUnit) - gfx950: - LDS Instructions: - avg: AVG((SQ_INSTS_LDS / $denom)) - min: MIN((SQ_INSTS_LDS / $denom)) - max: MAX((SQ_INSTS_LDS / $denom)) - unit: (Instr + $normUnit) - LDS LOAD: - avg: AVG((SQ_INSTS_LDS_LOAD / $denom)) - min: MIN((SQ_INSTS_LDS_LOAD / $denom)) - max: MAX((SQ_INSTS_LDS_LOAD / $denom)) - unit: (instr + $normUnit) - LDS STORE: - avg: AVG((SQ_INSTS_LDS_STORE / $denom)) - min: MIN((SQ_INSTS_LDS_STORE / $denom)) - max: MAX((SQ_INSTS_LDS_STORE / $denom)) - unit: (instr + $normUnit) - LDS ATOMIC: - avg: AVG((SQ_INSTS_LDS_ATOMIC / $denom)) - min: MIN((SQ_INSTS_LDS_ATOMIC / $denom)) - max: MAX((SQ_INSTS_LDS_ATOMIC / $denom)) - unit: (instr + $normUnit) - LDS LOAD Bandwidth: - avg: AVG(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp)) - min: MIN(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp)) - max: MAX(64 * SQ_INSTS_LDS_LOAD_BANDWIDTH / (End_Timestamp - Start_Timestamp)) - units: Gbps - LDS STORE Bandwidth: - avg: AVG(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp)) - min: MIN(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp)) - max: MAX(64 * SQ_INSTS_LDS_STORE_BANDWIDTH / (End_Timestamp - Start_Timestamp)) - units: Gbps - LDS ATOMIC Bandwidth: - avg: AVG(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp)) - min: MIN(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp)) - max: MAX(64 * SQ_INSTS_LDS_ATOMIC_BANDWIDTH / (End_Timestamp - Start_Timestamp)) - units: Gbps - Theoretical Bandwidth: - avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp))) - min: MIN(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp))) - max: MAX(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp))) - unit: Gbps - LDS Latency: - avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) - else None)) - min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) - else None)) - max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) - else None)) - unit: Cycles - coll_level: SQ_INST_LEVEL_LDS - Bank Conflicts/Access: - avg: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - min: MIN(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - max: MAX(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - unit: Conflicts/Access - Index Accesses: - avg: AVG((SQ_LDS_IDX_ACTIVE / $denom)) - min: MIN((SQ_LDS_IDX_ACTIVE / $denom)) - max: MAX((SQ_LDS_IDX_ACTIVE / $denom)) - unit: (Cycles + $normUnit) - Atomic Return Cycles: - avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom)) - min: MIN((SQ_LDS_ATOMIC_RETURN / $denom)) - max: MAX((SQ_LDS_ATOMIC_RETURN / $denom)) - unit: (Cycles + $normUnit) - Bank Conflict: - avg: AVG((SQ_LDS_BANK_CONFLICT / $denom)) - min: MIN((SQ_LDS_BANK_CONFLICT / $denom)) - max: MAX((SQ_LDS_BANK_CONFLICT / $denom)) - unit: (Cycles + $normUnit) - Addr Conflict: - avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom)) - min: MIN((SQ_LDS_ADDR_CONFLICT / $denom)) - max: MAX((SQ_LDS_ADDR_CONFLICT / $denom)) - unit: (Cycles + $normUnit) - Unaligned Stall: - avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom)) - min: MIN((SQ_LDS_UNALIGNED_STALL / $denom)) - max: MAX((SQ_LDS_UNALIGNED_STALL / $denom)) - unit: (Cycles + $normUnit) - Mem Violations: - avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom)) - min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom)) - max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom)) - unit: (Accesses + $normUnit) - LDS Command FIFO Full Rate: - avg: AVG((SQ_LDS_CMD_FIFO_FULL / $denom)) - min: MIN((SQ_LDS_CMD_FIFO_FULL / $denom)) - max: MAX((SQ_LDS_CMD_FIFO_FULL / $denom)) - unit: (Cycles + $normUnit) - LDS Data FIFO Full Rate: - avg: AVG((SQ_LDS_DATA_FIFO_FULL / $denom)) - min: MIN((SQ_LDS_DATA_FIFO_FULL / $denom)) - max: MAX((SQ_LDS_DATA_FIFO_FULL / $denom)) - unit: (Cycles + $normUnit) - gfx908: - LDS Instructions: - avg: AVG((SQ_INSTS_LDS / $denom)) - min: MIN((SQ_INSTS_LDS / $denom)) - max: MAX((SQ_INSTS_LDS / $denom)) - unit: (Instr + $normUnit) - Theoretical Bandwidth: - avg: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp))) - min: MIN(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp))) - max: MAX(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu)) - / (End_Timestamp - Start_Timestamp))) - unit: Gbps - LDS Latency: - avg: AVG(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) - else None)) - min: MIN(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) - else None)) - max: MAX(((SQ_ACCUM_PREV_HIRES / SQ_INSTS_LDS) if (SQ_INSTS_LDS != 0) - else None)) - unit: Cycles - coll_level: SQ_INST_LEVEL_LDS - Bank Conflicts/Access: - avg: AVG(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - min: MIN(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - max: MAX(((SQ_LDS_BANK_CONFLICT / (SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT)) - if ((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) != 0) else None)) - unit: Conflicts/Access - Index Accesses: - avg: AVG((SQ_LDS_IDX_ACTIVE / $denom)) - min: MIN((SQ_LDS_IDX_ACTIVE / $denom)) - max: MAX((SQ_LDS_IDX_ACTIVE / $denom)) - unit: (Cycles + $normUnit) - Atomic Return Cycles: - avg: AVG((SQ_LDS_ATOMIC_RETURN / $denom)) - min: MIN((SQ_LDS_ATOMIC_RETURN / $denom)) - max: MAX((SQ_LDS_ATOMIC_RETURN / $denom)) - unit: (Cycles + $normUnit) - Bank Conflict: - avg: AVG((SQ_LDS_BANK_CONFLICT / $denom)) - min: MIN((SQ_LDS_BANK_CONFLICT / $denom)) - max: MAX((SQ_LDS_BANK_CONFLICT / $denom)) - unit: (Cycles + $normUnit) - Addr Conflict: - avg: AVG((SQ_LDS_ADDR_CONFLICT / $denom)) - min: MIN((SQ_LDS_ADDR_CONFLICT / $denom)) - max: MAX((SQ_LDS_ADDR_CONFLICT / $denom)) - unit: (Cycles + $normUnit) - Unaligned Stall: - avg: AVG((SQ_LDS_UNALIGNED_STALL / $denom)) - min: MIN((SQ_LDS_UNALIGNED_STALL / $denom)) - max: MAX((SQ_LDS_UNALIGNED_STALL / $denom)) - unit: (Cycles + $normUnit) - Mem Violations: - avg: AVG((SQ_LDS_MEM_VIOLATIONS / $denom)) - min: MIN((SQ_LDS_MEM_VIOLATIONS / $denom)) - max: MAX((SQ_LDS_MEM_VIOLATIONS / $denom)) - unit: (Accesses + $normUnit) - metrics_description: - Utilization: - plain: Indicates what percent of the kernel's duration the LDS was actively - executing instructions (including, but not limited to, load, store, atomic - and HIP's __shfl operations). Calculated as the ratio of the total number - of cycles LDS was active over the total CU cycles. - rst: Indicates what percent of the kernel's duration the :ref:`LDS <desc-lds>` was - actively executing instructions (including, but not limited to, load, store, - atomic and HIP's ``__shfl`` operations). Calculated as the ratio of the - total number of cycles LDS was active over the :ref:`total CU cycles <total-cu-cycles>`. - unit: Percent - Access Rate: - plain: Indicates the percentage of SIMDs in the VALU actively issuing LDS instructions, - averaged over the lifetime of the kernel. Calculated as the ratio of the total - number of cycles spent by the scheduler issuing LDS instructions over the - total CU cycles. - rst: Indicates the percentage of SIMDs in the :ref:`VALU <desc-valu>` [#lds-workload]_ - actively issuing LDS instructions, averaged over the lifetime of the kernel. - Calculated as the ratio of the total number of cycles spent by the :ref:`scheduler - <desc-scheduler>` issuing :ref:`LDS <desc-lds>` instructions over the :ref:`total - CU cycles <total-cu-cycles>`. - unit: Percent - Theoretical Bandwidth Utilization: - plain: Indicates the maximum amount of bytes that could have been loaded from, - stored to, or atomically updated in the LDS divided as percentage of theoretical peak. - Does not take into account the execution mask of the wavefront when the instruction - was executed. - rst: Indicates the maximum amount of bytes that could have been loaded from, stored - to, or atomically updated in the LDS divided as percentage of theoretical peak. - Does *not* take into account the execution mask of the wavefront when the - instruction was executed. See the :ref:`LDS bandwidth example <lds-bandwidth>` - for more detail. - unit: Percent - Theoretical Bandwidth: - plain: Indicates the maximum amount of bytes that could have been loaded from, - stored to, or atomically updated in the LDS divided by total duration. Does not - take into account the execution mask of the wavefront when the instruction - was executed. - rst: Indicates the maximum amount of bytes that could have been loaded from, stored - to, or atomically updated in the LDS divided by total duration. - Does *not* take into account the execution mask of the wavefront when the - instruction was executed. See the :ref:`LDS bandwidth example <lds-bandwidth>` - for more detail. - unit: Gbps - Bank Conflict Rate: - plain: Indicates the percentage of active LDS cycles that were spent servicing - bank conflicts. Calculated as the ratio of LDS cycles spent servicing bank - conflicts over the number of LDS cycles that would have been required to move - the same amount of data in an uncontended access. - rst: Indicates the percentage of active LDS cycles that were spent servicing bank - conflicts. Calculated as the ratio of LDS cycles spent servicing bank conflicts - over the number of LDS cycles that would have been required to move the same - amount of data in an uncontended access. [#lds-bank-conflict]_ - unit: Percent - LDS Instructions: - plain: The total number of LDS instructions (including, but not limited to, - read/write/atomics and HIP's __shfl instructions) executed per normalization - unit. - rst: The total number of LDS instructions (including, but not limited to, read/write/atomics - and HIP's ``__shfl`` instructions) executed per :ref:`normalization unit - <normalization-units>`. - unit: Instructions per normalization unit - LDS Latency: - plain: The average number of round-trip cycles (i.e., from issue to data-return - acknowledgment) required for an LDS instruction to complete. - rst: The average number of round-trip cycles (i.e., from issue to data-return - acknowledgment) required for an LDS instruction to complete. - unit: Cycles - Bank Conflicts/Access: - plain: The ratio of the number of cycles spent in the LDS scheduler due to bank - conflicts (as determined by the conflict resolution hardware) to the base - number of cycles that would be spent in the LDS scheduler in a completely - uncontended case. This is the unnormalized form of the Bank Conflict Rate. - rst: The ratio of the number of cycles spent in the :ref:`LDS scheduler <desc-lds>` - due to bank conflicts (as determined by the conflict resolution hardware) - to the base number of cycles that would be spent in the LDS scheduler in - a completely uncontended case. This is the unnormalized form of the Bank - Conflict Rate. - unit: Conflicts per Access - Index Accesses: - plain: The total number of cycles spent in the LDS scheduler over all operations - per normalization unit. - rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>` over - all operations per :ref:`normalization unit <normalization-units>`. - unit: Cycles per normalization unit - Atomic Return Cycles: - plain: The total number of cycles spent on LDS atomics with return per normalization - unit. - rst: The total number of cycles spent on LDS atomics with return per :ref:`normalization - unit <normalization-units>`. - unit: Cycles per normalization unit - Bank Conflict: - plain: The total number of cycles spent in the LDS scheduler due to bank conflicts - (as determined by the conflict resolution hardware) per normalization unit. - rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>` due - to bank conflicts (as determined by the conflict resolution hardware) per - :ref:`normalization unit <normalization-units>`. - unit: Cycles per normalization unit - Addr Conflict: - plain: The total number of cycles spent in the LDS scheduler due to address - conflicts (as determined by the conflict resolution hardware) per normalization - unit. - rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>` due - to address conflicts (as determined by the conflict resolution hardware) - per :ref:`normalization unit <normalization-units>`. - unit: Cycles per normalization unit - Unaligned Stall: - plain: The total number of cycles spent in the LDS scheduler due to stalls from - non-dword aligned addresses per normalization unit. - rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>` due - to stalls from non-dword aligned addresses per :ref:`normalization unit <normalization-units>`. - unit: Cycles per normalization unit - Mem Violations: - plain: |- - The total number of out-of-bounds accesses made to the LDS, per normalization - unit. This is unused and expected to be zero in most configurations for - modern CDNA\u2122 accelerators. - rst: |- - The total number of out-of-bounds accesses made to the LDS, per :ref:`normalization - unit <normalization-units>`. This is unused and expected to be zero in - most configurations for modern CDNA\u2122 accelerators. - unit: Accesses per normalization unit -- id: 1300 - title: Instruction Cache - data source: - - metric_table: - id: 1301 - title: L1I Speed-of-Light - header: - metric: Metric - value: Avg - unit: Unit - metric: - gfx90a: - Bandwidth Utilization: - value: AVG(((SQC_ICACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) * - (End_Timestamp - Start_Timestamp)))) - unit: Pct of Peak - Cache Hit Rate: - value: AVG(((SQC_ICACHE_HITS * 100) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) - + SQC_ICACHE_MISSES_DUPLICATE))) - unit: Pct of Peak - L1I-L2 Bandwidth Utilization: - value: AVG(((SQC_TC_INST_REQ * 100000) / (2 * ($max_sclk * $sqc_per_gpu) - * (End_Timestamp - Start_Timestamp)))) - unit: Pct of Peak - gfx941: - Bandwidth Utilization: - value: AVG(((SQC_ICACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) * - (End_Timestamp - Start_Timestamp)))) - unit: Pct of Peak - Cache Hit Rate: - value: AVG(((SQC_ICACHE_HITS * 100) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) - + SQC_ICACHE_MISSES_DUPLICATE))) - unit: Pct of Peak - L1I-L2 Bandwidth Utilization: - value: AVG(((SQC_TC_INST_REQ * 100000) / (2 * ($max_sclk * $sqc_per_gpu) - * (End_Timestamp - Start_Timestamp)))) - unit: Pct of Peak - gfx940: - Bandwidth Utilization: - value: AVG(((SQC_ICACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) * - (End_Timestamp - Start_Timestamp)))) - unit: Pct of Peak - Cache Hit Rate: - value: AVG(((SQC_ICACHE_HITS * 100) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) - + SQC_ICACHE_MISSES_DUPLICATE))) - unit: Pct of Peak - L1I-L2 Bandwidth Utilization: - value: AVG(((SQC_TC_INST_REQ * 100000) / (2 * ($max_sclk * $sqc_per_gpu) - * (End_Timestamp - Start_Timestamp)))) - unit: Pct of Peak - gfx942: - Bandwidth Utilization: - value: AVG(((SQC_ICACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) * - (End_Timestamp - Start_Timestamp)))) - unit: Pct of Peak - Cache Hit Rate: - value: AVG(((SQC_ICACHE_HITS * 100) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) - + SQC_ICACHE_MISSES_DUPLICATE))) - unit: Pct of Peak - L1I-L2 Bandwidth Utilization: - value: AVG(((SQC_TC_INST_REQ * 100000) / (2 * ($max_sclk * $sqc_per_gpu) - * (End_Timestamp - Start_Timestamp)))) - unit: Pct of Peak - gfx950: - Bandwidth Utilization: - value: AVG(((SQC_ICACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) * - (End_Timestamp - Start_Timestamp)))) - unit: Pct of Peak - Cache Hit Rate: - value: AVG(((SQC_ICACHE_HITS * 100) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) - + SQC_ICACHE_MISSES_DUPLICATE))) - unit: Pct of Peak - L1I-L2 Bandwidth Utilization: - value: AVG(((SQC_TC_INST_REQ * 100000) / (2 * ($max_sclk * $sqc_per_gpu) - * (End_Timestamp - Start_Timestamp)))) - unit: Pct of Peak - gfx908: - Bandwidth Utilization: - value: AVG(((SQC_ICACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) * - (End_Timestamp - Start_Timestamp)))) - unit: Pct of Peak - Cache Hit Rate: - value: AVG(((SQC_ICACHE_HITS * 100) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) - + SQC_ICACHE_MISSES_DUPLICATE))) - unit: Pct of Peak - L1I-L2 Bandwidth Utilization: - value: AVG(((SQC_TC_INST_REQ * 100000) / (2 * ($max_sclk * $sqc_per_gpu) - * (End_Timestamp - Start_Timestamp)))) - unit: Pct of Peak - comparable: false - cli_style: simple_bar - tui_style: simple_bar - - metric_table: - id: 1302 - title: L1I cache accesses - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - metric: - gfx90a: - Req: - avg: AVG((SQC_ICACHE_REQ / $denom)) - min: MIN((SQC_ICACHE_REQ / $denom)) - max: MAX((SQC_ICACHE_REQ / $denom)) - unit: (Req + $normUnit) - Hits: - avg: AVG((SQC_ICACHE_HITS / $denom)) - min: MIN((SQC_ICACHE_HITS / $denom)) - max: MAX((SQC_ICACHE_HITS / $denom)) - unit: (Hits + $normUnit) - Misses - Non Duplicated: - avg: AVG((SQC_ICACHE_MISSES / $denom)) - min: MIN((SQC_ICACHE_MISSES / $denom)) - max: MAX((SQC_ICACHE_MISSES / $denom)) - unit: (Misses + $normUnit) - Misses - Duplicated: - avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom)) - min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom)) - max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom)) - unit: (Misses + $normUnit) - Cache Hit Rate: - avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) - + SQC_ICACHE_MISSES_DUPLICATE))) - min: MIN(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) - + SQC_ICACHE_MISSES_DUPLICATE))) - max: MAX(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) - + SQC_ICACHE_MISSES_DUPLICATE))) - unit: pct - Instruction Fetch Latency: - avg: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) - min: MIN((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) - max: MAX((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) - unit: Cycles - coll_level: SQ_IFETCH_LEVEL - gfx941: - Req: - avg: AVG((SQC_ICACHE_REQ / $denom)) - min: MIN((SQC_ICACHE_REQ / $denom)) - max: MAX((SQC_ICACHE_REQ / $denom)) - unit: (Req + $normUnit) - Hits: - avg: AVG((SQC_ICACHE_HITS / $denom)) - min: MIN((SQC_ICACHE_HITS / $denom)) - max: MAX((SQC_ICACHE_HITS / $denom)) - unit: (Hits + $normUnit) - Misses - Non Duplicated: - avg: AVG((SQC_ICACHE_MISSES / $denom)) - min: MIN((SQC_ICACHE_MISSES / $denom)) - max: MAX((SQC_ICACHE_MISSES / $denom)) - unit: (Misses + $normUnit) - Misses - Duplicated: - avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom)) - min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom)) - max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom)) - unit: (Misses + $normUnit) - Cache Hit Rate: - avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) - + SQC_ICACHE_MISSES_DUPLICATE))) - min: MIN(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) - + SQC_ICACHE_MISSES_DUPLICATE))) - max: MAX(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) - + SQC_ICACHE_MISSES_DUPLICATE))) - unit: pct - Instruction Fetch Latency: - avg: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) - min: MIN((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) - max: MAX((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) - unit: Cycles - coll_level: SQ_IFETCH_LEVEL - gfx940: - Req: - avg: AVG((SQC_ICACHE_REQ / $denom)) - min: MIN((SQC_ICACHE_REQ / $denom)) - max: MAX((SQC_ICACHE_REQ / $denom)) - unit: (Req + $normUnit) - Hits: - avg: AVG((SQC_ICACHE_HITS / $denom)) - min: MIN((SQC_ICACHE_HITS / $denom)) - max: MAX((SQC_ICACHE_HITS / $denom)) - unit: (Hits + $normUnit) - Misses - Non Duplicated: - avg: AVG((SQC_ICACHE_MISSES / $denom)) - min: MIN((SQC_ICACHE_MISSES / $denom)) - max: MAX((SQC_ICACHE_MISSES / $denom)) - unit: (Misses + $normUnit) - Misses - Duplicated: - avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom)) - min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom)) - max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom)) - unit: (Misses + $normUnit) - Cache Hit Rate: - avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) - + SQC_ICACHE_MISSES_DUPLICATE))) - min: MIN(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) - + SQC_ICACHE_MISSES_DUPLICATE))) - max: MAX(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) - + SQC_ICACHE_MISSES_DUPLICATE))) - unit: pct - Instruction Fetch Latency: - avg: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) - min: MIN((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) - max: MAX((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) - unit: Cycles - coll_level: SQ_IFETCH_LEVEL - gfx942: - Req: - avg: AVG((SQC_ICACHE_REQ / $denom)) - min: MIN((SQC_ICACHE_REQ / $denom)) - max: MAX((SQC_ICACHE_REQ / $denom)) - unit: (Req + $normUnit) - Hits: - avg: AVG((SQC_ICACHE_HITS / $denom)) - min: MIN((SQC_ICACHE_HITS / $denom)) - max: MAX((SQC_ICACHE_HITS / $denom)) - unit: (Hits + $normUnit) - Misses - Non Duplicated: - avg: AVG((SQC_ICACHE_MISSES / $denom)) - min: MIN((SQC_ICACHE_MISSES / $denom)) - max: MAX((SQC_ICACHE_MISSES / $denom)) - unit: (Misses + $normUnit) - Misses - Duplicated: - avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom)) - min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom)) - max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom)) - unit: (Misses + $normUnit) - Cache Hit Rate: - avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) - + SQC_ICACHE_MISSES_DUPLICATE))) - min: MIN(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) - + SQC_ICACHE_MISSES_DUPLICATE))) - max: MAX(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) - + SQC_ICACHE_MISSES_DUPLICATE))) - unit: pct - Instruction Fetch Latency: - avg: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) - min: MIN((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) - max: MAX((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) - unit: Cycles - coll_level: SQ_IFETCH_LEVEL - gfx950: - Req: - avg: AVG((SQC_ICACHE_REQ / $denom)) - min: MIN((SQC_ICACHE_REQ / $denom)) - max: MAX((SQC_ICACHE_REQ / $denom)) - unit: (Req + $normUnit) - Hits: - avg: AVG((SQC_ICACHE_HITS / $denom)) - min: MIN((SQC_ICACHE_HITS / $denom)) - max: MAX((SQC_ICACHE_HITS / $denom)) - unit: (Hits + $normUnit) - Misses - Non Duplicated: - avg: AVG((SQC_ICACHE_MISSES / $denom)) - min: MIN((SQC_ICACHE_MISSES / $denom)) - max: MAX((SQC_ICACHE_MISSES / $denom)) - unit: (Misses + $normUnit) - Misses - Duplicated: - avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom)) - min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom)) - max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom)) - unit: (Misses + $normUnit) - Cache Hit Rate: - avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) - + SQC_ICACHE_MISSES_DUPLICATE))) - min: MIN(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) - + SQC_ICACHE_MISSES_DUPLICATE))) - max: MAX(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) - + SQC_ICACHE_MISSES_DUPLICATE))) - unit: pct - Instruction Fetch Latency: - avg: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) - min: MIN((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) - max: MAX((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) - unit: Cycles - coll_level: SQ_IFETCH_LEVEL - gfx908: - Req: - avg: AVG((SQC_ICACHE_REQ / $denom)) - min: MIN((SQC_ICACHE_REQ / $denom)) - max: MAX((SQC_ICACHE_REQ / $denom)) - unit: (Req + $normUnit) - Hits: - avg: AVG((SQC_ICACHE_HITS / $denom)) - min: MIN((SQC_ICACHE_HITS / $denom)) - max: MAX((SQC_ICACHE_HITS / $denom)) - unit: (Hits + $normUnit) - Misses - Non Duplicated: - avg: AVG((SQC_ICACHE_MISSES / $denom)) - min: MIN((SQC_ICACHE_MISSES / $denom)) - max: MAX((SQC_ICACHE_MISSES / $denom)) - unit: (Misses + $normUnit) - Misses - Duplicated: - avg: AVG((SQC_ICACHE_MISSES_DUPLICATE / $denom)) - min: MIN((SQC_ICACHE_MISSES_DUPLICATE / $denom)) - max: MAX((SQC_ICACHE_MISSES_DUPLICATE / $denom)) - unit: (Misses + $normUnit) - Cache Hit Rate: - avg: AVG(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) - + SQC_ICACHE_MISSES_DUPLICATE))) - min: MIN(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) - + SQC_ICACHE_MISSES_DUPLICATE))) - max: MAX(((100 * SQC_ICACHE_HITS) / ((SQC_ICACHE_HITS + SQC_ICACHE_MISSES) - + SQC_ICACHE_MISSES_DUPLICATE))) - unit: pct - Instruction Fetch Latency: - avg: AVG((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) - min: MIN((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) - max: MAX((SQ_ACCUM_PREV_HIRES / SQ_IFETCH)) - unit: Cycles - coll_level: SQ_IFETCH_LEVEL - - metric_table: - id: 1303 - title: L1I <-> L2 interface - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - metric: - gfx90a: - L1I-L2 Bandwidth: - avg: AVG(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp))) - min: MIN(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp))) - max: MAX(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp))) - unit: Gbps - gfx941: - L1I-L2 Bandwidth: - avg: AVG(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp))) - min: MIN(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp))) - max: MAX(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp))) - unit: Gbps - gfx940: - L1I-L2 Bandwidth: - avg: AVG(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp))) - min: MIN(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp))) - max: MAX(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp))) - unit: Gbps - gfx942: - L1I-L2 Bandwidth: - avg: AVG(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp))) - min: MIN(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp))) - max: MAX(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp))) - unit: Gbps - gfx950: - L1I-L2 Bandwidth: - avg: AVG(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp))) - min: MIN(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp))) - max: MAX(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp))) - unit: Gbps - gfx908: - L1I-L2 Bandwidth: - avg: AVG(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp))) - min: MIN(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp))) - max: MAX(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp))) - unit: Gbps - metrics_description: - Bandwidth Utilization: - plain: The number of bytes looked up in the L1I cache, as a percent of the peak - theoretical bandwidth. Calculated as the ratio of L1I requests over the total - L1I cycles. - rst: The number of bytes looked up in the L1I cache, as a percent of the peak theoretical - bandwidth. Calculated as the ratio of L1I requests over the :ref:`total L1I - cycles <total-l1i-cycles>`. - unit: Percent - Cache Hit Rate: - plain: The percent of L1I requests that hit [#l1i-cache]_ on a previously loaded - line the cache. Calculated as the ratio of the number of L1I requests that - hit over the number of all L1I requests. - rst: The percent of L1I requests that hit [#l1i-cache]_ on a previously loaded line - the cache. Calculated as the ratio of the number of L1I requests that hit - over the number of all L1I requests. - unit: Percent - L1I-L2 Bandwidth Utilization: - plain: |- - The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth - achieved. Calculated as the ratio of the total number of requests from the - L1I to the L2 cache over the total L1I-L2 interface cycles. - rst: |- - The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth - achieved. Calculated as the ratio of the total number of requests from - the L1I to the L2 cache over the :ref:`total L1I-L2 interface cycles <total-l1i-cycles>`. - unit: Percent - L1I-L2 Bandwidth: - plain: Total number of bytes transferred across L1I - L2 interface divided by total duration. - rst: Total number of bytes transferred across L1I - L2 interface divided by total duration. - unit: Gbps - Req: - plain: The total number of requests made to the L1I per normalization-unit - rst: The total number of requests made to the L1I per normalization-unit - unit: Requests per normalization unit - Hits: - plain: The total number of L1I requests that hit on a previously loaded cache - line, per normalization-unit. - rst: The total number of L1I requests that hit on a previously loaded cache line, - per :ref:`normalization-unit <normalization-units>`. - unit: Requests per normalization unit - Misses - Non Duplicated: - plain: The total number of L1I requests that missed on a cache line that were - not already pending due to another request, per normalization-unit. - rst: The total number of L1I requests that missed on a cache line that *were - not* already pending due to another request, per :ref:`normalization-unit - <normalization-units>`. See note in :ref:`desc-l1i-sol` for more detail. - unit: Requests per normalization unit - Misses - Duplicated: - plain: The total number of L1I requests that missed on a cache line that were - already pending due to another request, per normalization-unit. - rst: The total number of L1I requests that missed on a cache line that *were* already - pending due to another request, per :ref:`normalization-unit <normalization-units>`. - See note in :ref:`desc-l1i-sol` for more detail. - unit: Requests per normalization unit - Instruction Fetch Latency: - plain: The average number of cycles spent to fetch instructions to a CU. - rst: The average number of cycles spent to fetch instructions to a :doc:`CU - <compute-unit>`. - unit: Cycles -- id: 1400 - title: Scalar L1 Data Cache - data source: - - metric_table: - id: 1401 - title: Scalar L1D Speed-of-Light - header: - metric: Metric - value: Avg - unit: Unit - metric: - gfx90a: - Bandwidth Utilization: - value: AVG(((SQC_DCACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) * - (End_Timestamp - Start_Timestamp)))) - unit: Pct of Peak - Cache Hit Rate: - value: AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES - + SQC_DCACHE_MISSES_DUPLICATE)) if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES - + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) - unit: Pct of Peak - sL1D-L2 BW Utilization: - value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) - * 100000) / (2 * ($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp))) - unit: Pct of Peak - gfx941: - Bandwidth Utilization: - value: AVG(((SQC_DCACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) * - (End_Timestamp - Start_Timestamp)))) - unit: Pct of Peak - Cache Hit Rate: - value: AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES - + SQC_DCACHE_MISSES_DUPLICATE)) if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES - + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) - unit: Pct of Peak - sL1D-L2 BW Utilization: - value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) - * 100000) / (2 * ($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp))) - unit: Pct of Peak - gfx940: - Bandwidth Utilization: - value: AVG(((SQC_DCACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) * - (End_Timestamp - Start_Timestamp)))) - unit: Pct of Peak - Cache Hit Rate: - value: AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES - + SQC_DCACHE_MISSES_DUPLICATE)) if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES - + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) - unit: Pct of Peak - sL1D-L2 BW Utilization: - value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) - * 100000) / (2 * ($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp))) - unit: Pct of Peak - gfx942: - Bandwidth Utilization: - value: AVG(((SQC_DCACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) * - (End_Timestamp - Start_Timestamp)))) - unit: Pct of Peak - Cache Hit Rate: - value: AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES - + SQC_DCACHE_MISSES_DUPLICATE)) if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES - + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) - unit: Pct of Peak - sL1D-L2 BW Utilization: - value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) - * 100000) / (2 * ($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp))) - unit: Pct of Peak - gfx950: - Bandwidth Utilization: - value: AVG(((SQC_DCACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) * - (End_Timestamp - Start_Timestamp)))) - unit: Pct of Peak - Cache Hit Rate: - value: AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES - + SQC_DCACHE_MISSES_DUPLICATE)) if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES - + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) - unit: Pct of Peak - sL1D-L2 BW Utilization: - value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) - * 100000) / (2 * ($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp))) - unit: Pct of Peak - gfx908: - Bandwidth Utilization: - value: AVG(((SQC_DCACHE_REQ * 100000) / (($max_sclk * $sqc_per_gpu) * - (End_Timestamp - Start_Timestamp)))) - unit: Pct of Peak - Cache Hit Rate: - value: AVG((((SQC_DCACHE_HITS * 100) / (SQC_DCACHE_HITS + SQC_DCACHE_MISSES - + SQC_DCACHE_MISSES_DUPLICATE)) if ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES - + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) - unit: Pct of Peak - sL1D-L2 BW Utilization: - value: AVG(((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) - * 100000) / (2 * ($max_sclk * $sqc_per_gpu) * (End_Timestamp - Start_Timestamp))) - unit: Pct of Peak - comparable: false - cli_style: simple_bar - tui_style: simple_bar - - metric_table: - id: 1402 - title: Scalar L1D cache accesses - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - metric: - gfx90a: - Req: - avg: AVG((SQC_DCACHE_REQ / $denom)) - min: MIN((SQC_DCACHE_REQ / $denom)) - max: MAX((SQC_DCACHE_REQ / $denom)) - unit: (Req + $normUnit) - Hits: - avg: AVG((SQC_DCACHE_HITS / $denom)) - min: MIN((SQC_DCACHE_HITS / $denom)) - max: MAX((SQC_DCACHE_HITS / $denom)) - unit: (Req + $normUnit) - Misses - Non Duplicated: - avg: AVG((SQC_DCACHE_MISSES / $denom)) - min: MIN((SQC_DCACHE_MISSES / $denom)) - max: MAX((SQC_DCACHE_MISSES / $denom)) - unit: (Req + $normUnit) - Misses- Duplicated: - avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom)) - min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom)) - max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom)) - unit: (Req + $normUnit) - Cache Hit Rate: - avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) - min: MIN((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) - max: MAX((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) - unit: pct - Read Req (Total): - avg: AVG((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) - + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) - min: MIN((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) - + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) - max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) - + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) - unit: (Req + $normUnit) - Atomic Req: - avg: AVG((SQC_DCACHE_ATOMIC / $denom)) - min: MIN((SQC_DCACHE_ATOMIC / $denom)) - max: MAX((SQC_DCACHE_ATOMIC / $denom)) - unit: (Req + $normUnit) - Read Req (1 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_1 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_1 / $denom)) - unit: (Req + $normUnit) - Read Req (2 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_2 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_2 / $denom)) - unit: (Req + $normUnit) - Read Req (4 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_4 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_4 / $denom)) - unit: (Req + $normUnit) - Read Req (8 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_8 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_8 / $denom)) - unit: (Req + $normUnit) - Read Req (16 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_16 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_16 / $denom)) - unit: (Req + $normUnit) - gfx941: - Req: - avg: AVG((SQC_DCACHE_REQ / $denom)) - min: MIN((SQC_DCACHE_REQ / $denom)) - max: MAX((SQC_DCACHE_REQ / $denom)) - unit: (Req + $normUnit) - Hits: - avg: AVG((SQC_DCACHE_HITS / $denom)) - min: MIN((SQC_DCACHE_HITS / $denom)) - max: MAX((SQC_DCACHE_HITS / $denom)) - unit: (Req + $normUnit) - Misses - Non Duplicated: - avg: AVG((SQC_DCACHE_MISSES / $denom)) - min: MIN((SQC_DCACHE_MISSES / $denom)) - max: MAX((SQC_DCACHE_MISSES / $denom)) - unit: (Req + $normUnit) - Misses- Duplicated: - avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom)) - min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom)) - max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom)) - unit: (Req + $normUnit) - Cache Hit Rate: - avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) - min: MIN((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) - max: MAX((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) - unit: pct - Read Req (Total): - avg: AVG((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) - + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) - min: MIN((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) - + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) - max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) - + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) - unit: (Req + $normUnit) - Atomic Req: - avg: AVG((SQC_DCACHE_ATOMIC / $denom)) - min: MIN((SQC_DCACHE_ATOMIC / $denom)) - max: MAX((SQC_DCACHE_ATOMIC / $denom)) - unit: (Req + $normUnit) - Read Req (1 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_1 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_1 / $denom)) - unit: (Req + $normUnit) - Read Req (2 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_2 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_2 / $denom)) - unit: (Req + $normUnit) - Read Req (4 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_4 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_4 / $denom)) - unit: (Req + $normUnit) - Read Req (8 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_8 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_8 / $denom)) - unit: (Req + $normUnit) - Read Req (16 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_16 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_16 / $denom)) - unit: (Req + $normUnit) - gfx940: - Req: - avg: AVG((SQC_DCACHE_REQ / $denom)) - min: MIN((SQC_DCACHE_REQ / $denom)) - max: MAX((SQC_DCACHE_REQ / $denom)) - unit: (Req + $normUnit) - Hits: - avg: AVG((SQC_DCACHE_HITS / $denom)) - min: MIN((SQC_DCACHE_HITS / $denom)) - max: MAX((SQC_DCACHE_HITS / $denom)) - unit: (Req + $normUnit) - Misses - Non Duplicated: - avg: AVG((SQC_DCACHE_MISSES / $denom)) - min: MIN((SQC_DCACHE_MISSES / $denom)) - max: MAX((SQC_DCACHE_MISSES / $denom)) - unit: (Req + $normUnit) - Misses- Duplicated: - avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom)) - min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom)) - max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom)) - unit: (Req + $normUnit) - Cache Hit Rate: - avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) - min: MIN((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) - max: MAX((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) - unit: pct - Read Req (Total): - avg: AVG((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) - + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) - min: MIN((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) - + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) - max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) - + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) - unit: (Req + $normUnit) - Atomic Req: - avg: AVG((SQC_DCACHE_ATOMIC / $denom)) - min: MIN((SQC_DCACHE_ATOMIC / $denom)) - max: MAX((SQC_DCACHE_ATOMIC / $denom)) - unit: (Req + $normUnit) - Read Req (1 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_1 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_1 / $denom)) - unit: (Req + $normUnit) - Read Req (2 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_2 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_2 / $denom)) - unit: (Req + $normUnit) - Read Req (4 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_4 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_4 / $denom)) - unit: (Req + $normUnit) - Read Req (8 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_8 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_8 / $denom)) - unit: (Req + $normUnit) - Read Req (16 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_16 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_16 / $denom)) - unit: (Req + $normUnit) - gfx942: - Req: - avg: AVG((SQC_DCACHE_REQ / $denom)) - min: MIN((SQC_DCACHE_REQ / $denom)) - max: MAX((SQC_DCACHE_REQ / $denom)) - unit: (Req + $normUnit) - Hits: - avg: AVG((SQC_DCACHE_HITS / $denom)) - min: MIN((SQC_DCACHE_HITS / $denom)) - max: MAX((SQC_DCACHE_HITS / $denom)) - unit: (Req + $normUnit) - Misses - Non Duplicated: - avg: AVG((SQC_DCACHE_MISSES / $denom)) - min: MIN((SQC_DCACHE_MISSES / $denom)) - max: MAX((SQC_DCACHE_MISSES / $denom)) - unit: (Req + $normUnit) - Misses- Duplicated: - avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom)) - min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom)) - max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom)) - unit: (Req + $normUnit) - Cache Hit Rate: - avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) - min: MIN((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) - max: MAX((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) - unit: pct - Read Req (Total): - avg: AVG((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) - + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) - min: MIN((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) - + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) - max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) - + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) - unit: (Req + $normUnit) - Atomic Req: - avg: AVG((SQC_DCACHE_ATOMIC / $denom)) - min: MIN((SQC_DCACHE_ATOMIC / $denom)) - max: MAX((SQC_DCACHE_ATOMIC / $denom)) - unit: (Req + $normUnit) - Read Req (1 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_1 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_1 / $denom)) - unit: (Req + $normUnit) - Read Req (2 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_2 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_2 / $denom)) - unit: (Req + $normUnit) - Read Req (4 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_4 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_4 / $denom)) - unit: (Req + $normUnit) - Read Req (8 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_8 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_8 / $denom)) - unit: (Req + $normUnit) - Read Req (16 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_16 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_16 / $denom)) - unit: (Req + $normUnit) - gfx950: - Req: - avg: AVG((SQC_DCACHE_REQ / $denom)) - min: MIN((SQC_DCACHE_REQ / $denom)) - max: MAX((SQC_DCACHE_REQ / $denom)) - unit: (Req + $normUnit) - Hits: - avg: AVG((SQC_DCACHE_HITS / $denom)) - min: MIN((SQC_DCACHE_HITS / $denom)) - max: MAX((SQC_DCACHE_HITS / $denom)) - unit: (Req + $normUnit) - Misses - Non Duplicated: - avg: AVG((SQC_DCACHE_MISSES / $denom)) - min: MIN((SQC_DCACHE_MISSES / $denom)) - max: MAX((SQC_DCACHE_MISSES / $denom)) - unit: (Req + $normUnit) - Misses- Duplicated: - avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom)) - min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom)) - max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom)) - unit: (Req + $normUnit) - Cache Hit Rate: - avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) - min: MIN((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) - max: MAX((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) - unit: pct - Read Req (Total): - avg: AVG((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) - + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) - min: MIN((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) - + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) - max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) - + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) - unit: (Req + $normUnit) - Atomic Req: - avg: AVG((SQC_DCACHE_ATOMIC / $denom)) - min: MIN((SQC_DCACHE_ATOMIC / $denom)) - max: MAX((SQC_DCACHE_ATOMIC / $denom)) - unit: (Req + $normUnit) - Read Req (1 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_1 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_1 / $denom)) - unit: (Req + $normUnit) - Read Req (2 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_2 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_2 / $denom)) - unit: (Req + $normUnit) - Read Req (4 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_4 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_4 / $denom)) - unit: (Req + $normUnit) - Read Req (8 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_8 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_8 / $denom)) - unit: (Req + $normUnit) - Read Req (16 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_16 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_16 / $denom)) - unit: (Req + $normUnit) - gfx908: - Req: - avg: AVG((SQC_DCACHE_REQ / $denom)) - min: MIN((SQC_DCACHE_REQ / $denom)) - max: MAX((SQC_DCACHE_REQ / $denom)) - unit: (Req + $normUnit) - Hits: - avg: AVG((SQC_DCACHE_HITS / $denom)) - min: MIN((SQC_DCACHE_HITS / $denom)) - max: MAX((SQC_DCACHE_HITS / $denom)) - unit: (Req + $normUnit) - Misses - Non Duplicated: - avg: AVG((SQC_DCACHE_MISSES / $denom)) - min: MIN((SQC_DCACHE_MISSES / $denom)) - max: MAX((SQC_DCACHE_MISSES / $denom)) - unit: (Req + $normUnit) - Misses- Duplicated: - avg: AVG((SQC_DCACHE_MISSES_DUPLICATE / $denom)) - min: MIN((SQC_DCACHE_MISSES_DUPLICATE / $denom)) - max: MAX((SQC_DCACHE_MISSES_DUPLICATE / $denom)) - unit: (Req + $normUnit) - Cache Hit Rate: - avg: AVG((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) - min: MIN((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) - max: MAX((((100 * SQC_DCACHE_HITS) / ((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE)) if (((SQC_DCACHE_HITS + SQC_DCACHE_MISSES) - + SQC_DCACHE_MISSES_DUPLICATE) != 0) else None)) - unit: pct - Read Req (Total): - avg: AVG((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) - + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) - min: MIN((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) - + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) - max: MAX((((((SQC_DCACHE_REQ_READ_1 + SQC_DCACHE_REQ_READ_2) + SQC_DCACHE_REQ_READ_4) - + SQC_DCACHE_REQ_READ_8) + SQC_DCACHE_REQ_READ_16) / $denom)) - unit: (Req + $normUnit) - Atomic Req: - avg: AVG((SQC_DCACHE_ATOMIC / $denom)) - min: MIN((SQC_DCACHE_ATOMIC / $denom)) - max: MAX((SQC_DCACHE_ATOMIC / $denom)) - unit: (Req + $normUnit) - Read Req (1 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_1 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_1 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_1 / $denom)) - unit: (Req + $normUnit) - Read Req (2 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_2 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_2 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_2 / $denom)) - unit: (Req + $normUnit) - Read Req (4 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_4 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_4 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_4 / $denom)) - unit: (Req + $normUnit) - Read Req (8 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_8 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_8 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_8 / $denom)) - unit: (Req + $normUnit) - Read Req (16 DWord): - avg: AVG((SQC_DCACHE_REQ_READ_16 / $denom)) - min: MIN((SQC_DCACHE_REQ_READ_16 / $denom)) - max: MAX((SQC_DCACHE_REQ_READ_16 / $denom)) - unit: (Req + $normUnit) - - metric_table: - id: 1403 - title: Scalar L1D Cache - L2 Interface - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - metric: - gfx90a: - sL1D-L2 BW: - avg: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) - * 64)) / (End_Timestamp - Start_Timestamp))) - min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) - * 64)) / (End_Timestamp - Start_Timestamp))) - max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) - * 64)) / (End_Timestamp - Start_Timestamp))) - unit: Gbps - Read Req: - avg: AVG((SQC_TC_DATA_READ_REQ / $denom)) - min: MIN((SQC_TC_DATA_READ_REQ / $denom)) - max: MAX((SQC_TC_DATA_READ_REQ / $denom)) - unit: (Req + $normUnit) - Write Req: - avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom)) - min: MIN((SQC_TC_DATA_WRITE_REQ / $denom)) - max: MAX((SQC_TC_DATA_WRITE_REQ / $denom)) - unit: (Req + $normUnit) - Atomic Req: - avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)) - min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom)) - max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom)) - unit: (Req + $normUnit) - Stall Cycles: - avg: AVG((SQC_TC_STALL / $denom)) - min: MIN((SQC_TC_STALL / $denom)) - max: MAX((SQC_TC_STALL / $denom)) - unit: (Cycles + $normUnit) - gfx941: - sL1D-L2 BW: - avg: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) - * 64)) / (End_Timestamp - Start_Timestamp))) - min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) - * 64)) / (End_Timestamp - Start_Timestamp))) - max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) - * 64)) / (End_Timestamp - Start_Timestamp))) - unit: Gbps - Read Req: - avg: AVG((SQC_TC_DATA_READ_REQ / $denom)) - min: MIN((SQC_TC_DATA_READ_REQ / $denom)) - max: MAX((SQC_TC_DATA_READ_REQ / $denom)) - unit: (Req + $normUnit) - Write Req: - avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom)) - min: MIN((SQC_TC_DATA_WRITE_REQ / $denom)) - max: MAX((SQC_TC_DATA_WRITE_REQ / $denom)) - unit: (Req + $normUnit) - Atomic Req: - avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)) - min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom)) - max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom)) - unit: (Req + $normUnit) - Stall Cycles: - avg: AVG((SQC_TC_STALL / $denom)) - min: MIN((SQC_TC_STALL / $denom)) - max: MAX((SQC_TC_STALL / $denom)) - unit: (Cycles + $normUnit) - gfx940: - sL1D-L2 BW: - avg: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) - * 64)) / (End_Timestamp - Start_Timestamp))) - min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) - * 64)) / (End_Timestamp - Start_Timestamp))) - max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) - * 64)) / (End_Timestamp - Start_Timestamp))) - unit: Gbps - Read Req: - avg: AVG((SQC_TC_DATA_READ_REQ / $denom)) - min: MIN((SQC_TC_DATA_READ_REQ / $denom)) - max: MAX((SQC_TC_DATA_READ_REQ / $denom)) - unit: (Req + $normUnit) - Write Req: - avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom)) - min: MIN((SQC_TC_DATA_WRITE_REQ / $denom)) - max: MAX((SQC_TC_DATA_WRITE_REQ / $denom)) - unit: (Req + $normUnit) - Atomic Req: - avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)) - min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom)) - max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom)) - unit: (Req + $normUnit) - Stall Cycles: - avg: AVG((SQC_TC_STALL / $denom)) - min: MIN((SQC_TC_STALL / $denom)) - max: MAX((SQC_TC_STALL / $denom)) - unit: (Cycles + $normUnit) - gfx942: - sL1D-L2 BW: - avg: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) - * 64)) / (End_Timestamp - Start_Timestamp))) - min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) - * 64)) / (End_Timestamp - Start_Timestamp))) - max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) - * 64)) / (End_Timestamp - Start_Timestamp))) - unit: Gbps - Read Req: - avg: AVG((SQC_TC_DATA_READ_REQ / $denom)) - min: MIN((SQC_TC_DATA_READ_REQ / $denom)) - max: MAX((SQC_TC_DATA_READ_REQ / $denom)) - unit: (Req + $normUnit) - Write Req: - avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom)) - min: MIN((SQC_TC_DATA_WRITE_REQ / $denom)) - max: MAX((SQC_TC_DATA_WRITE_REQ / $denom)) - unit: (Req + $normUnit) - Atomic Req: - avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)) - min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom)) - max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom)) - unit: (Req + $normUnit) - Stall Cycles: - avg: AVG((SQC_TC_STALL / $denom)) - min: MIN((SQC_TC_STALL / $denom)) - max: MAX((SQC_TC_STALL / $denom)) - unit: (Cycles + $normUnit) - gfx950: - sL1D-L2 BW: - avg: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) - * 64)) / (End_Timestamp - Start_Timestamp))) - min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) - * 64)) / (End_Timestamp - Start_Timestamp))) - max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) - * 64)) / (End_Timestamp - Start_Timestamp))) - unit: Gbps - Read Req: - avg: AVG((SQC_TC_DATA_READ_REQ / $denom)) - min: MIN((SQC_TC_DATA_READ_REQ / $denom)) - max: MAX((SQC_TC_DATA_READ_REQ / $denom)) - unit: (Req + $normUnit) - Write Req: - avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom)) - min: MIN((SQC_TC_DATA_WRITE_REQ / $denom)) - max: MAX((SQC_TC_DATA_WRITE_REQ / $denom)) - unit: (Req + $normUnit) - Atomic Req: - avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)) - min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom)) - max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom)) - unit: (Req + $normUnit) - Stall Cycles: - avg: AVG((SQC_TC_STALL / $denom)) - min: MIN((SQC_TC_STALL / $denom)) - max: MAX((SQC_TC_STALL / $denom)) - unit: (Cycles + $normUnit) - gfx908: - sL1D-L2 BW: - avg: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) - * 64)) / (End_Timestamp - Start_Timestamp))) - min: MIN(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) - * 64)) / (End_Timestamp - Start_Timestamp))) - max: MAX(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ) - * 64)) / (End_Timestamp - Start_Timestamp))) - unit: Gbps - Read Req: - avg: AVG((SQC_TC_DATA_READ_REQ / $denom)) - min: MIN((SQC_TC_DATA_READ_REQ / $denom)) - max: MAX((SQC_TC_DATA_READ_REQ / $denom)) - unit: (Req + $normUnit) - Write Req: - avg: AVG((SQC_TC_DATA_WRITE_REQ / $denom)) - min: MIN((SQC_TC_DATA_WRITE_REQ / $denom)) - max: MAX((SQC_TC_DATA_WRITE_REQ / $denom)) - unit: (Req + $normUnit) - Atomic Req: - avg: AVG((SQC_TC_DATA_ATOMIC_REQ / $denom)) - min: MIN((SQC_TC_DATA_ATOMIC_REQ / $denom)) - max: MAX((SQC_TC_DATA_ATOMIC_REQ / $denom)) - unit: (Req + $normUnit) - Stall Cycles: - avg: AVG((SQC_TC_STALL / $denom)) - min: MIN((SQC_TC_STALL / $denom)) - max: MAX((SQC_TC_STALL / $denom)) - unit: (Cycles + $normUnit) - metrics_description: - Bandwidth Utilization: - plain: The number of bytes looked up in the sL1D cache, as a percent of the - peak theoretical bandwidth. Calculated as the ratio of sL1D requests over - the total sL1D cycles. - rst: The number of bytes looked up in the sL1D cache, as a percent of the peak theoretical - bandwidth. Calculated as the ratio of sL1D requests over the :ref:`total - sL1D cycles <total-sl1d-cycles>`. - unit: Percent - Cache Hit Rate: - plain: Indicates the percent of sL1D requests that hit on a previously loaded - line the cache. The ratio of the number of sL1D requests that hit over the - number of all sL1D requests. - rst: Indicates the percent of sL1D requests that hit on a previously loaded line - the cache. The ratio of the number of sL1D requests that hit [#sl1d-cache]_ - over the number of all sL1D requests. - unit: Percent - sL1D-L2 BW Utilization: - plain: The percentage of the peak theoretical sL1D - L2 interface bandwidth acheived. - Calculated as total number of bytes read from, written to, or atomically updated - across the sL1D - L2 interface. - rst: The percentage of the peak theoretical sL1D - L2 interface bandwidth acheived. - Calculated as total number of bytes read from, written to, or atomically updated - across the sL1D - L2 interface. - unit: Percent - sL1D-L2 BW: - plain: |- - The total number of bytes read from, written to, or atomically updated - across the sL1D\u2194L2 interface, divided by total duration. Note that sL1D - writes and atomics are typically unused on current CDNA accelerators, so - in the majority of cases this can be interpreted as an sL1D\u2192L2 read - bandwidth. - rst: |- - The total number of bytes read from, written to, or atomically updated - across the sL1D\u2194:doc:`L2 <l2-cache>` interface, divided by total duration. - Note that sL1D writes and atomics are typically - unused on current CDNA accelerators, so in the majority of cases this can - be interpreted as an sL1D\u2192L2 read bandwidth. - unit: Gbps - Req: - plain: The total number of requests, of any size or type, made to the sL1D per - normalization unit. - rst: The total number of requests, of any size or type, made to the sL1D per :ref:`normalization - unit <normalization-units>`. - unit: Requests per normalization unit - Hits: - plain: The total number of sL1D requests that hit on a previously loaded cache - line, per normalization unit. - rst: The total number of sL1D requests that hit on a previously loaded cache line, - per :ref:`normalization unit <normalization-units>`. - unit: Requests per normalization unit - Misses - Non Duplicated: - plain: |- - The total number of sL1D requests that missed on a cache line that was - not already pending due to another request, per normalization unit. - rst: The total number of sL1D requests that missed on a cache line that *was not* - already pending due to another request, per :ref:`normalization unit <normalization-units>`. - See :ref:`desc-sl1d-sol` for more detail. - unit: Requests per normalization unit - Misses- Duplicated: - plain: The total number of sL1D requests that missed on a cache line that was - already pending due to another request, per normalization unit. - rst: The total number of sL1D requests that missed on a cache line that *was* already - pending due to another request, per :ref:`normalization unit <normalization-units>`. - See :ref:`desc-sl1d-sol` for more detail. - unit: Requests per normalization unit - Read Req (Total): - plain: The total number of sL1D read requests of any size, per normalization - unit. - rst: The total number of sL1D read requests of any size, per :ref:`normalization - unit <normalization-units>`. - unit: Requests per normalization unit - Atomic Req: - plain: The total number of atomic requests from sL1D to the L2, per normalization - unit. Typically unused on current CDNA accelerators. - rst: The total number of atomic requests from sL1D to the :doc:`L2 <l2-cache>`, - per :ref:`normalization unit <normalization-units>`. Typically unused on current - CDNA accelerators. - unit: Requests per normalization unit - Read Req (1 DWord): - plain: The total number of sL1D read requests made for a single dword of data - (4B), per normalization unit. - rst: The total number of sL1D read requests made for a single dword of data (4B), - per :ref:`normalization unit <normalization-units>`. - unit: Requests per normalization unit - Read Req (2 DWord): - plain: The total number of sL1D read requests made for a two dwords of data - (8B), per normalization unit. - rst: The total number of sL1D read requests made for a two dwords of data (8B), - per :ref:`normalization unit <normalization-units>`. - unit: Requests per normalization unit - Read Req (4 DWord): - plain: The total number of sL1D read requests made for a four dwords of data - (16B), per normalization unit. - rst: The total number of sL1D read requests made for a four dwords of data (16B), - per :ref:`normalization unit <normalization-units>`. - unit: Requests per normalization unit - Read Req (8 DWord): - plain: The total number of sL1D read requests made for a eight dwords of data - (32B), per normalization unit. - rst: The total number of sL1D read requests made for a eight dwords of data (32B), - per :ref:`normalization unit <normalization-units>`. - unit: Requests per normalization unit - Read Req (16 DWord): - plain: The total number of sL1D read requests made for a sixteen dwords of data - (64B), per normalization unit. - rst: The total number of sL1D read requests made for a sixteen dwords of data (64B), - per :ref:`normalization unit <normalization-units>`. - unit: Requests per normalization unit - Read Req: - plain: The total number of read requests from sL1D to the L2 per normalization - unit. - rst: The total number of read requests from sL1D to the :doc:`L2 <l2-cache>`, per - :ref:`normalization unit <normalization-units>`. - unit: Requests per normalization unit - Write Req: - plain: The total number of write requests from sL1D to the L2, per normalization - unit. Typically unused on current CDNA accelerators. - rst: The total number of write requests from sL1D to the :doc:`L2 <l2-cache>`, per - :ref:`normalization unit <normalization-units>`. Typically unused on current - CDNA accelerators. - unit: Requests per normalization unit - Stall Cycles: - plain: |- - The total number of cycles the sL1D\u2194L2 interface was stalled, per - normalization unit. - rst: |- - The total number of cycles the sL1D\u2194 :doc:`L2 <l2-cache>` interface - was stalled, per :ref:`normalization unit <normalization-units>`. - unit: Cycles per normalization unit -- id: 1500 - title: Address Processing Unit and Data Return Path (TA/TD) - data source: - - metric_table: - id: 1501 - title: Busy and stall metrics - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - metric: - gfx90a: - Address Processing Unit Busy: - avg: AVG(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - Address Stall: - avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - unit: pct - Data Stall: - avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - unit: pct - "Data-Processor \u2192 Address Stall": - avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - unit: pct - "Sequencer \u2192 TA Address Stall": - avg: AVG((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) - min: MIN((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) - max: MAX((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) - unit: (Cycles + $normUnit) - "Sequencer \u2192 TA Command Stall": - avg: AVG((SQ_VMEM_TA_CMD_FIFO_FULL / $denom)) - min: MIN((SQ_VMEM_TA_CMD_FIFO_FULL / $denom)) - max: MAX((SQ_VMEM_TA_CMD_FIFO_FULL / $denom)) - unit: (Cycles + $normUnit) - "Sequencer \u2192 TA Data Stall": - avg: AVG((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) - min: MIN((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) - max: MAX((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) - unit: (Cycles + $normUnit) - gfx941: - Address Processing Unit Busy: - avg: AVG(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - Address Stall: - avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - unit: pct - Data Stall: - avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - unit: pct - "Data-Processor \u2192 Address Stall": - avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - unit: pct - "Sequencer \u2192 TA Address Stall": - avg: AVG((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) - min: MIN((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) - max: MAX((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) - unit: (Cycles + $normUnit) - "Sequencer \u2192 TA Command Stall": - avg: AVG((SQ_VMEM_TA_CMD_FIFO_FULL / $denom)) - min: MIN((SQ_VMEM_TA_CMD_FIFO_FULL / $denom)) - max: MAX((SQ_VMEM_TA_CMD_FIFO_FULL / $denom)) - unit: (Cycles + $normUnit) - "Sequencer \u2192 TA Data Stall": - avg: AVG((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) - min: MIN((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) - max: MAX((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) - unit: (Cycles + $normUnit) - gfx940: - Address Processing Unit Busy: - avg: AVG(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - Address Stall: - avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - unit: pct - Data Stall: - avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - unit: pct - "Data-Processor \u2192 Address Stall": - avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - unit: pct - "Sequencer \u2192 TA Address Stall": - avg: AVG((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) - min: MIN((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) - max: MAX((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) - unit: (Cycles + $normUnit) - "Sequencer \u2192 TA Command Stall": - avg: AVG((SQ_VMEM_TA_CMD_FIFO_FULL / $denom)) - min: MIN((SQ_VMEM_TA_CMD_FIFO_FULL / $denom)) - max: MAX((SQ_VMEM_TA_CMD_FIFO_FULL / $denom)) - unit: (Cycles + $normUnit) - "Sequencer \u2192 TA Data Stall": - avg: AVG((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) - min: MIN((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) - max: MAX((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) - unit: (Cycles + $normUnit) - gfx942: - Address Processing Unit Busy: - avg: AVG(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - Address Stall: - avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - unit: pct - Data Stall: - avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - unit: pct - "Data-Processor \u2192 Address Stall": - avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - unit: pct - "Sequencer \u2192 TA Address Stall": - avg: AVG((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) - min: MIN((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) - max: MAX((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) - unit: (Cycles + $normUnit) - "Sequencer \u2192 TA Command Stall": - avg: AVG((SQ_VMEM_TA_CMD_FIFO_FULL / $denom)) - min: MIN((SQ_VMEM_TA_CMD_FIFO_FULL / $denom)) - max: MAX((SQ_VMEM_TA_CMD_FIFO_FULL / $denom)) - unit: (Cycles + $normUnit) - "Sequencer \u2192 TA Data Stall": - avg: AVG((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) - min: MIN((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) - max: MAX((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) - unit: (Cycles + $normUnit) - gfx950: - Address Processing Unit Busy: - avg: AVG(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - Address Stall: - avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - unit: pct - Data Stall: - avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - unit: pct - "Data-Processor \u2192 Address Stall": - avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - unit: pct - "Sequencer \u2192 TA Address Stall": - avg: AVG((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) - min: MIN((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) - max: MAX((SQ_VMEM_TA_ADDR_FIFO_FULL / $denom)) - unit: (Cycles + $normUnit) - "Sequencer \u2192 TA Command Stall": - avg: AVG((SQ_VMEM_TA_CMD_FIFO_FULL / $denom)) - min: MIN((SQ_VMEM_TA_CMD_FIFO_FULL / $denom)) - max: MAX((SQ_VMEM_TA_CMD_FIFO_FULL / $denom)) - unit: (Cycles + $normUnit) - "Sequencer \u2192 TA Data Stall": - avg: AVG((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) - min: MIN((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) - max: MAX((SQ_VMEM_WR_TA_DATA_FIFO_FULL / $denom)) - unit: (Cycles + $normUnit) - gfx908: - Address Processing Unit Busy: - avg: AVG(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - Address Stall: - avg: AVG(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - min: MIN(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - max: MAX(((100 * TA_ADDR_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - unit: pct - Data Stall: - avg: AVG(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - min: MIN(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - max: MAX(((100 * TA_DATA_STALLED_BY_TC_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - unit: pct - "Data-Processor \u2192 Address Stall": - avg: AVG(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - min: MIN(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - max: MAX(((100 * TA_ADDR_STALLED_BY_TD_CYCLES_sum) / ($GRBM_GUI_ACTIVE_PER_XCD - * $cu_per_gpu))) - unit: pct - - metric_table: - id: 1502 - title: Instruction counts - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - metric: - gfx90a: - Total Instructions: - avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom)) - min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom)) - max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - Global/Generic Instructions: - avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - Global/Generic Read Instructions: - avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - Global/Generic Write Instructions: - avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - Global/Generic Atomic Instructions: - avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - Spill/Stack Instructions: - avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - Spill/Stack Read Instructions: - avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - Spill/Stack Write Instructions: - avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - Spill/Stack Atomic Instructions: - avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - max: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - gfx941: - Total Instructions: - avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom)) - min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom)) - max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - Global/Generic Instructions: - avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - Global/Generic Read Instructions: - avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - Global/Generic Write Instructions: - avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - Global/Generic Atomic Instructions: - avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - Spill/Stack Instructions: - avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - Spill/Stack Read Instructions: - avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - Spill/Stack Write Instructions: - avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - Spill/Stack Atomic Instructions: - avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - gfx940: - Total Instructions: - avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom)) - min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom)) - max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - Global/Generic Instructions: - avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - Global/Generic Read Instructions: - avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - Global/Generic Write Instructions: - avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - Global/Generic Atomic Instructions: - avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - Spill/Stack Instructions: - avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - Spill/Stack Read Instructions: - avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - Spill/Stack Write Instructions: - avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - Spill/Stack Atomic Instructions: - avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - gfx942: - Total Instructions: - avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom)) - min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom)) - max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - Global/Generic Instructions: - avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - Global/Generic Read Instructions: - avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - Global/Generic Write Instructions: - avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - Global/Generic Atomic Instructions: - avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - Spill/Stack Instructions: - avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - Spill/Stack Read Instructions: - avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - Spill/Stack Write Instructions: - avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - Spill/Stack Atomic Instructions: - avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - gfx950: - Total Instructions: - avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom)) - min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom)) - max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - Global/Generic Instructions: - avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - Global/Generic Read Instructions: - avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - Global/Generic Read Instructions for LDS: - avg: AVG((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_READ_LDS_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - Global/Generic Write Instructions: - avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - Global/Generic Atomic Instructions: - avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - Spill/Stack Instructions: - avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - Spill/Stack Read Instructions: - avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - Spill/Stack Read Instructions for LDS: - avg: AVG((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_READ_LDS_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - Spill/Stack Write Instructions: - avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - Spill/Stack Atomic Instructions: - avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - gfx908: - Total Instructions: - avg: AVG((TA_TOTAL_WAVEFRONTS_sum / $denom)) - min: MIN((TA_TOTAL_WAVEFRONTS_sum / $denom)) - max: MAX((TA_TOTAL_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - Global/Generic Instructions: - avg: AVG((TA_FLAT_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - Global/Generic Read Instructions: - avg: AVG((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_READ_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - Global/Generic Write Instructions: - avg: AVG((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_WRITE_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - Global/Generic Atomic Instructions: - avg: AVG((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - min: MIN((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - max: MAX((TA_FLAT_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - Spill/Stack Instructions: - avg: AVG((TA_BUFFER_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - Spill/Stack Read Instructions: - avg: AVG((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_READ_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - Spill/Stack Write Instructions: - avg: AVG((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_WRITE_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - Spill/Stack Atomic Instructions: - avg: AVG((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - min: MIN((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - max: MAX((TA_BUFFER_ATOMIC_WAVEFRONTS_sum / $denom)) - unit: (Instructions + $normUnit) - - metric_table: - id: 1503 - title: Spill and stack metrics - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - metric: - gfx90a: - Spill/Stack Total Cycles: - avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) - min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) - max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) - unit: (Cycles + $normUnit) - Spill/Stack Coalesced Read: - avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) - min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) - max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) - unit: (Cycles + $normUnit) - Spill/Stack Coalesced Write: - avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) - min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) - max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) - unit: (Cycles + $normUnit) - gfx941: - Spill/Stack Total Cycles: - avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) - min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) - max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) - unit: (Cycles + $normUnit) - Spill/Stack Coalesced Read: - avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) - min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) - max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) - unit: (Cycles + $normUnit) - Spill/Stack Coalesced Write: - avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) - min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) - max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) - unit: (Cycles + $normUnit) - gfx940: - Spill/Stack Total Cycles: - avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) - min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) - max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) - unit: (Cycles + $normUnit) - Spill/Stack Coalesced Read: - avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) - min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) - max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) - unit: (Cycles + $normUnit) - Spill/Stack Coalesced Write: - avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) - min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) - max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) - unit: (Cycles + $normUnit) - gfx942: - Spill/Stack Total Cycles: - avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) - min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) - max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) - unit: (Cycles + $normUnit) - Spill/Stack Coalesced Read: - avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) - min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) - max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) - unit: (Cycles + $normUnit) - Spill/Stack Coalesced Write: - avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) - min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) - max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) - unit: (Cycles + $normUnit) - gfx950: - Spill/Stack Total Cycles: - avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) - min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) - max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) - unit: (Cycles + $normUnit) - Spill/Stack Coalesced Read: - avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) - min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) - max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) - unit: (Cycles + $normUnit) - Spill/Stack Coalesced Write: - avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) - min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) - max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) - unit: (Cycles + $normUnit) - gfx908: - Spill/Stack Total Cycles: - avg: AVG((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) - min: MIN((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) - max: MAX((TA_BUFFER_TOTAL_CYCLES_sum / $denom)) - unit: (Cycles + $normUnit) - Spill/Stack Coalesced Read: - avg: AVG((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) - min: MIN((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) - max: MAX((TA_BUFFER_COALESCED_READ_CYCLES_sum / $denom)) - unit: (Cycles + $normUnit) - Spill/Stack Coalesced Write: - avg: AVG((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) - min: MIN((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) - max: MAX((TA_BUFFER_COALESCED_WRITE_CYCLES_sum / $denom)) - unit: (Cycles + $normUnit) - - metric_table: - id: 1504 - title: Vector L1 data-return path or Texture Data (TD) - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - metric: - gfx90a: - Data-Return Busy: - avg: AVG(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - "Cache RAM \u2192 Data-Return Stall": - avg: AVG(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - "Workgroup manager \u2192 Data-Return Stall": - avg: AVG(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - Coalescable Instructions: - avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom)) - min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom)) - max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom)) - unit: (Instructions + $normUnit) - Read Instructions: - avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) - / $denom)) - min: MIN((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) - / $denom)) - max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) - / $denom)) - unit: (Instructions + $normUnit) - Write Instructions: - avg: AVG((TD_STORE_WAVEFRONT_sum / $denom)) - min: MIN((TD_STORE_WAVEFRONT_sum / $denom)) - max: MAX((TD_STORE_WAVEFRONT_sum / $denom)) - unit: (Instructions + $normUnit) - Atomic Instructions: - avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom)) - min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom)) - max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom)) - unit: (Instructions + $normUnit) - gfx941: - Data-Return Busy: - avg: AVG(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - "Cache RAM \u2192 Data-Return Stall": - avg: AVG(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - "Workgroup manager \u2192 Data-Return Stall": - avg: AVG(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - Coalescable Instructions: - avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom)) - min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom)) - max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom)) - unit: (Instructions + $normUnit) - Read Instructions: - avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) - / $denom)) - min: MIN((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) - / $denom)) - max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) - / $denom)) - unit: (Instructions + $normUnit) - Write Instructions: - avg: AVG((TD_STORE_WAVEFRONT_sum / $denom)) - min: MIN((TD_STORE_WAVEFRONT_sum / $denom)) - max: MAX((TD_STORE_WAVEFRONT_sum / $denom)) - unit: (Instructions + $normUnit) - Atomic Instructions: - avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom)) - min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom)) - max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom)) - unit: (Instructions + $normUnit) - gfx940: - Data-Return Busy: - avg: AVG(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - "Cache RAM \u2192 Data-Return Stall": - avg: AVG(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - "Workgroup manager \u2192 Data-Return Stall": - avg: AVG(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - Coalescable Instructions: - avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom)) - min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom)) - max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom)) - unit: (Instructions + $normUnit) - Read Instructions: - avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) - / $denom)) - min: MIN((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) - / $denom)) - max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) - / $denom)) - unit: (Instructions + $normUnit) - Write Instructions: - avg: AVG((TD_STORE_WAVEFRONT_sum / $denom)) - min: MIN((TD_STORE_WAVEFRONT_sum / $denom)) - max: MAX((TD_STORE_WAVEFRONT_sum / $denom)) - unit: (Instructions + $normUnit) - Atomic Instructions: - avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom)) - min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom)) - max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom)) - unit: (Instructions + $normUnit) - gfx942: - Data-Return Busy: - avg: AVG(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - "Cache RAM \u2192 Data-Return Stall": - avg: AVG(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - "Workgroup manager \u2192 Data-Return Stall": - avg: AVG(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - Coalescable Instructions: - avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom)) - min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom)) - max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom)) - unit: (Instructions + $normUnit) - Read Instructions: - avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) - / $denom)) - min: MIN((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) - / $denom)) - max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) - / $denom)) - unit: (Instructions + $normUnit) - Write Instructions: - avg: AVG((TD_STORE_WAVEFRONT_sum / $denom)) - min: MIN((TD_STORE_WAVEFRONT_sum / $denom)) - max: MAX((TD_STORE_WAVEFRONT_sum / $denom)) - unit: (Instructions + $normUnit) - Atomic Instructions: - avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom)) - min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom)) - max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom)) - unit: (Instructions + $normUnit) - gfx950: - Data-Return Busy: - avg: AVG(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - "Cache RAM \u2192 Data-Return Stall": - avg: AVG(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - "Workgroup manager \u2192 Data-Return Stall": - avg: AVG(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TD_SPI_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - Coalescable Instructions: - avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom)) - min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom)) - max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom)) - unit: (Instructions + $normUnit) - Read Instructions: - avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) - / $denom)) - min: MIN((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) - / $denom)) - max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) - / $denom)) - unit: (Instructions + $normUnit) - Write Instructions: - avg: AVG((TD_STORE_WAVEFRONT_sum / $denom)) - min: MIN((TD_STORE_WAVEFRONT_sum / $denom)) - max: MAX((TD_STORE_WAVEFRONT_sum / $denom)) - unit: (Instructions + $normUnit) - Atomic Instructions: - avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom)) - min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom)) - max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom)) - unit: (Instructions + $normUnit) - Write Ack Instructions: - avg: AVG((TD_WRITE_ACKT_WAVEFRONT_sum / $denom)) - min: MIN((TD_WRITE_ACKT_WAVEFRONT_sum / $denom)) - max: MAX((TD_WRITE_ACKT_WAVEFRONT_sum / $denom)) - unit: (Instructions + $normUnit) - gfx908: - Data-Return Busy: - avg: AVG(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - "Cache RAM \u2192 Data-Return Stall": - avg: AVG(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - min: MIN(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - max: MAX(((100 * TD_TC_STALL_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))) - unit: pct - Coalescable Instructions: - avg: AVG((TD_COALESCABLE_WAVEFRONT_sum / $denom)) - min: MIN((TD_COALESCABLE_WAVEFRONT_sum / $denom)) - max: MAX((TD_COALESCABLE_WAVEFRONT_sum / $denom)) - unit: (Instructions + $normUnit) - Read Instructions: - avg: AVG((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) - / $denom)) - min: MIN((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) - / $denom)) - max: MAX((((TD_LOAD_WAVEFRONT_sum - TD_STORE_WAVEFRONT_sum) - TD_ATOMIC_WAVEFRONT_sum) - / $denom)) - unit: (Instructions + $normUnit) - Write Instructions: - avg: AVG((TD_STORE_WAVEFRONT_sum / $denom)) - min: MIN((TD_STORE_WAVEFRONT_sum / $denom)) - max: MAX((TD_STORE_WAVEFRONT_sum / $denom)) - unit: (Instructions + $normUnit) - Atomic Instructions: - avg: AVG((TD_ATOMIC_WAVEFRONT_sum / $denom)) - min: MIN((TD_ATOMIC_WAVEFRONT_sum / $denom)) - max: MAX((TD_ATOMIC_WAVEFRONT_sum / $denom)) - unit: (Instructions + $normUnit) - metrics_description: - Address Processing Unit Busy: - plain: Percent of the total CU cycles the address processor was busy - rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the address processor - was busy - unit: Percent - Address Stall: - plain: Percent of the total CU cycles the address processor was stalled from - sending address requests further into the vL1D pipeline. - rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the address processor - was stalled from sending address requests further into the vL1D pipeline - unit: Percent - Data Stall: - plain: Percent of the total CU cycles the address processor was stalled from - sending write/atomic data further into the vL1D pipeline. - rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the address processor - was stalled from sending write/atomic data further into the vL1D pipeline - unit: Percent - "Data-Processor \u2192 Address Stall": - plain: Percent of total CU cycles the address processor was stalled waiting - to send command data to the data processor. - rst: Percent of :ref:`total CU cycles <total-cu-cycles>` the address processor was - stalled waiting to send command data to the :ref:`data processor <desc-td>` - unit: Percent - Total Instructions: - plain: The total number of memory instructions executed by the address processer - over all compute units on the accelerator, per normalization unit. - rst: The total number of memory instructions executed by the address processer - over all compute units on the accelerator, per normalization unit. - unit: Instructions per normalization unit - Global/Generic Instructions: - plain: The total number of global & generic memory instructions executed on - all compute units on the accelerator, per normalization unit. - rst: The total number of global & generic memory instructions executed on all :doc:`compute - units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`. - unit: Instructions per normalization unit - Global/Generic Read Instructions: - plain: The total number of global & generic memory read instructions executed - on all compute units on the accelerator, per normalization unit. - rst: The total number of global & generic memory read instructions executed - on all :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization - unit <normalization-units>`. - unit: Instructions per normalization unit - Global/Generic Write Instructions: - plain: The total number of global & generic memory write instructions executed - on all compute units on the accelerator, per normalization unit. - rst: The total number of global & generic memory write instructions executed on - all :doc:`compute units <compute-unit>` on the accelerator, per :ref:`normalization - unit <normalization-units>`. - unit: Instructions per normalization unit - Global/Generic Atomic Instructions: - plain: The total number of global & generic memory atomic (with and without - return) instructions executed on all compute units on the accelerator, per - normalization unit. - rst: The total number of global & generic memory atomic (with and without return) - instructions executed on all :doc:`compute units <compute-unit>` on the accelerator, - per :ref:`normalization unit <normalization-units>`. - unit: Instructions per normalization unit - Spill/Stack Instructions: - plain: The total number of spill/stack memory instructions executed on all compute - units on the accelerator, per normalization unit. - rst: The total number of spill/stack memory instructions executed on all :doc:`compute - units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`. - unit: Instructions per normalization unit - Spill/Stack Read Instructions: - plain: The total number of spill/stack memory read instructions executed on - all compute units on the accelerator, per normalization unit. - rst: The total number of spill/stack memory read instructions executed on all :doc:`compute - units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`. - unit: Instructions per normalization unit - Spill/Stack Write Instructions: - plain: The total number of spill/stack memory write instructions executed on - all compute units on the accelerator, per normalization unit. - rst: The total number of spill/stack memory write instructions executed on all :doc:`compute - units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`. - unit: Instructions per normalization unit - Spill/Stack Atomic Instructions: - plain: The total number of spill/stack memory atomic (with and without return) - instructions executed on all compute units on the accelerator, per normalization - unit. Typically unused as these memory operations are typically used to implement - thread-local storage. - rst: The total number of spill/stack memory atomic (with and without return) instructions - executed on all :doc:`compute units <compute-unit>` on the accelerator, per - :ref:`normalization unit <normalization-units>`. Typically unused as these - memory operations are typically used to implement thread-local storage. - unit: Instructions per normalization unit - Spill/Stack Total Cycles: - plain: The number of cycles the address processing unit spent working on spill/stack - instructions, per normalization unit. - rst: The number of cycles the address processing unit spent working on spill/stack - instructions, per :ref:`normalization unit <normalization-units>`. - unit: Cycles per normalization unit - Spill/Stack Coalesced Read: - plain: The number of cycles the address processing unit spent working on coalesced - spill/stack read instructions, per normalization unit. - rst: The number of cycles the address processing unit spent working on coalesced - spill/stack read instructions, per :ref:`normalization unit <normalization-units>`. - unit: Cycles per normalization unit - Spill/Stack Coalesced Write: - plain: The number of cycles the address processing unit spent working on coalesced - spill/stack write instructions, per normalization unit. - rst: The number of cycles the address processing unit spent working on coalesced - spill/stack write instructions, per :ref:`normalization unit <normalization-units>`. - unit: Cycles per normalization unit - Data-Return Busy: - plain: Percent of the total CU cycles the data-return unit was busy processing - or waiting on data to return to the CU. - rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the data-return unit - was busy processing or waiting on data to return to the :doc:`CU <compute-unit>`. - unit: Percent - "Cache RAM \u2192 Data-Return Stall": - plain: Percent of the total CU cycles the data-return unit was stalled on data - to be returned from the vL1D Cache RAM. - rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the data-return unit - was stalled on data to be returned from the :ref:`vL1D Cache RAM <desc-tc>`. - unit: Percent - "Workgroup manager \u2192 Data-Return Stall": - plain: Percent of the total CU cycles the data-return unit was stalled by the - workgroup manager due to initialization of registers as a part of launching - new workgroups. - rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the data-return unit - was stalled by the :ref:`workgroup manager <desc-spi>` due to initialization - of registers as a part of launching new workgroups. - unit: Percent - Coalescable Instructions: - plain: The number of instructions submitted to the data-return unit by the address - processor that were found to be coalescable, per normalization unit. - rst: The number of instructions submitted to the :ref:`data-return unit <desc-td>` - by the :ref:`address processor <desc-ta>` that were found to be coalescable, - per :ref:`normalization unit <normalization-units>`. - unit: Instructions per normalization unit - Read Instructions: - plain: The number of read instructions submitted to the data-return unit by - the address processor summed over all compute units on the accelerator, per - normalization unit. This is expected to be the sum of global/generic and spill/stack - reads in the address processor. - rst: The number of read instructions submitted to the :ref:`data-return unit - <desc-td>` by the :ref:`address processor <desc-ta>` summed over all :doc:`compute - units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`. - This is expected to be the sum of global/generic and spill/stack reads in - the :ref:`address processor <desc-ta>`. - unit: Instructions per normalization unit - Write Instructions: - plain: The number of store instructions submitted to the data-return unit by - the address processor summed over all compute units on the accelerator, per - normalization unit. This is expected to be the sum of global/generic and spill/stack - stores in the address processor. - rst: The number of store instructions submitted to the :ref:`data-return unit - <desc-td>` by the :ref:`address processor <desc-ta>` summed over all :doc:`compute - units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`. - This is expected to be the sum of global/generic and spill/stack stores counted - by the :ref:`vL1D cache-front-end <ta-instruction-counts>`. - unit: Instructions per normalization unit - Atomic Instructions: - plain: The number of atomic instructions submitted to the data-return unit by - the address processor summed over all compute units on the accelerator, per - normalization unit. This is expected to be the sum of global/generic and spill/stack - atomics in the address processor. - rst: The number of atomic instructions submitted to the :ref:`data-return unit - <desc-td>` by the :ref:`address processor <desc-ta>` summed over all :doc:`compute - units <compute-unit>` on the accelerator, per :ref:`normalization unit <normalization-units>`. - This is expected to be the sum of global/generic and spill/stack atomics - in the :ref:`address processor <desc-ta>`. - unit: Instructions per normalization unit - Write Ack Instructions: - plain: The total number of write acknowledgements submitted by data-return - unit to SQ, summed over all compute units on the accelerator, per normalization - unit. - rst: The total number of write acknowledgements submitted by :ref:`data-return unit <desc-td>` - to SQ, summed over all compute units on the accelerator, per normalization unit. - unit: Instructions per normalization unit -- id: 1600 - title: Vector L1 Data Cache - data source: - - metric_table: - id: 1601 - title: vL1D Speed-of-Light - header: - metric: Metric - value: Avg - unit: Unit - metric: - gfx90a: - Hit rate: - value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != - 0) else None)) - unit: Pct of Peak - Bandwidth Utilization: - value: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk / 1000) * 64) * $cu_per_gpu)) - unit: Pct of Peak - Utilization: - value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum - != 0) else None)) - unit: Pct of Peak - Coalescing: - value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum - * 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None)) - unit: Pct of Peak - gfx941: - Hit rate: - value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != - 0) else None)) - unit: Pct of Peak - Bandwidth Utilization: - value: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu)) - unit: Pct of Peak - Utilization: - value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum - != 0) else None)) - unit: Pct of Peak - Coalescing: - value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum - * 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None)) - unit: Pct of Peak - gfx940: - Hit rate: - value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != - 0) else None)) - unit: Pct of Peak - Bandwidth Utilization: - value: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu)) - unit: Pct of Peak - Utilization: - value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum - != 0) else None)) - unit: Pct of Peak - Coalescing: - value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum - * 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None)) - unit: Pct of Peak - gfx942: - Hit rate: - value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != - 0) else None)) - unit: Pct of Peak - Bandwidth Utilization: - value: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu)) - unit: Pct of Peak - Utilization: - value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum - != 0) else None)) - unit: Pct of Peak - Coalescing: - value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum - * 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None)) - unit: Pct of Peak - gfx950: - Hit rate: - value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != - 0) else None)) - unit: Pct of Peak - Bandwidth Utilization: - value: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu)) - unit: Pct of Peak - Utilization: - value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum - != 0) else None)) - unit: Pct of Peak - Coalescing: - value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum - * 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None)) - unit: Pct of Peak - gfx908: - Hit rate: - value: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != - 0) else None)) - unit: Pct of Peak - Bandwidth Utilization: - value: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - - Start_Timestamp)))) / ((($max_sclk / 1000) * 64) * $cu_per_gpu)) - unit: Pct of Peak - Utilization: - value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum - != 0) else None)) - unit: Pct of Peak - Coalescing: - value: AVG(((((TA_TOTAL_WAVEFRONTS_sum * 64) * 100) / (TCP_TOTAL_ACCESSES_sum - * 4)) if (TCP_TOTAL_ACCESSES_sum != 0) else None)) - unit: Pct of Peak - comparable: false - cli_style: simple_bar - tui_style: simple_bar - - metric_table: - id: 1602 - title: vL1D cache stall metrics - header: - metric: Metric - expr: Expression - metric: - gfx90a: - Stalled on L2 Data: - expr: (((100 * TCP_PENDING_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum - != 0) else None) - Stalled on L2 Req: - expr: (((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum - != 0) else None) - Tag RAM Stall (Read): - expr: (((100 * TCP_READ_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) - if (TCP_GATE_EN1_sum != 0) else None) - Tag RAM Stall (Write): - expr: (((100 * TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) - if (TCP_GATE_EN1_sum != 0) else None) - Tag RAM Stall (Atomic): - expr: (((100 * TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) - if (TCP_GATE_EN1_sum != 0) else None) - gfx941: - Stalled on L2 Data: - expr: (((100 * TCP_PENDING_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum - != 0) else None) - Stalled on L2 Req: - expr: (((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum - != 0) else None) - Tag RAM Stall (Read): - expr: (((100 * TCP_READ_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) - if (TCP_GATE_EN1_sum != 0) else None) - Tag RAM Stall (Write): - expr: (((100 * TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) - if (TCP_GATE_EN1_sum != 0) else None) - Tag RAM Stall (Atomic): - expr: (((100 * TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) - if (TCP_GATE_EN1_sum != 0) else None) - gfx940: - Stalled on L2 Data: - expr: (((100 * TCP_PENDING_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum - != 0) else None) - Stalled on L2 Req: - expr: (((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum - != 0) else None) - Tag RAM Stall (Read): - expr: (((100 * TCP_READ_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) - if (TCP_GATE_EN1_sum != 0) else None) - Tag RAM Stall (Write): - expr: (((100 * TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) - if (TCP_GATE_EN1_sum != 0) else None) - Tag RAM Stall (Atomic): - expr: (((100 * TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) - if (TCP_GATE_EN1_sum != 0) else None) - gfx942: - Stalled on L2 Data: - expr: (((100 * TCP_PENDING_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum - != 0) else None) - Stalled on L2 Req: - expr: (((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum - != 0) else None) - Tag RAM Stall (Read): - expr: (((100 * TCP_READ_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) - if (TCP_GATE_EN1_sum != 0) else None) - Tag RAM Stall (Write): - expr: (((100 * TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) - if (TCP_GATE_EN1_sum != 0) else None) - Tag RAM Stall (Atomic): - expr: (((100 * TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) - if (TCP_GATE_EN1_sum != 0) else None) - gfx950: - Stalled on L2 Data: - expr: (((100 * TCP_PENDING_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum - != 0) else None) - Stalled on L2 Req: - expr: (((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum - != 0) else None) - Stalled on Address: - expr: (((100 * TCP_TCP_TA_ADDR_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if - (TCP_GATE_EN1_sum != 0) else None) - Stalled on Data: - expr: (((100 * TCP_TCP_TA_DATA_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if - (TCP_GATE_EN1_sum != 0) else None) - Stalled on Latency FIFO: - expr: (((100 * TCP_LFIFO_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum - != 0) else None) - Stalled on Request FIFO: - expr: (((100 * TCP_RFIFO_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum - != 0) else None) - Stalled on Read Return: - expr: (((100 * TCP_TCR_RDRET_STALL_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum - != 0) else None) - Tag RAM Stall (Read): - expr: (((100 * TCP_READ_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) - if (TCP_GATE_EN1_sum != 0) else None) - Tag RAM Stall (Write): - expr: (((100 * TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) - if (TCP_GATE_EN1_sum != 0) else None) - Tag RAM Stall (Atomic): - expr: (((100 * TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) - if (TCP_GATE_EN1_sum != 0) else None) - gfx908: - Stalled on L2 Data: - expr: (((100 * TCP_PENDING_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum - != 0) else None) - Stalled on L2 Req: - expr: (((100 * TCP_TCR_TCP_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum - != 0) else None) - Tag RAM Stall (Read): - expr: (((100 * TCP_READ_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) - if (TCP_GATE_EN1_sum != 0) else None) - Tag RAM Stall (Write): - expr: (((100 * TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) - if (TCP_GATE_EN1_sum != 0) else None) - Tag RAM Stall (Atomic): - expr: (((100 * TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum) / TCP_GATE_EN1_sum) - if (TCP_GATE_EN1_sum != 0) else None) - cli_style: simple_box - tui_style: simple_box - - metric_table: - id: 1603 - title: vL1D cache access metrics - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - metric: - gfx90a: - Total Req: - avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom)) - min: MIN((TCP_TOTAL_ACCESSES_sum / $denom)) - max: MAX((TCP_TOTAL_ACCESSES_sum / $denom)) - unit: (Req + $normUnit) - Read Req: - avg: AVG((TCP_TOTAL_READ_sum / $denom)) - min: MIN((TCP_TOTAL_READ_sum / $denom)) - max: MAX((TCP_TOTAL_READ_sum / $denom)) - unit: (Req + $normUnit) - Write Req: - avg: AVG((TCP_TOTAL_WRITE_sum / $denom)) - min: MIN((TCP_TOTAL_WRITE_sum / $denom)) - max: MAX((TCP_TOTAL_WRITE_sum / $denom)) - unit: (Req + $normUnit) - Atomic Req: - avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) - / $denom)) - min: MIN(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) - / $denom)) - max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) - / $denom)) - unit: (Req + $normUnit) - Cache BW: - avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp))) - min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp))) - max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp))) - unit: Gbps - Cache Hit Rate: - avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != - 0) else None)) - min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != - 0) else None)) - max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != - 0) else None)) - unit: pct - Cache Accesses: - avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) - min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) - max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) - unit: (Req + $normUnit) - Cache Hits: - avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / $denom)) - min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / $denom)) - max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / $denom)) - unit: (Req + $normUnit) - Invalidations: - avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) - min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) - max: MAX((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) - unit: (Req + $normUnit) - L1-L2 BW: - avg: AVG(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) - + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp))) - min: MIN(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) - + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp))) - max: MAX(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) - + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp))) - unit: Gbps - L1-L2 Read: - avg: AVG((TCP_TCC_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - L1-L2 Write: - avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - L1-L2 Atomic: - avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - / $denom)) - min: MIN(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - / $denom)) - max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - / $denom)) - unit: (Req + $normUnit) - L1 Access Latency: - avg: AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum - != 0) else None)) - min: MIN(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum - != 0) else None)) - max: MAX(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum - != 0) else None)) - unit: Cycles - L1-L2 Read Latency: - avg: AVG(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) - if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else - None)) - min: MIN(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) - if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else - None)) - max: MAX(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) - if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else - None)) - unit: Cycles - L1-L2 Write Latency: - avg: AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != - 0) else None)) - min: MIN(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != - 0) else None)) - max: MAX(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != - 0) else None)) - unit: Cycles - gfx941: - Total Req: - avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom)) - min: MIN((TCP_TOTAL_ACCESSES_sum / $denom)) - max: MAX((TCP_TOTAL_ACCESSES_sum / $denom)) - unit: (Req + $normUnit) - Read Req: - avg: AVG((TCP_TOTAL_READ_sum / $denom)) - min: MIN((TCP_TOTAL_READ_sum / $denom)) - max: MAX((TCP_TOTAL_READ_sum / $denom)) - unit: (Req + $normUnit) - Write Req: - avg: AVG((TCP_TOTAL_WRITE_sum / $denom)) - min: MIN((TCP_TOTAL_WRITE_sum / $denom)) - max: MAX((TCP_TOTAL_WRITE_sum / $denom)) - unit: (Req + $normUnit) - Atomic Req: - avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) - / $denom)) - min: MIN(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) - / $denom)) - max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) - / $denom)) - unit: (Req + $normUnit) - Cache BW: - avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) - min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) - max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) - unit: Gbps - Cache Hit Rate: - avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != - 0) else None)) - min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != - 0) else None)) - max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != - 0) else None)) - unit: pct - Cache Accesses: - avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) - min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) - max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) - unit: (Req + $normUnit) - Cache Hits: - avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / $denom)) - min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / $denom)) - max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / $denom)) - unit: (Req + $normUnit) - Invalidations: - avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) - min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) - max: MAX((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) - unit: (Req + $normUnit) - L1-L2 BW: - avg: AVG(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + - TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / (End_Timestamp - Start_Timestamp))) - min: MIN(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + - TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / (End_Timestamp - Start_Timestamp))) - max: MAX(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + - TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / (End_Timestamp - Start_Timestamp))) - unit: Gbps - L1-L2 Read: - avg: AVG((TCP_TCC_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - L1-L2 Write: - avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - L1-L2 Atomic: - avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - / $denom)) - min: MIN(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - / $denom)) - max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - / $denom)) - unit: (Req + $normUnit) - gfx940: - Total Req: - avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom)) - min: MIN((TCP_TOTAL_ACCESSES_sum / $denom)) - max: MAX((TCP_TOTAL_ACCESSES_sum / $denom)) - unit: (Req + $normUnit) - Read Req: - avg: AVG((TCP_TOTAL_READ_sum / $denom)) - min: MIN((TCP_TOTAL_READ_sum / $denom)) - max: MAX((TCP_TOTAL_READ_sum / $denom)) - unit: (Req + $normUnit) - Write Req: - avg: AVG((TCP_TOTAL_WRITE_sum / $denom)) - min: MIN((TCP_TOTAL_WRITE_sum / $denom)) - max: MAX((TCP_TOTAL_WRITE_sum / $denom)) - unit: (Req + $normUnit) - Atomic Req: - avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) - / $denom)) - min: MIN(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) - / $denom)) - max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) - / $denom)) - unit: (Req + $normUnit) - Cache BW: - avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) - min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) - max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) - unit: Gbps - Cache Hit Rate: - avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != - 0) else None)) - min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != - 0) else None)) - max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != - 0) else None)) - unit: pct - Cache Accesses: - avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) - min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) - max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) - unit: (Req + $normUnit) - Cache Hits: - avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / $denom)) - min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / $denom)) - max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / $denom)) - unit: (Req + $normUnit) - Invalidations: - avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) - min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) - max: MAX((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) - unit: (Req + $normUnit) - L1-L2 BW: - avg: AVG(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + - TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / (End_Timestamp - Start_Timestamp))) - min: MIN(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + - TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / (End_Timestamp - Start_Timestamp))) - max: MAX(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + - TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / (End_Timestamp - Start_Timestamp))) - unit: Gbps - L1-L2 Read: - avg: AVG((TCP_TCC_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - L1-L2 Write: - avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - L1-L2 Atomic: - avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - / $denom)) - min: MIN(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - / $denom)) - max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - / $denom)) - unit: (Req + $normUnit) - gfx942: - Total Req: - avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom)) - min: MIN((TCP_TOTAL_ACCESSES_sum / $denom)) - max: MAX((TCP_TOTAL_ACCESSES_sum / $denom)) - unit: (Req + $normUnit) - Read Req: - avg: AVG((TCP_TOTAL_READ_sum / $denom)) - min: MIN((TCP_TOTAL_READ_sum / $denom)) - max: MAX((TCP_TOTAL_READ_sum / $denom)) - unit: (Req + $normUnit) - Write Req: - avg: AVG((TCP_TOTAL_WRITE_sum / $denom)) - min: MIN((TCP_TOTAL_WRITE_sum / $denom)) - max: MAX((TCP_TOTAL_WRITE_sum / $denom)) - unit: (Req + $normUnit) - Atomic Req: - avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) - / $denom)) - min: MIN(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) - / $denom)) - max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) - / $denom)) - unit: (Req + $normUnit) - Cache BW: - avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) - min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) - max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) - unit: Gbps - Cache Hit Rate: - avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != - 0) else None)) - min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != - 0) else None)) - max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != - 0) else None)) - unit: pct - Cache Accesses: - avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) - min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) - max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) - unit: (Req + $normUnit) - Cache Hits: - avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / $denom)) - min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / $denom)) - max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / $denom)) - unit: (Req + $normUnit) - Invalidations: - avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) - min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) - max: MAX((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) - unit: (Req + $normUnit) - L1-L2 BW: - avg: AVG(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + - TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / (End_Timestamp - Start_Timestamp))) - min: MIN(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + - TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / (End_Timestamp - Start_Timestamp))) - max: MAX(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + - TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / (End_Timestamp - Start_Timestamp))) - unit: Gbps - L1-L2 Read: - avg: AVG((TCP_TCC_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - L1-L2 Write: - avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - L1-L2 Atomic: - avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - / $denom)) - min: MIN(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - / $denom)) - max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - / $denom)) - unit: (Req + $normUnit) - gfx950: - Total Req: - avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom)) - min: MIN((TCP_TOTAL_ACCESSES_sum / $denom)) - max: MAX((TCP_TOTAL_ACCESSES_sum / $denom)) - unit: (Req + $normUnit) - Read Req: - avg: AVG((TCP_TOTAL_READ_sum / $denom)) - min: MIN((TCP_TOTAL_READ_sum / $denom)) - max: MAX((TCP_TOTAL_READ_sum / $denom)) - unit: (Req + $normUnit) - Write Req: - avg: AVG((TCP_TOTAL_WRITE_sum / $denom)) - min: MIN((TCP_TOTAL_WRITE_sum / $denom)) - max: MAX((TCP_TOTAL_WRITE_sum / $denom)) - unit: (Req + $normUnit) - Atomic Req: - avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) - / $denom)) - min: MIN(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) - / $denom)) - max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) - / $denom)) - unit: (Req + $normUnit) - Cache BW: - avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) - min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) - max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp))) - unit: Gbps - Cache Hit Rate: - avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != - 0) else None)) - min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != - 0) else None)) - max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != - 0) else None)) - unit: pct - Cache Accesses: - avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) - min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) - max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) - unit: (Req + $normUnit) - Cache Hits: - avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / $denom)) - min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / $denom)) - max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / $denom)) - unit: (Req + $normUnit) - Invalidations: - avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) - min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) - max: MAX((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) - unit: (Req + $normUnit) - L1-L2 BW: - avg: AVG(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + - TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / (End_Timestamp - Start_Timestamp))) - min: MIN(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + - TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / (End_Timestamp - Start_Timestamp))) - max: MAX(((128 * TCP_TCC_READ_REQ_sum + 64 * (TCP_TCC_WRITE_REQ_sum + - TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / (End_Timestamp - Start_Timestamp))) - unit: Gbps - Tag RAM 0 Req: - avg: AVG((TCP_TAGRAM0_REQ_sum / $denom)) - min: MIN((TCP_TAGRAM0_REQ_sum / $denom)) - max: MAX((TCP_TAGRAM0_REQ_sum / $denom)) - unit: (Req + $normUnit) - Tag RAM 1 Req: - avg: AVG((TCP_TAGRAM1_REQ_sum / $denom)) - min: MIN((TCP_TAGRAM1_REQ_sum / $denom)) - max: MAX((TCP_TAGRAM1_REQ_sum / $denom)) - unit: (Req + $normUnit) - Tag RAM 2 Req: - avg: AVG((TCP_TAGRAM2_REQ_sum / $denom)) - min: MIN((TCP_TAGRAM2_REQ_sum / $denom)) - max: MAX((TCP_TAGRAM2_REQ_sum / $denom)) - unit: (Req + $normUnit) - Tag RAM 3 Req: - avg: AVG((TCP_TAGRAM3_REQ_sum / $denom)) - min: MIN((TCP_TAGRAM3_REQ_sum / $denom)) - max: MAX((TCP_TAGRAM3_REQ_sum / $denom)) - unit: (Req + $normUnit) - L1-L2 Read: - avg: AVG((TCP_TCC_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - L1-L2 Write: - avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - L1-L2 Atomic: - avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - / $denom)) - min: MIN(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - / $denom)) - max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - / $denom)) - unit: (Req + $normUnit) - L1 Access Latency: - avg: AVG((TCP_TCP_LATENCY_sum / $denom)) - min: MIN((TCP_TCP_LATENCY_sum / $denom)) - max: MAX((TCP_TCP_LATENCY_sum / $denom)) - unit: (Cycles + $normUnit) - L1-L2 Read Latency: - avg: AVG((TCP_TCC_READ_REQ_LATENCY_sum / $denom)) - min: MIN((TCP_TCC_READ_REQ_LATENCY_sum / $denom)) - max: MAX((TCP_TCC_READ_REQ_LATENCY_sum / $denom)) - unit: (Cycles + $normUnit) - L1-L2 Write Latency: - avg: AVG((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom)) - min: MIN((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom)) - max: MAX((TCP_TCC_WRITE_REQ_LATENCY_sum / $denom)) - unit: (Cycles + $normUnit) - gfx908: - Total Req: - avg: AVG((TCP_TOTAL_ACCESSES_sum / $denom)) - min: MIN((TCP_TOTAL_ACCESSES_sum / $denom)) - max: MAX((TCP_TOTAL_ACCESSES_sum / $denom)) - unit: (Req + $normUnit) - Read Req: - avg: AVG((TCP_TOTAL_READ_sum / $denom)) - min: MIN((TCP_TOTAL_READ_sum / $denom)) - max: MAX((TCP_TOTAL_READ_sum / $denom)) - unit: (Req + $normUnit) - Write Req: - avg: AVG((TCP_TOTAL_WRITE_sum / $denom)) - min: MIN((TCP_TOTAL_WRITE_sum / $denom)) - max: MAX((TCP_TOTAL_WRITE_sum / $denom)) - unit: (Req + $normUnit) - Atomic Req: - avg: AVG(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) - / $denom)) - min: MIN(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) - / $denom)) - max: MAX(((TCP_TOTAL_ATOMIC_WITH_RET_sum + TCP_TOTAL_ATOMIC_WITHOUT_RET_sum) - / $denom)) - unit: (Req + $normUnit) - Cache BW: - avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp))) - min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp))) - max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum * 64) / (End_Timestamp - Start_Timestamp))) - unit: Gbps - Cache Hit Rate: - avg: AVG(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != - 0) else None)) - min: MIN(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != - 0) else None)) - max: MAX(((100 - ((100 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / TCP_TOTAL_CACHE_ACCESSES_sum)) if (TCP_TOTAL_CACHE_ACCESSES_sum != - 0) else None)) - unit: pct - Cache Accesses: - avg: AVG((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) - min: MIN((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) - max: MAX((TCP_TOTAL_CACHE_ACCESSES_sum / $denom)) - unit: (Req + $normUnit) - Cache Hits: - avg: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / $denom)) - min: MIN(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / $denom)) - max: MAX(((TCP_TOTAL_CACHE_ACCESSES_sum - (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) - + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - / $denom)) - unit: (Req + $normUnit) - Invalidations: - avg: AVG((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) - min: MIN((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) - max: MAX((TCP_TOTAL_WRITEBACK_INVALIDATES_sum / $denom)) - unit: (Req + $normUnit) - L1-L2 BW: - avg: AVG(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) - + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp))) - min: MIN(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) - + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp))) - max: MAX(((64 * (((TCP_TCC_READ_REQ_sum + TCP_TCC_WRITE_REQ_sum) + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) - + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) / (End_Timestamp - Start_Timestamp))) - unit: Gbps - L1-L2 Read: - avg: AVG((TCP_TCC_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - L1-L2 Write: - avg: AVG((TCP_TCC_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - L1-L2 Atomic: - avg: AVG(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - / $denom)) - min: MIN(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - / $denom)) - max: MAX(((TCP_TCC_ATOMIC_WITH_RET_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) - / $denom)) - unit: (Req + $normUnit) - L1 Access Latency: - avg: AVG(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum - != 0) else None)) - min: MIN(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum - != 0) else None)) - max: MAX(((TCP_TCP_LATENCY_sum / TCP_TA_TCP_STATE_READ_sum) if (TCP_TA_TCP_STATE_READ_sum - != 0) else None)) - unit: Cycles - L1-L2 Read Latency: - avg: AVG(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) - if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else - None)) - min: MIN(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) - if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else - None)) - max: MAX(((TCP_TCC_READ_REQ_LATENCY_sum / (TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum)) - if ((TCP_TCC_READ_REQ_sum + TCP_TCC_ATOMIC_WITH_RET_REQ_sum) != 0) else - None)) - unit: Cycles - L1-L2 Write Latency: - avg: AVG(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != - 0) else None)) - min: MIN(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != - 0) else None)) - max: MAX(((TCP_TCC_WRITE_REQ_LATENCY_sum / (TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum)) - if ((TCP_TCC_WRITE_REQ_sum + TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum) != - 0) else None)) - unit: Cycles - - metric_table: - id: 1604 - title: L1D - L2 Transactions - header: - metric: Metric - xfer: Xfer - coherency: Coherency - avg: Avg - min: Min - max: Max - unit: Unit - metric: - gfx90a: - NC - Read: - xfer: Read - coherency: NC - avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - UC - Read: - xfer: Read - coherency: UC - avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - CC - Read: - xfer: Read - coherency: CC - avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - RW - Read: - xfer: Read - coherency: RW - avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - RW - Write: - xfer: Write - coherency: RW - avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - NC - Write: - xfer: Write - coherency: NC - avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - UC - Write: - xfer: Write - coherency: UC - avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - CC - Write: - xfer: Write - coherency: CC - avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - NC - Atomic: - xfer: Atomic - coherency: NC - avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) - min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) - max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) - unit: (Req + $normUnit) - UC - Atomic: - xfer: Atomic - coherency: UC - avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) - min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) - max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) - unit: (Req + $normUnit) - CC - Atomic: - xfer: Atomic - coherency: CC - avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) - min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) - max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) - unit: (Req + $normUnit) - RW - Atomic: - xfer: Atomic - coherency: RW - avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) - min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) - max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) - unit: (Req + $normUnit) - gfx941: - NC - Read: - xfer: Read - coherency: NC - avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - UC - Read: - xfer: Read - coherency: UC - avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - CC - Read: - xfer: Read - coherency: CC - avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - RW - Read: - xfer: Read - coherency: RW - avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - RW - Write: - xfer: Write - coherency: RW - avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - NC - Write: - xfer: Write - coherency: NC - avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - UC - Write: - xfer: Write - coherency: UC - avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - CC - Write: - xfer: Write - coherency: CC - avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - NC - Atomic: - xfer: Atomic - coherency: NC - avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) - min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) - max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) - unit: (Req + $normUnit) - UC - Atomic: - xfer: Atomic - coherency: UC - avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) - min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) - max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) - unit: (Req + $normUnit) - CC - Atomic: - xfer: Atomic - coherency: CC - avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) - min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) - max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) - unit: (Req + $normUnit) - RW - Atomic: - xfer: Atomic - coherency: RW - avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) - min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) - max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) - unit: (Req + $normUnit) - gfx940: - NC - Read: - xfer: Read - coherency: NC - avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - UC - Read: - xfer: Read - coherency: UC - avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - CC - Read: - xfer: Read - coherency: CC - avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - RW - Read: - xfer: Read - coherency: RW - avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - RW - Write: - xfer: Write - coherency: RW - avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - NC - Write: - xfer: Write - coherency: NC - avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - UC - Write: - xfer: Write - coherency: UC - avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - CC - Write: - xfer: Write - coherency: CC - avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - NC - Atomic: - xfer: Atomic - coherency: NC - avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) - min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) - max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) - unit: (Req + $normUnit) - UC - Atomic: - xfer: Atomic - coherency: UC - avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) - min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) - max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) - unit: (Req + $normUnit) - CC - Atomic: - xfer: Atomic - coherency: CC - avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) - min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) - max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) - unit: (Req + $normUnit) - RW - Atomic: - xfer: Atomic - coherency: RW - avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) - min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) - max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) - unit: (Req + $normUnit) - gfx942: - NC - Read: - xfer: Read - coherency: NC - avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - UC - Read: - xfer: Read - coherency: UC - avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - CC - Read: - xfer: Read - coherency: CC - avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - RW - Read: - xfer: Read - coherency: RW - avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - RW - Write: - xfer: Write - coherency: RW - avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - NC - Write: - xfer: Write - coherency: NC - avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - UC - Write: - xfer: Write - coherency: UC - avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - CC - Write: - xfer: Write - coherency: CC - avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - NC - Atomic: - xfer: Atomic - coherency: NC - avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) - min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) - max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) - unit: (Req + $normUnit) - UC - Atomic: - xfer: Atomic - coherency: UC - avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) - min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) - max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) - unit: (Req + $normUnit) - CC - Atomic: - xfer: Atomic - coherency: CC - avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) - min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) - max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) - unit: (Req + $normUnit) - RW - Atomic: - xfer: Atomic - coherency: RW - avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) - min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) - max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) - unit: (Req + $normUnit) - gfx950: - NC - Read: - xfer: Read - coherency: NC - avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - UC - Read: - xfer: Read - coherency: UC - avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - CC - Read: - xfer: Read - coherency: CC - avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - RW - Read: - xfer: Read - coherency: RW - avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - RW - Write: - xfer: Write - coherency: RW - avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - NC - Write: - xfer: Write - coherency: NC - avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - UC - Write: - xfer: Write - coherency: UC - avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - CC - Write: - xfer: Write - coherency: CC - avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - NC - Atomic: - xfer: Atomic - coherency: NC - avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) - min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) - max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) - unit: (Req + $normUnit) - UC - Atomic: - xfer: Atomic - coherency: UC - avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) - min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) - max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) - unit: (Req + $normUnit) - CC - Atomic: - xfer: Atomic - coherency: CC - avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) - min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) - max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) - unit: (Req + $normUnit) - RW - Atomic: - xfer: Atomic - coherency: RW - avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) - min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) - max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) - unit: (Req + $normUnit) - gfx908: - NC - Read: - xfer: Read - coherency: NC - avg: AVG((TCP_TCC_NC_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_NC_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_NC_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - UC - Read: - xfer: Read - coherency: UC - avg: AVG((TCP_TCC_UC_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_UC_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_UC_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - CC - Read: - xfer: Read - coherency: CC - avg: AVG((TCP_TCC_CC_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_CC_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_CC_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - RW - Read: - xfer: Read - coherency: RW - avg: AVG((TCP_TCC_RW_READ_REQ_sum / $denom)) - min: MIN((TCP_TCC_RW_READ_REQ_sum / $denom)) - max: MAX((TCP_TCC_RW_READ_REQ_sum / $denom)) - unit: (Req + $normUnit) - RW - Write: - xfer: Write - coherency: RW - avg: AVG((TCP_TCC_RW_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_RW_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_RW_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - NC - Write: - xfer: Write - coherency: NC - avg: AVG((TCP_TCC_NC_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_NC_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_NC_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - UC - Write: - xfer: Write - coherency: UC - avg: AVG((TCP_TCC_UC_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_UC_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_UC_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - CC - Write: - xfer: Write - coherency: CC - avg: AVG((TCP_TCC_CC_WRITE_REQ_sum / $denom)) - min: MIN((TCP_TCC_CC_WRITE_REQ_sum / $denom)) - max: MAX((TCP_TCC_CC_WRITE_REQ_sum / $denom)) - unit: (Req + $normUnit) - NC - Atomic: - xfer: Atomic - coherency: NC - avg: AVG((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) - min: MIN((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) - max: MAX((TCP_TCC_NC_ATOMIC_REQ_sum / $denom)) - unit: (Req + $normUnit) - UC - Atomic: - xfer: Atomic - coherency: UC - avg: AVG((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) - min: MIN((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) - max: MAX((TCP_TCC_UC_ATOMIC_REQ_sum / $denom)) - unit: (Req + $normUnit) - CC - Atomic: - xfer: Atomic - coherency: CC - avg: AVG((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) - min: MIN((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) - max: MAX((TCP_TCC_CC_ATOMIC_REQ_sum / $denom)) - unit: (Req + $normUnit) - RW - Atomic: - xfer: Atomic - coherency: RW - avg: AVG((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) - min: MIN((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) - max: MAX((TCP_TCC_RW_ATOMIC_REQ_sum / $denom)) - unit: (Req + $normUnit) - - metric_table: - id: 1605 - title: L1 Unified Translation Cache (UTCL1) - header: - metric: Metric - avg: Avg - min: Min - max: Max - units: Unit - metric: - gfx90a: - Req: - avg: AVG((TCP_UTCL1_REQUEST_sum / $denom)) - min: MIN((TCP_UTCL1_REQUEST_sum / $denom)) - max: MAX((TCP_UTCL1_REQUEST_sum / $denom)) - units: (Req + $normUnit) - Hit Ratio: - avg: AVG((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) - if (TCP_UTCL1_REQUEST_sum != 0) else None)) - min: MIN((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) - if (TCP_UTCL1_REQUEST_sum != 0) else None)) - max: MAX((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) - if (TCP_UTCL1_REQUEST_sum != 0) else None)) - units: pct - Hits: - avg: AVG((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) - min: MIN((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) - max: MAX((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) - units: (Req + $normUnit) - Translation Misses: - avg: AVG((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) - min: MIN((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) - max: MAX((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) - units: (Req + $normUnit) - Permission Misses: - avg: AVG((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) - min: MIN((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) - max: MAX((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) - units: (Req + $normUnit) - gfx941: - Req: - avg: AVG((TCP_UTCL1_REQUEST_sum / $denom)) - min: MIN((TCP_UTCL1_REQUEST_sum / $denom)) - max: MAX((TCP_UTCL1_REQUEST_sum / $denom)) - units: (Req + $normUnit) - Hit Ratio: - avg: AVG((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) - if (TCP_UTCL1_REQUEST_sum != 0) else None)) - min: MIN((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) - if (TCP_UTCL1_REQUEST_sum != 0) else None)) - max: MAX((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) - if (TCP_UTCL1_REQUEST_sum != 0) else None)) - units: pct - Hits: - avg: AVG((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) - min: MIN((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) - max: MAX((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) - units: (Req + $normUnit) - Translation Misses: - avg: AVG((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) - min: MIN((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) - max: MAX((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) - units: (Req + $normUnit) - Permission Misses: - avg: AVG((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) - min: MIN((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) - max: MAX((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) - units: (Req + $normUnit) - gfx940: - Req: - avg: AVG((TCP_UTCL1_REQUEST_sum / $denom)) - min: MIN((TCP_UTCL1_REQUEST_sum / $denom)) - max: MAX((TCP_UTCL1_REQUEST_sum / $denom)) - units: (Req + $normUnit) - Hit Ratio: - avg: AVG((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) - if (TCP_UTCL1_REQUEST_sum != 0) else None)) - min: MIN((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) - if (TCP_UTCL1_REQUEST_sum != 0) else None)) - max: MAX((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) - if (TCP_UTCL1_REQUEST_sum != 0) else None)) - units: pct - Hits: - avg: AVG((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) - min: MIN((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) - max: MAX((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) - units: (Req + $normUnit) - Translation Misses: - avg: AVG((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) - min: MIN((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) - max: MAX((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) - units: (Req + $normUnit) - Permission Misses: - avg: AVG((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) - min: MIN((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) - max: MAX((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) - units: (Req + $normUnit) - gfx942: - Req: - avg: AVG((TCP_UTCL1_REQUEST_sum / $denom)) - min: MIN((TCP_UTCL1_REQUEST_sum / $denom)) - max: MAX((TCP_UTCL1_REQUEST_sum / $denom)) - units: (Req + $normUnit) - Hit Ratio: - avg: AVG((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) - if (TCP_UTCL1_REQUEST_sum != 0) else None)) - min: MIN((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) - if (TCP_UTCL1_REQUEST_sum != 0) else None)) - max: MAX((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) - if (TCP_UTCL1_REQUEST_sum != 0) else None)) - units: pct - Hits: - avg: AVG((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) - min: MIN((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) - max: MAX((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) - units: (Req + $normUnit) - Translation Misses: - avg: AVG((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) - min: MIN((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) - max: MAX((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) - units: (Req + $normUnit) - Permission Misses: - avg: AVG((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) - min: MIN((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) - max: MAX((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) - units: (Req + $normUnit) - gfx950: - Req: - avg: AVG((TCP_UTCL1_REQUEST_sum / $denom)) - min: MIN((TCP_UTCL1_REQUEST_sum / $denom)) - max: MAX((TCP_UTCL1_REQUEST_sum / $denom)) - units: (Req + $normUnit) - Inflight Req: - avg: AVG((TCP_CLIENT_UTCL1_INFLIGHT_sum / $denom)) - min: MIN((TCP_CLIENT_UTCL1_INFLIGHT_sum / $denom)) - max: MAX((TCP_CLIENT_UTCL1_INFLIGHT_sum / $denom)) - units: (Req + $normUnit) - Hit Ratio: - avg: AVG((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) - if (TCP_UTCL1_REQUEST_sum != 0) else None)) - min: MIN((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) - if (TCP_UTCL1_REQUEST_sum != 0) else None)) - max: MAX((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) - if (TCP_UTCL1_REQUEST_sum != 0) else None)) - units: pct - Hits: - avg: AVG((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) - min: MIN((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) - max: MAX((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) - units: (Req + $normUnit) - Translation Misses: - avg: AVG((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) - min: MIN((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) - max: MAX((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) - units: (Req + $normUnit) - Misses under Translation Miss: - avg: AVG((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom)) - min: MIN((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom)) - max: MAX((TCP_UTCL1_TRANSLATION_MISS_UNDER_MISS_sum / $denom)) - units: (Req + $normUnit) - Permission Misses: - avg: AVG((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) - min: MIN((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) - max: MAX((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) - units: (Req + $normUnit) - gfx908: - Req: - avg: AVG((TCP_UTCL1_REQUEST_sum / $denom)) - min: MIN((TCP_UTCL1_REQUEST_sum / $denom)) - max: MAX((TCP_UTCL1_REQUEST_sum / $denom)) - units: (Req + $normUnit) - Hit Ratio: - avg: AVG((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) - if (TCP_UTCL1_REQUEST_sum != 0) else None)) - min: MIN((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) - if (TCP_UTCL1_REQUEST_sum != 0) else None)) - max: MAX((((100 * TCP_UTCL1_TRANSLATION_HIT_sum) / TCP_UTCL1_REQUEST_sum) - if (TCP_UTCL1_REQUEST_sum != 0) else None)) - units: pct - Hits: - avg: AVG((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) - min: MIN((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) - max: MAX((TCP_UTCL1_TRANSLATION_HIT_sum / $denom)) - units: (Req + $normUnit) - Translation Misses: - avg: AVG((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) - min: MIN((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) - max: MAX((TCP_UTCL1_TRANSLATION_MISS_sum / $denom)) - units: (Req + $normUnit) - Permission Misses: - avg: AVG((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) - min: MIN((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) - max: MAX((TCP_UTCL1_PERMISSION_MISS_sum / $denom)) - units: (Req + $normUnit) - - metric_table: - id: 1606 - title: L1D Addr Translation Stalls - header: - metric: Metric - avg: Avg - min: Min - max: Max - units: Unit - metric: - gfx90a: {} - gfx941: {} - gfx940: {} - gfx942: {} - gfx950: - Cache Full Stall: - avg: AVG((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom)) - min: MIN((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom)) - max: MAX((TCP_UTCL1_STALL_INFLIGHT_MAX_sum / $denom)) - units: (Cycles + $normUnit) - Cache Miss Stall: - avg: AVG((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom)) - min: MIN((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom)) - max: MAX((TCP_UTCL1_STALL_MULTI_MISS_sum / $denom)) - units: (Cycles + $normUnit) - Serialization Stall: - avg: AVG((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom)) - min: MIN((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom)) - max: MAX((TCP_UTCL1_SERIALIZATION_STALL_sum / $denom)) - units: (Cycles + $normUnit) - Thrashing Stall: - avg: AVG((TCP_UTCL1_THRASHING_STALL_sum / $denom)) - min: MIN((TCP_UTCL1_THRASHING_STALL_sum / $denom)) - max: MAX((TCP_UTCL1_THRASHING_STALL_sum / $denom)) - units: (Cycles + $normUnit) - Latency FIFO Stall: - avg: AVG((TCP_UTCL1_LFIFO_FULL_sum / $denom)) - min: MIN((TCP_UTCL1_LFIFO_FULL_sum / $denom)) - max: MAX((TCP_UTCL1_LFIFO_FULL_sum / $denom)) - units: (Cycles + $normUnit) - Resident Page Full Stall: - avg: AVG((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom)) - min: MIN((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom)) - max: MAX((TCP_UTCL1_STALL_LFIFO_NO_RES_sum / $denom)) - units: (Cycles + $normUnit) - UTCL2 Stall: - avg: AVG((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom)) - min: MIN((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom)) - max: MAX((TCP_UTCL1_STALL_UTCL2_REQ_OUT_OF_CREDITS_sum / $denom)) - units: (Cycles + $normUnit) - gfx908: {} - metrics_description: - Hit rate: - plain: The ratio of the number of vL1D cache line requests that hit in vL1D - cache over the total number of cache line requests to the vL1D Cache RAM. - rst: The ratio of the number of vL1D cache line requests that hit [#vl1d-hit]_ in - vL1D cache over the total number of cache line requests to the :ref:`vL1D - Cache RAM <desc-tc>`. - unit: Percent - Bandwidth Utilization: - plain: The number of bytes looked up in the vL1D cache as a result of VMEM instructions, - as a percent of the peak theoretical bandwidth achievable on the specific - accelerator. The number of bytes is calculated as the number of cache lines - requested multiplied by the cache line size. This value does not consider - partial requests, so for instance, if only a single value is requested in - a cache line, the data movement will still be counted as a full cache line. - rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM - <desc-vmem>` instructions, as a percent of the peak theoretical bandwidth - achievable on the specific accelerator. The number of bytes is calculated - as the number of cache lines requested multiplied by the cache line size. - This value does not consider partial requests, so for instance, if only a - single value is requested in a cache line, the data movement will still be - counted as a full cache line. - unit: Percent - Utilization: - plain: Indicates how busy the vL1D Cache RAM was during the kernel execution. - The number of cycles where the vL1D Cache RAM is actively processing any request - divided by the number of cycles where the vL1D is active. - rst: Indicates how busy the :ref:`vL1D Cache RAM <desc-tc>` was during the kernel - execution. The number of cycles where the vL1D Cache RAM is actively processing - any request divided by the number of cycles where the vL1D is active [#vl1d-activity]_. - unit: Percent - Coalescing: - plain: Indicates how well memory instructions were coalesced by the address - processing unit, ranging from uncoalesced (25%) to fully coalesced (100%). - Calculated as the average number of thread-requests generated per instruction - divided by the ideal number of thread-requests per instruction. - rst: Indicates how well memory instructions were coalesced by the :ref:`address - processing unit <desc-ta>`, ranging from uncoalesced (25%) to fully coalesced - (100%). Calculated as the average number of :ref:`thread-requests <thread-requests>` - generated per instruction divided by the ideal number of thread-requests - per instruction. - unit: Percent - Stalled on L2 Data: - plain: The ratio of the number of cycles where the vL1D is stalled waiting for - requested data to return from the L2 cache divided by the number of cycles - where the vL1D is active. - rst: The ratio of the number of cycles where the vL1D is stalled waiting for requested - data to return from the :doc:`L2 cache <l2-cache>` divided by the number - of cycles where the vL1D is active [#vl1d-activity]_. - unit: Percent - Stalled on L2 Req: - plain: The ratio of the number of cycles where the vL1D is stalled waiting to - issue a request for data to the L2 cache divided by the number of cycles where - the vL1D is active. - rst: The ratio of the number of cycles where the vL1D is stalled waiting to issue - a request for data to the :doc:`L2 cache <l2-cache>` divided by the number - of cycles where the vL1D is active [#vl1d-activity]_. - unit: Percent - Tag RAM Stall (Read): - plain: The ratio of the number of cycles where the vL1D is stalled due to Read - requests with conflicting tags being looked up concurrently, divided by the - number of cycles where the vL1D is active. - rst: The ratio of the number of cycles where the vL1D is stalled due to Read - requests with conflicting tags being looked up concurrently, divided by the - number of cycles where the vL1D is active [#vl1d-activity]_. - unit: Percent - Tag RAM Stall (Write): - plain: The ratio of the number of cycles where the vL1D is stalled due to Write - requests with conflicting tags being looked up concurrently, divided by the - number of cycles where the vL1D is active. - rst: The ratio of the number of cycles where the vL1D is stalled due to Write - requests with conflicting tags being looked up concurrently, divided by the - number of cycles where the vL1D is active [#vl1d-activity]_. - unit: Percent - Tag RAM Stall (Atomic): - plain: The ratio of the number of cycles where the vL1D is stalled due to Atomic - requests with conflicting tags being looked up concurrently, divided by the - number of cycles where the vL1D is active. - rst: The ratio of the number of cycles where the vL1D is stalled due to Atomic - requests with conflicting tags being looked up concurrently, divided by the - number of cycles where the vL1D is active [#vl1d-activity]_. - unit: Percent - Total Req: - plain: The total number of incoming requests from the address processing unit - after coalescing. - rst: The total number of incoming requests from the :ref:`address processing - unit <desc-ta>` after coalescing. - unit: Requests - Read Req: - plain: The total number of incoming read requests from the address processing - unit after coalescing per normalization unit. - rst: The total number of incoming read requests from the :ref:`address processing - unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>` - unit: Requests per normalization unit - Write Req: - plain: The total number of incoming write requests from the address processing - unit after coalescing per normalization unit. - rst: The total number of incoming write requests from the :ref:`address processing - unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>` - unit: Requests per normalization unit - Atomic Req: - plain: The total number of incoming atomic requests from the address processing - unit after coalescing per normalization unit. - rst: The total number of incoming atomic requests from the :ref:`address processing - unit <desc-ta>` after coalescing per :ref:`normalization unit <normalization-units>` - unit: Requests per normalization unit - Cache BW: - plain: The number of bytes looked up in the vL1D cache as a result of VMEM instructions - divided by total duration. The number of bytes is calculated as the number of - cache lines requested multiplied by the cache line size. This value does - not consider partial requests, so for instance, if only a single value is - requested in a cache line, the data movement will still be counted as a full - cache line. - rst: The number of bytes looked up in the vL1D cache as a result of :ref:`VMEM - <desc-vmem>` instructions divided by total duration. The - number of bytes is calculated as the number of cache lines requested multiplied - by the cache line size. This value does not consider partial requests, so - for instance, if only a single value is requested in a cache line, the data movement - will still be counted as a full cache line. - unit: Gbps - Cache Hit Rate: - plain: The ratio of the number of vL1D cache line requests that hit in vL1D - cache over the total number of cache line requests to the vL1D Cache RAM. - rst: The ratio of the number of vL1D cache line requests that hit in vL1D cache - over the total number of cache line requests to the :ref:`vL1D Cache RAM - <desc-tc>`. - unit: Percent - Cache Accesses: - plain: The total number of cache line lookups in the vL1D. - rst: The total number of cache line lookups in the vL1D. - unit: Cache lines - Cache Hits: - plain: The number of cache accesses minus the number of outgoing requests to - the L2 cache, that is, the number of cache line requests serviced by the vL1D - Cache RAM per normalization unit. - rst: The number of cache accesses minus the number of outgoing requests to the :doc:`L2 - cache <l2-cache>`, that is, the number of cache line requests serviced by - the :ref:`vL1D Cache RAM <desc-tc>` per :ref:`normalization unit <normalization-units>`. - unit: Cache lines per normalization unit - Invalidations: - plain: The number of times the vL1D was issued a write-back invalidate command - during the kernel's execution per normalization unit. This may be triggered - by, for instance, the buffer_wbinvl1 instruction. - rst: The number of times the vL1D was issued a write-back invalidate command during - the kernel's execution per :ref:`normalization unit <normalization-units>`. This - may be triggered by, for instance, the ``buffer_wbinvl1`` instruction. - unit: Invalidations per normalization unit - L1-L2 BW: - plain: The number of bytes transferred across the vL1D-L2 interface as a result - of VMEM instructions, divided by total duration. The number of bytes is calculated - as the number of cache lines requested multiplied by the cache line size. - This value does not consider partial requests, so for instance, if only a - single value is requested in a cache line, the data movement will still be - counted as a full cache line. - rst: The number of bytes transferred across the vL1D-L2 interface as a result of - :ref:`VMEM <desc-vmem>` instructions, divided by total duration. - The number of bytes is calculated as the number of cache lines requested - multiplied by the cache line size. This value does not consider partial requests, - so for instance, if only a single value is requested in a cache line, the - data movement will still be counted as a full cache line. - unit: Gbps - L1-L2 Read: - plain: The number of read requests for a vL1D cache line that were not satisfied - by the vL1D and must be retrieved from the to the L2 Cache per normalization - unit. - rst: The number of read requests for a vL1D cache line that were not satisfied by - the vL1D and must be retrieved from the to the :doc:`L2 Cache <l2-cache>` - per :ref:`normalization unit <normalization-units>`. - unit: Requests per normalization unit - L1-L2 Write: - plain: The number of write requests to a vL1D cache line that were sent through - the vL1D to the L2 cache, per normalization unit. - rst: The number of write requests to a vL1D cache line that were sent through the - vL1D to the :doc:`L2 cache <l2-cache>`, per :ref:`normalization unit <normalization-units>`. - unit: Requests per normalization unit - L1-L2 Atomic: - plain: The number of atomic requests that are sent through the vL1D to the L2 - cache, per normalization unit. This includes requests for atomics with, and - without return. - rst: The number of atomic requests that are sent through the vL1D to the :doc:`L2 - cache <l2-cache>`, per :ref:`normalization unit <normalization-units>`. This - includes requests for atomics with, and without return. - unit: Requests per normalization unit - L1 Access Latency: - plain: Calculated as the average number of cycles that a vL1D cache line request - spent in the vL1D cache pipeline. - rst: Calculated as the average number of cycles that a vL1D cache line request - spent in the vL1D cache pipeline. - unit: Cycles - L1-L2 Read Latency: - plain: Calculated as the average number of cycles that the vL1D cache took to - issue and receive read requests from the L2 Cache. This number also includes - requests for atomics with return values. - rst: Calculated as the average number of cycles that the vL1D cache took to issue - and receive read requests from the :doc:`L2 Cache <l2-cache>`. This number - also includes requests for atomics with return values. - unit: Cycles - L1-L2 Write Latency: - plain: Calculated as the average number of cycles that the vL1D cache took to - issue and receive acknowledgement of a write request to the L2 Cache. This - number also includes requests for atomics without return values. - rst: Calculated as the average number of cycles that the vL1D cache took to issue - and receive acknowledgement of a write request to the :doc:`L2 Cache <l2-cache>`. - This number also includes requests for atomics without return values. - unit: Cycles - NC - Read: - plain: Total read requests with NC mtype from this TCP to all TCCs Sum over - TCP instances per normalization unit. - rst: Total read requests with NC mtype from this TCP to all TCCs Sum over TCP - instances per normalization unit. - unit: Requests per normalization unit - UC - Read: - plain: Total read requests with UC mtype from this TCP to all TCCs Sum over - TCP instances per normalization unit. - rst: Total read requests with UC mtype from this TCP to all TCCs Sum over TCP - instances per normalization unit. - unit: Requests per normalization unit - CC - Read: - plain: Total read requests with CC mtype from this TCP to all TCCs Sum over - TCP instances per normalization unit. - rst: Total read requests with CC mtype from this TCP to all TCCs Sum over TCP - instances per normalization unit. - unit: Requests per normalization unit - RW - Read: - plain: Total read requests with RW mtype from this TCP to all TCCs Sum over - TCP instances per normalization unit. - rst: Total read requests with RW mtype from this TCP to all TCCs Sum over - TCP instances per normalization unit. - unit: Requests per normalization unit - RW - Write: - plain: Total write requests with RW mtype from this TCP to all TCCs Sum over - TCP instances per normalization unit. - rst: Total write requests with RW mtype from this TCP to all TCCs Sum over TCP - instances per normalization unit. - unit: Requests per normalization unit - NC - Write: - plain: Total write requests with NC mtype from this TCP to all TCCs Sum over - TCP instances per normalization unit. - rst: Total write requests with NC mtype from this TCP to all TCCs Sum over TCP - instances per normalization unit. - unit: Requests per normalization unit - UC - Write: - plain: Total write requests with UC mtype from this TCP to all TCCs Sum over - TCP instances per normalization unit. - rst: Total write requests with UC mtype from this TCP to all TCCs Sum over TCP - instances per normalization unit. - unit: Requests per normalization unit - CC - Write: - plain: Total write requests with CC mtype from this TCP to all TCCs Sum over - TCP instances per normalization unit. - rst: Total write requests with CC mtype from this TCP to all TCCs Sum over TCP - instances per normalization unit. - unit: Requests per normalization unit - NC - Atomic: - plain: Total atomic requests with NC mtype from this TCP to all TCCs Sum over - TCP instances per normalization unit. - rst: Total atomic requests with NC mtype from this TCP to all TCCs Sum over - TCP instances per normalization unit. - unit: Requests per normalization unit - UC - Atomic: - plain: Total atomic requests with UC mtype from this TCP to all TCCs Sum over - TCP instances per normalization unit. - rst: Total atomic requests with UC mtype from this TCP to all TCCs Sum over - TCP instances per normalization unit. - unit: Requests per normalization unit - CC - Atomic: - plain: Total atomic requests with CC mtype from this TCP to all TCCs Sum over - TCP instances per normalization unit. - rst: Total atomic requests with CC mtype from this TCP to all TCCs Sum over - TCP instances per normalization unit. - unit: Requests per normalization unit - RW - Atomic: - plain: Total atomic requests with RW mtype from this TCP to all TCCs Sum over - TCP instances per normalization unit. - rst: Total atomic requests with RW mtype from this TCP to all TCCs Sum over - TCP instances per normalization unit. - unit: Requests per normalization unit - Req: - plain: The number of translation requests made to the UTCL1 per normalization - unit. - rst: The number of translation requests made to the UTCL1 per normalization - unit. - unit: Requests per normalization unit - Hit Ratio: - plain: The ratio of the number of translation requests that hit in the UTCL1 - divided by the total number of translation requests made to the UTCL1. - rst: The ratio of the number of translation requests that hit in the UTCL1 divided - by the total number of translation requests made to the UTCL1. - unit: Percent - Hits: - plain: The number of translation requests that hit in the UTCL1, and could be - reused, per normalization unit. - rst: The number of translation requests that hit in the UTCL1, and could be - reused, per normalization unit. - unit: Requests per normalization unit - Translation Misses: - plain: The total number of translation requests that missed in the UTCL1 due - to translation not being present in the cache, per normalization unit. - rst: The total number of translation requests that missed in the UTCL1 due to translation - not being present in the cache, per :ref:`normalization unit <normalization-units>`. - unit: unit - Permission Misses: - plain: |- - The total number of translation requests that missed in the UTCL1 due - to a permission error, per normalization unit. This is unused and expected - to be zero in most configurations for modern CDNA\u2122 accelerators. - rst: |- - The total number of translation requests that missed in the UTCL1 due - to a permission error, per :ref:`normalization unit <normalization-units>`. - This is unused and expected to be zero in most configurations for modern - CDNA\u2122 accelerators. - unit: Requests per normalization unit -- id: 1700 - title: L2 Cache - data source: - - metric_table: - id: 1701 - title: L2 Speed-of-Light - header: - metric: Metric - value: Avg - unit: Unit - metric: - gfx90a: - Utilization: - value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($total_l2_chan) * $GRBM_GUI_ACTIVE_PER_XCD))) - unit: pct - Peak Bandwidth: - value: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))) - unit: pct - Hit Rate: - value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else 0)) - unit: pct - L2-Fabric Read BW: - value: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) - * 64)) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - L2-Fabric Write and Atomic BW: - value: AVG((((TCC_EA_WRREQ_64B_sum * 64) + ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - HBM Bandwidth: - value: $hbmBandwidth - unit: GB/s - gfx941: - Utilization: - value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($total_l2_chan) * $GRBM_GUI_ACTIVE_PER_XCD))) - unit: pct - Peak Bandwidth: - value: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))) - unit: pct - Hit Rate: - value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else 0)) - unit: pct - L2-Fabric Read BW: - value: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) - * 64)) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - L2-Fabric Write and Atomic BW: - value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - HBM Bandwidth: - value: $hbmBandwidth - unit: GB/s - gfx940: - Utilization: - value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($total_l2_chan) * $GRBM_GUI_ACTIVE_PER_XCD))) - unit: pct - Peak Bandwidth: - value: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))) - unit: pct - Hit Rate: - value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else 0)) - unit: pct - L2-Fabric Read BW: - value: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) - * 64)) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - L2-Fabric Write and Atomic BW: - value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - HBM Bandwidth: - value: $hbmBandwidth - unit: GB/s - gfx942: - Utilization: - value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($total_l2_chan) * $GRBM_GUI_ACTIVE_PER_XCD))) - unit: pct - Peak Bandwidth: - value: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))) - unit: pct - Hit Rate: - value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else 0)) - unit: pct - L2-Fabric Read BW: - value: AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - - Start_Timestamp)) - unit: GB/s - L2-Fabric Write and Atomic BW: - value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - HBM Bandwidth: - value: $hbmBandwidth - unit: GB/s - gfx950: - Utilization: - value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($total_l2_chan) * $GRBM_GUI_ACTIVE_PER_XCD))) - unit: pct - Peak Bandwidth: - value: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))) - unit: pct - Hit Rate: - value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else 0)) - unit: pct - L2-Fabric Read BW: - value: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) - + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - L2-Fabric Write and Atomic BW: - value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - HBM Bandwidth: - value: $hbmBandwidth - unit: GB/s - gfx908: - Utilization: - value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($total_l2_chan) * $GRBM_GUI_ACTIVE_PER_XCD))) - unit: pct - Peak Bandwidth: - value: ((100 * AVG(((TCC_REQ_sum * 64) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk / 1000) * 64) * TO_INT($total_l2_chan))) - unit: pct - Hit Rate: - value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else 0)) - unit: pct - L2-Fabric Read BW: - value: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) - * 64)) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - L2-Fabric Write and Atomic BW: - value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp))) - unit: GB/s - HBM Bandwidth: - value: $hbmBandwidth - unit: GB/s - - metric_table: - id: 1702 - title: L2-Fabric interface metrics - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - metric: - gfx90a: - Read BW: - avg: AVG((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) - * 64)) / (End_Timestamp - Start_Timestamp))) - min: MIN((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) - * 64)) / (End_Timestamp - Start_Timestamp))) - max: MAX((((TCC_EA_RDREQ_32B_sum * 32) + ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) - * 64)) / (End_Timestamp - Start_Timestamp))) - unit: Gbps - HBM Read Traffic: - avg: AVG((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum - != 0) else None)) - min: MIN((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum - != 0) else None)) - max: MAX((100 * (TCC_EA_RDREQ_DRAM_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum - != 0) else None)) - unit: pct - Remote Read Traffic: - avg: AVG((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum) - if (TCC_EA_RDREQ_sum != 0) else None)) - min: MIN((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum) - if (TCC_EA_RDREQ_sum != 0) else None)) - max: MAX((100 * ((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum) / TCC_EA_RDREQ_sum) - if (TCC_EA_RDREQ_sum != 0) else None)) - unit: pct - Uncached Read Traffic: - avg: AVG((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum - != 0) else None)) - min: MIN((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum - != 0) else None)) - max: MAX((100 * (TCC_EA_RD_UNCACHED_32B_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum - != 0) else None)) - unit: pct - Write and Atomic BW: - avg: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / $denom)) - min: MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / $denom)) - max: MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / $denom)) - unit: (Bytes + $normUnit) - HBM Write and Atomic Traffic: - avg: AVG((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum - != 0) else None)) - min: MIN((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum - != 0) else None)) - max: MAX((100 * (TCC_EA_WRREQ_DRAM_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum - != 0) else None)) - unit: pct - Remote Write and Atomic Traffic: - avg: AVG((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum) - if (TCC_EA_WRREQ_sum != 0) else None)) - min: MIN((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum) - if (TCC_EA_WRREQ_sum != 0) else None)) - max: MAX((100 * ((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum) / TCC_EA_WRREQ_sum) - if (TCC_EA_WRREQ_sum != 0) else None)) - unit: pct - Atomic Traffic: - avg: AVG((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum - != 0) else None)) - min: MIN((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum - != 0) else None)) - max: MAX((100 * (TCC_EA_ATOMIC_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum - != 0) else None)) - unit: pct - Uncached Write and Atomic Traffic: - avg: AVG((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum - != 0) else None)) - min: MIN((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum - != 0) else None)) - max: MAX((100 * (TCC_EA_WR_UNCACHED_32B_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum - != 0) else None)) - unit: pct - Read Latency: - avg: AVG(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum - != 0) else None)) - min: MIN(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum - != 0) else None)) - max: MAX(((TCC_EA_RDREQ_LEVEL_sum / TCC_EA_RDREQ_sum) if (TCC_EA_RDREQ_sum - != 0) else None)) - unit: Cycles - Write and Atomic Latency: - avg: AVG(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum - != 0) else None)) - min: MIN(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum - != 0) else None)) - max: MAX(((TCC_EA_WRREQ_LEVEL_sum / TCC_EA_WRREQ_sum) if (TCC_EA_WRREQ_sum - != 0) else None)) - unit: Cycles - Atomic Latency: - avg: AVG(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum - != 0) else None)) - min: MIN(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum - != 0) else None)) - max: MAX(((TCC_EA_ATOMIC_LEVEL_sum / TCC_EA_ATOMIC_sum) if (TCC_EA_ATOMIC_sum - != 0) else None)) - unit: Cycles - gfx941: - Read BW: - avg: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) - * 64)) / (End_Timestamp - Start_Timestamp))) - min: MIN((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) - * 64)) / (End_Timestamp - Start_Timestamp))) - max: MAX((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) - * 64)) / (End_Timestamp - Start_Timestamp))) - unit: Gbps - HBM Read Traffic: - avg: AVG((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - min: MIN((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - max: MAX((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - unit: pct - Remote Read Traffic: - avg: AVG((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum),0) / - TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - min: MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum),0) / - TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - max: MAX((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum),0) / - TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - unit: pct - Uncached Read Traffic: - avg: AVG((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - min: MIN((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - max: MAX((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - unit: pct - Write and Atomic BW: - avg: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp))) - min: MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp))) - max: MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp))) - unit: Gbps - HBM Write and Atomic Traffic: - avg: AVG((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - min: MIN((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - max: MAX((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - unit: pct - Remote Write and Atomic Traffic: - avg: AVG((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / - TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - min: MIN((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / - TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - max: MAX((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / - TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - unit: pct - Atomic Traffic: - avg: AVG((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - min: MIN((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - max: MAX((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - unit: pct - Uncached Write and Atomic Traffic: - avg: AVG((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - min: MIN((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - max: MAX((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - unit: pct - Read Latency: - avg: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - min: MIN(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - max: MAX(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - unit: Cycles - Write and Atomic Latency: - avg: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - min: MIN(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - max: MAX(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - unit: Cycles - Atomic Latency: - avg: AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum - != 0) else None)) - min: MIN(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum - != 0) else None)) - max: MAX(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum - != 0) else None)) - unit: Cycles - gfx940: - Read BW: - avg: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) - * 64)) / (End_Timestamp - Start_Timestamp))) - min: MIN((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) - * 64)) / (End_Timestamp - Start_Timestamp))) - max: MAX((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) - * 64)) / (End_Timestamp - Start_Timestamp))) - unit: Gbps - HBM Read Traffic: - avg: AVG((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - min: MIN((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - max: MAX((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - unit: pct - Remote Read Traffic: - avg: AVG((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) - / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - min: MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) - / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - max: MAX((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) - / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - unit: pct - Uncached Read Traffic: - avg: AVG((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - min: MIN((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - max: MAX((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - unit: pct - Write and Atomic BW: - avg: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp))) - min: MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp))) - max: MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp))) - unit: Gbps - HBM Write and Atomic Traffic: - avg: AVG((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - min: MIN((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - max: MAX((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - unit: pct - Remote Write and Atomic Traffic: - avg: AVG((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / - TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - min: MIN((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / - TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - max: MAX((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / - TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - unit: pct - Atomic Traffic: - avg: AVG((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - min: MIN((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - max: MAX((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - unit: pct - Uncached Write and Atomic Traffic: - avg: AVG((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - min: MIN((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - max: MAX((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - unit: pct - Read Latency: - avg: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - min: MIN(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - max: MAX(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - unit: Cycles - Write and Atomic Latency: - avg: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - min: MIN(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - max: MAX(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - unit: Cycles - Atomic Latency: - avg: AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum - != 0) else None)) - min: MIN(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum - != 0) else None)) - max: MAX(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum - != 0) else None)) - unit: Cycles - gfx942: - Read BW: - avg: AVG(((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp))) - min: MIN(((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp))) - max: MAX(((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp - Start_Timestamp))) - unit: Gbps - HBM Read Traffic: - avg: AVG((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - min: MIN((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - max: MAX((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - unit: pct - Remote Read Traffic: - avg: AVG((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum),0) / - TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - min: MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum),0) / - TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - max: MAX((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum),0) / - TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - unit: pct - Uncached Read Traffic: - avg: AVG((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - min: MIN((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - max: MAX((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - unit: pct - Write and Atomic BW: - avg: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp))) - min: MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp))) - max: MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp))) - unit: Gbps - HBM Write and Atomic Traffic: - avg: AVG((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - min: MIN((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - max: MAX((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - unit: pct - Remote Write and Atomic Traffic: - avg: AVG((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / - TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - min: MIN((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / - TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - max: MAX((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / - TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - unit: pct - Atomic Traffic: - avg: AVG((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - min: MIN((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - max: MAX((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - unit: pct - Uncached Write and Atomic Traffic: - avg: AVG((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - min: MIN((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - max: MAX((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - unit: pct - Read Latency: - avg: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - min: MIN(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - max: MAX(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - unit: Cycles - Write and Atomic Latency: - avg: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - min: MIN(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - max: MAX(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - unit: Cycles - Atomic Latency: - avg: AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum - != 0) else None)) - min: MIN(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum - != 0) else None)) - max: MAX(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum - != 0) else None)) - unit: Cycles - gfx950: - Read BW: - avg: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) - + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) - min: MIN((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) - + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) - max: MAX((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64) - + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp))) - unit: Gbps - HBM Read Traffic: - avg: AVG((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - min: MIN((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - max: MAX((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - unit: pct - Remote Read Traffic: - avg: AVG((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) - / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - min: MIN((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) - / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - max: MAX((100 * (MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) - / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum != 0) else None)) - unit: pct - Uncached Read Traffic: - avg: AVG((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - min: MIN((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - max: MAX((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - unit: pct - Write and Atomic BW: - avg: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp))) - min: MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp))) - max: MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / (End_Timestamp - Start_Timestamp))) - unit: Gbps - HBM Write and Atomic Traffic: - avg: AVG((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - min: MIN((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - max: MAX((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - unit: pct - Remote Write and Atomic Traffic: - avg: AVG((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / - TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - min: MIN((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / - TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - max: MAX((100 * (MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum),0) / - TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum != 0) else None)) - unit: pct - Atomic Traffic: - avg: AVG((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - min: MIN((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - max: MAX((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - unit: pct - Uncached Write and Atomic Traffic: - avg: AVG((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - min: MIN((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - max: MAX((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - unit: pct - Read Latency: - avg: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - min: MIN(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - max: MAX(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - unit: Cycles - Write and Atomic Latency: - avg: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - min: MIN(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - max: MAX(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - unit: Cycles - Atomic Latency: - avg: AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum - != 0) else None)) - min: MIN(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum - != 0) else None)) - max: MAX(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum - != 0) else None)) - unit: Cycles - Read Stall: - avg: AVG((((100 * ((TCC_EA0_RDREQ_IO_CREDIT_STALL_sum + TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum) - + TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum - != 0) else None)) - min: MIN((((100 * ((TCC_EA0_RDREQ_IO_CREDIT_STALL_sum + TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum) - + TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum - != 0) else None)) - max: MAX((((100 * ((TCC_EA0_RDREQ_IO_CREDIT_STALL_sum + TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum) - + TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum)) / TCC_BUSY_sum) if (TCC_BUSY_sum - != 0) else None)) - unit: pct - Write Stall: - avg: AVG(((100 * (TCC_EA0_WRREQ_STALL_sum) / TCC_BUSY_sum) if (TCC_BUSY_sum - != 0) else None)) - min: MIN(((100 * (TCC_EA0_WRREQ_STALL_sum) / TCC_BUSY_sum) if (TCC_BUSY_sum - != 0) else None)) - max: MAX(((100 * (TCC_EA0_WRREQ_STALL_sum) / TCC_BUSY_sum) if (TCC_BUSY_sum - != 0) else None)) - unit: pct - gfx908: - Read BW: - avg: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) - * 64)) / $denom)) - min: MIN((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) - * 64)) / $denom)) - max: MAX((((TCC_EA0_RDREQ_32B_sum * 32) + ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) - * 64)) / $denom)) - unit: (Bytes + $normUnit) - HBM Read Traffic: - avg: AVG((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - min: MIN((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - max: MAX((100 * (TCC_EA0_RDREQ_DRAM_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - unit: pct - Remote Read Traffic: - avg: AVG((100 * ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum) / TCC_EA0_RDREQ_sum) - if (TCC_EA0_RDREQ_sum != 0) else None)) - min: MIN((100 * ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum) / TCC_EA0_RDREQ_sum) - if (TCC_EA0_RDREQ_sum != 0) else None)) - max: MAX((100 * ((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum) / TCC_EA0_RDREQ_sum) - if (TCC_EA0_RDREQ_sum != 0) else None)) - unit: pct - Uncached Read Traffic: - avg: AVG((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - min: MIN((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - max: MAX((100 * (TCC_EA0_RD_UNCACHED_32B_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - unit: pct - Write and Atomic BW: - avg: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / $denom)) - min: MIN((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / $denom)) - max: MAX((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) - * 32)) / $denom)) - unit: (Bytes + $normUnit) - HBM Write and Atomic Traffic: - avg: AVG((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - min: MIN((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - max: MAX((100 * (TCC_EA0_WRREQ_DRAM_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - unit: pct - Remote Write and Atomic Traffic: - avg: AVG((100 * ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum) / TCC_EA0_WRREQ_sum) - if (TCC_EA0_WRREQ_sum != 0) else None)) - min: MIN((100 * ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum) / TCC_EA0_WRREQ_sum) - if (TCC_EA0_WRREQ_sum != 0) else None)) - max: MAX((100 * ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum) / TCC_EA0_WRREQ_sum) - if (TCC_EA0_WRREQ_sum != 0) else None)) - unit: pct - Atomic Traffic: - avg: AVG((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - min: MIN((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - max: MAX((100 * (TCC_EA0_ATOMIC_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - unit: pct - Uncached Write and Atomic Traffic: - avg: AVG((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - min: MIN((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - max: MAX((100 * (TCC_EA0_WR_UNCACHED_32B_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - unit: pct - Read Latency: - avg: AVG(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - min: MIN(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - max: MAX(((TCC_EA0_RDREQ_LEVEL_sum / TCC_EA0_RDREQ_sum) if (TCC_EA0_RDREQ_sum - != 0) else None)) - unit: Cycles - Write and Atomic Latency: - avg: AVG(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - min: MIN(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - max: MAX(((TCC_EA0_WRREQ_LEVEL_sum / TCC_EA0_WRREQ_sum) if (TCC_EA0_WRREQ_sum - != 0) else None)) - unit: Cycles - Atomic Latency: - avg: AVG(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum - != 0) else None)) - min: MIN(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum - != 0) else None)) - max: MAX(((TCC_EA0_ATOMIC_LEVEL_sum / TCC_EA0_ATOMIC_sum) if (TCC_EA0_ATOMIC_sum - != 0) else None)) - unit: Cycles - - metric_table: - id: 1703 - title: L2 Cache Accesses - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - metric: - gfx90a: - Bandwidth: - avg: AVG((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)) - min: MIN((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)) - max: MAX((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)) - unit: Gbps - Req: - avg: AVG((TCC_REQ_sum / $denom)) - min: MIN((TCC_REQ_sum / $denom)) - max: MAX((TCC_REQ_sum / $denom)) - unit: (Req + $normUnit) - Read Req: - avg: AVG((TCC_READ_sum / $denom)) - min: MIN((TCC_READ_sum / $denom)) - max: MAX((TCC_READ_sum / $denom)) - unit: (Req + $normUnit) - Write Req: - avg: AVG((TCC_WRITE_sum / $denom)) - min: MIN((TCC_WRITE_sum / $denom)) - max: MAX((TCC_WRITE_sum / $denom)) - unit: (Req + $normUnit) - Atomic Req: - avg: AVG((TCC_ATOMIC_sum / $denom)) - min: MIN((TCC_ATOMIC_sum / $denom)) - max: MAX((TCC_ATOMIC_sum / $denom)) - unit: (Req + $normUnit) - Streaming Req: - avg: AVG((TCC_STREAMING_REQ_sum / $denom)) - min: MIN((TCC_STREAMING_REQ_sum / $denom)) - max: MAX((TCC_STREAMING_REQ_sum / $denom)) - unit: (Req + $normUnit) - Probe Req: - avg: AVG((TCC_PROBE_sum / $denom)) - min: MIN((TCC_PROBE_sum / $denom)) - max: MAX((TCC_PROBE_sum / $denom)) - unit: (Req + $normUnit) - Cache Hit: - avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - unit: pct - Hits: - avg: AVG((TCC_HIT_sum / $denom)) - min: MIN((TCC_HIT_sum / $denom)) - max: MAX((TCC_HIT_sum / $denom)) - unit: (Hits + $normUnit) - Misses: - avg: AVG((TCC_MISS_sum / $denom)) - min: MIN((TCC_MISS_sum / $denom)) - max: MAX((TCC_MISS_sum / $denom)) - unit: (Misses + $normUnit) - Writeback: - avg: AVG((TCC_WRITEBACK_sum / $denom)) - min: MIN((TCC_WRITEBACK_sum / $denom)) - max: MAX((TCC_WRITEBACK_sum / $denom)) - unit: (Cachelines + $normUnit) - Writeback (Internal): - avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom)) - min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom)) - max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom)) - unit: (Cachelines + $normUnit) - Writeback (vL1D Req): - avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) - min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) - max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) - unit: (Cachelines + $normUnit) - Evict (Internal): - avg: AVG((TCC_NORMAL_EVICT_sum / $denom)) - min: MIN((TCC_NORMAL_EVICT_sum / $denom)) - max: MAX((TCC_NORMAL_EVICT_sum / $denom)) - unit: (Cachelines + $normUnit) - Evict (vL1D Req): - avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) - min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) - max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) - unit: (Cachelines + $normUnit) - NC Req: - avg: AVG((TCC_NC_REQ_sum / $denom)) - min: MIN((TCC_NC_REQ_sum / $denom)) - max: MAX((TCC_NC_REQ_sum / $denom)) - unit: (Req + $normUnit) - UC Req: - avg: AVG((TCC_UC_REQ_sum / $denom)) - min: MIN((TCC_UC_REQ_sum / $denom)) - max: MAX((TCC_UC_REQ_sum / $denom)) - unit: (Req + $normUnit) - CC Req: - avg: AVG((TCC_CC_REQ_sum / $denom)) - min: MIN((TCC_CC_REQ_sum / $denom)) - max: MAX((TCC_CC_REQ_sum / $denom)) - unit: (Req + $normUnit) - RW Req: - avg: AVG((TCC_RW_REQ_sum / $denom)) - min: MIN((TCC_RW_REQ_sum / $denom)) - max: MAX((TCC_RW_REQ_sum / $denom)) - unit: (Req + $normUnit) - gfx941: - Bandwidth: - avg: AVG((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)) - min: MIN((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)) - max: MAX((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)) - unit: Gbps - Req: - avg: AVG((TCC_REQ_sum / $denom)) - min: MIN((TCC_REQ_sum / $denom)) - max: MAX((TCC_REQ_sum / $denom)) - unit: (Req + $normUnit) - Read Req: - avg: AVG((TCC_READ_sum / $denom)) - min: MIN((TCC_READ_sum / $denom)) - max: MAX((TCC_READ_sum / $denom)) - unit: (Req + $normUnit) - Write Req: - avg: AVG((TCC_WRITE_sum / $denom)) - min: MIN((TCC_WRITE_sum / $denom)) - max: MAX((TCC_WRITE_sum / $denom)) - unit: (Req + $normUnit) - Atomic Req: - avg: AVG((TCC_ATOMIC_sum / $denom)) - min: MIN((TCC_ATOMIC_sum / $denom)) - max: MAX((TCC_ATOMIC_sum / $denom)) - unit: (Req + $normUnit) - Streaming Req: - avg: AVG((TCC_STREAMING_REQ_sum / $denom)) - min: MIN((TCC_STREAMING_REQ_sum / $denom)) - max: MAX((TCC_STREAMING_REQ_sum / $denom)) - unit: (Req + $normUnit) - Probe Req: - avg: AVG((TCC_PROBE_sum / $denom)) - min: MIN((TCC_PROBE_sum / $denom)) - max: MAX((TCC_PROBE_sum / $denom)) - unit: (Req + $normUnit) - Cache Hit: - avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - unit: pct - Hits: - avg: AVG((TCC_HIT_sum / $denom)) - min: MIN((TCC_HIT_sum / $denom)) - max: MAX((TCC_HIT_sum / $denom)) - unit: (Hits + $normUnit) - Misses: - avg: AVG((TCC_MISS_sum / $denom)) - min: MIN((TCC_MISS_sum / $denom)) - max: MAX((TCC_MISS_sum / $denom)) - unit: (Misses + $normUnit) - Writeback: - avg: AVG((TCC_WRITEBACK_sum / $denom)) - min: MIN((TCC_WRITEBACK_sum / $denom)) - max: MAX((TCC_WRITEBACK_sum / $denom)) - unit: (Cachelines + $normUnit) - Writeback (Internal): - avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom)) - min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom)) - max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom)) - unit: (Cachelines + $normUnit) - Writeback (vL1D Req): - avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) - min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) - max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) - unit: (Cachelines + $normUnit) - Evict (Internal): - avg: AVG((TCC_NORMAL_EVICT_sum / $denom)) - min: MIN((TCC_NORMAL_EVICT_sum / $denom)) - max: MAX((TCC_NORMAL_EVICT_sum / $denom)) - unit: (Cachelines + $normUnit) - Evict (vL1D Req): - avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) - min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) - max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) - unit: (Cachelines + $normUnit) - NC Req: - avg: AVG((TCC_NC_REQ_sum / $denom)) - min: MIN((TCC_NC_REQ_sum / $denom)) - max: MAX((TCC_NC_REQ_sum / $denom)) - unit: (Req + $normUnit) - UC Req: - avg: AVG((TCC_UC_REQ_sum / $denom)) - min: MIN((TCC_UC_REQ_sum / $denom)) - max: MAX((TCC_UC_REQ_sum / $denom)) - unit: (Req + $normUnit) - CC Req: - avg: AVG((TCC_CC_REQ_sum / $denom)) - min: MIN((TCC_CC_REQ_sum / $denom)) - max: MAX((TCC_CC_REQ_sum / $denom)) - unit: (Req + $normUnit) - RW Req: - avg: AVG((TCC_RW_REQ_sum / $denom)) - min: MIN((TCC_RW_REQ_sum / $denom)) - max: MAX((TCC_RW_REQ_sum / $denom)) - unit: (Req + $normUnit) - gfx940: - Bandwidth: - avg: AVG((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)) - min: MIN((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)) - max: MAX((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)) - unit: Gbps - Req: - avg: AVG((TCC_REQ_sum / $denom)) - min: MIN((TCC_REQ_sum / $denom)) - max: MAX((TCC_REQ_sum / $denom)) - unit: (Req + $normUnit) - Read Req: - avg: AVG((TCC_READ_sum / $denom)) - min: MIN((TCC_READ_sum / $denom)) - max: MAX((TCC_READ_sum / $denom)) - unit: (Req + $normUnit) - Write Req: - avg: AVG((TCC_WRITE_sum / $denom)) - min: MIN((TCC_WRITE_sum / $denom)) - max: MAX((TCC_WRITE_sum / $denom)) - unit: (Req + $normUnit) - Atomic Req: - avg: AVG((TCC_ATOMIC_sum / $denom)) - min: MIN((TCC_ATOMIC_sum / $denom)) - max: MAX((TCC_ATOMIC_sum / $denom)) - unit: (Req + $normUnit) - Streaming Req: - avg: AVG((TCC_STREAMING_REQ_sum / $denom)) - min: MIN((TCC_STREAMING_REQ_sum / $denom)) - max: MAX((TCC_STREAMING_REQ_sum / $denom)) - unit: (Req + $normUnit) - Probe Req: - avg: AVG((TCC_PROBE_sum / $denom)) - min: MIN((TCC_PROBE_sum / $denom)) - max: MAX((TCC_PROBE_sum / $denom)) - unit: (Req + $normUnit) - Cache Hit: - avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - unit: pct - Hits: - avg: AVG((TCC_HIT_sum / $denom)) - min: MIN((TCC_HIT_sum / $denom)) - max: MAX((TCC_HIT_sum / $denom)) - unit: (Hits + $normUnit) - Misses: - avg: AVG((TCC_MISS_sum / $denom)) - min: MIN((TCC_MISS_sum / $denom)) - max: MAX((TCC_MISS_sum / $denom)) - unit: (Misses + $normUnit) - Writeback: - avg: AVG((TCC_WRITEBACK_sum / $denom)) - min: MIN((TCC_WRITEBACK_sum / $denom)) - max: MAX((TCC_WRITEBACK_sum / $denom)) - unit: (Cachelines + $normUnit) - Writeback (Internal): - avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom)) - min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom)) - max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom)) - unit: (Cachelines + $normUnit) - Writeback (vL1D Req): - avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) - min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) - max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) - unit: (Cachelines + $normUnit) - Evict (Internal): - avg: AVG((TCC_NORMAL_EVICT_sum / $denom)) - min: MIN((TCC_NORMAL_EVICT_sum / $denom)) - max: MAX((TCC_NORMAL_EVICT_sum / $denom)) - unit: (Cachelines + $normUnit) - Evict (vL1D Req): - avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) - min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) - max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) - unit: (Cachelines + $normUnit) - NC Req: - avg: AVG((TCC_NC_REQ_sum / $denom)) - min: MIN((TCC_NC_REQ_sum / $denom)) - max: MAX((TCC_NC_REQ_sum / $denom)) - unit: (Req + $normUnit) - UC Req: - avg: AVG((TCC_UC_REQ_sum / $denom)) - min: MIN((TCC_UC_REQ_sum / $denom)) - max: MAX((TCC_UC_REQ_sum / $denom)) - unit: (Req + $normUnit) - CC Req: - avg: AVG((TCC_CC_REQ_sum / $denom)) - min: MIN((TCC_CC_REQ_sum / $denom)) - max: MAX((TCC_CC_REQ_sum / $denom)) - unit: (Req + $normUnit) - RW Req: - avg: AVG((TCC_RW_REQ_sum / $denom)) - min: MIN((TCC_RW_REQ_sum / $denom)) - max: MAX((TCC_RW_REQ_sum / $denom)) - unit: (Req + $normUnit) - gfx942: - Bandwidth: - avg: AVG((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)) - min: MIN((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)) - max: MAX((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)) - unit: Gbps - Req: - avg: AVG((TCC_REQ_sum / $denom)) - min: MIN((TCC_REQ_sum / $denom)) - max: MAX((TCC_REQ_sum / $denom)) - unit: (Req + $normUnit) - Read Req: - avg: AVG((TCC_READ_sum / $denom)) - min: MIN((TCC_READ_sum / $denom)) - max: MAX((TCC_READ_sum / $denom)) - unit: (Req + $normUnit) - Write Req: - avg: AVG((TCC_WRITE_sum / $denom)) - min: MIN((TCC_WRITE_sum / $denom)) - max: MAX((TCC_WRITE_sum / $denom)) - unit: (Req + $normUnit) - Atomic Req: - avg: AVG((TCC_ATOMIC_sum / $denom)) - min: MIN((TCC_ATOMIC_sum / $denom)) - max: MAX((TCC_ATOMIC_sum / $denom)) - unit: (Req + $normUnit) - Streaming Req: - avg: AVG((TCC_STREAMING_REQ_sum / $denom)) - min: MIN((TCC_STREAMING_REQ_sum / $denom)) - max: MAX((TCC_STREAMING_REQ_sum / $denom)) - unit: (Req + $normUnit) - Probe Req: - avg: AVG((TCC_PROBE_sum / $denom)) - min: MIN((TCC_PROBE_sum / $denom)) - max: MAX((TCC_PROBE_sum / $denom)) - unit: (Req + $normUnit) - Cache Hit: - avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - unit: pct - Hits: - avg: AVG((TCC_HIT_sum / $denom)) - min: MIN((TCC_HIT_sum / $denom)) - max: MAX((TCC_HIT_sum / $denom)) - unit: (Hits + $normUnit) - Misses: - avg: AVG((TCC_MISS_sum / $denom)) - min: MIN((TCC_MISS_sum / $denom)) - max: MAX((TCC_MISS_sum / $denom)) - unit: (Misses + $normUnit) - Writeback: - avg: AVG((TCC_WRITEBACK_sum / $denom)) - min: MIN((TCC_WRITEBACK_sum / $denom)) - max: MAX((TCC_WRITEBACK_sum / $denom)) - unit: (Cachelines + $normUnit) - Writeback (Internal): - avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom)) - min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom)) - max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom)) - unit: (Cachelines + $normUnit) - Writeback (vL1D Req): - avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) - min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) - max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) - unit: (Cachelines + $normUnit) - Evict (Internal): - avg: AVG((TCC_NORMAL_EVICT_sum / $denom)) - min: MIN((TCC_NORMAL_EVICT_sum / $denom)) - max: MAX((TCC_NORMAL_EVICT_sum / $denom)) - unit: (Cachelines + $normUnit) - Evict (vL1D Req): - avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) - min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) - max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) - unit: (Cachelines + $normUnit) - NC Req: - avg: AVG((TCC_NC_REQ_sum / $denom)) - min: MIN((TCC_NC_REQ_sum / $denom)) - max: MAX((TCC_NC_REQ_sum / $denom)) - unit: (Req + $normUnit) - UC Req: - avg: AVG((TCC_UC_REQ_sum / $denom)) - min: MIN((TCC_UC_REQ_sum / $denom)) - max: MAX((TCC_UC_REQ_sum / $denom)) - unit: (Req + $normUnit) - CC Req: - avg: AVG((TCC_CC_REQ_sum / $denom)) - min: MIN((TCC_CC_REQ_sum / $denom)) - max: MAX((TCC_CC_REQ_sum / $denom)) - unit: (Req + $normUnit) - RW Req: - avg: AVG((TCC_RW_REQ_sum / $denom)) - min: MIN((TCC_RW_REQ_sum / $denom)) - max: MAX((TCC_RW_REQ_sum / $denom)) - unit: (Req + $normUnit) - gfx950: - Bandwidth: - avg: AVG((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)) - min: MIN((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)) - max: MAX((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)) - unit: Gbps - Read Bandwidth: - avg: AVG(TCC_READ_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) - min: MIN(TCC_READ_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) - max: MAX(TCC_READ_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) - unit: Gbps - Write Bandwidth: - avg: AVG(TCC_WRITE_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) - min: MIN(TCC_WRITE_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) - max: MAX(TCC_WRITE_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) - unit: Gbps - Atomic Bandwidth: - avg: AVG(TCC_ATOMIC_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) - min: MIN(TCC_ATOMIC_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) - max: MAX(TCC_ATOMIC_SECTORS_sum * 32/ (End_Timestamp - Start_Timestamp)) - unit: Gbps - Req: - avg: AVG((TCC_REQ_sum / $denom)) - min: MIN((TCC_REQ_sum / $denom)) - max: MAX((TCC_REQ_sum / $denom)) - unit: (Req + $normUnit) - Read Req: - avg: AVG((TCC_READ_sum / $denom)) - min: MIN((TCC_READ_sum / $denom)) - max: MAX((TCC_READ_sum / $denom)) - unit: (Req + $normUnit) - Write Req: - avg: AVG((TCC_WRITE_sum / $denom)) - min: MIN((TCC_WRITE_sum / $denom)) - max: MAX((TCC_WRITE_sum / $denom)) - unit: (Req + $normUnit) - Atomic Req: - avg: AVG((TCC_ATOMIC_sum / $denom)) - min: MIN((TCC_ATOMIC_sum / $denom)) - max: MAX((TCC_ATOMIC_sum / $denom)) - unit: (Req + $normUnit) - Streaming Req: - avg: AVG((TCC_STREAMING_REQ_sum / $denom)) - min: MIN((TCC_STREAMING_REQ_sum / $denom)) - max: MAX((TCC_STREAMING_REQ_sum / $denom)) - unit: (Req + $normUnit) - Bypasss Req: - avg: AVG((TCC_BYPASS_REQ_sum / $denom)) - min: MIN((TCC_BYPASS_REQ_sum / $denom)) - max: MAX((TCC_BYPASS_REQ_sum / $denom)) - unit: (Req + $normUnit) - Probe Req: - avg: AVG((TCC_PROBE_sum / $denom)) - min: MIN((TCC_PROBE_sum / $denom)) - max: MAX((TCC_PROBE_sum / $denom)) - unit: (Req + $normUnit) - Input Buffer Req: - avg: AVG((TCC_IB_REQ_sum / $denom)) - min: MIN((TCC_IB_REQ_sum / $denom)) - max: MAX((TCC_IB_REQ_sum / $denom)) - unit: (Req + $normUnit) - Cache Hit: - avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - unit: pct - Hits: - avg: AVG((TCC_HIT_sum / $denom)) - min: MIN((TCC_HIT_sum / $denom)) - max: MAX((TCC_HIT_sum / $denom)) - unit: (Hits + $normUnit) - Misses: - avg: AVG((TCC_MISS_sum / $denom)) - min: MIN((TCC_MISS_sum / $denom)) - max: MAX((TCC_MISS_sum / $denom)) - unit: (Misses + $normUnit) - Writeback: - avg: AVG((TCC_WRITEBACK_sum / $denom)) - min: MIN((TCC_WRITEBACK_sum / $denom)) - max: MAX((TCC_WRITEBACK_sum / $denom)) - unit: (Cachelines + $normUnit) - Writeback (Internal): - avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom)) - min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom)) - max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom)) - unit: (Cachelines + $normUnit) - Writeback (vL1D Req): - avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) - min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) - max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) - unit: (Cachelines + $normUnit) - Evict (Internal): - avg: AVG((TCC_NORMAL_EVICT_sum / $denom)) - min: MIN((TCC_NORMAL_EVICT_sum / $denom)) - max: MAX((TCC_NORMAL_EVICT_sum / $denom)) - unit: (Cachelines + $normUnit) - Evict (vL1D Req): - avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) - min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) - max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) - unit: (Cachelines + $normUnit) - NC Req: - avg: AVG((TCC_NC_REQ_sum / $denom)) - min: MIN((TCC_NC_REQ_sum / $denom)) - max: MAX((TCC_NC_REQ_sum / $denom)) - unit: (Req + $normUnit) - UC Req: - avg: AVG((TCC_UC_REQ_sum / $denom)) - min: MIN((TCC_UC_REQ_sum / $denom)) - max: MAX((TCC_UC_REQ_sum / $denom)) - unit: (Req + $normUnit) - CC Req: - avg: AVG((TCC_CC_REQ_sum / $denom)) - min: MIN((TCC_CC_REQ_sum / $denom)) - max: MAX((TCC_CC_REQ_sum / $denom)) - unit: (Req + $normUnit) - RW Req: - avg: AVG((TCC_RW_REQ_sum / $denom)) - min: MIN((TCC_RW_REQ_sum / $denom)) - max: MAX((TCC_RW_REQ_sum / $denom)) - unit: (Req + $normUnit) - gfx908: - Bandwidth: - avg: AVG((TCC_REQ_sum * 64) / (End_Timestamp - Start_Timestamp)) - min: MIN((TCC_REQ_sum * 64) / (End_Timestamp - Start_Timestamp)) - max: MAX((TCC_REQ_sum * 64) / (End_Timestamp - Start_Timestamp)) - unit: Gbps - Req: - avg: AVG((TCC_REQ_sum / $denom)) - min: MIN((TCC_REQ_sum / $denom)) - max: MAX((TCC_REQ_sum / $denom)) - unit: (Req + $normUnit) - Read Req: - avg: AVG((TCC_READ_sum / $denom)) - min: MIN((TCC_READ_sum / $denom)) - max: MAX((TCC_READ_sum / $denom)) - unit: (Req + $normUnit) - Write Req: - avg: AVG((TCC_WRITE_sum / $denom)) - min: MIN((TCC_WRITE_sum / $denom)) - max: MAX((TCC_WRITE_sum / $denom)) - unit: (Req + $normUnit) - Atomic Req: - avg: AVG((TCC_ATOMIC_sum / $denom)) - min: MIN((TCC_ATOMIC_sum / $denom)) - max: MAX((TCC_ATOMIC_sum / $denom)) - unit: (Req + $normUnit) - Streaming Req: - avg: AVG((TCC_STREAMING_REQ_sum / $denom)) - min: MIN((TCC_STREAMING_REQ_sum / $denom)) - max: MAX((TCC_STREAMING_REQ_sum / $denom)) - unit: (Req + $normUnit) - Probe Req: - avg: AVG((TCC_PROBE_sum / $denom)) - min: MIN((TCC_PROBE_sum / $denom)) - max: MAX((TCC_PROBE_sum / $denom)) - unit: (Req + $normUnit) - Cache Hit: - avg: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - min: MIN((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - max: MAX((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum - + TCC_MISS_sum) != 0) else None)) - unit: pct - Hits: - avg: AVG((TCC_HIT_sum / $denom)) - min: MIN((TCC_HIT_sum / $denom)) - max: MAX((TCC_HIT_sum / $denom)) - unit: (Hits + $normUnit) - Misses: - avg: AVG((TCC_MISS_sum / $denom)) - min: MIN((TCC_MISS_sum / $denom)) - max: MAX((TCC_MISS_sum / $denom)) - unit: (Misses + $normUnit) - Writeback: - avg: AVG((TCC_WRITEBACK_sum / $denom)) - min: MIN((TCC_WRITEBACK_sum / $denom)) - max: MAX((TCC_WRITEBACK_sum / $denom)) - unit: (Cachelines + $normUnit) - Writeback (Internal): - avg: AVG((TCC_NORMAL_WRITEBACK_sum / $denom)) - min: MIN((TCC_NORMAL_WRITEBACK_sum / $denom)) - max: MAX((TCC_NORMAL_WRITEBACK_sum / $denom)) - unit: (Cachelines + $normUnit) - Writeback (vL1D Req): - avg: AVG((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) - min: MIN((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) - max: MAX((TCC_ALL_TC_OP_WB_WRITEBACK_sum / $denom)) - unit: (Cachelines + $normUnit) - Evict (Internal): - avg: AVG((TCC_NORMAL_EVICT_sum / $denom)) - min: MIN((TCC_NORMAL_EVICT_sum / $denom)) - max: MAX((TCC_NORMAL_EVICT_sum / $denom)) - unit: (Cachelines + $normUnit) - Evict (vL1D Req): - avg: AVG((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) - min: MIN((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) - max: MAX((TCC_ALL_TC_OP_INV_EVICT_sum / $denom)) - unit: (Cachelines + $normUnit) - NC Req: - avg: AVG((TCC_NC_REQ_sum / $denom)) - min: MIN((TCC_NC_REQ_sum / $denom)) - max: MAX((TCC_NC_REQ_sum / $denom)) - unit: (Req + $normUnit) - UC Req: - avg: AVG((TCC_UC_REQ_sum / $denom)) - min: MIN((TCC_UC_REQ_sum / $denom)) - max: MAX((TCC_UC_REQ_sum / $denom)) - unit: (Req + $normUnit) - CC Req: - avg: AVG((TCC_CC_REQ_sum / $denom)) - min: MIN((TCC_CC_REQ_sum / $denom)) - max: MAX((TCC_CC_REQ_sum / $denom)) - unit: (Req + $normUnit) - RW Req: - avg: AVG((TCC_RW_REQ_sum / $denom)) - min: MIN((TCC_RW_REQ_sum / $denom)) - max: MAX((TCC_RW_REQ_sum / $denom)) - unit: (Req + $normUnit) - - metric_table: - id: 1704 - title: L2 Cache Stalls - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - metric: - gfx90a: {} - gfx941: {} - gfx940: {} - gfx942: {} - gfx950: - Stalled on Latency FIFO: - avg: AVG(TCC_LATENCY_FIFO_FULL_sum / $denom) - min: MIN(TCC_LATENCY_FIFO_FULL_sum / $denom) - max: MAX(TCC_LATENCY_FIFO_FULL_sum / $denom) - unit: (Cycles + $normUnit) - Stalled on Write Data FIFO: - avg: AVG(TCC_SRC_FIFO_FULL_sum / $denom) - min: MIN(TCC_SRC_FIFO_FULL_sum / $denom) - max: MAX(TCC_SRC_FIFO_FULL_sum / $denom) - unit: (Cycles + $normUnit) - Input Buffer Stalled on L2: - avg: AVG(TCC_IB_STALL_sum / $denom) - min: MIN(TCC_IB_STALL_sum / $denom) - max: MAX(TCC_IB_STALL_sum / $denom) - unit: (Cycles + $normUnit) - gfx908: {} - - metric_table: - id: 1705 - title: L2 - Fabric Interface stalls - header: - metric: Metric - type: Type - transaction: Transaction - avg: Avg - min: Min - max: Max - unit: Unit - style: - type: simple_multi_bar - metric: - gfx90a: - Write - Credit Starvation: - type: Credit Starvation - transaction: Write - avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if - (TCC_BUSY_sum != 0) else None)) - min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if - (TCC_BUSY_sum != 0) else None)) - max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if - (TCC_BUSY_sum != 0) else None)) - unit: pct - gfx941: - Write - Credit Starvation: - type: Credit Starvation - transaction: Write - avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if - (TCC_BUSY_sum != 0) else None)) - min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if - (TCC_BUSY_sum != 0) else None)) - max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if - (TCC_BUSY_sum != 0) else None)) - unit: pct - gfx940: - Write - Credit Starvation: - type: Credit Starvation - transaction: Write - avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if - (TCC_BUSY_sum != 0) else None)) - min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if - (TCC_BUSY_sum != 0) else None)) - max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if - (TCC_BUSY_sum != 0) else None)) - unit: pct - gfx942: - Write - Credit Starvation: - type: Credit Starvation - transaction: Write - avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if - (TCC_BUSY_sum != 0) else None)) - min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if - (TCC_BUSY_sum != 0) else None)) - max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if - (TCC_BUSY_sum != 0) else None)) - unit: pct - gfx950: - Read - PCIe Stall: - type: PCIe Stall - transaction: Read - avg: AVG(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if - (TCC_BUSY_sum != 0) else None)) - min: MIN(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if - (TCC_BUSY_sum != 0) else None)) - max: MAX(((100 * (TCC_EA0_RDREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if - (TCC_BUSY_sum != 0) else None)) - unit: pct - Read - Infinity Fabric Stall: - type: "Infinity Fabric\u2122 Stall" - transaction: Read - avg: AVG(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) - if (TCC_BUSY_sum != 0) else None)) - min: MIN(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) - if (TCC_BUSY_sum != 0) else None)) - max: MAX(((100 * (TCC_EA0_RDREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) - if (TCC_BUSY_sum != 0) else None)) - unit: pct - Read - HBM Stall: - type: HBM Stall - transaction: Read - avg: AVG(((100 * (TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) - if (TCC_BUSY_sum != 0) else None)) - min: MIN(((100 * (TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) - if (TCC_BUSY_sum != 0) else None)) - max: MAX(((100 * (TCC_EA0_RDREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) - if (TCC_BUSY_sum != 0) else None)) - unit: pct - Write - PCIe Stall: - type: PCIe Stall - transaction: Write - avg: AVG(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if - (TCC_BUSY_sum != 0) else None)) - min: MIN(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if - (TCC_BUSY_sum != 0) else None)) - max: MAX(((100 * (TCC_EA0_WRREQ_IO_CREDIT_STALL_sum / TCC_BUSY_sum)) if - (TCC_BUSY_sum != 0) else None)) - unit: pct - Write - Infinity Fabric Stall: - type: "Infinity Fabric\u2122 Stall" - transaction: Write - avg: AVG(((100 * (TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) - if (TCC_BUSY_sum != 0) else None)) - min: MIN(((100 * (TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) - if (TCC_BUSY_sum != 0) else None)) - max: MAX(((100 * (TCC_EA0_WRREQ_GMI_CREDIT_STALL_sum / TCC_BUSY_sum)) - if (TCC_BUSY_sum != 0) else None)) - unit: pct - Write - HBM Stall: - type: HBM Stall - transaction: Write - avg: AVG(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) - if (TCC_BUSY_sum != 0) else None)) - min: MIN(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) - if (TCC_BUSY_sum != 0) else None)) - max: MAX(((100 * (TCC_EA0_WRREQ_DRAM_CREDIT_STALL_sum / TCC_BUSY_sum)) - if (TCC_BUSY_sum != 0) else None)) - unit: pct - Write - Credit Starvation: - type: Credit Starvation - transaction: Write - avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if - (TCC_BUSY_sum != 0) else None)) - min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if - (TCC_BUSY_sum != 0) else None)) - max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if - (TCC_BUSY_sum != 0) else None)) - unit: pct - gfx908: - Write - Credit Starvation: - type: Credit Starvation - transaction: Write - avg: AVG(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if - (TCC_BUSY_sum != 0) else None)) - min: MIN(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if - (TCC_BUSY_sum != 0) else None)) - max: MAX(((100 * (TCC_TOO_MANY_EA_WRREQS_STALL_sum / TCC_BUSY_sum)) if - (TCC_BUSY_sum != 0) else None)) - unit: pct - - metric_table: - id: 1706 - title: L2 - Fabric interface detailed metrics - header: - metric: Metric - avg: Avg - min: Min - max: Max - unit: Unit - metric: - gfx90a: - Read (32B): - avg: AVG((TCC_EA_RDREQ_32B_sum / $denom)) - min: MIN((TCC_EA_RDREQ_32B_sum / $denom)) - max: MAX((TCC_EA_RDREQ_32B_sum / $denom)) - unit: (Req + $normUnit) - Read (64B): - avg: AVG(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom)) - min: MIN(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom)) - max: MAX(((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_32B_sum) / $denom)) - unit: (Req + $normUnit) - Read (Uncached): - avg: AVG((TCC_EA_RD_UNCACHED_32B_sum / $denom)) - min: MIN((TCC_EA_RD_UNCACHED_32B_sum / $denom)) - max: MAX((TCC_EA_RD_UNCACHED_32B_sum / $denom)) - unit: (Req + $normUnit) - HBM Read: - avg: AVG((TCC_EA_RDREQ_DRAM_sum / $denom)) - min: MIN((TCC_EA_RDREQ_DRAM_sum / $denom)) - max: MAX((TCC_EA_RDREQ_DRAM_sum / $denom)) - unit: (Req + $normUnit) - Remote Read: - avg: AVG((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom)) - min: MIN((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom)) - max: MAX((MAX((TCC_EA_RDREQ_sum - TCC_EA_RDREQ_DRAM_sum), 0) / $denom)) - unit: (Req + $normUnit) - Write and Atomic (32B): - avg: AVG(MAX(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom), 0)) - min: MIN(MAX(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom), 0)) - max: MAX(MAX(((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_64B_sum) / $denom), 0)) - unit: (Req + $normUnit) - Write and Atomic (Uncached): - avg: AVG((TCC_EA_WR_UNCACHED_32B_sum / $denom)) - min: MIN((TCC_EA_WR_UNCACHED_32B_sum / $denom)) - max: MAX((TCC_EA_WR_UNCACHED_32B_sum / $denom)) - unit: (Req + $normUnit) - Write and Atomic (64B): - avg: AVG((TCC_EA_WRREQ_64B_sum / $denom)) - min: MIN((TCC_EA_WRREQ_64B_sum / $denom)) - max: MAX((TCC_EA_WRREQ_64B_sum / $denom)) - unit: (Req + $normUnit) - HBM Write and Atomic: - avg: AVG((TCC_EA_WRREQ_DRAM_sum / $denom)) - min: MIN((TCC_EA_WRREQ_DRAM_sum / $denom)) - max: MAX((TCC_EA_WRREQ_DRAM_sum / $denom)) - unit: (Req + $normUnit) - Remote Write and Atomic: - avg: AVG((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom)) - min: MIN((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom)) - max: MAX((MAX((TCC_EA_WRREQ_sum - TCC_EA_WRREQ_DRAM_sum), 0) / $denom)) - unit: (Req + $normUnit) - Atomic: - avg: AVG((TCC_EA_ATOMIC_sum / $denom)) - min: MIN((TCC_EA_ATOMIC_sum / $denom)) - max: MAX((TCC_EA_ATOMIC_sum / $denom)) - unit: (Req + $normUnit) - gfx941: - Read (32B): - avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom)) - min: MIN((TCC_EA0_RDREQ_32B_sum / $denom)) - max: MAX((TCC_EA0_RDREQ_32B_sum / $denom)) - unit: (Req + $normUnit) - Read (64B): - avg: AVG(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom)) - min: MIN(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom)) - max: MAX(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom)) - unit: (Req + $normUnit) - Read (Uncached): - avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) - min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) - max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) - unit: (Req + $normUnit) - HBM Read: - avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)) - min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom)) - max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom)) - unit: (Req + $normUnit) - Remote Read: - avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) - min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) - max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) - unit: (Req + $normUnit) - Write and Atomic (32B): - avg: AVG(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0)) - min: MIN(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0)) - max: MAX(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0)) - unit: (Req + $normUnit) - Write and Atomic (Uncached): - avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) - min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) - max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) - unit: (Req + $normUnit) - Write and Atomic (64B): - avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom)) - min: MIN((TCC_EA0_WRREQ_64B_sum / $denom)) - max: MAX((TCC_EA0_WRREQ_64B_sum / $denom)) - unit: (Req + $normUnit) - HBM Write and Atomic: - avg: AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)) - min: MIN((TCC_EA0_WRREQ_DRAM_sum / $denom)) - max: MAX((TCC_EA0_WRREQ_DRAM_sum / $denom)) - unit: (Req + $normUnit) - Remote Write and Atomic: - avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) - min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) - max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) - unit: (Req + $normUnit) - Atomic: - avg: AVG((TCC_EA0_ATOMIC_sum / $denom)) - min: MIN((TCC_EA0_ATOMIC_sum / $denom)) - max: MAX((TCC_EA0_ATOMIC_sum / $denom)) - unit: (Req + $normUnit) - gfx940: - Read (32B): - avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom)) - min: MIN((TCC_EA0_RDREQ_32B_sum / $denom)) - max: MAX((TCC_EA0_RDREQ_32B_sum / $denom)) - unit: (Req + $normUnit) - Read (64B): - avg: AVG(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom)) - min: MIN(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom)) - max: MAX(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom)) - unit: (Req + $normUnit) - Read (Uncached): - avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) - min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) - max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) - unit: (Req + $normUnit) - HBM Read: - avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)) - min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom)) - max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom)) - unit: (Req + $normUnit) - Remote Read: - avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) - min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) - max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) - unit: (Req + $normUnit) - Write and Atomic (32B): - avg: AVG(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0)) - min: MIN(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0)) - max: MAX(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0)) - unit: (Req + $normUnit) - Write and Atomic (Uncached): - avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) - min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) - max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) - unit: (Req + $normUnit) - Write and Atomic (64B): - avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom)) - min: MIN((TCC_EA0_WRREQ_64B_sum / $denom)) - max: MAX((TCC_EA0_WRREQ_64B_sum / $denom)) - unit: (Req + $normUnit) - HBM Write and Atomic: - avg: AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)) - min: MIN((TCC_EA0_WRREQ_DRAM_sum / $denom)) - max: MAX((TCC_EA0_WRREQ_DRAM_sum / $denom)) - unit: (Req + $normUnit) - Remote Write and Atomic: - avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) - min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) - max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) - unit: (Req + $normUnit) - Atomic: - avg: AVG((TCC_EA0_ATOMIC_sum / $denom)) - min: MIN((TCC_EA0_ATOMIC_sum / $denom)) - max: MAX((TCC_EA0_ATOMIC_sum / $denom)) - unit: (Req + $normUnit) - gfx942: - Read (32B): - avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom)) - min: MIN((TCC_EA0_RDREQ_32B_sum / $denom)) - max: MAX((TCC_EA0_RDREQ_32B_sum / $denom)) - unit: (Req + $normUnit) - Read (64B): - avg: AVG(MAX(((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) / - $denom), 0)) - min: MIN(MAX(((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) / - $denom), 0)) - max: MAX(MAX(((TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum - TCC_EA0_RDREQ_32B_sum) / - $denom), 0)) - unit: (Req + $normUnit) - Read (128B): - avg: AVG(((TCC_BUBBLE_sum) / $denom)) - min: MIN(((TCC_BUBBLE_sum) / $denom)) - max: MAX(((TCC_BUBBLE_sum) / $denom)) - unit: (Req + $normUnit) - Read (Uncached): - avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) - min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) - max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) - unit: (Req + $normUnit) - HBM Read: - avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)) - min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom)) - max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom)) - unit: (Req + $normUnit) - Remote Read: - avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) - min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) - max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) - unit: (Req + $normUnit) - Write and Atomic (32B): - avg: AVG(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0)) - min: MIN(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0)) - max: MAX(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0)) - unit: (Req + $normUnit) - Write and Atomic (Uncached): - avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) - min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) - max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) - unit: (Req + $normUnit) - Write and Atomic (64B): - avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom)) - min: MIN((TCC_EA0_WRREQ_64B_sum / $denom)) - max: MAX((TCC_EA0_WRREQ_64B_sum / $denom)) - unit: (Req + $normUnit) - HBM Write and Atomic: - avg: AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)) - min: MIN((TCC_EA0_WRREQ_DRAM_sum / $denom)) - max: MAX((TCC_EA0_WRREQ_DRAM_sum / $denom)) - unit: (Req + $normUnit) - Remote Write and Atomic: - avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) - min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) - max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) - unit: (Req + $normUnit) - Atomic: - avg: AVG((TCC_EA0_ATOMIC_sum / $denom)) - min: MIN((TCC_EA0_ATOMIC_sum / $denom)) - max: MAX((TCC_EA0_ATOMIC_sum / $denom)) - unit: (Req + $normUnit) - gfx950: - Read (32B): - avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom)) - min: MIN((TCC_EA0_RDREQ_32B_sum / $denom)) - max: MAX((TCC_EA0_RDREQ_32B_sum / $denom)) - unit: (Req + $normUnit) - Read (64B): - avg: AVG((TCC_EA0_RDREQ_64B_sum / $denom)) - min: MIN((TCC_EA0_RDREQ_64B_sum / $denom)) - max: MAX((TCC_EA0_RDREQ_64B_sum / $denom)) - unit: (Req + $normUnit) - Read (128B): - avg: AVG((TCC_EA0_RDREQ_128B_sum / $denom)) - min: MIN((TCC_EA0_RDREQ_128B_sum / $denom)) - max: MAX((TCC_EA0_RDREQ_128B_sum / $denom)) - unit: (Req + $normUnit) - Read (Uncached): - avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) - min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) - max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) - unit: (Req + $normUnit) - HBM Read: - avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)) - min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom)) - max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom)) - unit: (Req + $normUnit) - Remote Read: - avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) - min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) - max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) - unit: (Req + $normUnit) - Read Bandwidth - PCIe: - avg: AVG(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - min: MIN(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - max: MAX(TCC_EA0_RDREQ_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - unit: Gbps - "Read Bandwidth - Infinity Fabric\u2122": - avg: AVG(TCC_EA0_RDREQ_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - min: MIN(TCC_EA0_RDREQ_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - max: MAX(TCC_EA0_RDREQ_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - unit: Gbps - Read Bandwidth - HBM: - avg: AVG(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - min: MIN(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - max: MAX(TCC_EA0_RDREQ_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - unit: Gbps - Write and Atomic (32B): - avg: AVG(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0)) - min: MIN(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0)) - max: MAX(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0)) - unit: (Req + $normUnit) - Write and Atomic (Uncached): - avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) - min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) - max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) - unit: (Req + $normUnit) - Write and Atomic (64B): - avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom)) - min: MIN((TCC_EA0_WRREQ_64B_sum / $denom)) - max: MAX((TCC_EA0_WRREQ_64B_sum / $denom)) - unit: (Req + $normUnit) - HBM Write and Atomic: - avg: AVG((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) - min: MIN((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) - max: MAX((TCC_EA0_WRREQ_WRITE_DRAM_sum / $denom)) - unit: (Req + $normUnit) - Remote Write and Atomic: - avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) - min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) - max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) - unit: (Req + $normUnit) - Write Bandwidth - PCIe: - avg: AVG(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - min: MIN(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - max: MAX(TCC_EA0_WRREQ_WRITE_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - unit: Gbps - "Write Bandwidth - Infinity Fabric\u2122": - avg: AVG(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - min: MIN(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - max: MAX(TCC_EA0_WRREQ_WRITE_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - unit: Gbps - Write Bandwidth - HBM: - avg: AVG(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - min: MIN(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - max: MAX(TCC_EA0_WRREQ_WRITE_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - unit: Gbps - Atomic: - avg: AVG((TCC_EA0_ATOMIC_sum / $denom)) - min: MIN((TCC_EA0_ATOMIC_sum / $denom)) - max: MAX((TCC_EA0_ATOMIC_sum / $denom)) - unit: (Req + $normUnit) - Atomic - HBM: - avg: AVG((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom)) - min: MIN((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom)) - max: MAX((TCC_EA0_WRREQ_ATOMIC_DRAM_sum / $denom)) - unit: (Req + $normUnit) - Atomic Bandwidth - PCIe: - avg: AVG(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - min: MIN(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - max: MAX(TCC_EA0_WRREQ_ATOMIC_IO_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - unit: Gbps - "Atomic Bandwidth - Infinity Fabric\u2122": - avg: AVG(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - min: MIN(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - max: MAX(TCC_EA0_WRREQ_ATOMIC_GMI_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - unit: Gbps - Atomic Bandwidth - HBM: - avg: AVG(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - min: MIN(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - max: MAX(TCC_EA0_WRREQ_ATOMIC_DRAM_32B_sum * 32/ (End_Timestamp - Start_Timestamp)) - unit: Gbps - gfx908: - Read (32B): - avg: AVG((TCC_EA0_RDREQ_32B_sum / $denom)) - min: MIN((TCC_EA0_RDREQ_32B_sum / $denom)) - max: MAX((TCC_EA0_RDREQ_32B_sum / $denom)) - unit: (Req + $normUnit) - Read (64B): - avg: AVG(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom)) - min: MIN(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom)) - max: MAX(((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_32B_sum) / $denom)) - unit: (Req + $normUnit) - Read (Uncached): - avg: AVG((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) - min: MIN((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) - max: MAX((TCC_EA0_RD_UNCACHED_32B_sum / $denom)) - unit: (Req + $normUnit) - HBM Read: - avg: AVG((TCC_EA0_RDREQ_DRAM_sum / $denom)) - min: MIN((TCC_EA0_RDREQ_DRAM_sum / $denom)) - max: MAX((TCC_EA0_RDREQ_DRAM_sum / $denom)) - unit: (Req + $normUnit) - Remote Read: - avg: AVG((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) - min: MIN((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) - max: MAX((MAX((TCC_EA0_RDREQ_sum - TCC_EA0_RDREQ_DRAM_sum), 0) / $denom)) - unit: (Req + $normUnit) - Write and Atomic (32B): - avg: AVG(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0)) - min: MIN(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0)) - max: MAX(MAX(((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum) / $denom), 0)) - unit: (Req + $normUnit) - Write and Atomic (Uncached): - avg: AVG((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) - min: MIN((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) - max: MAX((TCC_EA0_WR_UNCACHED_32B_sum / $denom)) - unit: (Req + $normUnit) - Write and Atomic (64B): - avg: AVG((TCC_EA0_WRREQ_64B_sum / $denom)) - min: MIN((TCC_EA0_WRREQ_64B_sum / $denom)) - max: MAX((TCC_EA0_WRREQ_64B_sum / $denom)) - unit: (Req + $normUnit) - HBM Write and Atomic: - avg: AVG((TCC_EA0_WRREQ_DRAM_sum / $denom)) - min: MIN((TCC_EA0_WRREQ_DRAM_sum / $denom)) - max: MAX((TCC_EA0_WRREQ_DRAM_sum / $denom)) - unit: (Req + $normUnit) - Remote Write and Atomic: - avg: AVG((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) - min: MIN((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) - max: MAX((MAX((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_DRAM_sum), 0) / $denom)) - unit: (Req + $normUnit) - Atomic: - avg: AVG((TCC_EA0_ATOMIC_sum / $denom)) - min: MIN((TCC_EA0_ATOMIC_sum / $denom)) - max: MAX((TCC_EA0_ATOMIC_sum / $denom)) - unit: (Req + $normUnit) - metrics_description: - Utilization: - plain: The ratio of the number of cycles an L2 channel was active, summed over - all L2 channels on the accelerator over the total L2 cycles. - rst: The ratio of the :ref:`number of cycles an L2 channel was active, summed - over all L2 channels on the accelerator <total-active-l2-cycles>` over the - :ref:`total L2 cycles <total-l2-cycles>`. - unit: Percent - Peak Bandwidth: - plain: The number of bytes looked up in the L2 cache, as a percent of the peak - theoretical bandwidth achievable on the specific accelerator. The number of - bytes is calculated as the number of cache lines requested multiplied by the - cache line size. This value does not consider partial requests, so e.g., if - only a single value is requested in a cache line, the data movement will still - be counted as a full cache line. - rst: The number of bytes looked up in the L2 cache, as a percent of the peak theoretical - bandwidth achievable on the specific accelerator. The number of bytes is - calculated as the number of cache lines requested multiplied by the cache - line size. This value does not consider partial requests, so e.g., if only - a single value is requested in a cache line, the data movement will still - be counted as a full cache line. - unit: Percent - Hit Rate: - plain: The ratio of the number of L2 cache line requests that hit in the L2 - cache over the total number of incoming cache line requests to the L2 cache. - rst: The ratio of the number of L2 cache line requests that hit in the L2 cache - over the total number of incoming cache line requests to the L2 cache. - unit: Percent - L2-Fabric Read BW: - plain: The number of bytes read by the L2 over the Infinity Fabric interface - per unit time. - rst: The number of bytes read by the L2 over the :ref:`Infinity Fabric interface - <l2-fabric>` per unit time. - unit: GB/s - L2-Fabric Write and Atomic BW: - plain: The number of bytes sent by the L2 over the Infinity Fabric interface - by write and atomic operations per unit time. - rst: The number of bytes sent by the L2 over the :ref:`Infinity Fabric interface - <l2-fabric>` by write and atomic operations per unit time. - unit: GB/s - HBM Bandwidth: - plain: Maximum theoretical bandwidth of the accelerator's local high-bandwidth - memory (HBM) per unit time. This value is calculated as the number of HBM - channels multiplied by the HBM channel width multiplied by the HBM clock frequency. - rst: Maximum theoretical bandwidth of the accelerator's local high-bandwidth - memory (HBM) per unit time. This value is calculated as the number of HBM - channels multiplied by the HBM channel width multiplied by the HBM clock frequency. - unit: GB/s - Read BW: - plain: The total number of bytes read by the L2 cache from Infinity Fabric divided by total duration. - rst: The total number of bytes read by the L2 cache from Infinity Fabric divided by total duration. - unit: Gbps - HBM Read Traffic: - plain: The percent of read requests generated by the L2 cache that are routed - to the accelerator's local high-bandwidth memory (HBM). This breakdown does - not consider the size of the request (meaning that 32B and 64B requests are - both counted as a single request), so this metric only approximates the percent - of the L2-Fabric Read bandwidth directed to the local HBM. - rst: The percent of read requests generated by the L2 cache that are routed - to the accelerator's local high-bandwidth memory (HBM). This breakdown does not - consider the *size* of the request (meaning that 32B and 64B requests are - both counted as a single request), so this metric only *approximates* the - percent of the L2-Fabric Read bandwidth directed to the local HBM. - unit: Percent - Remote Read Traffic: - plain: The percent of read requests generated by the L2 cache that are routed - to any memory location other than the accelerator's local high-bandwidth memory - (HBM) - for example, the CPU's DRAM or a remote accelerator's HBM. This breakdown - does not consider the size of the request (meaning that 32B and 64B requests - are both counted as a single request), so this metric only approximates the - percent of the L2-Fabric Read bandwidth directed to a remote location. - rst: The percent of read requests generated by the L2 cache that are routed - to any memory location other than the accelerator's local high-bandwidth memory - (HBM) -- for example, the CPU's DRAM or a remote accelerator's HBM. This - breakdown does not consider the *size* of the request (meaning that 32B and - 64B requests are both counted as a single request), so this metric only *approximates* - the percent of the L2-Fabric Read bandwidth directed to a remote location. - unit: Percent - Uncached Read Traffic: - plain: The percent of read requests generated by the L2 cache that are reading - from an uncached memory allocation. Note, as described in the request flow - section, a single 64B read request is typically counted as two uncached read - requests. So, it is possible for the Uncached Read Traffic to reach up to - 200% of the total number of read requests. This breakdown does not consider - the size of the request (i.e., 32B and 64B requests are both counted as a - single request), so this metric only approximates the percent of the L2-Fabric - read bandwidth directed to an uncached memory location. - rst: The percent of read requests generated by the L2 cache that are reading from - an :ref:`uncached memory allocation <memory-type>`. Note, as described in - the :ref:`request flow <l2-request-flow>` section, a single 64B read request - is typically counted as two uncached read requests. So, it is possible for - the Uncached Read Traffic to reach up to 200% of the total number of read - requests. This breakdown does not consider the *size* of the request (i.e., - 32B and 64B requests are both counted as a single request), so this metric - only *approximates* the percent of the L2-Fabric read bandwidth directed - to an uncached memory location. - unit: Percent - Write and Atomic BW: - plain: The total number of bytes written by the L2 over Infinity Fabric by write - and atomic operations divided by total duration. Note that on current CDNA accelerators, - such as the MI2XX, requests are only considered atomic by Infinity Fabric - if they are targeted at non-write-cacheable memory, for example, fine-grained - memory allocations or uncached memory allocations on the MI2XX. - rst: The total number of bytes written by the L2 over Infinity Fabric by write and - atomic operations divided by total duration. Note - that on current CDNA accelerators, such as the :ref:`MI2XX <mixxx-note>`, - requests are only considered *atomic* by Infinity Fabric if they are targeted - at non-write-cacheable memory, for example, :ref:`fine-grained memory <memory-type>` - allocations or :ref:`uncached memory <memory-type>` allocations on the MI2XX. - unit: Gbps - HBM Write and Atomic Traffic: - plain: The percent of write and atomic requests generated by the L2 cache that - are routed to the accelerator's local high-bandwidth memory (HBM). This breakdown - does not consider the size of the request (meaning that 32B and 64B requests - are both counted as a single request), so this metric only approximates the - percent of the L2-Fabric Write and Atomic bandwidth directed to the local - HBM. Note that on current CDNA accelerators, such as the MI2XX, requests are - only considered atomic by Infinity Fabric if they are targeted at fine-grained - memory allocations or uncached memory allocations. - rst: The percent of write and atomic requests generated by the L2 cache that are - routed to the accelerator's local high-bandwidth memory (HBM). This breakdown - does not consider the *size* of the request (meaning that 32B and 64B requests - are both counted as a single request), so this metric only *approximates* - the percent of the L2-Fabric Write and Atomic bandwidth directed to the local - HBM. Note that on current CDNA accelerators, such as the :ref:`MI2XX <mixxx-note>`, - requests are only considered *atomic* by Infinity Fabric if they are targeted - at :ref:`fine-grained memory <memory-type>` allocations or :ref:`uncached - memory <memory-type>` allocations. - unit: Percent - Remote Write and Atomic Traffic: - plain: The percent of read requests generated by the L2 cache that are routed - to any memory location other than the accelerator's local high-bandwidth memory - (HBM) - for example, the CPU's DRAM or a remote accelerator's HBM. This breakdown - does not consider the size of the request (meaning that 32B and 64B requests - are both counted as a single request), so this metric only approximates the - percent of the L2-Fabric Read bandwidth directed to a remote location. Note - that on current CDNA accelerators, such as the MI2XX, requests are only considered - atomic by Infinity Fabric if they are targeted at fine-grained memory allocations - or uncached memory allocations. - rst: The percent of read requests generated by the L2 cache that are routed - to any memory location other than the accelerator's local high-bandwidth memory - (HBM) -- for example, the CPU's DRAM or a remote accelerator's HBM. This - breakdown does not consider the *size* of the request (meaning that 32B and - 64B requests are both counted as a single request), so this metric only *approximates* - the percent of the L2-Fabric Read bandwidth directed to a remote location. - Note that on current CDNA accelerators, such as the :ref:`MI2XX <mixxx-note>`, - requests are only considered *atomic* by Infinity Fabric if they are targeted - at :ref:`fine-grained memory <memory-type>` allocations or :ref:`uncached - memory <memory-type>` allocations. - unit: Percent - Atomic Traffic: - plain: The percent of write requests generated by the L2 cache that are atomic - requests to any memory location. This breakdown does not consider the size - of the request (meaning that 32B and 64B requests are both counted as a single - request), so this metric only approximates the percent of the L2-Fabric Read - bandwidth directed to a remote location. Note that on current CDNA accelerators, - such as the MI2XX, requests are only considered atomic by Infinity Fabric - if they are targeted at fine-grained memory allocations or uncached memory - allocations. - rst: The percent of write requests generated by the L2 cache that are atomic requests - to *any* memory location. This breakdown does not consider the *size* of - the request (meaning that 32B and 64B requests are both counted as a single - request), so this metric only *approximates* the percent of the L2-Fabric - Read bandwidth directed to a remote location. Note that on current CDNA accelerators, - such as the :ref:`MI2XX <mixxx-note>`, requests are only considered *atomic* - by Infinity Fabric if they are targeted at :ref:`fine-grained memory <memory-type>` - allocations or :ref:`uncached memory <memory-type>` allocations. - unit: Percent - Uncached Write and Atomic Traffic: - plain: The percent of write and atomic requests generated by the L2 cache that - are targeting uncached memory allocations. This breakdown does not consider - the size of the request (meaning that 32B and 64B requests are both counted - as a single request), so this metric only approximates the percent of the - L2-Fabric read bandwidth directed to uncached memory allocations. - rst: The percent of write and atomic requests generated by the L2 cache that are - targeting :ref:`uncached memory allocations <memory-type>`. This breakdown - does not consider the *size* of the request (meaning that 32B and 64B requests - are both counted as a single request), so this metric only *approximates* - the percent of the L2-Fabric read bandwidth directed to uncached memory allocations. - unit: Percent - Read Latency: - plain: The time-averaged number of cycles read requests spent in Infinity Fabric - before data was returned to the L2. - rst: The time-averaged number of cycles read requests spent in Infinity Fabric before - data was returned to the L2. - unit: Cycles - Write and Atomic Latency: - plain: The time-averaged number of cycles write requests spent in Infinity Fabric - before a completion acknowledgement was returned to the L2. - rst: The time-averaged number of cycles write requests spent in Infinity Fabric - before a completion acknowledgement was returned to the L2. - unit: Cycles - Atomic Latency: - plain: The time-averaged number of cycles atomic requests spent in Infinity - Fabric before a completion acknowledgement (atomic without return value) or - data (atomic with return value) was returned to the L2. - rst: The time-averaged number of cycles atomic requests spent in Infinity Fabric - before a completion acknowledgement (atomic without return value) or data - (atomic with return value) was returned to the L2. - unit: Cycles - Bandwidth: - plain: The number of bytes looked up in the L2 cache, divided by total duration. - The number of bytes is calculated as the number of cache lines requested multiplied - by the cache line size. This value does not consider partial requests, so - for example, if only a single value is requested in a cache line, the data - movement will still be counted as a full cache line. - rst: The number of bytes looked up in the L2 cache, divided by total duration. - The number of bytes is calculated as the number of cache lines requested - multiplied by the cache line size. This value does - not consider partial requests, so for example, if only a single value is - requested in a cache line, the data movement will still be counted as a full - cache line. - unit: Gbps - Read Bandwidth: - plain: Total number of bytes looked up in the L2 cache for read requests, - divided by total duration. - rst: Total number of bytes looked up in the L2 cache for read requests, - divided by total duration. - unit: Gbps - Write Bandwidth: - plain: Total number of bytes looked up in the L2 cache for write requests, - divided by total duration. - rst: Total number of bytes looked up in the L2 cache for write requests, - divided by total duration. - unit: Gbps - Atomic Bandwidth: - plain: Total number of bytes looked up in the L2 cache for atomic requests, - divided by total duration. - rst: Total number of bytes looked up in the L2 cache for atomic requests, - divided by total duration. - unit: Gbps - Req: - plain: The total number of incoming requests to the L2 from all clients for - all request types, per normalization unit. - rst: The total number of incoming requests to the L2 from all clients for all request - types, per :ref:`normalization unit <normalization-units>`. - unit: Requests per normalization unit - Read Req: - plain: The total number of read requests to the L2 from all clients. - rst: The total number of read requests to the L2 from all clients. - unit: Requests per normalization unit - Write Req: - plain: The total number of write requests to the L2 from all clients. - rst: The total number of write requests to the L2 from all clients. - unit: Requests per normalization unit - Atomic Req: - plain: The total number of atomic requests (with and without return) to the - L2 from all clients. - rst: The total number of atomic requests (with and without return) to the L2 - from all clients. - unit: Requests per normalization unit - Streaming Req: - plain: The total number of incoming requests to the L2 that are marked as streaming. - The exact meaning of this may differ depending on the targeted accelerator, - however on an MI2XX this corresponds to non-temporal load or stores. The L2 - cache attempts to evict streaming requests before normal requests when the - L2 is at capacity. - rst: The total number of incoming requests to the L2 that are marked as *streaming*. - The exact meaning of this may differ depending on the targeted accelerator, - however on an :ref:`MI2XX <mixxx-note>` this corresponds to `non-temporal - load or stores <https://clang.llvm.org/docs/LanguageExtensions.html#non-temporal-load-store-builtins>`_. The - L2 cache attempts to evict *streaming* requests before normal requests when - the L2 is at capacity. - unit: Requests per normalization unit - Probe Req: - plain: The number of coherence probe requests made to the L2 cache from outside - the accelerator. On an MI2XX, probe requests may be generated by, for example, - writes to fine-grained device memory or by writes to coarse-grained device - memory. - rst: The number of coherence probe requests made to the L2 cache from outside the - accelerator. On an :ref:`MI2XX <mixxx-note>`, probe requests may be generated - by, for example, writes to :ref:`fine-grained device <memory-type>` memory - or by writes to :ref:`coarse-grained <memory-type>` device memory. - unit: Requests per normalization unit - Cache Hit: - plain: The ratio of the number of L2 cache line requests that hit in the L2 - cache over the total number of incoming cache line requests to the L2 cache. - rst: The ratio of the number of L2 cache line requests that hit in the L2 cache - over the total number of incoming cache line requests to the L2 cache. - unit: Percent - Hits: - plain: The total number of requests to the L2 from all clients that hit in the - cache. As noted in the Speed-of-Light section, this includes hit-on-miss requests. - rst: The total number of requests to the L2 from all clients that hit in the cache. - As noted in the :ref:`Speed-of-Light <l2-sol>` section, this includes hit-on-miss - requests. - unit: Requests per normalization unit - Misses: - plain: The total number of requests to the L2 from all clients that miss in - the cache. As noted in the Speed-of-Light section, these do not include hit-on-miss - requests. - rst: The total number of requests to the L2 from all clients that miss in the cache. - As noted in the :ref:`Speed-of-Light <l2-sol>` section, these do not include - hit-on-miss requests. - unit: Requests per normalization unit - Writeback: - plain: The total number of L2 cache lines written back to memory for any reason. - Write-backs may occur due to user code (such as HIP kernel calls to _threadfence_system - or atomic built-ins) by the command processor's memory acquire/release fences, - or for other internal hardware reasons. - rst: The total number of L2 cache lines written back to memory for any reason. Write-backs - may occur due to user code (such as HIP kernel calls to ``__threadfence_system`` - or atomic built-ins) by the :doc:`command processor <command-processor>`'s - memory acquire/release fences, or for other internal hardware reasons. - unit: Cache lines per normalization unit - Writeback (Internal): - plain: The total number of L2 cache lines written back to memory for internal - hardware reasons, per normalization unit. - rst: The total number of L2 cache lines written back to memory for internal hardware - reasons, per :ref:`normalization unit <normalization-units>`. - unit: Cache lines per normalization unit - Writeback (vL1D Req): - plain: The total number of L2 cache lines written back to memory due to requests - initiated by the vL1D cache, per normalization unit. - rst: The total number of L2 cache lines written back to memory due to requests initiated - by the :doc:`vL1D cache <vector-l1-cache>`, per :ref:`normalization unit - <normalization-units>`. - unit: Cache lines per normalization unit - Evict (Internal): - plain: The total number of L2 cache lines evicted from the cache due to capacity - limits, per normalization unit. - rst: The total number of L2 cache lines evicted from the cache due to capacity limits, - per :ref:`normalization unit <normalization-units>`. - unit: Cache lines per normalization unit - Evict (vL1D Req): - plain: The total number of L2 cache lines evicted from the cache due to invalidation - requests initiated by the vL1D cache, per normalization unit. - rst: The total number of L2 cache lines evicted from the cache due to invalidation - requests initiated by the :doc:`vL1D cache <vector-l1-cache>`, per :ref:`normalization - unit <normalization-units>`. - unit: Cache lines per normalization unit - NC Req: - plain: The total number of requests to the L2 to Not-hardware-Coherent (NC) - memory allocations, per normalization unit. - rst: The total number of requests to the L2 to Not-hardware-Coherent (NC) memory - allocations, per :ref:`normalization unit <normalization-units>`. See the - :ref:`memory-type` for more information. - unit: Requests per normalization unit - UC Req: - plain: The total number of requests to the L2 that go to Uncached (UC) memory - allocations. - rst: The total number of requests to the L2 that go to Uncached (UC) memory allocations. - See the :ref:`memory-type` for more information. - unit: Requests per normalization unit - CC Req: - plain: The total number of requests to the L2 that go to Coherently Cacheable - (CC) memory allocations. - rst: The total number of requests to the L2 that go to Coherently Cacheable - (CC) memory allocations. See the :ref:`memory-type` for more information. - unit: Requests per normalization unit - RW Req: - plain: The total number of requests to the L2 that go to Read-Write coherent - memory (RW) allocations. - rst: The total number of requests to the L2 that go to Read-Write coherent memory (RW) - allocations. See the :ref:`memory-type` for more information. - unit: Requests per normalization unit - Write - Credit Starvation: - plain: The number of cycles the L2-Fabric interface was stalled on write or - atomic requests to any memory location because too many write/atomic requests - were currently in flight, as a percent of the total active L2 cycles. - rst: The number of cycles the L2-Fabric interface was stalled on write or atomic - requests to any memory location because too many write/atomic requests were - currently in flight, as a percent of the :ref:`total active L2 cycles <total-active-l2-cycles>`. - unit: Percent - Read (32B): - plain: The total number of L2 requests to Infinity Fabric to read 32B of data - from any memory location, per normalization unit. - rst: The total number of L2 requests to Infinity Fabric to read 32B of data from - any memory location, per :ref:`normalization unit <normalization-units>`. - See :ref:`l2-request-flow` for more detail. Typically unused on CDNA accelerators. - unit: Requests per normalization unit - Read (64B): - plain: The total number of L2 requests to Infinity Fabric to read 64B of data - from any memory location, per normalization unit. - rst: The total number of L2 requests to Infinity Fabric to read 64B of data from - any memory location, per :ref:`normalization unit <normalization-units>`. - See :ref:`l2-request-flow` for more detail. - unit: Requests per normalization unit - Read (Uncached): - plain: The total number of L2 requests to Infinity Fabric to read uncached data - from any memory location, per normalization unit. 64B requests for uncached - data are counted as two 32B uncached data requests. - rst: The total number of L2 requests to Infinity Fabric to read :ref:`uncached - data <memory-type>` from any memory location, per :ref:`normalization unit - <normalization-units>`. 64B requests for uncached data are counted as two - 32B uncached data requests. See :ref:`l2-request-flow` for more detail. - unit: Requests per normalization unit - HBM Read: - plain: The total number of L2 requests to Infinity Fabric to read 32B or 64B - of data from the accelerator's local HBM, per normalization unit. - rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of data - from the accelerator's local HBM, per :ref:`normalization unit <normalization-units>`. - See :ref:`l2-request-flow` for more detail. - unit: Requests per normalization unit - Remote Read: - plain: The total number of L2 requests to Infinity Fabric to read 32B or 64B - of data from any source other than the accelerator's local HBM, per normalization - unit. - rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of data - from any source other than the accelerator's local HBM, per :ref:`normalization - unit <normalization-units>`. See :ref:`l2-request-flow` for more detail. - unit: Requests per normalization unit - Read Bandwidth - PCIe: - plain: Total number of bytes due to L2 read requests due to PCIe traffic, divided by total duration. - rst: Total number of bytes due to L2 read requests due to PCIe traffic, divided by total duration. - unit: Gbps - "Read Bandwidth - Infinity Fabric\u2122": - plain: Total number of bytes due to L2 read requests due to Infinity Fabric traffic, divided by total duration. - rst: Total number of bytes due to L2 read requests due to Infinity Fabric traffic, divided by total duration. - unit: Gbps - Read Bandwidth - HBM: - plain: Total number of bytes due to L2 read requests due to HBM traffic, divided by total duration. - rst: Total number of bytes due to L2 read requests due to HBM traffic, divided by total duration. - unit: Gbps - Write and Atomic (32B): - plain: The total number of L2 requests to Infinity Fabric to write or atomically - update 32B of data to any memory location, per normalization unit. - rst: The total number of L2 requests to Infinity Fabric to write or atomically update - 32B of data to any memory location, per :ref:`normalization unit <normalization-units>`. - See :ref:`l2-request-flow` for more detail. - unit: Requests per normalization unit - Write and Atomic (Uncached): - plain: The total number of L2 requests to Infinity Fabric to write or atomically - update 32B or 64B of uncached data, per normalization unit. - rst: The total number of L2 requests to Infinity Fabric to write or atomically update - 32B or 64B of :ref:`uncached data <memory-type>`, per :ref:`normalization - unit <normalization-units>`. See :ref:`l2-request-flow` for more detail. - unit: Requests per normalization unit - Write and Atomic (64B): - plain: The total number of L2 requests to Infinity Fabric to write or atomically - update 64B of data in any memory location, per normalization unit. - rst: The total number of L2 requests to Infinity Fabric to write or atomically update - 64B of data in any memory location, per :ref:`normalization unit <normalization-units>`. - See :ref:`l2-request-flow` for more detail. - unit: Requests per normalization unit - HBM Write and Atomic: - plain: The total number of L2 requests to Infinity Fabric to write or atomically - update 32B or 64B of data in the accelerator's local HBM, per normalization - unit. - rst: The total number of L2 requests to Infinity Fabric to write or atomically update - 32B or 64B of data in the accelerator's local HBM, per :ref:`normalization - unit <normalization-units>`. See :ref:`l2-request-flow` for more detail. plain - unit: Requests per normalization unit - Remote Write and Atomic: - plain: The total number of L2 requests to Infinity Fabric to write or atomically - update 32B or 64B of data in any memory location other than the accelerator's - local HBM, per normalization unit. - rst: The total number of L2 requests to Infinity Fabric to write or atomically update - 32B or 64B of data in any memory location other than the accelerator's local - HBM, per :ref:`normalization unit <normalization-units>`. See :ref:`l2-request-flow` - for more detail. - unit: Requests per normalization unit - Write Bandwidth - PCIe: - plain: Total number of bytes due to L2 write requests due to PCIe traffic, divided by total duration. - rst: Total number of bytes due to L2 write requests due to PCIe traffic, divided by total duration. - unit: Gbps - "Write Bandwidth - Infinity Fabric\u2122": - plain: Total number of bytes due to L2 write requests due to Infinity Fabric traffic, divided by total duration. - rst: Total number of bytes due to L2 write requests due to Infinity Fabric traffic, divided by total duration. - unit: Gbps - Write Bandwidth - HBM: - plain: Total number of bytes due to L2 write requests due to HBM traffic, divided by total duration. - rst: Total number of bytes due to L2 write requests due to HBM traffic, divided by total duration. - unit: Gbps - Atomic Bandwidth - PCIe: - plain: Total number of bytes due to L2 atomic requests due to PCIe traffic, divided by total duration. - rst: Total number of bytes due to L2 atomic requests due to PCIe traffic, divided by total duration. - unit: Gbps - "Atomic Bandwidth - Infinity Fabric\u2122": - plain: Total number of bytes due to L2 atomic requests due to Infinity Fabric traffic, divided by total duration. - rst: Total number of bytes due to L2 atomic requests due to Infinity Fabric traffic, divided by total duration. - unit: Gbps - Atomic Bandwidth - HBM: - plain: Total number of bytes due to L2 atomic requests due to HBM traffic, divided by total duration. - rst: Total number of bytes due to L2 atomic requests due to HBM traffic, divided by total duration. - unit: Gbps - Atomic: - plain: The total number of L2 requests to Infinity Fabric to atomically update - 32B or 64B of data in any memory location, per normalization unit. See Request - flow for more detail. Note that on current CDNA accelerators, such as the - MI2XX, requests are only considered atomic by Infinity Fabric if they are - targeted at non-write-cacheable memory, such as fine-grained memory allocations - or uncached memory allocations on the MI2XX. - rst: The total number of L2 requests to Infinity Fabric to atomically update 32B - or 64B of data in any memory location, per :ref:`normalization unit <normalization-units>`. - See :ref:`l2-request-flow` for more detail. Note that on current CDNA accelerators, - such as the :ref:`MI2XX <mixxx-note>`, requests are only considered *atomic* - by Infinity Fabric if they are targeted at non-write-cacheable memory, such - as :ref:`fine-grained memory <memory-type>` allocations or :ref:`uncached - memory <memory-type>` allocations on the MI2XX. - unit: Requests per normalization unit - Read Stall: - plain: |- - The ratio of the total number of cycles the L2-Fabric interface was - stalled on a read request to any destination (local HBM, remote PCIe\xAE - connected accelerator or CPU, or remote Infinity Fabric connected accelerator - or CPU) over the total active L2 cycles. - rst: |- - The ratio of the total number of cycles the L2-Fabric interface was stalled - on a read request to any destination (local HBM, remote PCIe\xAE connected - accelerator or CPU, or remote Infinity Fabric connected accelerator [#inf]_ - or CPU) over the :ref:`total active L2 cycles <total-active-l2-cycles>`. - unit: Percent - Write Stall: - plain: The ratio of the total number of cycles the L2-Fabric interface was stalled - on a write or atomic request to any destination (local HBM, remote accelerator - or CPU, PCIe connected accelerator or CPU, or remote Infinity Fabric connected - accelerator or CPU) over the total active L2 cycles. - rst: The ratio of the total number of cycles the L2-Fabric interface was stalled - on a write or atomic request to any destination (local HBM, remote accelerator - or CPU, PCIe connected accelerator or CPU, or remote Infinity Fabric connected - accelerator [#inf]_ or CPU) over the :ref:`total active L2 cycles <total-active-l2-cycles>`. - unit: Percent - Read - PCIe Stall: - plain: The number of cycles the L2-Fabric interface was stalled on read requests - to remote PCIe connected accelerators or CPUs as a percent of the total active - L2 cycles. - rst: The number of cycles the L2-Fabric interface was stalled on read requests - to remote PCIe connected accelerators [#inf]_ or CPUs as a percent of the - :ref:`total active L2 cycles <total-active-l2-cycles>`. - unit: Percent - Read - Infinity Fabric Stall: - plain: The number of cycles the L2-Fabric interface was stalled on read requests - to remote Infinity Fabric connected accelerators or CPUs as a percent of the - total active L2 cycles. - rst: The number of cycles the L2-Fabric interface was stalled on read requests - to remote Infinity Fabric connected accelerators [#inf]_ or CPUs as a percent - of the :ref:`total active L2 cycles <total-active-l2-cycles>`. - unit: Percent - Read - HBM Stall: - plain: The number of cycles the L2-Fabric interface was stalled on read requests - to the accelerator's local HBM as a percent of the total active L2 cycles. - rst: The number of cycles the L2-Fabric interface was stalled on read requests - to the accelerator's local HBM as a percent of the :ref:`total active L2 cycles - <total-active-l2-cycles>`. - unit: Percent - Write - PCIe Stall: - plain: The number of cycles the L2-Fabric interface was stalled on write or - atomic requests to remote PCIe connected accelerators or CPUs as a percent - of the total active L2 cycles. - rst: The number of cycles the L2-Fabric interface was stalled on write or atomic - requests to remote PCIe connected accelerators [#inf]_ or CPUs as a percent - of the :ref:`total active L2 cycles <total-active-l2-cycles>`. - unit: Percent - Write - Infinity Fabric Stall: - plain: The number of cycles the L2-Fabric interface was stalled on write or - atomic requests to remote Infinity Fabric connected accelerators or CPUs as - a percent of the total active L2 cycles. - rst: The number of cycles the L2-Fabric interface was stalled on write or atomic - requests to remote Infinity Fabric connected accelerators [#inf]_ or CPUs - as a percent of the :ref:`total active L2 cycles <total-active-l2-cycles>`. - unit: Percent - Write - HBM Stall: - plain: The number of cycles the L2-Fabric interface was stalled on write or - atomic requests to accelerator's local HBM as a percent of the total active - L2 cycles. - rst: The number of cycles the L2-Fabric interface was stalled on write or atomic - requests to accelerator's local HBM as a percent of the total active L2 cycles. - unit: Percent -- id: 1800 - title: L2 Cache (per Channel) - data source: - - metric_table: - id: 1801 - title: Aggregate Stats (All channels) - header: - metric: Metric - avg: Avg - std dev: Std Dev - min: Min - max: Max - unit: Unit - metric: - gfx90a: - L2 Cache Hit Rate: - avg: AVG(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100 * - TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) - + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + - (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 - * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 - * TCC_HIT[14])) + (100 * TCC_HIT[15])) + (100 * TCC_HIT[16])) + (100 - * TCC_HIT[17])) + (100 * TCC_HIT[18])) + (100 * TCC_HIT[19])) + (100 - * TCC_HIT[20])) + (100 * TCC_HIT[21])) + (100 * TCC_HIT[22])) + (100 - * TCC_HIT[23])) + (100 * TCC_HIT[24])) + (100 * TCC_HIT[25])) + (100 - * TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 * TCC_HIT[28])) + (100 - * TCC_HIT[29])) + (100 * TCC_HIT[30])) + (100 * TCC_HIT[31])) / ((((((((((((((((((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) - + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] - + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) - + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] - + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) - + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] - + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) - + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] - + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) - + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] - + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) - + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[29] + TCC_HIT[29])) + (TCC_MISS[30] - + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None)) - std dev: STD(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100 - * TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * - TCC_HIT[4])) + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) - + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + - (100 * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + - (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) + (100 * TCC_HIT[16])) + - (100 * TCC_HIT[17])) + (100 * TCC_HIT[18])) + (100 * TCC_HIT[19])) + - (100 * TCC_HIT[20])) + (100 * TCC_HIT[21])) + (100 * TCC_HIT[22])) + - (100 * TCC_HIT[23])) + (100 * TCC_HIT[24])) + (100 * TCC_HIT[25])) + - (100 * TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 * TCC_HIT[28])) + - (100 * TCC_HIT[29])) + (100 * TCC_HIT[30])) + (100 * TCC_HIT[31])) / - ((((((((((((((((((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] - + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) - + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] - + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) - + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] - + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) - + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) + (TCC_MISS[16] - + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) + (TCC_MISS[18] + TCC_HIT[18])) - + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] + TCC_HIT[20])) + (TCC_MISS[21] - + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) + (TCC_MISS[23] + TCC_HIT[23])) - + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] + TCC_HIT[25])) + (TCC_MISS[26] - + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) + (TCC_MISS[28] + TCC_HIT[28])) - + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] + TCC_HIT[30])) + (TCC_MISS[31] - + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) - + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] - + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) - + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] - + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) - + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] - + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) - + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) + (TCC_MISS[18] - + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] + TCC_HIT[20])) - + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) + (TCC_MISS[23] - + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] + TCC_HIT[25])) - + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) + (TCC_MISS[28] - + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] + TCC_HIT[30])) - + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None)) - min: MIN(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100 * - TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) - + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + - (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 - * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 - * TCC_HIT[14])) + (100 * TCC_HIT[15])) + (100 * TCC_HIT[16])) + (100 - * TCC_HIT[17])) + (100 * TCC_HIT[18])) + (100 * TCC_HIT[19])) + (100 - * TCC_HIT[20])) + (100 * TCC_HIT[21])) + (100 * TCC_HIT[22])) + (100 - * TCC_HIT[23])) + (100 * TCC_HIT[24])) + (100 * TCC_HIT[25])) + (100 - * TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 * TCC_HIT[28])) + (100 - * TCC_HIT[29])) + (100 * TCC_HIT[30])) + (100 * TCC_HIT[31])) / ((((((((((((((((((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) - + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] - + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) - + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] - + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) - + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] - + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) - + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] - + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) - + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] - + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) - + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] - + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None)) - max: MAX(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100 * - TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) - + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + - (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 - * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 - * TCC_HIT[14])) + (100 * TCC_HIT[15])) + (100 * TCC_HIT[16])) + (100 - * TCC_HIT[17])) + (100 * TCC_HIT[18])) + (100 * TCC_HIT[19])) + (100 - * TCC_HIT[20])) + (100 * TCC_HIT[21])) + (100 * TCC_HIT[22])) + (100 - * TCC_HIT[23])) + (100 * TCC_HIT[24])) + (100 * TCC_HIT[25])) + (100 - * TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 * TCC_HIT[28])) + (100 - * TCC_HIT[29])) + (100 * TCC_HIT[30])) + (100 * TCC_HIT[31])) / ((((((((((((((((((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) - + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] - + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) - + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] - + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) - + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] - + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) - + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] - + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) - + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] - + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) - + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] - + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None)) - unit: pct - gfx941: - L2 Cache Hit Rate: - avg: AVG(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 - * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * - TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) - + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) - + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) - + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) - + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] - + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) - + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] - + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) - + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] - + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) - if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) - + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] - + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) - + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] - + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) - + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] - + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) - std dev: STD(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + - (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 - * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * - TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) - + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) - + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) - + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] - + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) - + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] - + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) - + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] - + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) - if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) - + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] - + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) - + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] - + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) - + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] - + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) - min: MIN(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 - * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * - TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) - + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) - + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) - + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) - + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] - + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) - + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] - + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) - + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] - + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) - if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) - + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] - + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) - + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] - + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) - + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] - + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) - max: MAX(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 - * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * - TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) - + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) - + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) - + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) - + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] - + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) - + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] - + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) - + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] - + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) - if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) - + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] - + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) - + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] - + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) - + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] - + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) - unit: pct - gfx940: - L2 Cache Hit Rate: - avg: AVG(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 - * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * - TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) - + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) - + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) - + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) - + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] - + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) - + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] - + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) - + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] - + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) - if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) - + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] - + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) - + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] - + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) - + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] - + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) - std dev: STD(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + - (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 - * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * - TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) - + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) - + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) - + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] - + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) - + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] - + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) - + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] - + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) - if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) - + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] - + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) - + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] - + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) - + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] - + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) - min: MIN(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 - * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * - TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) - + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) - + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) - + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) - + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] - + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) - + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] - + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) - + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] - + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) - if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) - + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] - + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) - + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] - + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) - + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] - + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) - max: MAX(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 - * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * - TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) - + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) - + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) - + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) - + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] - + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) - + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] - + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) - + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] - + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) - if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) - + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] - + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) - + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] - + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) - + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] - + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) - unit: pct - gfx942: - L2 Cache Hit Rate: - avg: AVG(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 - * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * - TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) - + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) - + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) - + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) - + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] - + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) - + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] - + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) - + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] - + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) - if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) - + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] - + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) - + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] - + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) - + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] - + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) - std dev: STD(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + - (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 - * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * - TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) - + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) - + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) - + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] - + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) - + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] - + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) - + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] - + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) - if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) - + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] - + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) - + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] - + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) - + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] - + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) - min: MIN(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 - * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * - TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) - + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) - + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) - + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) - + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] - + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) - + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] - + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) - + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] - + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) - if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) - + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] - + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) - + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] - + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) - + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] - + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) - max: MAX(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 - * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * - TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) - + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) - + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) - + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) - + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] - + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) - + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] - + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) - + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] - + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) - if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) - + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] - + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) - + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] - + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) - + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] - + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) - unit: pct - gfx950: - L2 Cache Hit Rate: - avg: AVG(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 - * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * - TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) - + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) - + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) - + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) - + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] - + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) - + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] - + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) - + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] - + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) - if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) - + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] - + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) - + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] - + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) - + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] - + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) - std dev: STD(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + - (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 - * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * - TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) - + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) - + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) - + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] - + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) - + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] - + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) - + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] - + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) - if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) - + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] - + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) - + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] - + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) - + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] - + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) - min: MIN(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 - * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * - TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) - + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) - + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) - + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) - + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] - + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) - + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] - + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) - + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] - + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) - if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) - + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] - + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) - + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] - + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) - + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] - + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) - max: MAX(((((((((((((((((100 * TCC_HIT[0]) + (100 * TCC_HIT[1])) + (100 - * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) + (100 * - TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + (100 * TCC_HIT[8])) - + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 * TCC_HIT[11])) - + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 * TCC_HIT[14])) - + (100 * TCC_HIT[15])) / (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) - + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] - + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) - + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] - + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) - + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] - + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15]))) - if (((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) - + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] - + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) - + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] - + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) - + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] - + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) != 0) else None) - unit: pct - gfx908: - L2 Cache Hit Rate: - avg: AVG(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100 * - TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) - + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + - (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 - * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 - * TCC_HIT[14])) + (100 * TCC_HIT[15])) + (100 * TCC_HIT[16])) + (100 - * TCC_HIT[17])) + (100 * TCC_HIT[18])) + (100 * TCC_HIT[19])) + (100 - * TCC_HIT[20])) + (100 * TCC_HIT[21])) + (100 * TCC_HIT[22])) + (100 - * TCC_HIT[23])) + (100 * TCC_HIT[24])) + (100 * TCC_HIT[25])) + (100 - * TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 * TCC_HIT[28])) + (100 - * TCC_HIT[29])) + (100 * TCC_HIT[30])) + (100 * TCC_HIT[31])) / ((((((((((((((((((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) - + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] - + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) - + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] - + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) - + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] - + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) - + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] - + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) - + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] - + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) - + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[29] + TCC_HIT[29])) + (TCC_MISS[30] - + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None)) - std dev: STD(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100 - * TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * - TCC_HIT[4])) + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) - + (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + - (100 * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + - (100 * TCC_HIT[14])) + (100 * TCC_HIT[15])) + (100 * TCC_HIT[16])) + - (100 * TCC_HIT[17])) + (100 * TCC_HIT[18])) + (100 * TCC_HIT[19])) + - (100 * TCC_HIT[20])) + (100 * TCC_HIT[21])) + (100 * TCC_HIT[22])) + - (100 * TCC_HIT[23])) + (100 * TCC_HIT[24])) + (100 * TCC_HIT[25])) + - (100 * TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 * TCC_HIT[28])) + - (100 * TCC_HIT[29])) + (100 * TCC_HIT[30])) + (100 * TCC_HIT[31])) / - ((((((((((((((((((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) + (TCC_MISS[1] - + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] + TCC_HIT[3])) - + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) + (TCC_MISS[6] - + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] + TCC_HIT[8])) - + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) + (TCC_MISS[11] - + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] + TCC_HIT[13])) - + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) + (TCC_MISS[16] - + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) + (TCC_MISS[18] + TCC_HIT[18])) - + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] + TCC_HIT[20])) + (TCC_MISS[21] - + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) + (TCC_MISS[23] + TCC_HIT[23])) - + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] + TCC_HIT[25])) + (TCC_MISS[26] - + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) + (TCC_MISS[28] + TCC_HIT[28])) - + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] + TCC_HIT[30])) + (TCC_MISS[31] - + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0] + TCC_HIT[0]) - + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) + (TCC_MISS[3] - + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] + TCC_HIT[5])) - + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) + (TCC_MISS[8] - + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] + TCC_HIT[10])) - + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) + (TCC_MISS[13] - + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] + TCC_HIT[15])) - + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) + (TCC_MISS[18] - + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] + TCC_HIT[20])) - + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) + (TCC_MISS[23] - + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] + TCC_HIT[25])) - + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) + (TCC_MISS[28] - + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] + TCC_HIT[30])) - + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None)) - min: MIN(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100 * - TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) - + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + - (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 - * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 - * TCC_HIT[14])) + (100 * TCC_HIT[15])) + (100 * TCC_HIT[16])) + (100 - * TCC_HIT[17])) + (100 * TCC_HIT[18])) + (100 * TCC_HIT[19])) + (100 - * TCC_HIT[20])) + (100 * TCC_HIT[21])) + (100 * TCC_HIT[22])) + (100 - * TCC_HIT[23])) + (100 * TCC_HIT[24])) + (100 * TCC_HIT[25])) + (100 - * TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 * TCC_HIT[28])) + (100 - * TCC_HIT[29])) + (100 * TCC_HIT[30])) + (100 * TCC_HIT[31])) / ((((((((((((((((((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) - + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] - + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) - + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] - + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) - + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] - + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) - + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] - + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) - + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] - + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) - + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] - + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None)) - max: MAX(((((((((((((((((((((((((((((((((((100 * TCC_HIT[0]) + (100 * - TCC_HIT[1])) + (100 * TCC_HIT[2])) + (100 * TCC_HIT[3])) + (100 * TCC_HIT[4])) - + (100 * TCC_HIT[5])) + (100 * TCC_HIT[6])) + (100 * TCC_HIT[7])) + - (100 * TCC_HIT[8])) + (100 * TCC_HIT[9])) + (100 * TCC_HIT[10])) + (100 - * TCC_HIT[11])) + (100 * TCC_HIT[12])) + (100 * TCC_HIT[13])) + (100 - * TCC_HIT[14])) + (100 * TCC_HIT[15])) + (100 * TCC_HIT[16])) + (100 - * TCC_HIT[17])) + (100 * TCC_HIT[18])) + (100 * TCC_HIT[19])) + (100 - * TCC_HIT[20])) + (100 * TCC_HIT[21])) + (100 * TCC_HIT[22])) + (100 - * TCC_HIT[23])) + (100 * TCC_HIT[24])) + (100 * TCC_HIT[25])) + (100 - * TCC_HIT[26])) + (100 * TCC_HIT[27])) + (100 * TCC_HIT[28])) + (100 - * TCC_HIT[29])) + (100 * TCC_HIT[30])) + (100 * TCC_HIT[31])) / ((((((((((((((((((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) - + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] - + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) - + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] - + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) - + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] - + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31]))) if (((((((((((((((((((((((((((((((((TCC_MISS[0] - + TCC_HIT[0]) + (TCC_MISS[1] + TCC_HIT[1])) + (TCC_MISS[2] + TCC_HIT[2])) - + (TCC_MISS[3] + TCC_HIT[3])) + (TCC_MISS[4] + TCC_HIT[4])) + (TCC_MISS[5] - + TCC_HIT[5])) + (TCC_MISS[6] + TCC_HIT[6])) + (TCC_MISS[7] + TCC_HIT[7])) - + (TCC_MISS[8] + TCC_HIT[8])) + (TCC_MISS[9] + TCC_HIT[9])) + (TCC_MISS[10] - + TCC_HIT[10])) + (TCC_MISS[11] + TCC_HIT[11])) + (TCC_MISS[12] + TCC_HIT[12])) - + (TCC_MISS[13] + TCC_HIT[13])) + (TCC_MISS[14] + TCC_HIT[14])) + (TCC_MISS[15] - + TCC_HIT[15])) + (TCC_MISS[16] + TCC_HIT[16])) + (TCC_MISS[17] + TCC_HIT[17])) - + (TCC_MISS[18] + TCC_HIT[18])) + (TCC_MISS[19] + TCC_HIT[19])) + (TCC_MISS[20] - + TCC_HIT[20])) + (TCC_MISS[21] + TCC_HIT[21])) + (TCC_MISS[22] + TCC_HIT[22])) - + (TCC_MISS[23] + TCC_HIT[23])) + (TCC_MISS[24] + TCC_HIT[24])) + (TCC_MISS[25] - + TCC_HIT[25])) + (TCC_MISS[26] + TCC_HIT[26])) + (TCC_MISS[27] + TCC_HIT[27])) - + (TCC_MISS[28] + TCC_HIT[28])) + (TCC_MISS[28] + TCC_HIT[29])) + (TCC_MISS[30] - + TCC_HIT[30])) + (TCC_MISS[31] + TCC_HIT[31])) != 0) else None)) - unit: pct - - metric_table: - id: 1802 - title: L2 Cache Hit Rate (pct) - header: - metric: Channel - expr: Expression - metric: - gfx90a: - ::_1: - expr: (((100 * TCC_HIT[::_1]) / (TCC_HIT[::_1] + TCC_MISS[::_1])) if ((TCC_HIT[::_1] - + TCC_MISS[::_1]) != 0) else None) - placeholder_range: - ::_1: $total_l2_chan - gfx941: - ::_1: - expr: (((100 * TCC_HIT[::_1]) / (TCC_HIT[::_1] + TCC_MISS[::_1])) if ((TCC_HIT[::_1] - + TCC_MISS[::_1]) != 0) else None) - placeholder_range: - ::_1: $total_l2_chan - gfx940: - ::_1: - expr: (((100 * TCC_HIT[::_1]) / (TCC_HIT[::_1] + TCC_MISS[::_1])) if ((TCC_HIT[::_1] - + TCC_MISS[::_1]) != 0) else None) - placeholder_range: - ::_1: $total_l2_chan - gfx942: - ::_1: - expr: (((100 * TCC_HIT[::_1]) / (TCC_HIT[::_1] + TCC_MISS[::_1])) if ((TCC_HIT[::_1] - + TCC_MISS[::_1]) != 0) else None) - placeholder_range: - ::_1: $total_l2_chan - gfx950: - ::_1: - expr: (((100 * TCC_HIT[::_1]) / (TCC_HIT[::_1] + TCC_MISS[::_1])) if ((TCC_HIT[::_1] - + TCC_MISS[::_1]) != 0) else None) - placeholder_range: - ::_1: $total_l2_chan - gfx908: - ::_1: - expr: (((100 * TCC_HIT[::_1]) / (TCC_HIT[::_1] + TCC_MISS[::_1])) if ((TCC_HIT[::_1] - + TCC_MISS[::_1]) != 0) else None) - placeholder_range: - ::_1: $total_l2_chan - cli_style: simple_box - tui_style: simple_box - - metric_table: - id: 1803 - title: L2 Requests (per normUnit) - header: - metric: Channel - expr: Expression - metric: - gfx90a: - ::_1: - expr: (TO_INT(TCC_REQ[::_1]) / $denom) - placeholder_range: - ::_1: $total_l2_chan - gfx941: - ::_1: - expr: (TO_INT(TCC_REQ[::_1]) / $denom) - placeholder_range: - ::_1: $total_l2_chan - gfx940: - ::_1: - expr: (TO_INT(TCC_REQ[::_1]) / $denom) - placeholder_range: - ::_1: $total_l2_chan - gfx942: - ::_1: - expr: (TO_INT(TCC_REQ[::_1]) / $denom) - placeholder_range: - ::_1: $total_l2_chan - gfx950: - ::_1: - expr: (TO_INT(TCC_REQ[::_1]) / $denom) - placeholder_range: - ::_1: $total_l2_chan - gfx908: - ::_1: - expr: (TO_INT(TCC_REQ[::_1]) / $denom) - placeholder_range: - ::_1: $total_l2_chan - cli_style: simple_box - tui_style: simple_box - - metric_table: - id: 1804 - title: L2 Requests (per normUnit) - header: - metric: Channel - read req: L2 Read - write req: L2 Write - atomic req: L2 Atomic - metric: - gfx90a: - ::_1: - read req: AVG((TO_INT(TCC_READ[::_1]) / $denom)) - write req: AVG((TO_INT(TCC_WRITE[::_1]) / $denom)) - atomic req: AVG((TO_INT(TCC_ATOMIC[::_1]) / $denom)) - placeholder_range: - ::_1: $total_l2_chan - gfx941: - ::_1: - read req: AVG((TO_INT(TCC_READ[::_1]) / $denom)) - write req: AVG((TO_INT(TCC_WRITE[::_1]) / $denom)) - atomic req: AVG((TO_INT(TCC_ATOMIC[::_1]) / $denom)) - placeholder_range: - ::_1: $total_l2_chan - gfx940: - ::_1: - read req: AVG((TO_INT(TCC_READ[::_1]) / $denom)) - write req: AVG((TO_INT(TCC_WRITE[::_1]) / $denom)) - atomic req: AVG((TO_INT(TCC_ATOMIC[::_1]) / $denom)) - placeholder_range: - ::_1: $total_l2_chan - gfx942: - ::_1: - read req: AVG((TO_INT(TCC_READ[::_1]) / $denom)) - write req: AVG((TO_INT(TCC_WRITE[::_1]) / $denom)) - atomic req: AVG((TO_INT(TCC_ATOMIC[::_1]) / $denom)) - placeholder_range: - ::_1: $total_l2_chan - gfx950: - ::_1: - read req: AVG((TO_INT(TCC_READ[::_1]) / $denom)) - write req: AVG((TO_INT(TCC_WRITE[::_1]) / $denom)) - atomic req: AVG((TO_INT(TCC_ATOMIC[::_1]) / $denom)) - placeholder_range: - ::_1: $total_l2_chan - gfx908: - ::_1: - read req: AVG((TO_INT(TCC_READ[::_1]) / $denom)) - write req: AVG((TO_INT(TCC_WRITE[::_1]) / $denom)) - atomic req: AVG((TO_INT(TCC_ATOMIC[::_1]) / $denom)) - placeholder_range: - ::_1: $total_l2_chan - cli_style: simple_multiple_bar - tui_style: simple_multiple_bar - - metric_table: - id: 1805 - title: L2-Fabric Requests (per normUnit) - header: - metric: Channel - read req: L2-Fabric Read - write req: L2-Fabric Write and Atomic - atomic req: L2-Fabric Atomic - metric: - gfx90a: - ::_1: - read req: AVG((TO_INT(TCC_EA_RDREQ[::_1]) / $denom)) - write req: AVG((TO_INT(TCC_EA_WRREQ[::_1]) / $denom)) - atomic req: AVG((TO_INT(TCC_EA_ATOMIC[::_1]) / $denom)) - placeholder_range: - ::_1: $total_l2_chan - gfx941: - ::_1: - read req: AVG((TO_INT(TCC_EA0_RDREQ[::_1]) / $denom)) - write req: AVG((TO_INT(TCC_EA0_WRREQ[::_1]) / $denom)) - atomic req: AVG((TO_INT(TCC_EA0_ATOMIC[::_1]) / $denom)) - placeholder_range: - ::_1: $total_l2_chan - gfx940: - ::_1: - read req: AVG((TO_INT(TCC_EA0_RDREQ[::_1]) / $denom)) - write req: AVG((TO_INT(TCC_EA0_WRREQ[::_1]) / $denom)) - atomic req: AVG((TO_INT(TCC_EA0_ATOMIC[::_1]) / $denom)) - placeholder_range: - ::_1: $total_l2_chan - gfx942: - ::_1: - read req: AVG((TO_INT(TCC_EA0_RDREQ[::_1]) / $denom)) - write req: AVG((TO_INT(TCC_EA0_WRREQ[::_1]) / $denom)) - atomic req: AVG((TO_INT(TCC_EA0_ATOMIC[::_1]) / $denom)) - placeholder_range: - ::_1: $total_l2_chan - gfx950: - ::_1: - read req: AVG((TO_INT(TCC_EA0_RDREQ[::_1]) / $denom)) - write req: AVG((TO_INT(TCC_EA0_WRREQ[::_1]) / $denom)) - atomic req: AVG((TO_INT(TCC_EA0_ATOMIC[::_1]) / $denom)) - placeholder_range: - ::_1: $total_l2_chan - gfx908: - ::_1: - read req: AVG((TO_INT(TCC_EA0_RDREQ[::_1]) / $denom)) - write req: AVG((TO_INT(TCC_EA0_WRREQ[::_1]) / $denom)) - atomic req: AVG((TO_INT(TCC_EA0_ATOMIC[::_1]) / $denom)) - placeholder_range: - ::_1: $total_l2_chan - cli_style: simple_multiple_bar - tui_style: simple_multiple_bar - - metric_table: - id: 1806 - title: L2-Fabric Read Latency (Cycles) - header: - metric: Channel - expr: Expression - metric: - gfx90a: - ::_1: - expr: ((TCC_EA_RDREQ_LEVEL[::_1] / TCC_EA_RDREQ[::_1]) if (TCC_EA_RDREQ[::_1] - != 0) else None) - placeholder_range: - ::_1: $total_l2_chan - gfx941: - ::_1: - expr: ((TCC_EA0_RDREQ_LEVEL[::_1] / TCC_EA0_RDREQ[::_1]) if (TCC_EA0_RDREQ[::_1] - != 0) else None) - placeholder_range: - ::_1: $total_l2_chan - gfx940: - ::_1: - expr: ((TCC_EA0_RDREQ_LEVEL[::_1] / TCC_EA0_RDREQ[::_1]) if (TCC_EA0_RDREQ[::_1] - != 0) else None) - placeholder_range: - ::_1: $total_l2_chan - gfx942: - ::_1: - expr: ((TCC_EA0_RDREQ_LEVEL[::_1] / TCC_EA0_RDREQ[::_1]) if (TCC_EA0_RDREQ[::_1] - != 0) else None) - placeholder_range: - ::_1: $total_l2_chan - gfx950: - ::_1: - expr: ((TCC_EA0_RDREQ_LEVEL[::_1] / TCC_EA0_RDREQ[::_1]) if (TCC_EA0_RDREQ[::_1] - != 0) else None) - placeholder_range: - ::_1: $total_l2_chan - gfx908: - ::_1: - expr: ((TCC_EA0_RDREQ_LEVEL[::_1] / TCC_EA0_RDREQ[::_1]) if (TCC_EA0_RDREQ[::_1] - != 0) else None) - placeholder_range: - ::_1: $total_l2_chan - cli_style: simple_box - tui_style: simple_box - - metric_table: - id: 1807 - title: L2-Fabric Write and Atomic Latency (Cycles) - header: - metric: Channel - expr: Expression - metric: - gfx90a: - ::_1: - expr: ((TCC_EA_WRREQ_LEVEL[::_1] / TCC_EA_WRREQ[::_1]) if (TCC_EA_WRREQ[::_1] - != 0) else None) - placeholder_range: - ::_1: $total_l2_chan - gfx941: - ::_1: - expr: ((TCC_EA0_WRREQ_LEVEL[::_1] / TCC_EA0_WRREQ[::_1]) if (TCC_EA0_WRREQ[::_1] - != 0) else None) - placeholder_range: - ::_1: $total_l2_chan - gfx940: - ::_1: - expr: ((TCC_EA0_WRREQ_LEVEL[::_1] / TCC_EA0_WRREQ[::_1]) if (TCC_EA0_WRREQ[::_1] - != 0) else None) - placeholder_range: - ::_1: $total_l2_chan - gfx942: - ::_1: - expr: ((TCC_EA0_WRREQ_LEVEL[::_1] / TCC_EA0_WRREQ[::_1]) if (TCC_EA0_WRREQ[::_1] - != 0) else None) - placeholder_range: - ::_1: $total_l2_chan - gfx950: - ::_1: - expr: ((TCC_EA0_WRREQ_LEVEL[::_1] / TCC_EA0_WRREQ[::_1]) if (TCC_EA0_WRREQ[::_1] - != 0) else None) - placeholder_range: - ::_1: $total_l2_chan - gfx908: - ::_1: - expr: ((TCC_EA0_WRREQ_LEVEL[::_1] / TCC_EA0_WRREQ[::_1]) if (TCC_EA0_WRREQ[::_1] - != 0) else None) - placeholder_range: - ::_1: $total_l2_chan - cli_style: simple_box - tui_style: simple_box - - metric_table: - id: 1808 - title: L2-Fabric Atomic Latency (Cycles) - header: - metric: Channel - expr: Expression - metric: - gfx90a: - ::_1: - expr: ((TCC_EA_ATOMIC_LEVEL[::_1] / TCC_EA_ATOMIC[::_1]) if (TCC_EA_ATOMIC[::_1] - != 0) else 0) - placeholder_range: - ::_1: $total_l2_chan - gfx941: - ::_1: - expr: ((TCC_EA0_ATOMIC_LEVEL[::_1] / TCC_EA0_ATOMIC[::_1]) if (TCC_EA0_ATOMIC[::_1] - != 0) else 0) - placeholder_range: - ::_1: $total_l2_chan - gfx940: - ::_1: - expr: ((TCC_EA0_ATOMIC_LEVEL[::_1] / TCC_EA0_ATOMIC[::_1]) if (TCC_EA0_ATOMIC[::_1] - != 0) else 0) - placeholder_range: - ::_1: $total_l2_chan - gfx942: - ::_1: - expr: ((TCC_EA0_ATOMIC_LEVEL[::_1] / TCC_EA0_ATOMIC[::_1]) if (TCC_EA0_ATOMIC[::_1] - != 0) else 0) - placeholder_range: - ::_1: $total_l2_chan - gfx950: - ::_1: - expr: ((TCC_EA0_ATOMIC_LEVEL[::_1] / TCC_EA0_ATOMIC[::_1]) if (TCC_EA0_ATOMIC[::_1] - != 0) else 0) - placeholder_range: - ::_1: $total_l2_chan - gfx908: - ::_1: - expr: ((TCC_EA0_ATOMIC_LEVEL[::_1] / TCC_EA0_ATOMIC[::_1]) if (TCC_EA0_ATOMIC[::_1] - != 0) else 0) - placeholder_range: - ::_1: $total_l2_chan - cli_style: simple_box - tui_style: simple_box - - metric_table: - id: 1809 - title: L2-Fabric Read Stall (Cycles per normUnit) - header: - metric: Channel - ea read stall - pcie: L2-Fabric Read Stall (PCIe) - ea read stall - if: "L2-Fabric Read Stall (Infinity Fabric\u2122)" - ea read stall - hbm: L2-Fabric Read Stall (HBM) - metric: - gfx90a: - ::_1: - ea read stall - pcie: None - ea read stall - if: None - ea read stall - hbm: None - placeholder_range: - ::_1: $total_l2_chan - gfx941: - ::_1: - ea read stall - pcie: None - ea read stall - if: None - ea read stall - hbm: None - placeholder_range: - ::_1: $total_l2_chan - gfx940: - ::_1: - ea read stall - pcie: None - ea read stall - if: None - ea read stall - hbm: None - placeholder_range: - ::_1: $total_l2_chan - gfx942: - ::_1: - ea read stall - pcie: None - ea read stall - if: None - ea read stall - hbm: None - placeholder_range: - ::_1: $total_l2_chan - gfx950: - ::_1: - ea read stall - pcie: AVG((TO_INT(TCC_EA0_RDREQ_IO_CREDIT_STALL[::_1]) - / $denom)) - ea read stall - if: AVG((TO_INT(TCC_EA0_RDREQ_GMI_CREDIT_STALL[::_1]) - / $denom)) - ea read stall - hbm: AVG((TO_INT(TCC_EA0_RDREQ_DRAM_CREDIT_STALL[::_1]) - / $denom)) - placeholder_range: - ::_1: $total_l2_chan - gfx908: - ::_1: - ea read stall - pcie: None - ea read stall - if: None - ea read stall - hbm: None - placeholder_range: - ::_1: $total_l2_chan - cli_style: simple_multiple_bar - tui_style: simple_multiple_bar - - metric_table: - id: 1810 - title: L2-Fabric Write and Atomic Stall (Cycles per normUnit) - header: - metric: Channel - ea write stall - pcie: L2-Fabric Write Stall (PCIe) - ea write stall - if: "L2-Fabric Write Stall (Infinity Fabric\u2122)" - ea write stall - hbm: L2-Fabric Write Stall (HBM) - ea write stall - starve: L2-Fabric Write Starve - metric: - gfx90a: - ::_1: - ea write stall - pcie: None - ea write stall - if: None - ea write stall - hbm: None - ea write stall - starve: AVG((TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[::_1]) - / $denom)) - placeholder_range: - ::_1: $total_l2_chan - gfx941: - ::_1: - ea write stall - pcie: None - ea write stall - if: None - ea write stall - hbm: None - ea write stall - starve: AVG((TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[::_1]) - / $denom)) - placeholder_range: - ::_1: $total_l2_chan - gfx940: - ::_1: - ea write stall - pcie: None - ea write stall - if: None - ea write stall - hbm: None - ea write stall - starve: AVG((TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[::_1]) - / $denom)) - placeholder_range: - ::_1: $total_l2_chan - gfx942: - ::_1: - ea write stall - pcie: None - ea write stall - if: None - ea write stall - hbm: None - ea write stall - starve: AVG((TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[::_1]) - / $denom)) - placeholder_range: - ::_1: $total_l2_chan - gfx950: - ::_1: - ea write stall - pcie: AVG((TO_INT(TCC_EA0_WRREQ_IO_CREDIT_STALL[::_1]) - / $denom)) - ea write stall - if: AVG((TO_INT(TCC_EA0_WRREQ_GMI_CREDIT_STALL[::_1]) - / $denom)) - ea write stall - hbm: AVG((TO_INT(TCC_EA0_WRREQ_DRAM_CREDIT_STALL[::_1]) - / $denom)) - ea write stall - starve: AVG((TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[::_1]) - / $denom)) - placeholder_range: - ::_1: $total_l2_chan - gfx908: - ::_1: - ea write stall - pcie: None - ea write stall - if: None - ea write stall - hbm: None - ea write stall - starve: AVG((TO_INT(TCC_TOO_MANY_EA_WRREQS_STALL[::_1]) - / $denom)) - placeholder_range: - ::_1: $total_l2_chan - cli_style: simple_multiple_bar - tui_style: simple_multiple_bar - - metric_table: - id: 1812 - title: L2-Fabric (128B read requests per normUnit) - header: - metric: Channel - expr: Expression - metric: - gfx90a: - ::_1: - expr: (TO_INT(TCC_BUBBLE[::_1]) / $denom) - placeholder_range: - ::_1: $total_l2_chan - gfx941: - ::_1: - expr: (TO_INT(TCC_BUBBLE[::_1]) / $denom) - placeholder_range: - ::_1: $total_l2_chan - gfx940: - ::_1: - expr: (TO_INT(TCC_BUBBLE[::_1]) / $denom) - placeholder_range: - ::_1: $total_l2_chan - gfx942: - ::_1: - expr: (TO_INT(TCC_BUBBLE[::_1]) / $denom) - placeholder_range: - ::_1: $total_l2_chan - gfx950: - ::_1: - expr: (TO_INT(TCC_BUBBLE[::_1]) / $denom) - placeholder_range: - ::_1: $total_l2_chan - gfx908: - ::_1: - expr: (TO_INT(TCC_BUBBLE[::_1]) / $denom) - placeholder_range: - ::_1: $total_l2_chan - cli_style: simple_box - tui_style: simple_box - metrics_description: - L2 Cache Hit Rate: - plain: The percent of total number of requests to the L2 from all clients that - hit in the cache. As noted in the Speed-of-Light section, this includes hit-on-miss - requests. - rst: The total number of requests to the L2 from all clients that hit in the cache. - As noted in the :ref:`Speed-of-Light <l2-sol>` section, this includes hit-on-miss - requests. - unit: Percent -- id: 2100 - title: PC Sampling - data source: - - pc_sampling_table: - id: 2101 - title: PC Sampling - source: ps_file - comparable: false diff --git a/projects/rocprofiler-compute/tools/unified_sets.yaml b/projects/rocprofiler-compute/tools/unified_sets.yaml deleted file mode 100644 index f94aa22435..0000000000 --- a/projects/rocprofiler-compute/tools/unified_sets.yaml +++ /dev/null @@ -1,176 +0,0 @@ ---- -# Pre-defined sets containing a collection of relevant metrics that can be collected in a single pass. -# To profile customized set(s), append to this yaml file. - -sets: -- title: Compute Throughput Utilization - set_option: compute_thruput_util - description: Placeholder - metric: - gfx908: - - 11.2.2 - - 11.2.3 - gfx90a: - - 11.2.3 - - 11.2.4 - - 11.2.5 - - 11.2.6 - gfx940: - - 11.2.2 - - 11.2.3 - - 11.2.4 - - 11.2.5 - gfx941: - - 11.2.2 - - 11.2.3 - - 11.2.4 - - 11.2.5 - gfx942: - - 11.2.2 - - 11.2.3 - - 11.2.4 - - 11.2.5 - gfx950: - - 11.2.2 - - 11.2.3 - - 11.2.5 - - 11.2.6 - -- title: Compute Throughput FLOPS - set_option: compute_thruput_flops - description: Placeholder - metric: - gfx908: - - 2.1.2 - - 2.1.3 - - 2.1.4 - - 2.1.5 - - 2.1.6 - gfx90a: - - 2.1.2 - - 2.1.3 - - 2.1.4 - - 2.1.5 - - 2.1.6 - gfx940: - - 2.1.2 - - 2.1.3 - - 2.1.4 - - 2.1.5 - - 2.1.6 - - 2.1.7 - gfx941: - - 2.1.2 - - 2.1.3 - - 2.1.4 - - 2.1.5 - - 2.1.6 - - 2.1.7 - gfx942: - - 2.1.2 - - 2.1.3 - - 2.1.4 - - 2.1.5 - - 2.1.6 - - 2.1.7 - gfx950: - - 2.1.2 - - 2.1.3 - - 2.1.4 - - 2.1.5 - - 2.1.6 - - 2.1.8 - -- title: Memory Throughput - set_option: mem_thruput - description: Placeholder - metric: - gfx908: - - 2.1.16 - - 2.1.17 - - 16.1.2 - - 17.1.0 - gfx90a: - - 2.1.16 - - 2.1.17 - - 16.1.2 - - 17.1.0 - gfx940: - - 2.1.17 - - 2.1.18 - - 16.1.2 - - 17.1.0 - gfx941: - - 2.1.17 - - 2.1.18 - - 16.1.2 - - 17.1.0 - gfx942: - - 2.1.17 - - 2.1.18 - - 16.1.2 - - 17.1.0 - gfx950: - - 2.1.18 - - 2.1.19 - - 16.1.2 - - 17.1.0 - -- title: Launch Stats - set_option: launch_stats - description: Placeholder - metric: - gfx908: - - 7.1.0 - - 7.1.1 - - 7.1.2 - - 7.1.5 - - 7.1.6 - - 7.1.7 - - 7.1.8 - - 7.1.9 - gfx90a: - - 7.1.0 - - 7.1.1 - - 7.1.2 - - 7.1.5 - - 7.1.6 - - 7.1.7 - - 7.1.8 - - 7.1.9 - gfx940: - - 7.1.0 - - 7.1.1 - - 7.1.2 - - 7.1.5 - - 7.1.6 - - 7.1.7 - - 7.1.8 - - 7.1.9 - gfx941: - - 7.1.0 - - 7.1.1 - - 7.1.2 - - 7.1.5 - - 7.1.6 - - 7.1.7 - - 7.1.8 - - 7.1.9 - gfx942: - - 7.1.0 - - 7.1.1 - - 7.1.2 - - 7.1.5 - - 7.1.6 - - 7.1.7 - - 7.1.8 - - 7.1.9 - gfx950: - - 7.1.0 - - 7.1.1 - - 7.1.2 - - 7.1.5 - - 7.1.6 - - 7.1.7 - - 7.1.8 - - 7.1.9