From c778acdb702dbd748e06dab1bf84bf82547b920a Mon Sep 17 00:00:00 2001
From: xuchen-amd <xuchen@amd.com>
Date: Wed, 19 Nov 2025 10:46:02 -0500
Subject: [PATCH] [rocprof-compute] update yamls for docs (#1887)

---
 .../docs/data/metrics_description.yaml        |  75 +++++----
 .../gfx908/0200_system_speed_of_light.yaml    |  18 +--
 .../gfx908/0300_memory_chart.yaml             |   8 +-
 .../gfx908/0400_roofline.yaml                 |  26 ++--
 .../0500_command_processor_cpc_cpf.yaml       |   2 +-
 .../gfx908/0600_workgroup_manager_spi.yaml    |   2 +-
 .../gfx908/0700_wavefront.yaml                |  12 +-
 .../gfx908/1200_local_data_share_lds.yaml     |   2 +-
 .../gfx908/1300_instruction_cache.yaml        |   2 +-
 .../gfx908/1400_scalar_l1_data_cache.yaml     |   6 +-
 .../gfx908/1600_vector_l1_data_cache.yaml     |   2 +-
 .../gfx90a/0200_system_speed_of_light.yaml    |  18 +--
 .../gfx90a/0300_memory_chart.yaml             |   8 +-
 .../gfx90a/0400_roofline.yaml                 |  26 ++--
 .../0500_command_processor_cpc_cpf.yaml       |   2 +-
 .../gfx90a/0600_workgroup_manager_spi.yaml    |   2 +-
 .../gfx90a/0700_wavefront.yaml                |  12 +-
 .../1000_compute_units_instruction_mix.yaml   |   2 +-
 .../1100_compute_units_compute_pipeline.yaml  |  14 +-
 .../gfx90a/1200_local_data_share_lds.yaml     |   2 +-
 .../gfx90a/1300_instruction_cache.yaml        |   2 +-
 .../gfx90a/1400_scalar_l1_data_cache.yaml     |   6 +-
 .../gfx90a/1600_vector_l1_data_cache.yaml     |   2 +-
 .../gfx940/0200_system_speed_of_light.yaml    |  18 +--
 .../gfx940/0300_memory_chart.yaml             |   8 +-
 .../gfx940/0400_roofline.yaml                 |  26 ++--
 .../0500_command_processor_cpc_cpf.yaml       |   2 +-
 .../gfx940/0600_workgroup_manager_spi.yaml    |   2 +-
 .../gfx940/0700_wavefront.yaml                |  12 +-
 .../1000_compute_units_instruction_mix.yaml   |   2 +-
 .../1100_compute_units_compute_pipeline.yaml  |  14 +-
 .../gfx940/1200_local_data_share_lds.yaml     |   2 +-
 .../gfx940/1300_instruction_cache.yaml        |   2 +-
 .../gfx940/1400_scalar_l1_data_cache.yaml     |   6 +-
 .../gfx940/1600_vector_l1_data_cache.yaml     |   2 +-
 .../gfx941/0200_system_speed_of_light.yaml    |  18 +--
 .../gfx941/0300_memory_chart.yaml             |   8 +-
 .../gfx941/0400_roofline.yaml                 |  26 ++--
 .../0500_command_processor_cpc_cpf.yaml       |   2 +-
 .../gfx941/0600_workgroup_manager_spi.yaml    |   2 +-
 .../gfx941/0700_wavefront.yaml                |  12 +-
 .../1000_compute_units_instruction_mix.yaml   |   2 +-
 .../1100_compute_units_compute_pipeline.yaml  |  14 +-
 .../gfx941/1200_local_data_share_lds.yaml     |   2 +-
 .../gfx941/1300_instruction_cache.yaml        |   2 +-
 .../gfx941/1400_scalar_l1_data_cache.yaml     |   6 +-
 .../gfx941/1600_vector_l1_data_cache.yaml     |   2 +-
 .../gfx942/0200_system_speed_of_light.yaml    |  18 +--
 .../gfx942/0300_memory_chart.yaml             |   8 +-
 .../gfx942/0400_roofline.yaml                 |  26 ++--
 .../0500_command_processor_cpc_cpf.yaml       |   2 +-
 .../gfx942/0600_workgroup_manager_spi.yaml    |   2 +-
 .../gfx942/0700_wavefront.yaml                |  12 +-
 .../1000_compute_units_instruction_mix.yaml   |   2 +-
 .../1100_compute_units_compute_pipeline.yaml  |  14 +-
 .../gfx942/1200_local_data_share_lds.yaml     |   2 +-
 .../gfx942/1300_instruction_cache.yaml        |   2 +-
 .../gfx942/1400_scalar_l1_data_cache.yaml     |   6 +-
 .../gfx942/1600_vector_l1_data_cache.yaml     |   2 +-
 .../gfx950/0200_system_speed_of_light.yaml    |  18 +--
 .../gfx950/0300_memory_chart.yaml             |   8 +-
 .../gfx950/0400_roofline.yaml                 |  28 ++--
 .../0500_command_processor_cpc_cpf.yaml       |   2 +-
 .../gfx950/0600_workgroup_manager_spi.yaml    |   2 +-
 .../gfx950/0700_wavefront.yaml                |  12 +-
 .../1000_compute_units_instruction_mix.yaml   |   2 +-
 .../1100_compute_units_compute_pipeline.yaml  |  14 +-
 .../gfx950/1200_local_data_share_lds.yaml     |   2 +-
 .../gfx950/1300_instruction_cache.yaml        |   2 +-
 .../gfx950/1400_scalar_l1_data_cache.yaml     |   6 +-
 .../gfx950/1600_vector_l1_data_cache.yaml     |   2 +-
 .../gfx950/1700_l2_cache.yaml                 |   2 +-
 .../tools/autogen_hash.yaml                   | 118 +--------------
 .../config_management/.config_hashes.json     | 142 +++++++++---------
 .../tools/config_management/README.md         |   4 +-
 .../metric_description_manager.py             |  27 +---
 .../tools/config_management/utils.py          |   2 +-
 .../gfx908_metrics_description.yaml           |  82 +++++-----
 .../gfx90a_metrics_description.yaml           | 102 ++++++-------
 .../gfx940_metrics_description.yaml           | 106 ++++++-------
 .../gfx941_metrics_description.yaml           | 106 ++++++-------
 .../gfx942_metrics_description.yaml           | 106 ++++++-------
 .../gfx950_metrics_description.yaml           | 110 +++++++-------
 83 files changed, 708 insertions(+), 836 deletions(-)

diff --git a/projects/rocprofiler-compute/docs/data/metrics_description.yaml b/projects/rocprofiler-compute/docs/data/metrics_description.yaml
index 25635f32b8..4c60cf24a1 100644
--- a/projects/rocprofiler-compute/docs/data/metrics_description.yaml
+++ b/projects/rocprofiler-compute/docs/data/metrics_description.yaml
@@ -1,7 +1,6 @@
-# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py
 Wavefront launch stats:
   AGPRs:
-    rst: |-
+    rst: >-
       The number of accumulation vector general-purpose registers allocated
       for the kernel, see :ref:`AGPRs <desc-agprs>`. Note: this may not exactly match
       the number of AGPRs requested by the compiler due to allocation granularity.
@@ -12,7 +11,7 @@ Wavefront launch stats:
       total workgroup (or, block) size.
     unit: Work-Items
   LDS Allocation:
-    rst: |-
+    rst: >-
       The number of bytes of :doc:`LDS <local-data-share>` memory (or, shared memory)
       allocated for this kernel. Note: This may also be larger than what was requested
       at compile time due to both allocation granularity and dynamic per-dispatch
@@ -23,7 +22,7 @@ Wavefront launch stats:
       <https://docs.kernel.org/gpu/amdgpu/module-parameters.html?highlight=cwsr>`_.
     unit: Wavefronts
   SGPRs:
-    rst: |-
+    rst: >-
       The number of scalar general-purpose registers allocated for the kernel, see
       :ref:`SALU <desc-salu>`. Note: this may not exactly match the number of
       SGPRs requested by the compiler due to allocation granularity.
@@ -38,14 +37,14 @@ Wavefront launch stats:
       as well as for register spills and restores.
     unit: Bytes per work-item
   Total Wavefronts:
-    rst: |-
+    rst: >-
       The total number of wavefronts launched as part of the kernel dispatch.
       On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront
       size is always 64 work-items. Thus, the total number of wavefronts should
       be equivalent to the ceiling of grid size divided by 64.
     unit: Wavefronts
   VGPRs:
-    rst: |-
+    rst: >-
       The number of architected vector general-purpose registers allocated for the
       kernel, see :ref:`VALU <desc-valu>`. Note: this may not exactly match the
       number of VGPRs requested by the compiler due to allocation granularity.
@@ -98,14 +97,14 @@ Wavefront runtime stats:
     rst: The total duration of the executed kernel in cycles.
     unit: Cycles
   Wave Cycles:
-    rst: |-
+    rst: >-
       The number of cycles a wavefront in the kernel dispatch spent resident
       on a compute unit per :ref:`normalization unit <normalization-units>`. This is
       averaged over all wavefronts in a kernel dispatch. Note: this should not
       be directly compared to the kernel cycles above.
     unit: Cycles per normalization unit
   Wavefront Occupancy:
-    rst: |-
+    rst: >-
       The time-averaged number of wavefronts resident on the accelerator over the
       lifetime of the kernel. Note: this metric may be inaccurate for short-running
       kernels (less than 1ms).
@@ -148,7 +147,7 @@ Overall instruction mix:
     unit: Instructions
 VALU arithmetic instruction mix:
   Conversion:
-    rst: |-
+    rst: >-
       The total number of type conversion instructions (such as converting data
       to or from F32\u2194F64) issued to the VALU per :ref:`normalization unit
       <normalization-units>`.
@@ -240,7 +239,7 @@ MFMA instruction mix:
     unit: Instructions per normalization unit
 Compute Speed-of-Light:
   MFMA FLOPs (BF16):
-    rst: |-
+    rst: >-
       The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 16-bit brain floating
       point operations from :ref:`VALU <desc-valu>` instructions. This is also
@@ -248,7 +247,7 @@ Compute Speed-of-Light:
       on the specific accelerator.
     unit: GFLOPs
   MFMA FLOPs (F16):
-    rst: |-
+    rst: >-
       The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 16-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. This is also presented
@@ -256,7 +255,7 @@ Compute Speed-of-Light:
       specific accelerator.
     unit: GFLOPs
   MFMA FLOPs (F32):
-    rst: |-
+    rst: >-
       The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 32-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. This is also presented
@@ -264,7 +263,7 @@ Compute Speed-of-Light:
       specific accelerator.
     unit: GFLOPs
   MFMA FLOPs (F64):
-    rst: |-
+    rst: >-
       The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 64-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. This is also presented
@@ -276,21 +275,21 @@ Compute Speed-of-Light:
       achievable on the specific accelerator.
     unit: GFLOPs
   MFMA IOPs (INT8):
-    rst: |-
+    rst: >-
       The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
       per second. Note: this does not include any 8-bit integer operations from
       :ref:`VALU <desc-valu>` instructions. This is also presented as a percent
       of the peak theoretical INT8 MFMA operations achievable on the specific accelerator.
     unit: GFLOPs
   VALU FLOPs:
-    rst: |-
+    rst: >-
       The total floating-point operations executed per second on the :ref:`VALU
       <desc-valu>`. This is also presented as a percent of the peak theoretical
       FLOPs achievable on the specific accelerator. Note: this does not include
       any floating-point operations from :ref:`MFMA <desc-mfma>` instructions.
     unit: GFLOPs
   VALU IOPs:
-    rst: |-
+    rst: >-
       The total integer operations executed per second on the :ref:`VALU <desc-valu>`.
       This is also presented as a percent of the peak theoretical IOPs achievable
       on the specific accelerator. Note: this does not include any integer operations
@@ -361,7 +360,7 @@ Pipeline statistics:
     unit: Percent
 Arithmetic operations:
   BF16 OPs:
-    rst: |-
+    rst: >-
       The total number of 16-bit brain floating-point operations executed on
       either the :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization
       unit <normalization-units>`. Note: on current CDNA accelerators, the VALU
@@ -388,7 +387,7 @@ Arithmetic operations:
       <normalization-units>`.
     unit: FLOP per normalization unit
   INT8 OPs:
-    rst: |-
+    rst: >-
       The total number of 8-bit integer operations executed on either the :ref:`VALU
       <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization unit
       <normalization-units>`. Note: on current CDNA accelerators, the VALU has
@@ -460,7 +459,7 @@ LDS Statistics:
       acknowledgment) required for an LDS instruction to complete.
     unit: Cycles
   Mem Violations:
-    rst: |-
+    rst: >-
       The total number of out-of-bounds accesses made to the LDS, per :ref:`normalization
       unit <normalization-units>`. This is unused and expected to be zero in
       most configurations for modern CDNA\u2122 accelerators.
@@ -586,7 +585,7 @@ L1 Unified Translation Cache (UTCL1):
       per normalization unit.
     unit: Requests per normalization unit
   Permission Misses:
-    rst: |-
+    rst: >-
       The total number of translation requests that missed in the UTCL1 due
       to a permission error, per :ref:`normalization unit <normalization-units>`.
       This is unused and expected to be zero in most configurations for modern
@@ -925,7 +924,7 @@ L2-Fabric interface metrics:
       before data was returned to the L2.
     unit: Cycles
   Read Stall:
-    rst: |-
+    rst: >-
       The ratio of the total number of cycles the L2-Fabric interface was stalled
       on a read request to any destination (local HBM, remote PCIe\xAE connected
       accelerator or CPU, or remote Infinity Fabric connected accelerator [#inf]_
@@ -1198,7 +1197,7 @@ Scalar L1D Cache - L2 Interface:
       per :ref:`normalization unit <normalization-units>`.
     unit: Requests per normalization unit
   Stall Cycles:
-    rst: |-
+    rst: >-
       The total number of cycles the sL1D\u2194 :doc:`L2 <l2-cache>` interface
       was stalled, per :ref:`normalization unit <normalization-units>`.
     unit: Cycles per normalization unit
@@ -1208,7 +1207,7 @@ Scalar L1D Cache - L2 Interface:
       CDNA accelerators.
     unit: Requests per normalization unit
   sL1D-L2 BW:
-    rst: |-
+    rst: >-
       The total number of bytes read from, written to, or atomically updated
       across the sL1D\u2194:doc:`L2 <l2-cache>` interface, divided by total duration.
       Note that sL1D writes and atomics are typically
@@ -1227,7 +1226,7 @@ L1I Speed-of-Light:
       over the number of all L1I requests.
     unit: Percent
   L1I-L2 Bandwidth Utilization:
-    rst: |-
+    rst: >-
       The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth
       achieved. Calculated as the ratio of the total number of requests from
       the L1I to the L2 cache over the :ref:`total L1I-L2 interface cycles <total-l1i-cycles>`.
@@ -1286,7 +1285,7 @@ Workgroup manager utilizations:
       not fully saturated by the kernel, or a potential load-imbalance issue.
     unit: Percent
   Scheduler-Pipe Utilization:
-    rst: |-
+    rst: >-
       The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
       in the kernel where the scheduler-pipes were actively doing any work. Note: this
       value is expected to range between 0% and 25%. See :ref:`desc-spi`.
@@ -1332,7 +1331,7 @@ Workgroup Manager - Resource Allocation:
       lack of available :ref:`waveslots <desc-valu>`.
     unit: Percent
   Not-scheduled Rate (Scheduler-Pipe):
-    rst: |-
+    rst: >-
       The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
       in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
       due to a bottleneck within the scheduler-pipes rather than a lack of a CU
@@ -1341,7 +1340,7 @@ Workgroup Manager - Resource Allocation:
       description.
     unit: Percent
   Not-scheduled Rate (Workgroup Manager):
-    rst: |-
+    rst: >-
       The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
       in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
       due to a bottleneck within the workgroup manager rather than a lack of a
@@ -1362,7 +1361,7 @@ Workgroup Manager - Resource Allocation:
       or newer accelerators (and small for previous accelerators).
     unit: Percent
   Scheduler-Pipe Stall Rate:
-    rst: |-
+    rst: >-
       The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
       in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
       due to occupancy limitations (like a lack of a CU or :ref:`SIMD <desc-valu>`
@@ -1464,7 +1463,7 @@ System Speed-of-Light:
       over the total number of incoming cache line requests to the L2 cache.
     unit: Percent
   L2-Fabric Read BW:
-    rst: |-
+    rst: >-
       The number of bytes read by the L2 over the :ref:`Infinity Fabric\u2122
       interface <l2-fabric>` per unit time. This is also presented as a percent
       of the peak theoretical bandwidth achievable on the specific accelerator.
@@ -1490,7 +1489,7 @@ System Speed-of-Light:
       Conflict Rate).
     unit: Conflicts/Access
   MFMA FLOPs (BF16):
-    rst: |-
+    rst: >-
       The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>`
       operations executed per second. Note: this does not include any 16-bit brain
       floating point operations from :ref:`VALU <desc-valu>` instructions. This
@@ -1498,7 +1497,7 @@ System Speed-of-Light:
       achievable on the specific accelerator.
     unit: GFLOPs
   MFMA FLOPs (F16):
-    rst: |-
+    rst: >-
       The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 16-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. This is also presented
@@ -1506,7 +1505,7 @@ System Speed-of-Light:
       specific accelerator.
     unit: GFLOPs
   MFMA FLOPs (F32):
-    rst: |-
+    rst: >-
       The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 32-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. This is also presented
@@ -1514,7 +1513,7 @@ System Speed-of-Light:
       specific accelerator.
     unit: GFLOPs
   MFMA FLOPs (F64):
-    rst: |-
+    rst: >-
       The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 64-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. This is also presented
@@ -1522,7 +1521,7 @@ System Speed-of-Light:
       specific accelerator.
     unit: GFLOPs
   MFMA FLOPs (F8):
-    rst: |-
+    rst: >-
       The total number of 8-bit brain floating point :ref:`MFMA <desc-mfma>`
       operations executed per second. Note: this does not include any 16-bit brain
       floating point operations from :ref:`VALU <desc-valu>` instructions. This
@@ -1531,7 +1530,7 @@ System Speed-of-Light:
       series and later only.
     unit: GFLOPs
   MFMA IOPs (Int8):
-    rst: |-
+    rst: >-
       The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
       per second. Note: this does not include any 8-bit integer operations from
       :ref:`VALU <desc-valu>` instructions. This is also presented as a percent
@@ -1562,14 +1561,14 @@ System Speed-of-Light:
       time-averaged over all VALU instructions run on all wavefronts in the kernel.
     unit: Work-items
   VALU FLOPs:
-    rst: |-
+    rst: >-
       The total floating-point operations executed per second on the :ref:`VALU
       <desc-valu>`. This is also presented as a percent of the peak theoretical
       FLOPs achievable on the specific accelerator. Note: this does not include
       any floating-point operations from :ref:`MFMA <desc-mfma>` instructions.
     unit: GFLOPs
   VALU IOPs:
-    rst: |-
+    rst: >-
       The total integer operations executed per second on the :ref:`VALU <desc-valu>`.
       This is also presented as a percent of the peak theoretical IOPs achievable
       on the specific accelerator. Note: this does not include any integer operations
@@ -1590,7 +1589,7 @@ System Speed-of-Light:
       issuing VMEM instructions over the :ref:`total CU cycles <total-cu-cycles>`.
     unit: Percent
   Wavefront Occupancy:
-    rst: |-
+    rst: >-
       The time-averaged number of wavefronts resident on the accelerator over
       the lifetime of the kernel. Note: this metric may be inaccurate for short-running
       kernels (less than 1ms). This is also presented as a percent of the peak theoretical
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0200_system_speed_of_light.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0200_system_speed_of_light.yaml
index ae059bc0cb..6fca0c579c 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0200_system_speed_of_light.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0200_system_speed_of_light.yaml
@@ -200,37 +200,37 @@ Panel Config:
           pop: None
           coll_level: SQ_IFETCH_LEVEL
   metrics_description:
-    VALU FLOPs: |-
+    VALU FLOPs: >-
       The total floating-point operations executed per second on the VALU.
       This is also presented as a percent of the peak theoretical FLOPs achievable
       on the specific accelerator. Note: this does not include any floating-point
       operations from MFMA instructions.
-    VALU IOPs: |-
+    VALU IOPs: >-
       The total integer operations executed per second on the VALU. This is
       also presented as a percent of the peak theoretical IOPs achievable on the
       specific accelerator. Note: this does not include any integer operations from
       MFMA instructions.
-    MFMA FLOPs (BF16): |-
+    MFMA FLOPs (BF16): >-
       The total number of 16-bit brain floating point MFMA operations executed
       per second. Note: this does not include any 16-bit brain floating point operations
       from VALU instructions. This is also presented as a percent of the peak theoretical
       BF16 MFMA operations achievable on the specific accelerator.
-    MFMA FLOPs (F16): |-
+    MFMA FLOPs (F16): >-
       The total number of 16-bit floating point MFMA operations executed per
       second. Note: this does not include any 16-bit floating point operations from
       VALU instructions. This is also presented as a percent of the peak theoretical
       F16 MFMA operations achievable on the specific accelerator.
-    MFMA FLOPs (F32): |-
+    MFMA FLOPs (F32): >-
       The total number of 32-bit floating point MFMA operations executed per
       second. Note: this does not include any 32-bit floating point operations from
       VALU instructions. This is also presented as a percent of the peak theoretical
       F32 MFMA operations achievable on the specific accelerator.
-    MFMA FLOPs (F64): |-
+    MFMA FLOPs (F64): >-
       The total number of 64-bit floating point MFMA operations executed per
       second. Note: this does not include any 64-bit floating point operations from
       VALU instructions. This is also presented as a percent of the peak theoretical
       F64 MFMA operations achievable on the specific accelerator.
-    MFMA IOPs (Int8): |-
+    MFMA IOPs (Int8): >-
       The total number of 8-bit integer MFMA operations executed per second.
       Note: this does not include any 8-bit integer operations from VALU instructions.
       This is also presented as a percent of the peak theoretical INT8 MFMA operations
@@ -263,7 +263,7 @@ Panel Config:
     IPC: The ratio of the total number of instructions executed on the CU over the
       total active CU cycles. This is also presented as a percent of the peak theoretical
       bandwidth achievable on the specific accelerator.
-    Wavefront Occupancy: |-
+    Wavefront Occupancy: >-
       The time-averaged number of wavefronts resident on the accelerator over
       the lifetime of the kernel. Note: this metric may be inaccurate for short-running
       kernels (less than 1ms). This is also presented as a percent of the peak theoretical
@@ -296,7 +296,7 @@ Panel Config:
       if only a single value is requested in a cache line, the data movement will
       still be counted as a full cache line. This is also presented as a percent of
       the peak theoretical bandwidth achievable on the specific accelerator.
-    L2-Fabric Read BW: |-
+    L2-Fabric Read BW: >-
       The number of bytes read by the L2 over the Infinity Fabric\u2122 interface
       per unit time. This is also presented as a percent of the peak theoretical
       bandwidth achievable on the specific accelerator.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0300_memory_chart.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0300_memory_chart.yaml
index d817e3e02f..db190f03ef 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0300_memory_chart.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0300_memory_chart.yaml
@@ -170,15 +170,15 @@ Panel Config:
     Active CUs: Total number of active compute units (CUs) on the accelerator during
       the kernel execution.
     Num CUs: Total number of compute units (CUs) on the accelerator.
-    VGPR: |-
+    VGPR: >-
       The number of architected vector general-purpose registers allocated
       for the kernel, see VALU. Note: this may not exactly match the number of VGPRs
       requested by the compiler due to allocation granularity.
-    SGPR: |-
+    SGPR: >-
       The number of scalar general-purpose registers allocated for the kernel,
       see SALU. Note: this may not exactly match the number of SGPRs requested by
       the compiler due to allocation granularity.
-    LDS Allocation: |-
+    LDS Allocation: >-
       The number of bytes of LDS memory (or, shared memory) allocated for
       this kernel. Note: This may also be larger than what was requested at compile
       time due to both allocation granularity and dynamic per-dispatch LDS allocations.
@@ -266,7 +266,7 @@ Panel Config:
       or data (atomic with return value) was returned to the L2.
     HBM Rd: The total number of L2 requests to Infinity Fabric to read 32B or 64B
       of data from the accelerator's local HBM, per normalization unit.
-    HBM Wr: |-
+    HBM Wr: >-
       The total number of L2 requests to Infinity Fabric to write or atomically
       update 32B or 64B of data in the accelerator's local HBM, per normalization
       unit.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0400_roofline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0400_roofline.yaml
index 6cf7344c4f..971414d5e7 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0400_roofline.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0400_roofline.yaml
@@ -134,48 +134,48 @@ Panel Config:
             / 1e9) ) / 1e9
           unit: GFLOP/s
   metrics_description:
-    VALU FLOPs (F16): |-
+    VALU FLOPs (F16): >-
       The total 16-bit floating-point operations executed per second on the VALU.
       This is presented with the value of the peak empirical F16 FLOPs achievable
       on the specific accelerator. Note: this does not include any F16 operations
       from MFMA instructions.
-    VALU FLOPs (F32): |-
+    VALU FLOPs (F32): >-
       The total 32-bit floating-point operations executed per second on the VALU.
       This is presented with the value of the peak empirical F32 FLOPs achievable
       on the specific accelerator. Note: this does not include any F32 operations
       from MFMA instructions.
-    VALU FLOPs (F64): |-
+    VALU FLOPs (F64): >-
       The total 64-bit floating-point operations executed per second on the VALU.
       This is presented with the value of the peak empirical F64 FLOPs achievable
       on the specific accelerator. Note: this does not include any F64 operations
       from MFMA instructions.
-    MFMA FLOPs (BF16): |-
+    MFMA FLOPs (BF16): >-
       The total number of 16-bit brain floating point MFMA operations executed
       per second. Note: this does not include any 16-bit brain floating point
       operations from VALU instructions. The peak empirically measured BF16 MFMA
       operations achievable on the specific accelerator is displayed alongside
       for comparison.
-    MFMA FLOPs (F16): |-
+    MFMA FLOPs (F16): >-
       The total number of 16-bit floating point MFMA operations executed per
       second. Note: this does not include any 16-bit floating point operations from
       VALU instructions. The peak empirically measured F16 MFMA operations
       achievable on the specific accelerator is displayed alongside for comparison.
-    MFMA FLOPs (F32): |-
+    MFMA FLOPs (F32): >-
       The total number of 32-bit floating point MFMA operations executed per
       second. Note: this does not include any 32-bit floating point operations from
       VALU instructions. The peak empirically measured F32 MFMA operations
       achievable on the specific accelerator is displayed alongside for comparison.
-    MFMA FLOPs (F64): |-
+    MFMA FLOPs (F64): >-
       The total number of 64-bit floating point MFMA operations executed per
       second. Note: this does not include any 64-bit floating point operations from
       VALU instructions. The peak empirically measured F64 MFMA operations
       achievable on the specific accelerator is displayed alongside for comparison.
-    MFMA IOPs (Int8): |-
+    MFMA IOPs (Int8): >-
       The total number of 8-bit integer MFMA operations executed per second.
       Note: this does not include any 8-bit integer operations from VALU instructions.
       The peak empirically measured INT8 MFMA operations achievable on the specific
       accelerator is displayed alongside for comparison.
-    HBM Bandwidth: |-
+    HBM Bandwidth: >-
       The total number of bytes read from and written to High-Bandwidth
       Memory (HBM) per second. The peak empirically measured bandwidth achievable
       on the specific accelerator is displayed alongside for comparison.
@@ -196,22 +196,22 @@ Panel Config:
       from, stored to, or atomically updated in the LDS per unit time (see LDS Bandwidth
       example for more detail). The peak empirically measured LDS bandwidth achievable
       on the specific accelerator is displayed alongside for comparison.
-    AI L1: |-
+    AI L1: >-
       The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio
       of total floating-point operations (FLOPs) to total bytes transferred between
       the L1 cache and the processing units. This value is used as the x-coordinate
       for the L1 roofline.
-    AI L2: |-
+    AI L2: >-
       The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio
       of total floating-point operations (FLOPs) to total bytes transferred between
       the L2 cache and the L1 cache. This value is used as the x-coordinate for
       the L2 roofline.
-    AI HBM: |-
+    AI HBM: >-
       The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM).
       It is the ratio of total floating-point operations (FLOPs) to total bytes
       transferred between HBM and the L2 cache. This value is used as the x-coordinate
       for the HBM roofline.
-    Performance (GFLOPs): |-
+    Performance (GFLOPs): >-
       The overall achieved performance, measured in GigaFLOPs
       per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point
       operations divided by the total execution time. This value is used as the y-coordinate
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0500_command_processor_cpc_cpf.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0500_command_processor_cpc_cpf.yaml
index 118ce18331..411c4c803b 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0500_command_processor_cpc_cpf.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0500_command_processor_cpc_cpf.yaml
@@ -141,6 +141,6 @@ Panel Config:
       the CPC-L2 interface was active doing any work.
     CPC-UTCL1 Stall: Percent of CPC busy cycles where the CPC was stalled by address
       translation
-    CPC-UTCL2 Utilization: |-
+    CPC-UTCL2 Utilization: >-
       Percent of total cycles counted by the CPC's L2 address translation
       interface where the CPC was busy doing address translation work.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0600_workgroup_manager_spi.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0600_workgroup_manager_spi.yaml
index eb9845aa82..ca033cbdd7 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0600_workgroup_manager_spi.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0600_workgroup_manager_spi.yaml
@@ -168,7 +168,7 @@ Panel Config:
       in the kernel where a workgroup could not be scheduled to a CU due to a bottleneck
       within the workgroup manager rather than a lack of a CU or SIMD with sufficient
       resources.
-    Not-scheduled Rate (Scheduler-Pipe): |-
+    Not-scheduled Rate (Scheduler-Pipe): >-
       The percent of total scheduler-pipe cycles in the kernel where a workgroup
       could not be scheduled to a CU due to a bottleneck within the scheduler-pipes
       rather than a lack of a CU or SIMD with sufficient resources.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0700_wavefront.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0700_wavefront.yaml
index e9e9407cfc..dfc968df65 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0700_wavefront.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0700_wavefront.yaml
@@ -121,26 +121,26 @@ Panel Config:
     Workgroup Size: The total number of work-items (or, threads) in each workgroup
       (or, block) launched as part of the kernel dispatch. In HIP, this is equivalent
       to the total block size.
-    Total Wavefronts: |-
+    Total Wavefronts: >-
       The total number of wavefronts launched as part of the kernel dispatch.
       On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront
       size is always 64 work-items. Thus, the total number of wavefronts should
       be equivalent to the ceiling of grid size divided by 64.
     Saved Wavefronts: The total number of wavefronts saved at a context-save.
     Restored Wavefronts: The total number of wavefronts restored from a context-save.
-    VGPRs: |-
+    VGPRs: >-
       The number of architected vector general-purpose registers allocated
       for the kernel, see VALU. Note: this may not exactly match the number of VGPRs
       requested by the compiler due to allocation granularity.
-    AGPRs: |-
+    AGPRs: >-
       The number of accumulation vector general-purpose registers allocated
       for the kernel, see AGPRs. Note: this may not exactly match the number of
       AGPRs requested by the compiler due to allocation granularity.
-    SGPRs: |-
+    SGPRs: >-
       The number of scalar general-purpose registers allocated for the kernel,
       see SALU. Note: this may not exactly match the number of SGPRs requested by
       the compiler due to allocation granularity.
-    LDS Allocation: |-
+    LDS Allocation: >-
       The number of bytes of LDS memory (or, shared memory) allocated for
       this kernel. Note: This may also be larger than what was requested at compile
       time due to both allocation granularity and dynamic per-dispatch LDS allocations.
@@ -173,7 +173,7 @@ Panel Config:
       rather than identification of a precise limiter. The sum of this metric, Issue
       Wait Cycles and Active Wait Cycles should be equal to the total Wave Cycles
       metric.
-    Wavefront Occupancy: |-
+    Wavefront Occupancy: >-
       The time-averaged number of wavefronts resident on the accelerator over
       the lifetime of the kernel. Note: this metric may be inaccurate for short-running
       kernels (less than 1ms).
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1200_local_data_share_lds.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1200_local_data_share_lds.yaml
index b7767fea16..8e602e9f83 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1200_local_data_share_lds.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1200_local_data_share_lds.yaml
@@ -140,7 +140,7 @@ Panel Config:
       unit.
     Unaligned Stall: The total number of cycles spent in the LDS scheduler due to
       stalls from non-dword aligned addresses per normalization unit.
-    Mem Violations: |-
+    Mem Violations: >-
       The total number of out-of-bounds accesses made to the LDS, per normalization
       unit. This is unused and expected to be zero in most configurations for
       modern CDNA\u2122 accelerators.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1300_instruction_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1300_instruction_cache.yaml
index 35808d9d96..ffc6e890b9 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1300_instruction_cache.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1300_instruction_cache.yaml
@@ -92,7 +92,7 @@ Panel Config:
     Cache Hit Rate: The percent of L1I requests that hit [#l1i-cache]_ on a previously
       loaded line the cache. Calculated as the ratio of the number of L1I requests
       that hit over the number of all L1I requests.
-    L1I-L2 Bandwidth Utilization: |-
+    L1I-L2 Bandwidth Utilization: >-
       The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth
       achieved. Calculated as the ratio of the total number of requests from the
       L1I to the L2 cache over the total L1I-L2 interface cycles.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1400_scalar_l1_data_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1400_scalar_l1_data_cache.yaml
index 6b73164848..be2ce0db79 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1400_scalar_l1_data_cache.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1400_scalar_l1_data_cache.yaml
@@ -154,7 +154,7 @@ Panel Config:
     sL1D-L2 BW Utilization: The percentage of the peak theoretical sL1D - L2 interface
       bandwidth acheived. Calculated as total number of bytes read from, written to,
       or atomically updated across the sL1D - L2 interface.
-    sL1D-L2 BW: |-
+    sL1D-L2 BW: >-
       The total number of bytes read from, written to, or atomically updated
       across the sL1D\u2194L2 interface, divided by total duration. Note that sL1D
       writes and atomics are typically unused on current CDNA accelerators, so
@@ -164,7 +164,7 @@ Panel Config:
       unit.
     Hits: The total number of sL1D requests that hit on a previously loaded cache
       line, per normalization unit.
-    Misses - Non Duplicated: |-
+    Misses - Non Duplicated: >-
       The total number of sL1D requests that missed on a cache line that was
       not already pending due to another request, per normalization unit.
     Misses- Duplicated: The total number of sL1D requests that missed on a cache line
@@ -187,6 +187,6 @@ Panel Config:
       unit.
     Write Req: The total number of write requests from sL1D to the L2, per normalization
       unit. Typically unused on current CDNA accelerators.
-    Stall Cycles: |-
+    Stall Cycles: >-
       The total number of cycles the sL1D\u2194L2 interface was stalled, per
       normalization unit.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1600_vector_l1_data_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1600_vector_l1_data_cache.yaml
index 2be99f875f..55ba9d1447 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1600_vector_l1_data_cache.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/1600_vector_l1_data_cache.yaml
@@ -436,7 +436,7 @@ Panel Config:
       per normalization unit.
     Translation Misses: The total number of translation requests that missed in the
       UTCL1 due to translation not being present in the cache, per normalization unit.
-    Permission Misses: |-
+    Permission Misses: >-
       The total number of translation requests that missed in the UTCL1 due
       to a permission error, per normalization unit. This is unused and expected
       to be zero in most configurations for modern CDNA\u2122 accelerators.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0200_system_speed_of_light.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0200_system_speed_of_light.yaml
index b8bdb7e664..34eb6972ca 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0200_system_speed_of_light.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0200_system_speed_of_light.yaml
@@ -218,37 +218,37 @@ Panel Config:
           pop: None
           coll_level: SQ_IFETCH_LEVEL
   metrics_description:
-    VALU FLOPs: |-
+    VALU FLOPs: >-
       The total floating-point operations executed per second on the VALU.
       This is also presented as a percent of the peak theoretical FLOPs achievable
       on the specific accelerator. Note: this does not include any floating-point
       operations from MFMA instructions.
-    VALU IOPs: |-
+    VALU IOPs: >-
       The total integer operations executed per second on the VALU. This is
       also presented as a percent of the peak theoretical IOPs achievable on the
       specific accelerator. Note: this does not include any integer operations from
       MFMA instructions.
-    MFMA FLOPs (BF16): |-
+    MFMA FLOPs (BF16): >-
       The total number of 16-bit brain floating point MFMA operations executed
       per second. Note: this does not include any 16-bit brain floating point operations
       from VALU instructions. This is also presented as a percent of the peak theoretical
       BF16 MFMA operations achievable on the specific accelerator.
-    MFMA FLOPs (F16): |-
+    MFMA FLOPs (F16): >-
       The total number of 16-bit floating point MFMA operations executed per
       second. Note: this does not include any 16-bit floating point operations from
       VALU instructions. This is also presented as a percent of the peak theoretical
       F16 MFMA operations achievable on the specific accelerator.
-    MFMA FLOPs (F32): |-
+    MFMA FLOPs (F32): >-
       The total number of 32-bit floating point MFMA operations executed per
       second. Note: this does not include any 32-bit floating point operations from
       VALU instructions. This is also presented as a percent of the peak theoretical
       F32 MFMA operations achievable on the specific accelerator.
-    MFMA FLOPs (F64): |-
+    MFMA FLOPs (F64): >-
       The total number of 64-bit floating point MFMA operations executed per
       second. Note: this does not include any 64-bit floating point operations from
       VALU instructions. This is also presented as a percent of the peak theoretical
       F64 MFMA operations achievable on the specific accelerator.
-    MFMA IOPs (Int8): |-
+    MFMA IOPs (Int8): >-
       The total number of 8-bit integer MFMA operations executed per second.
       Note: this does not include any 8-bit integer operations from VALU instructions.
       This is also presented as a percent of the peak theoretical INT8 MFMA operations
@@ -281,7 +281,7 @@ Panel Config:
     IPC: The ratio of the total number of instructions executed on the CU over the
       total active CU cycles. This is also presented as a percent of the peak theoretical
       bandwidth achievable on the specific accelerator.
-    Wavefront Occupancy: |-
+    Wavefront Occupancy: >-
       The time-averaged number of wavefronts resident on the accelerator over
       the lifetime of the kernel. Note: this metric may be inaccurate for short-running
       kernels (less than 1ms). This is also presented as a percent of the peak theoretical
@@ -314,7 +314,7 @@ Panel Config:
       if only a single value is requested in a cache line, the data movement will
       still be counted as a full cache line. This is also presented as a percent of
       the peak theoretical bandwidth achievable on the specific accelerator.
-    L2-Fabric Read BW: |-
+    L2-Fabric Read BW: >-
       The number of bytes read by the L2 over the Infinity Fabric\u2122 interface
       per unit time. This is also presented as a percent of the peak theoretical
       bandwidth achievable on the specific accelerator.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0300_memory_chart.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0300_memory_chart.yaml
index 1fd388cba1..8349f4f8fd 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0300_memory_chart.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0300_memory_chart.yaml
@@ -170,15 +170,15 @@ Panel Config:
     Active CUs: Total number of active compute units (CUs) on the accelerator during
       the kernel execution.
     Num CUs: Total number of compute units (CUs) on the accelerator.
-    VGPR: |-
+    VGPR: >-
       The number of architected vector general-purpose registers allocated
       for the kernel, see VALU. Note: this may not exactly match the number of VGPRs
       requested by the compiler due to allocation granularity.
-    SGPR: |-
+    SGPR: >-
       The number of scalar general-purpose registers allocated for the kernel,
       see SALU. Note: this may not exactly match the number of SGPRs requested by
       the compiler due to allocation granularity.
-    LDS Allocation: |-
+    LDS Allocation: >-
       The number of bytes of LDS memory (or, shared memory) allocated for
       this kernel. Note: This may also be larger than what was requested at compile
       time due to both allocation granularity and dynamic per-dispatch LDS allocations.
@@ -266,7 +266,7 @@ Panel Config:
       or data (atomic with return value) was returned to the L2.
     HBM Rd: The total number of L2 requests to Infinity Fabric to read 32B or 64B
       of data from the accelerator's local HBM, per normalization unit.
-    HBM Wr: |-
+    HBM Wr: >-
       The total number of L2 requests to Infinity Fabric to write or atomically
       update 32B or 64B of data in the accelerator's local HBM, per normalization
       unit.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0400_roofline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0400_roofline.yaml
index 4a8c962f3b..28089ad5d3 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0400_roofline.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0400_roofline.yaml
@@ -132,48 +132,48 @@ Panel Config:
             / 1e9) ) / 1e9
           unit: GFLOP/s
   metrics_description:
-    VALU FLOPs (F16): |-
+    VALU FLOPs (F16): >-
       The total 16-bit floating-point operations executed per second on the VALU.
       This is presented with the value of the peak empirical F16 FLOPs achievable
       on the specific accelerator. Note: this does not include any F16 operations
       from MFMA instructions.
-    VALU FLOPs (F32): |-
+    VALU FLOPs (F32): >-
       The total 32-bit floating-point operations executed per second on the VALU.
       This is presented with the value of the peak empirical F32 FLOPs achievable
       on the specific accelerator. Note: this does not include any F32 operations
       from MFMA instructions.
-    VALU FLOPs (F64): |-
+    VALU FLOPs (F64): >-
       The total 64-bit floating-point operations executed per second on the VALU.
       This is presented with the value of the peak empirical F64 FLOPs achievable
       on the specific accelerator. Note: this does not include any F64 operations
       from MFMA instructions.
-    MFMA FLOPs (BF16): |-
+    MFMA FLOPs (BF16): >-
       The total number of 16-bit brain floating point MFMA operations executed
       per second. Note: this does not include any 16-bit brain floating point
       operations from VALU instructions. The peak empirically measured BF16 MFMA
       operations achievable on the specific accelerator is displayed alongside
       for comparison.
-    MFMA FLOPs (F16): |-
+    MFMA FLOPs (F16): >-
       The total number of 16-bit floating point MFMA operations executed per
       second. Note: this does not include any 16-bit floating point operations from
       VALU instructions. The peak empirically measured F16 MFMA operations
       achievable on the specific accelerator is displayed alongside for comparison.
-    MFMA FLOPs (F32): |-
+    MFMA FLOPs (F32): >-
       The total number of 32-bit floating point MFMA operations executed per
       second. Note: this does not include any 32-bit floating point operations from
       VALU instructions. The peak empirically measured F32 MFMA operations
       achievable on the specific accelerator is displayed alongside for comparison.
-    MFMA FLOPs (F64): |-
+    MFMA FLOPs (F64): >-
       The total number of 64-bit floating point MFMA operations executed per
       second. Note: this does not include any 64-bit floating point operations from
       VALU instructions. The peak empirically measured F64 MFMA operations
       achievable on the specific accelerator is displayed alongside for comparison.
-    MFMA IOPs (Int8): |-
+    MFMA IOPs (Int8): >-
       The total number of 8-bit integer MFMA operations executed per second.
       Note: this does not include any 8-bit integer operations from VALU instructions.
       The peak empirically measured INT8 MFMA operations achievable on the specific
       accelerator is displayed alongside for comparison.
-    HBM Bandwidth: |-
+    HBM Bandwidth: >-
       The total number of bytes read from and written to High-Bandwidth
       Memory (HBM) per second. The peak empirically measured bandwidth achievable
       on the specific accelerator is displayed alongside for comparison.
@@ -194,22 +194,22 @@ Panel Config:
       from, stored to, or atomically updated in the LDS per unit time (see LDS Bandwidth
       example for more detail). The peak empirically measured LDS bandwidth achievable
       on the specific accelerator is displayed alongside for comparison.
-    AI L1: |-
+    AI L1: >-
       The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio
       of total floating-point operations (FLOPs) to total bytes transferred between
       the L1 cache and the processing units. This value is used as the x-coordinate
       for the L1 roofline.
-    AI L2: |-
+    AI L2: >-
       The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio
       of total floating-point operations (FLOPs) to total bytes transferred between
       the L2 cache and the L1 cache. This value is used as the x-coordinate for
       the L2 roofline.
-    AI HBM: |-
+    AI HBM: >-
       The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM).
       It is the ratio of total floating-point operations (FLOPs) to total bytes
       transferred between HBM and the L2 cache. This value is used as the x-coordinate
       for the HBM roofline.
-    Performance (GFLOPs): |-
+    Performance (GFLOPs): >-
       The overall achieved performance, measured in GigaFLOPs
       per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point
       operations divided by the total execution time. This value is used as the y-coordinate
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0500_command_processor_cpc_cpf.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0500_command_processor_cpc_cpf.yaml
index 118ce18331..411c4c803b 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0500_command_processor_cpc_cpf.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0500_command_processor_cpc_cpf.yaml
@@ -141,6 +141,6 @@ Panel Config:
       the CPC-L2 interface was active doing any work.
     CPC-UTCL1 Stall: Percent of CPC busy cycles where the CPC was stalled by address
       translation
-    CPC-UTCL2 Utilization: |-
+    CPC-UTCL2 Utilization: >-
       Percent of total cycles counted by the CPC's L2 address translation
       interface where the CPC was busy doing address translation work.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0600_workgroup_manager_spi.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0600_workgroup_manager_spi.yaml
index eb9845aa82..ca033cbdd7 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0600_workgroup_manager_spi.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0600_workgroup_manager_spi.yaml
@@ -168,7 +168,7 @@ Panel Config:
       in the kernel where a workgroup could not be scheduled to a CU due to a bottleneck
       within the workgroup manager rather than a lack of a CU or SIMD with sufficient
       resources.
-    Not-scheduled Rate (Scheduler-Pipe): |-
+    Not-scheduled Rate (Scheduler-Pipe): >-
       The percent of total scheduler-pipe cycles in the kernel where a workgroup
       could not be scheduled to a CU due to a bottleneck within the scheduler-pipes
       rather than a lack of a CU or SIMD with sufficient resources.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0700_wavefront.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0700_wavefront.yaml
index e9e9407cfc..dfc968df65 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0700_wavefront.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0700_wavefront.yaml
@@ -121,26 +121,26 @@ Panel Config:
     Workgroup Size: The total number of work-items (or, threads) in each workgroup
       (or, block) launched as part of the kernel dispatch. In HIP, this is equivalent
       to the total block size.
-    Total Wavefronts: |-
+    Total Wavefronts: >-
       The total number of wavefronts launched as part of the kernel dispatch.
       On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront
       size is always 64 work-items. Thus, the total number of wavefronts should
       be equivalent to the ceiling of grid size divided by 64.
     Saved Wavefronts: The total number of wavefronts saved at a context-save.
     Restored Wavefronts: The total number of wavefronts restored from a context-save.
-    VGPRs: |-
+    VGPRs: >-
       The number of architected vector general-purpose registers allocated
       for the kernel, see VALU. Note: this may not exactly match the number of VGPRs
       requested by the compiler due to allocation granularity.
-    AGPRs: |-
+    AGPRs: >-
       The number of accumulation vector general-purpose registers allocated
       for the kernel, see AGPRs. Note: this may not exactly match the number of
       AGPRs requested by the compiler due to allocation granularity.
-    SGPRs: |-
+    SGPRs: >-
       The number of scalar general-purpose registers allocated for the kernel,
       see SALU. Note: this may not exactly match the number of SGPRs requested by
       the compiler due to allocation granularity.
-    LDS Allocation: |-
+    LDS Allocation: >-
       The number of bytes of LDS memory (or, shared memory) allocated for
       this kernel. Note: This may also be larger than what was requested at compile
       time due to both allocation granularity and dynamic per-dispatch LDS allocations.
@@ -173,7 +173,7 @@ Panel Config:
       rather than identification of a precise limiter. The sum of this metric, Issue
       Wait Cycles and Active Wait Cycles should be equal to the total Wave Cycles
       metric.
-    Wavefront Occupancy: |-
+    Wavefront Occupancy: >-
       The time-averaged number of wavefronts resident on the accelerator over
       the lifetime of the kernel. Note: this metric may be inaccurate for short-running
       kernels (less than 1ms).
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1000_compute_units_instruction_mix.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1000_compute_units_instruction_mix.yaml
index ecf7473688..e35145781b 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1000_compute_units_instruction_mix.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1000_compute_units_instruction_mix.yaml
@@ -268,7 +268,7 @@ Panel Config:
       floating-point operands issued to the VALU per normalization unit.
     F64-Trans: The total number of transcendental instructions (such as sqrt) operating
       on 64-bit floating-point operands issued to the VALU per normalization unit.
-    Conversion: |-
+    Conversion: >-
       The total number of type conversion instructions (such as converting
       data to or from F32\u2194F64) issued to the VALU per normalization unit.
     Global/Generic Instr: The total number of global & generic memory instructions
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1100_compute_units_compute_pipeline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1100_compute_units_compute_pipeline.yaml
index af4ff8ef77..f7536c0006 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1100_compute_units_compute_pipeline.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1100_compute_units_compute_pipeline.yaml
@@ -237,37 +237,37 @@ Panel Config:
           max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
           unit: (OPs + $normUnit)
   metrics_description:
-    VALU FLOPs: |-
+    VALU FLOPs: >-
       The total floating-point operations executed per second on the VALU.
       This is also presented as a percent of the peak theoretical FLOPs achievable
       on the specific accelerator. Note: this does not include any floating-point
       operations from MFMA instructions.
-    VALU IOPs: |-
+    VALU IOPs: >-
       The total integer operations executed per second on the VALU. This is
       also presented as a percent of the peak theoretical IOPs achievable on the
       specific accelerator. Note: this does not include any integer operations from
       MFMA instructions.
-    MFMA FLOPs (BF16): |-
+    MFMA FLOPs (BF16): >-
       The total number of 16-bit brain floating point MFMA operations executed
       per second. Note: this does not include any 16-bit brain floating point operations
       from VALU instructions. This is also presented as a percent of the peak theoretical
       BF16 MFMA operations achievable on the specific accelerator.
-    MFMA FLOPs (F16): |-
+    MFMA FLOPs (F16): >-
       The total number of 16-bit floating point MFMA operations executed per
       second. Note: this does not include any 16-bit floating point operations from
       VALU instructions. This is also presented as a percent of the peak theoretical
       F16 MFMA operations achievable on the specific accelerator.
-    MFMA FLOPs (F32): |-
+    MFMA FLOPs (F32): >-
       The total number of 32-bit floating point MFMA operations executed per
       second. Note: this does not include any 32-bit floating point operations from
       VALU instructions. This is also presented as a percent of the peak theoretical
       F32 MFMA operations achievable on the specific accelerator.
-    MFMA FLOPs (F64): |-
+    MFMA FLOPs (F64): >-
       The total number of 64-bit floating point MFMA operations executed per
       second. Note: this does not include any 64-bit floating point operations from
       VALU instructions. This is also presented as a percent of the peak theoretical
       F64 MFMA operations achievable on the specific accelerator.
-    MFMA IOPs (INT8): |-
+    MFMA IOPs (INT8): >-
       The total number of 8-bit integer MFMA operations executed per second.
       Note: this does not include any 8-bit integer operations from VALU instructions.
       This is also presented as a percent of the peak theoretical INT8 MFMA operations
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1200_local_data_share_lds.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1200_local_data_share_lds.yaml
index b7767fea16..8e602e9f83 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1200_local_data_share_lds.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1200_local_data_share_lds.yaml
@@ -140,7 +140,7 @@ Panel Config:
       unit.
     Unaligned Stall: The total number of cycles spent in the LDS scheduler due to
       stalls from non-dword aligned addresses per normalization unit.
-    Mem Violations: |-
+    Mem Violations: >-
       The total number of out-of-bounds accesses made to the LDS, per normalization
       unit. This is unused and expected to be zero in most configurations for
       modern CDNA\u2122 accelerators.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1300_instruction_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1300_instruction_cache.yaml
index 35808d9d96..ffc6e890b9 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1300_instruction_cache.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1300_instruction_cache.yaml
@@ -92,7 +92,7 @@ Panel Config:
     Cache Hit Rate: The percent of L1I requests that hit [#l1i-cache]_ on a previously
       loaded line the cache. Calculated as the ratio of the number of L1I requests
       that hit over the number of all L1I requests.
-    L1I-L2 Bandwidth Utilization: |-
+    L1I-L2 Bandwidth Utilization: >-
       The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth
       achieved. Calculated as the ratio of the total number of requests from the
       L1I to the L2 cache over the total L1I-L2 interface cycles.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1400_scalar_l1_data_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1400_scalar_l1_data_cache.yaml
index 6b73164848..be2ce0db79 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1400_scalar_l1_data_cache.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1400_scalar_l1_data_cache.yaml
@@ -154,7 +154,7 @@ Panel Config:
     sL1D-L2 BW Utilization: The percentage of the peak theoretical sL1D - L2 interface
       bandwidth acheived. Calculated as total number of bytes read from, written to,
       or atomically updated across the sL1D - L2 interface.
-    sL1D-L2 BW: |-
+    sL1D-L2 BW: >-
       The total number of bytes read from, written to, or atomically updated
       across the sL1D\u2194L2 interface, divided by total duration. Note that sL1D
       writes and atomics are typically unused on current CDNA accelerators, so
@@ -164,7 +164,7 @@ Panel Config:
       unit.
     Hits: The total number of sL1D requests that hit on a previously loaded cache
       line, per normalization unit.
-    Misses - Non Duplicated: |-
+    Misses - Non Duplicated: >-
       The total number of sL1D requests that missed on a cache line that was
       not already pending due to another request, per normalization unit.
     Misses- Duplicated: The total number of sL1D requests that missed on a cache line
@@ -187,6 +187,6 @@ Panel Config:
       unit.
     Write Req: The total number of write requests from sL1D to the L2, per normalization
       unit. Typically unused on current CDNA accelerators.
-    Stall Cycles: |-
+    Stall Cycles: >-
       The total number of cycles the sL1D\u2194L2 interface was stalled, per
       normalization unit.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1600_vector_l1_data_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1600_vector_l1_data_cache.yaml
index 2be99f875f..55ba9d1447 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1600_vector_l1_data_cache.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/1600_vector_l1_data_cache.yaml
@@ -436,7 +436,7 @@ Panel Config:
       per normalization unit.
     Translation Misses: The total number of translation requests that missed in the
       UTCL1 due to translation not being present in the cache, per normalization unit.
-    Permission Misses: |-
+    Permission Misses: >-
       The total number of translation requests that missed in the UTCL1 due
       to a permission error, per normalization unit. This is unused and expected
       to be zero in most configurations for modern CDNA\u2122 accelerators.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0200_system_speed_of_light.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0200_system_speed_of_light.yaml
index 8aa72cb25d..5f76eb8937 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0200_system_speed_of_light.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0200_system_speed_of_light.yaml
@@ -227,12 +227,12 @@ Panel Config:
           pop: None
           coll_level: SQ_IFETCH_LEVEL
   metrics_description:
-    VALU FLOPs: |-
+    VALU FLOPs: >-
       The total floating-point operations executed per second on the VALU.
       This is also presented as a percent of the peak theoretical FLOPs achievable
       on the specific accelerator. Note: this does not include any floating-point
       operations from MFMA instructions.
-    VALU IOPs: |-
+    VALU IOPs: >-
       The total integer operations executed per second on the VALU. This is
       also presented as a percent of the peak theoretical IOPs achievable on the
       specific accelerator. Note: this does not include any integer operations from
@@ -242,27 +242,27 @@ Panel Config:
       from VALU instructions. This is also presented as a percent of the peak theoretical
       F8 MFMA operations achievable on the specific accelerator. It is supported on
       AMD Instinct MI300 series and later only.
-    MFMA FLOPs (BF16): |-
+    MFMA FLOPs (BF16): >-
       The total number of 16-bit brain floating point MFMA operations executed
       per second. Note: this does not include any 16-bit brain floating point operations
       from VALU instructions. This is also presented as a percent of the peak theoretical
       BF16 MFMA operations achievable on the specific accelerator.
-    MFMA FLOPs (F16): |-
+    MFMA FLOPs (F16): >-
       The total number of 16-bit floating point MFMA operations executed per
       second. Note: this does not include any 16-bit floating point operations from
       VALU instructions. This is also presented as a percent of the peak theoretical
       F16 MFMA operations achievable on the specific accelerator.
-    MFMA FLOPs (F32): |-
+    MFMA FLOPs (F32): >-
       The total number of 32-bit floating point MFMA operations executed per
       second. Note: this does not include any 32-bit floating point operations from
       VALU instructions. This is also presented as a percent of the peak theoretical
       F32 MFMA operations achievable on the specific accelerator.
-    MFMA FLOPs (F64): |-
+    MFMA FLOPs (F64): >-
       The total number of 64-bit floating point MFMA operations executed per
       second. Note: this does not include any 64-bit floating point operations from
       VALU instructions. This is also presented as a percent of the peak theoretical
       F64 MFMA operations achievable on the specific accelerator.
-    MFMA IOPs (Int8): |-
+    MFMA IOPs (Int8): >-
       The total number of 8-bit integer MFMA operations executed per second.
       Note: this does not include any 8-bit integer operations from VALU instructions.
       This is also presented as a percent of the peak theoretical INT8 MFMA operations
@@ -295,7 +295,7 @@ Panel Config:
     IPC: The ratio of the total number of instructions executed on the CU over the
       total active CU cycles. This is also presented as a percent of the peak theoretical
       bandwidth achievable on the specific accelerator.
-    Wavefront Occupancy: |-
+    Wavefront Occupancy: >-
       The time-averaged number of wavefronts resident on the accelerator over
       the lifetime of the kernel. Note: this metric may be inaccurate for short-running
       kernels (less than 1ms). This is also presented as a percent of the peak theoretical
@@ -328,7 +328,7 @@ Panel Config:
       if only a single value is requested in a cache line, the data movement will
       still be counted as a full cache line. This is also presented as a percent of
       the peak theoretical bandwidth achievable on the specific accelerator.
-    L2-Fabric Read BW: |-
+    L2-Fabric Read BW: >-
       The number of bytes read by the L2 over the Infinity Fabric\u2122 interface
       per unit time. This is also presented as a percent of the peak theoretical
       bandwidth achievable on the specific accelerator.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0300_memory_chart.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0300_memory_chart.yaml
index b13053c1f7..81ce3c2e68 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0300_memory_chart.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0300_memory_chart.yaml
@@ -162,15 +162,15 @@ Panel Config:
     Active CUs: Total number of active compute units (CUs) on the accelerator during
       the kernel execution.
     Num CUs: Total number of compute units (CUs) on the accelerator.
-    VGPR: |-
+    VGPR: >-
       The number of architected vector general-purpose registers allocated
       for the kernel, see VALU. Note: this may not exactly match the number of VGPRs
       requested by the compiler due to allocation granularity.
-    SGPR: |-
+    SGPR: >-
       The number of scalar general-purpose registers allocated for the kernel,
       see SALU. Note: this may not exactly match the number of SGPRs requested by
       the compiler due to allocation granularity.
-    LDS Allocation: |-
+    LDS Allocation: >-
       The number of bytes of LDS memory (or, shared memory) allocated for
       this kernel. Note: This may also be larger than what was requested at compile
       time due to both allocation granularity and dynamic per-dispatch LDS allocations.
@@ -252,7 +252,7 @@ Panel Config:
       or data (atomic with return value) was returned to the L2.
     HBM Rd: The total number of L2 requests to Infinity Fabric to read 32B or 64B
       of data from the accelerator's local HBM, per normalization unit.
-    HBM Wr: |-
+    HBM Wr: >-
       The total number of L2 requests to Infinity Fabric to write or atomically
       update 32B or 64B of data in the accelerator's local HBM, per normalization
       unit.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0400_roofline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0400_roofline.yaml
index 6731ebfceb..f32f4fa7d8 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0400_roofline.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0400_roofline.yaml
@@ -140,17 +140,17 @@ Panel Config:
             * 512) ) / (SUM(End_Timestamp - Start_Timestamp) / 1e9) ) / 1e9
           unit: GFLOP/s
   metrics_description:
-    VALU FLOPs (F16): |-
+    VALU FLOPs (F16): >-
       The total 16-bit floating-point operations executed per second on the VALU.
       This is presented with the value of the peak empirical F16 FLOPs achievable
       on the specific accelerator. Note: this does not include any F16 operations
       from MFMA instructions.
-    VALU FLOPs (F32): |-
+    VALU FLOPs (F32): >-
       The total 32-bit floating-point operations executed per second on the VALU.
       This is presented with the value of the peak empirical F32 FLOPs achievable
       on the specific accelerator. Note: this does not include any F32 operations
       from MFMA instructions.
-    VALU FLOPs (F64): |-
+    VALU FLOPs (F64): >-
       The total 64-bit floating-point operations executed per second on the VALU.
       This is presented with the value of the peak empirical F64 FLOPs achievable
       on the specific accelerator. Note: this does not include any F64 operations
@@ -160,33 +160,33 @@ Panel Config:
       from VALU instructions. The peak empirically measured F8 MFMA operations achievable
       on the specific accelerator is displayed alongside for comparison. It is supported
       on AMD Instinct MI300 series and later only.
-    MFMA FLOPs (BF16): |-
+    MFMA FLOPs (BF16): >-
       The total number of 16-bit brain floating point MFMA operations executed
       per second. Note: this does not include any 16-bit brain floating point
       operations from VALU instructions. The peak empirically measured BF16 MFMA
       operations achievable on the specific accelerator is displayed alongside
       for comparison.
-    MFMA FLOPs (F16): |-
+    MFMA FLOPs (F16): >-
       The total number of 16-bit floating point MFMA operations executed per
       second. Note: this does not include any 16-bit floating point operations from
       VALU instructions. The peak empirically measured F16 MFMA operations
       achievable on the specific accelerator is displayed alongside for comparison.
-    MFMA FLOPs (F32): |-
+    MFMA FLOPs (F32): >-
       The total number of 32-bit floating point MFMA operations executed per
       second. Note: this does not include any 32-bit floating point operations from
       VALU instructions. The peak empirically measured F32 MFMA operations
       achievable on the specific accelerator is displayed alongside for comparison.
-    MFMA FLOPs (F64): |-
+    MFMA FLOPs (F64): >-
       The total number of 64-bit floating point MFMA operations executed per
       second. Note: this does not include any 64-bit floating point operations from
       VALU instructions. The peak empirically measured F64 MFMA operations
       achievable on the specific accelerator is displayed alongside for comparison.
-    MFMA IOPs (Int8): |-
+    MFMA IOPs (Int8): >-
       The total number of 8-bit integer MFMA operations executed per second.
       Note: this does not include any 8-bit integer operations from VALU instructions.
       The peak empirically measured INT8 MFMA operations achievable on the specific
       accelerator is displayed alongside for comparison.
-    HBM Bandwidth: |-
+    HBM Bandwidth: >-
       The total number of bytes read from and written to High-Bandwidth
       Memory (HBM) per second. The peak empirically measured bandwidth achievable
       on the specific accelerator is displayed alongside for comparison.
@@ -207,22 +207,22 @@ Panel Config:
       from, stored to, or atomically updated in the LDS per unit time (see LDS Bandwidth
       example for more detail). The peak empirically measured LDS bandwidth achievable
       on the specific accelerator is displayed alongside for comparison.
-    AI L1: |-
+    AI L1: >-
       The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio
       of total floating-point operations (FLOPs) to total bytes transferred between
       the L1 cache and the processing units. This value is used as the x-coordinate
       for the L1 roofline.
-    AI L2: |-
+    AI L2: >-
       The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio
       of total floating-point operations (FLOPs) to total bytes transferred between
       the L2 cache and the L1 cache. This value is used as the x-coordinate for
       the L2 roofline.
-    AI HBM: |-
+    AI HBM: >-
       The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM).
       It is the ratio of total floating-point operations (FLOPs) to total bytes
       transferred between HBM and the L2 cache. This value is used as the x-coordinate
       for the HBM roofline.
-    Performance (GFLOPs): |-
+    Performance (GFLOPs): >-
       The overall achieved performance, measured in GigaFLOPs
       per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point
       operations divided by the total execution time. This value is used as the y-coordinate
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0500_command_processor_cpc_cpf.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0500_command_processor_cpc_cpf.yaml
index 118ce18331..411c4c803b 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0500_command_processor_cpc_cpf.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0500_command_processor_cpc_cpf.yaml
@@ -141,6 +141,6 @@ Panel Config:
       the CPC-L2 interface was active doing any work.
     CPC-UTCL1 Stall: Percent of CPC busy cycles where the CPC was stalled by address
       translation
-    CPC-UTCL2 Utilization: |-
+    CPC-UTCL2 Utilization: >-
       Percent of total cycles counted by the CPC's L2 address translation
       interface where the CPC was busy doing address translation work.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0600_workgroup_manager_spi.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0600_workgroup_manager_spi.yaml
index eb9845aa82..ca033cbdd7 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0600_workgroup_manager_spi.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0600_workgroup_manager_spi.yaml
@@ -168,7 +168,7 @@ Panel Config:
       in the kernel where a workgroup could not be scheduled to a CU due to a bottleneck
       within the workgroup manager rather than a lack of a CU or SIMD with sufficient
       resources.
-    Not-scheduled Rate (Scheduler-Pipe): |-
+    Not-scheduled Rate (Scheduler-Pipe): >-
       The percent of total scheduler-pipe cycles in the kernel where a workgroup
       could not be scheduled to a CU due to a bottleneck within the scheduler-pipes
       rather than a lack of a CU or SIMD with sufficient resources.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0700_wavefront.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0700_wavefront.yaml
index e9e9407cfc..dfc968df65 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0700_wavefront.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0700_wavefront.yaml
@@ -121,26 +121,26 @@ Panel Config:
     Workgroup Size: The total number of work-items (or, threads) in each workgroup
       (or, block) launched as part of the kernel dispatch. In HIP, this is equivalent
       to the total block size.
-    Total Wavefronts: |-
+    Total Wavefronts: >-
       The total number of wavefronts launched as part of the kernel dispatch.
       On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront
       size is always 64 work-items. Thus, the total number of wavefronts should
       be equivalent to the ceiling of grid size divided by 64.
     Saved Wavefronts: The total number of wavefronts saved at a context-save.
     Restored Wavefronts: The total number of wavefronts restored from a context-save.
-    VGPRs: |-
+    VGPRs: >-
       The number of architected vector general-purpose registers allocated
       for the kernel, see VALU. Note: this may not exactly match the number of VGPRs
       requested by the compiler due to allocation granularity.
-    AGPRs: |-
+    AGPRs: >-
       The number of accumulation vector general-purpose registers allocated
       for the kernel, see AGPRs. Note: this may not exactly match the number of
       AGPRs requested by the compiler due to allocation granularity.
-    SGPRs: |-
+    SGPRs: >-
       The number of scalar general-purpose registers allocated for the kernel,
       see SALU. Note: this may not exactly match the number of SGPRs requested by
       the compiler due to allocation granularity.
-    LDS Allocation: |-
+    LDS Allocation: >-
       The number of bytes of LDS memory (or, shared memory) allocated for
       this kernel. Note: This may also be larger than what was requested at compile
       time due to both allocation granularity and dynamic per-dispatch LDS allocations.
@@ -173,7 +173,7 @@ Panel Config:
       rather than identification of a precise limiter. The sum of this metric, Issue
       Wait Cycles and Active Wait Cycles should be equal to the total Wave Cycles
       metric.
-    Wavefront Occupancy: |-
+    Wavefront Occupancy: >-
       The time-averaged number of wavefronts resident on the accelerator over
       the lifetime of the kernel. Note: this metric may be inaccurate for short-running
       kernels (less than 1ms).
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1000_compute_units_instruction_mix.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1000_compute_units_instruction_mix.yaml
index 768fe6548b..823f543182 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1000_compute_units_instruction_mix.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1000_compute_units_instruction_mix.yaml
@@ -273,7 +273,7 @@ Panel Config:
       floating-point operands issued to the VALU per normalization unit.
     F64-Trans: The total number of transcendental instructions (such as sqrt) operating
       on 64-bit floating-point operands issued to the VALU per normalization unit.
-    Conversion: |-
+    Conversion: >-
       The total number of type conversion instructions (such as converting
       data to or from F32\u2194F64) issued to the VALU per normalization unit.
     Global/Generic Instr: The total number of global & generic memory instructions
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1100_compute_units_compute_pipeline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1100_compute_units_compute_pipeline.yaml
index 5e6ceb654f..d9a4abced9 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1100_compute_units_compute_pipeline.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1100_compute_units_compute_pipeline.yaml
@@ -251,37 +251,37 @@ Panel Config:
           max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
           unit: (OPs + $normUnit)
   metrics_description:
-    VALU FLOPs: |-
+    VALU FLOPs: >-
       The total floating-point operations executed per second on the VALU.
       This is also presented as a percent of the peak theoretical FLOPs achievable
       on the specific accelerator. Note: this does not include any floating-point
       operations from MFMA instructions.
-    VALU IOPs: |-
+    VALU IOPs: >-
       The total integer operations executed per second on the VALU. This is
       also presented as a percent of the peak theoretical IOPs achievable on the
       specific accelerator. Note: this does not include any integer operations from
       MFMA instructions.
-    MFMA FLOPs (BF16): |-
+    MFMA FLOPs (BF16): >-
       The total number of 16-bit brain floating point MFMA operations executed
       per second. Note: this does not include any 16-bit brain floating point operations
       from VALU instructions. This is also presented as a percent of the peak theoretical
       BF16 MFMA operations achievable on the specific accelerator.
-    MFMA FLOPs (F16): |-
+    MFMA FLOPs (F16): >-
       The total number of 16-bit floating point MFMA operations executed per
       second. Note: this does not include any 16-bit floating point operations from
       VALU instructions. This is also presented as a percent of the peak theoretical
       F16 MFMA operations achievable on the specific accelerator.
-    MFMA FLOPs (F32): |-
+    MFMA FLOPs (F32): >-
       The total number of 32-bit floating point MFMA operations executed per
       second. Note: this does not include any 32-bit floating point operations from
       VALU instructions. This is also presented as a percent of the peak theoretical
       F32 MFMA operations achievable on the specific accelerator.
-    MFMA FLOPs (F64): |-
+    MFMA FLOPs (F64): >-
       The total number of 64-bit floating point MFMA operations executed per
       second. Note: this does not include any 64-bit floating point operations from
       VALU instructions. This is also presented as a percent of the peak theoretical
       F64 MFMA operations achievable on the specific accelerator.
-    MFMA IOPs (INT8): |-
+    MFMA IOPs (INT8): >-
       The total number of 8-bit integer MFMA operations executed per second.
       Note: this does not include any 8-bit integer operations from VALU instructions.
       This is also presented as a percent of the peak theoretical INT8 MFMA operations
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1200_local_data_share_lds.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1200_local_data_share_lds.yaml
index b7767fea16..8e602e9f83 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1200_local_data_share_lds.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1200_local_data_share_lds.yaml
@@ -140,7 +140,7 @@ Panel Config:
       unit.
     Unaligned Stall: The total number of cycles spent in the LDS scheduler due to
       stalls from non-dword aligned addresses per normalization unit.
-    Mem Violations: |-
+    Mem Violations: >-
       The total number of out-of-bounds accesses made to the LDS, per normalization
       unit. This is unused and expected to be zero in most configurations for
       modern CDNA\u2122 accelerators.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1300_instruction_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1300_instruction_cache.yaml
index 35808d9d96..ffc6e890b9 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1300_instruction_cache.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1300_instruction_cache.yaml
@@ -92,7 +92,7 @@ Panel Config:
     Cache Hit Rate: The percent of L1I requests that hit [#l1i-cache]_ on a previously
       loaded line the cache. Calculated as the ratio of the number of L1I requests
       that hit over the number of all L1I requests.
-    L1I-L2 Bandwidth Utilization: |-
+    L1I-L2 Bandwidth Utilization: >-
       The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth
       achieved. Calculated as the ratio of the total number of requests from the
       L1I to the L2 cache over the total L1I-L2 interface cycles.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1400_scalar_l1_data_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1400_scalar_l1_data_cache.yaml
index 6b73164848..be2ce0db79 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1400_scalar_l1_data_cache.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1400_scalar_l1_data_cache.yaml
@@ -154,7 +154,7 @@ Panel Config:
     sL1D-L2 BW Utilization: The percentage of the peak theoretical sL1D - L2 interface
       bandwidth acheived. Calculated as total number of bytes read from, written to,
       or atomically updated across the sL1D - L2 interface.
-    sL1D-L2 BW: |-
+    sL1D-L2 BW: >-
       The total number of bytes read from, written to, or atomically updated
       across the sL1D\u2194L2 interface, divided by total duration. Note that sL1D
       writes and atomics are typically unused on current CDNA accelerators, so
@@ -164,7 +164,7 @@ Panel Config:
       unit.
     Hits: The total number of sL1D requests that hit on a previously loaded cache
       line, per normalization unit.
-    Misses - Non Duplicated: |-
+    Misses - Non Duplicated: >-
       The total number of sL1D requests that missed on a cache line that was
       not already pending due to another request, per normalization unit.
     Misses- Duplicated: The total number of sL1D requests that missed on a cache line
@@ -187,6 +187,6 @@ Panel Config:
       unit.
     Write Req: The total number of write requests from sL1D to the L2, per normalization
       unit. Typically unused on current CDNA accelerators.
-    Stall Cycles: |-
+    Stall Cycles: >-
       The total number of cycles the sL1D\u2194L2 interface was stalled, per
       normalization unit.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1600_vector_l1_data_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1600_vector_l1_data_cache.yaml
index 3125397a30..9111e85921 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1600_vector_l1_data_cache.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1600_vector_l1_data_cache.yaml
@@ -398,7 +398,7 @@ Panel Config:
       per normalization unit.
     Translation Misses: The total number of translation requests that missed in the
       UTCL1 due to translation not being present in the cache, per normalization unit.
-    Permission Misses: |-
+    Permission Misses: >-
       The total number of translation requests that missed in the UTCL1 due
       to a permission error, per normalization unit. This is unused and expected
       to be zero in most configurations for modern CDNA\u2122 accelerators.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0200_system_speed_of_light.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0200_system_speed_of_light.yaml
index 7943f891b1..d0efd3e600 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0200_system_speed_of_light.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0200_system_speed_of_light.yaml
@@ -227,12 +227,12 @@ Panel Config:
           pop: None
           coll_level: SQ_IFETCH_LEVEL
   metrics_description:
-    VALU FLOPs: |-
+    VALU FLOPs: >-
       The total floating-point operations executed per second on the VALU.
       This is also presented as a percent of the peak theoretical FLOPs achievable
       on the specific accelerator. Note: this does not include any floating-point
       operations from MFMA instructions.
-    VALU IOPs: |-
+    VALU IOPs: >-
       The total integer operations executed per second on the VALU. This is
       also presented as a percent of the peak theoretical IOPs achievable on the
       specific accelerator. Note: this does not include any integer operations from
@@ -242,27 +242,27 @@ Panel Config:
       from VALU instructions. This is also presented as a percent of the peak theoretical
       F8 MFMA operations achievable on the specific accelerator. It is supported on
       AMD Instinct MI300 series and later only.
-    MFMA FLOPs (BF16): |-
+    MFMA FLOPs (BF16): >-
       The total number of 16-bit brain floating point MFMA operations executed
       per second. Note: this does not include any 16-bit brain floating point operations
       from VALU instructions. This is also presented as a percent of the peak theoretical
       BF16 MFMA operations achievable on the specific accelerator.
-    MFMA FLOPs (F16): |-
+    MFMA FLOPs (F16): >-
       The total number of 16-bit floating point MFMA operations executed per
       second. Note: this does not include any 16-bit floating point operations from
       VALU instructions. This is also presented as a percent of the peak theoretical
       F16 MFMA operations achievable on the specific accelerator.
-    MFMA FLOPs (F32): |-
+    MFMA FLOPs (F32): >-
       The total number of 32-bit floating point MFMA operations executed per
       second. Note: this does not include any 32-bit floating point operations from
       VALU instructions. This is also presented as a percent of the peak theoretical
       F32 MFMA operations achievable on the specific accelerator.
-    MFMA FLOPs (F64): |-
+    MFMA FLOPs (F64): >-
       The total number of 64-bit floating point MFMA operations executed per
       second. Note: this does not include any 64-bit floating point operations from
       VALU instructions. This is also presented as a percent of the peak theoretical
       F64 MFMA operations achievable on the specific accelerator.
-    MFMA IOPs (Int8): |-
+    MFMA IOPs (Int8): >-
       The total number of 8-bit integer MFMA operations executed per second.
       Note: this does not include any 8-bit integer operations from VALU instructions.
       This is also presented as a percent of the peak theoretical INT8 MFMA operations
@@ -295,7 +295,7 @@ Panel Config:
     IPC: The ratio of the total number of instructions executed on the CU over the
       total active CU cycles. This is also presented as a percent of the peak theoretical
       bandwidth achievable on the specific accelerator.
-    Wavefront Occupancy: |-
+    Wavefront Occupancy: >-
       The time-averaged number of wavefronts resident on the accelerator over
       the lifetime of the kernel. Note: this metric may be inaccurate for short-running
       kernels (less than 1ms). This is also presented as a percent of the peak theoretical
@@ -328,7 +328,7 @@ Panel Config:
       if only a single value is requested in a cache line, the data movement will
       still be counted as a full cache line. This is also presented as a percent of
       the peak theoretical bandwidth achievable on the specific accelerator.
-    L2-Fabric Read BW: |-
+    L2-Fabric Read BW: >-
       The number of bytes read by the L2 over the Infinity Fabric\u2122 interface
       per unit time. This is also presented as a percent of the peak theoretical
       bandwidth achievable on the specific accelerator.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0300_memory_chart.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0300_memory_chart.yaml
index b13053c1f7..81ce3c2e68 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0300_memory_chart.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0300_memory_chart.yaml
@@ -162,15 +162,15 @@ Panel Config:
     Active CUs: Total number of active compute units (CUs) on the accelerator during
       the kernel execution.
     Num CUs: Total number of compute units (CUs) on the accelerator.
-    VGPR: |-
+    VGPR: >-
       The number of architected vector general-purpose registers allocated
       for the kernel, see VALU. Note: this may not exactly match the number of VGPRs
       requested by the compiler due to allocation granularity.
-    SGPR: |-
+    SGPR: >-
       The number of scalar general-purpose registers allocated for the kernel,
       see SALU. Note: this may not exactly match the number of SGPRs requested by
       the compiler due to allocation granularity.
-    LDS Allocation: |-
+    LDS Allocation: >-
       The number of bytes of LDS memory (or, shared memory) allocated for
       this kernel. Note: This may also be larger than what was requested at compile
       time due to both allocation granularity and dynamic per-dispatch LDS allocations.
@@ -252,7 +252,7 @@ Panel Config:
       or data (atomic with return value) was returned to the L2.
     HBM Rd: The total number of L2 requests to Infinity Fabric to read 32B or 64B
       of data from the accelerator's local HBM, per normalization unit.
-    HBM Wr: |-
+    HBM Wr: >-
       The total number of L2 requests to Infinity Fabric to write or atomically
       update 32B or 64B of data in the accelerator's local HBM, per normalization
       unit.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0400_roofline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0400_roofline.yaml
index 536938f700..d24eaedeae 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0400_roofline.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0400_roofline.yaml
@@ -140,17 +140,17 @@ Panel Config:
             * 512) ) / (SUM(End_Timestamp - Start_Timestamp) / 1e9) ) / 1e9
           unit: GFLOP/s
   metrics_description:
-    VALU FLOPs (F16): |-
+    VALU FLOPs (F16): >-
       The total 16-bit floating-point operations executed per second on the VALU.
       This is presented with the value of the peak empirical F16 FLOPs achievable
       on the specific accelerator. Note: this does not include any F16 operations
       from MFMA instructions.
-    VALU FLOPs (F32): |-
+    VALU FLOPs (F32): >-
       The total 32-bit floating-point operations executed per second on the VALU.
       This is presented with the value of the peak empirical F32 FLOPs achievable
       on the specific accelerator. Note: this does not include any F32 operations
       from MFMA instructions.
-    VALU FLOPs (F64): |-
+    VALU FLOPs (F64): >-
       The total 64-bit floating-point operations executed per second on the VALU.
       This is presented with the value of the peak empirical F64 FLOPs achievable
       on the specific accelerator. Note: this does not include any F64 operations
@@ -160,33 +160,33 @@ Panel Config:
       from VALU instructions. The peak empirically measured F8 MFMA operations achievable
       on the specific accelerator is displayed alongside for comparison. It is supported
       on AMD Instinct MI300 series and later only.
-    MFMA FLOPs (BF16): |-
+    MFMA FLOPs (BF16): >-
       The total number of 16-bit brain floating point MFMA operations executed
       per second. Note: this does not include any 16-bit brain floating point
       operations from VALU instructions. The peak empirically measured BF16 MFMA
       operations achievable on the specific accelerator is displayed alongside
       for comparison.
-    MFMA FLOPs (F16): |-
+    MFMA FLOPs (F16): >-
       The total number of 16-bit floating point MFMA operations executed per
       second. Note: this does not include any 16-bit floating point operations from
       VALU instructions. The peak empirically measured F16 MFMA operations
       achievable on the specific accelerator is displayed alongside for comparison.
-    MFMA FLOPs (F32): |-
+    MFMA FLOPs (F32): >-
       The total number of 32-bit floating point MFMA operations executed per
       second. Note: this does not include any 32-bit floating point operations from
       VALU instructions. The peak empirically measured F32 MFMA operations
       achievable on the specific accelerator is displayed alongside for comparison.
-    MFMA FLOPs (F64): |-
+    MFMA FLOPs (F64): >-
       The total number of 64-bit floating point MFMA operations executed per
       second. Note: this does not include any 64-bit floating point operations from
       VALU instructions. The peak empirically measured F64 MFMA operations
       achievable on the specific accelerator is displayed alongside for comparison.
-    MFMA IOPs (Int8): |-
+    MFMA IOPs (Int8): >-
       The total number of 8-bit integer MFMA operations executed per second.
       Note: this does not include any 8-bit integer operations from VALU instructions.
       The peak empirically measured INT8 MFMA operations achievable on the specific
       accelerator is displayed alongside for comparison.
-    HBM Bandwidth: |-
+    HBM Bandwidth: >-
       The total number of bytes read from and written to High-Bandwidth
       Memory (HBM) per second. The peak empirically measured bandwidth achievable
       on the specific accelerator is displayed alongside for comparison.
@@ -207,22 +207,22 @@ Panel Config:
       from, stored to, or atomically updated in the LDS per unit time (see LDS Bandwidth
       example for more detail). The peak empirically measured LDS bandwidth achievable
       on the specific accelerator is displayed alongside for comparison.
-    AI L1: |-
+    AI L1: >-
       The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio
       of total floating-point operations (FLOPs) to total bytes transferred between
       the L1 cache and the processing units. This value is used as the x-coordinate
       for the L1 roofline.
-    AI L2: |-
+    AI L2: >-
       The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio
       of total floating-point operations (FLOPs) to total bytes transferred between
       the L2 cache and the L1 cache. This value is used as the x-coordinate for
       the L2 roofline.
-    AI HBM: |-
+    AI HBM: >-
       The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM).
       It is the ratio of total floating-point operations (FLOPs) to total bytes
       transferred between HBM and the L2 cache. This value is used as the x-coordinate
       for the HBM roofline.
-    Performance (GFLOPs): |-
+    Performance (GFLOPs): >-
       The overall achieved performance, measured in GigaFLOPs
       per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point
       operations divided by the total execution time. This value is used as the y-coordinate
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0500_command_processor_cpc_cpf.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0500_command_processor_cpc_cpf.yaml
index 118ce18331..411c4c803b 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0500_command_processor_cpc_cpf.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0500_command_processor_cpc_cpf.yaml
@@ -141,6 +141,6 @@ Panel Config:
       the CPC-L2 interface was active doing any work.
     CPC-UTCL1 Stall: Percent of CPC busy cycles where the CPC was stalled by address
       translation
-    CPC-UTCL2 Utilization: |-
+    CPC-UTCL2 Utilization: >-
       Percent of total cycles counted by the CPC's L2 address translation
       interface where the CPC was busy doing address translation work.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0600_workgroup_manager_spi.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0600_workgroup_manager_spi.yaml
index eb9845aa82..ca033cbdd7 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0600_workgroup_manager_spi.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0600_workgroup_manager_spi.yaml
@@ -168,7 +168,7 @@ Panel Config:
       in the kernel where a workgroup could not be scheduled to a CU due to a bottleneck
       within the workgroup manager rather than a lack of a CU or SIMD with sufficient
       resources.
-    Not-scheduled Rate (Scheduler-Pipe): |-
+    Not-scheduled Rate (Scheduler-Pipe): >-
       The percent of total scheduler-pipe cycles in the kernel where a workgroup
       could not be scheduled to a CU due to a bottleneck within the scheduler-pipes
       rather than a lack of a CU or SIMD with sufficient resources.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0700_wavefront.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0700_wavefront.yaml
index e9e9407cfc..dfc968df65 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0700_wavefront.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0700_wavefront.yaml
@@ -121,26 +121,26 @@ Panel Config:
     Workgroup Size: The total number of work-items (or, threads) in each workgroup
       (or, block) launched as part of the kernel dispatch. In HIP, this is equivalent
       to the total block size.
-    Total Wavefronts: |-
+    Total Wavefronts: >-
       The total number of wavefronts launched as part of the kernel dispatch.
       On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront
       size is always 64 work-items. Thus, the total number of wavefronts should
       be equivalent to the ceiling of grid size divided by 64.
     Saved Wavefronts: The total number of wavefronts saved at a context-save.
     Restored Wavefronts: The total number of wavefronts restored from a context-save.
-    VGPRs: |-
+    VGPRs: >-
       The number of architected vector general-purpose registers allocated
       for the kernel, see VALU. Note: this may not exactly match the number of VGPRs
       requested by the compiler due to allocation granularity.
-    AGPRs: |-
+    AGPRs: >-
       The number of accumulation vector general-purpose registers allocated
       for the kernel, see AGPRs. Note: this may not exactly match the number of
       AGPRs requested by the compiler due to allocation granularity.
-    SGPRs: |-
+    SGPRs: >-
       The number of scalar general-purpose registers allocated for the kernel,
       see SALU. Note: this may not exactly match the number of SGPRs requested by
       the compiler due to allocation granularity.
-    LDS Allocation: |-
+    LDS Allocation: >-
       The number of bytes of LDS memory (or, shared memory) allocated for
       this kernel. Note: This may also be larger than what was requested at compile
       time due to both allocation granularity and dynamic per-dispatch LDS allocations.
@@ -173,7 +173,7 @@ Panel Config:
       rather than identification of a precise limiter. The sum of this metric, Issue
       Wait Cycles and Active Wait Cycles should be equal to the total Wave Cycles
       metric.
-    Wavefront Occupancy: |-
+    Wavefront Occupancy: >-
       The time-averaged number of wavefronts resident on the accelerator over
       the lifetime of the kernel. Note: this metric may be inaccurate for short-running
       kernels (less than 1ms).
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1000_compute_units_instruction_mix.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1000_compute_units_instruction_mix.yaml
index 768fe6548b..823f543182 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1000_compute_units_instruction_mix.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1000_compute_units_instruction_mix.yaml
@@ -273,7 +273,7 @@ Panel Config:
       floating-point operands issued to the VALU per normalization unit.
     F64-Trans: The total number of transcendental instructions (such as sqrt) operating
       on 64-bit floating-point operands issued to the VALU per normalization unit.
-    Conversion: |-
+    Conversion: >-
       The total number of type conversion instructions (such as converting
       data to or from F32\u2194F64) issued to the VALU per normalization unit.
     Global/Generic Instr: The total number of global & generic memory instructions
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1100_compute_units_compute_pipeline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1100_compute_units_compute_pipeline.yaml
index 5e6ceb654f..d9a4abced9 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1100_compute_units_compute_pipeline.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1100_compute_units_compute_pipeline.yaml
@@ -251,37 +251,37 @@ Panel Config:
           max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
           unit: (OPs + $normUnit)
   metrics_description:
-    VALU FLOPs: |-
+    VALU FLOPs: >-
       The total floating-point operations executed per second on the VALU.
       This is also presented as a percent of the peak theoretical FLOPs achievable
       on the specific accelerator. Note: this does not include any floating-point
       operations from MFMA instructions.
-    VALU IOPs: |-
+    VALU IOPs: >-
       The total integer operations executed per second on the VALU. This is
       also presented as a percent of the peak theoretical IOPs achievable on the
       specific accelerator. Note: this does not include any integer operations from
       MFMA instructions.
-    MFMA FLOPs (BF16): |-
+    MFMA FLOPs (BF16): >-
       The total number of 16-bit brain floating point MFMA operations executed
       per second. Note: this does not include any 16-bit brain floating point operations
       from VALU instructions. This is also presented as a percent of the peak theoretical
       BF16 MFMA operations achievable on the specific accelerator.
-    MFMA FLOPs (F16): |-
+    MFMA FLOPs (F16): >-
       The total number of 16-bit floating point MFMA operations executed per
       second. Note: this does not include any 16-bit floating point operations from
       VALU instructions. This is also presented as a percent of the peak theoretical
       F16 MFMA operations achievable on the specific accelerator.
-    MFMA FLOPs (F32): |-
+    MFMA FLOPs (F32): >-
       The total number of 32-bit floating point MFMA operations executed per
       second. Note: this does not include any 32-bit floating point operations from
       VALU instructions. This is also presented as a percent of the peak theoretical
       F32 MFMA operations achievable on the specific accelerator.
-    MFMA FLOPs (F64): |-
+    MFMA FLOPs (F64): >-
       The total number of 64-bit floating point MFMA operations executed per
       second. Note: this does not include any 64-bit floating point operations from
       VALU instructions. This is also presented as a percent of the peak theoretical
       F64 MFMA operations achievable on the specific accelerator.
-    MFMA IOPs (INT8): |-
+    MFMA IOPs (INT8): >-
       The total number of 8-bit integer MFMA operations executed per second.
       Note: this does not include any 8-bit integer operations from VALU instructions.
       This is also presented as a percent of the peak theoretical INT8 MFMA operations
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1200_local_data_share_lds.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1200_local_data_share_lds.yaml
index b7767fea16..8e602e9f83 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1200_local_data_share_lds.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1200_local_data_share_lds.yaml
@@ -140,7 +140,7 @@ Panel Config:
       unit.
     Unaligned Stall: The total number of cycles spent in the LDS scheduler due to
       stalls from non-dword aligned addresses per normalization unit.
-    Mem Violations: |-
+    Mem Violations: >-
       The total number of out-of-bounds accesses made to the LDS, per normalization
       unit. This is unused and expected to be zero in most configurations for
       modern CDNA\u2122 accelerators.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1300_instruction_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1300_instruction_cache.yaml
index 35808d9d96..ffc6e890b9 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1300_instruction_cache.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1300_instruction_cache.yaml
@@ -92,7 +92,7 @@ Panel Config:
     Cache Hit Rate: The percent of L1I requests that hit [#l1i-cache]_ on a previously
       loaded line the cache. Calculated as the ratio of the number of L1I requests
       that hit over the number of all L1I requests.
-    L1I-L2 Bandwidth Utilization: |-
+    L1I-L2 Bandwidth Utilization: >-
       The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth
       achieved. Calculated as the ratio of the total number of requests from the
       L1I to the L2 cache over the total L1I-L2 interface cycles.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1400_scalar_l1_data_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1400_scalar_l1_data_cache.yaml
index 6b73164848..be2ce0db79 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1400_scalar_l1_data_cache.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1400_scalar_l1_data_cache.yaml
@@ -154,7 +154,7 @@ Panel Config:
     sL1D-L2 BW Utilization: The percentage of the peak theoretical sL1D - L2 interface
       bandwidth acheived. Calculated as total number of bytes read from, written to,
       or atomically updated across the sL1D - L2 interface.
-    sL1D-L2 BW: |-
+    sL1D-L2 BW: >-
       The total number of bytes read from, written to, or atomically updated
       across the sL1D\u2194L2 interface, divided by total duration. Note that sL1D
       writes and atomics are typically unused on current CDNA accelerators, so
@@ -164,7 +164,7 @@ Panel Config:
       unit.
     Hits: The total number of sL1D requests that hit on a previously loaded cache
       line, per normalization unit.
-    Misses - Non Duplicated: |-
+    Misses - Non Duplicated: >-
       The total number of sL1D requests that missed on a cache line that was
       not already pending due to another request, per normalization unit.
     Misses- Duplicated: The total number of sL1D requests that missed on a cache line
@@ -187,6 +187,6 @@ Panel Config:
       unit.
     Write Req: The total number of write requests from sL1D to the L2, per normalization
       unit. Typically unused on current CDNA accelerators.
-    Stall Cycles: |-
+    Stall Cycles: >-
       The total number of cycles the sL1D\u2194L2 interface was stalled, per
       normalization unit.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1600_vector_l1_data_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1600_vector_l1_data_cache.yaml
index 3125397a30..9111e85921 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1600_vector_l1_data_cache.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1600_vector_l1_data_cache.yaml
@@ -398,7 +398,7 @@ Panel Config:
       per normalization unit.
     Translation Misses: The total number of translation requests that missed in the
       UTCL1 due to translation not being present in the cache, per normalization unit.
-    Permission Misses: |-
+    Permission Misses: >-
       The total number of translation requests that missed in the UTCL1 due
       to a permission error, per normalization unit. This is unused and expected
       to be zero in most configurations for modern CDNA\u2122 accelerators.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0200_system_speed_of_light.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0200_system_speed_of_light.yaml
index 8aa72cb25d..5f76eb8937 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0200_system_speed_of_light.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0200_system_speed_of_light.yaml
@@ -227,12 +227,12 @@ Panel Config:
           pop: None
           coll_level: SQ_IFETCH_LEVEL
   metrics_description:
-    VALU FLOPs: |-
+    VALU FLOPs: >-
       The total floating-point operations executed per second on the VALU.
       This is also presented as a percent of the peak theoretical FLOPs achievable
       on the specific accelerator. Note: this does not include any floating-point
       operations from MFMA instructions.
-    VALU IOPs: |-
+    VALU IOPs: >-
       The total integer operations executed per second on the VALU. This is
       also presented as a percent of the peak theoretical IOPs achievable on the
       specific accelerator. Note: this does not include any integer operations from
@@ -242,27 +242,27 @@ Panel Config:
       from VALU instructions. This is also presented as a percent of the peak theoretical
       F8 MFMA operations achievable on the specific accelerator. It is supported on
       AMD Instinct MI300 series and later only.
-    MFMA FLOPs (BF16): |-
+    MFMA FLOPs (BF16): >-
       The total number of 16-bit brain floating point MFMA operations executed
       per second. Note: this does not include any 16-bit brain floating point operations
       from VALU instructions. This is also presented as a percent of the peak theoretical
       BF16 MFMA operations achievable on the specific accelerator.
-    MFMA FLOPs (F16): |-
+    MFMA FLOPs (F16): >-
       The total number of 16-bit floating point MFMA operations executed per
       second. Note: this does not include any 16-bit floating point operations from
       VALU instructions. This is also presented as a percent of the peak theoretical
       F16 MFMA operations achievable on the specific accelerator.
-    MFMA FLOPs (F32): |-
+    MFMA FLOPs (F32): >-
       The total number of 32-bit floating point MFMA operations executed per
       second. Note: this does not include any 32-bit floating point operations from
       VALU instructions. This is also presented as a percent of the peak theoretical
       F32 MFMA operations achievable on the specific accelerator.
-    MFMA FLOPs (F64): |-
+    MFMA FLOPs (F64): >-
       The total number of 64-bit floating point MFMA operations executed per
       second. Note: this does not include any 64-bit floating point operations from
       VALU instructions. This is also presented as a percent of the peak theoretical
       F64 MFMA operations achievable on the specific accelerator.
-    MFMA IOPs (Int8): |-
+    MFMA IOPs (Int8): >-
       The total number of 8-bit integer MFMA operations executed per second.
       Note: this does not include any 8-bit integer operations from VALU instructions.
       This is also presented as a percent of the peak theoretical INT8 MFMA operations
@@ -295,7 +295,7 @@ Panel Config:
     IPC: The ratio of the total number of instructions executed on the CU over the
       total active CU cycles. This is also presented as a percent of the peak theoretical
       bandwidth achievable on the specific accelerator.
-    Wavefront Occupancy: |-
+    Wavefront Occupancy: >-
       The time-averaged number of wavefronts resident on the accelerator over
       the lifetime of the kernel. Note: this metric may be inaccurate for short-running
       kernels (less than 1ms). This is also presented as a percent of the peak theoretical
@@ -328,7 +328,7 @@ Panel Config:
       if only a single value is requested in a cache line, the data movement will
       still be counted as a full cache line. This is also presented as a percent of
       the peak theoretical bandwidth achievable on the specific accelerator.
-    L2-Fabric Read BW: |-
+    L2-Fabric Read BW: >-
       The number of bytes read by the L2 over the Infinity Fabric\u2122 interface
       per unit time. This is also presented as a percent of the peak theoretical
       bandwidth achievable on the specific accelerator.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0300_memory_chart.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0300_memory_chart.yaml
index b13053c1f7..81ce3c2e68 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0300_memory_chart.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0300_memory_chart.yaml
@@ -162,15 +162,15 @@ Panel Config:
     Active CUs: Total number of active compute units (CUs) on the accelerator during
       the kernel execution.
     Num CUs: Total number of compute units (CUs) on the accelerator.
-    VGPR: |-
+    VGPR: >-
       The number of architected vector general-purpose registers allocated
       for the kernel, see VALU. Note: this may not exactly match the number of VGPRs
       requested by the compiler due to allocation granularity.
-    SGPR: |-
+    SGPR: >-
       The number of scalar general-purpose registers allocated for the kernel,
       see SALU. Note: this may not exactly match the number of SGPRs requested by
       the compiler due to allocation granularity.
-    LDS Allocation: |-
+    LDS Allocation: >-
       The number of bytes of LDS memory (or, shared memory) allocated for
       this kernel. Note: This may also be larger than what was requested at compile
       time due to both allocation granularity and dynamic per-dispatch LDS allocations.
@@ -252,7 +252,7 @@ Panel Config:
       or data (atomic with return value) was returned to the L2.
     HBM Rd: The total number of L2 requests to Infinity Fabric to read 32B or 64B
       of data from the accelerator's local HBM, per normalization unit.
-    HBM Wr: |-
+    HBM Wr: >-
       The total number of L2 requests to Infinity Fabric to write or atomically
       update 32B or 64B of data in the accelerator's local HBM, per normalization
       unit.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0400_roofline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0400_roofline.yaml
index fe6389ef3b..6d3980f223 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0400_roofline.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0400_roofline.yaml
@@ -140,17 +140,17 @@ Panel Config:
             * 512) ) / (SUM(End_Timestamp - Start_Timestamp) / 1e9) ) / 1e9
           unit: GFLOP/s
   metrics_description:
-    VALU FLOPs (F16): |-
+    VALU FLOPs (F16): >-
       The total 16-bit floating-point operations executed per second on the VALU.
       This is presented with the value of the peak empirical F16 FLOPs achievable
       on the specific accelerator. Note: this does not include any F16 operations
       from MFMA instructions.
-    VALU FLOPs (F32): |-
+    VALU FLOPs (F32): >-
       The total 32-bit floating-point operations executed per second on the VALU.
       This is presented with the value of the peak empirical F32 FLOPs achievable
       on the specific accelerator. Note: this does not include any F32 operations
       from MFMA instructions.
-    VALU FLOPs (F64): |-
+    VALU FLOPs (F64): >-
       The total 64-bit floating-point operations executed per second on the VALU.
       This is presented with the value of the peak empirical F64 FLOPs achievable
       on the specific accelerator. Note: this does not include any F64 operations
@@ -160,33 +160,33 @@ Panel Config:
       from VALU instructions. The peak empirically measured F8 MFMA operations achievable
       on the specific accelerator is displayed alongside for comparison. It is supported
       on AMD Instinct MI300 series and later only.
-    MFMA FLOPs (BF16): |-
+    MFMA FLOPs (BF16): >-
       The total number of 16-bit brain floating point MFMA operations executed
       per second. Note: this does not include any 16-bit brain floating point
       operations from VALU instructions. The peak empirically measured BF16 MFMA
       operations achievable on the specific accelerator is displayed alongside
       for comparison.
-    MFMA FLOPs (F16): |-
+    MFMA FLOPs (F16): >-
       The total number of 16-bit floating point MFMA operations executed per
       second. Note: this does not include any 16-bit floating point operations from
       VALU instructions. The peak empirically measured F16 MFMA operations
       achievable on the specific accelerator is displayed alongside for comparison.
-    MFMA FLOPs (F32): |-
+    MFMA FLOPs (F32): >-
       The total number of 32-bit floating point MFMA operations executed per
       second. Note: this does not include any 32-bit floating point operations from
       VALU instructions. The peak empirically measured F32 MFMA operations
       achievable on the specific accelerator is displayed alongside for comparison.
-    MFMA FLOPs (F64): |-
+    MFMA FLOPs (F64): >-
       The total number of 64-bit floating point MFMA operations executed per
       second. Note: this does not include any 64-bit floating point operations from
       VALU instructions. The peak empirically measured F64 MFMA operations
       achievable on the specific accelerator is displayed alongside for comparison.
-    MFMA IOPs (Int8): |-
+    MFMA IOPs (Int8): >-
       The total number of 8-bit integer MFMA operations executed per second.
       Note: this does not include any 8-bit integer operations from VALU instructions.
       The peak empirically measured INT8 MFMA operations achievable on the specific
       accelerator is displayed alongside for comparison.
-    HBM Bandwidth: |-
+    HBM Bandwidth: >-
       The total number of bytes read from and written to High-Bandwidth
       Memory (HBM) per second. The peak empirically measured bandwidth achievable
       on the specific accelerator is displayed alongside for comparison.
@@ -207,22 +207,22 @@ Panel Config:
       from, stored to, or atomically updated in the LDS per unit time (see LDS Bandwidth
       example for more detail). The peak empirically measured LDS bandwidth achievable
       on the specific accelerator is displayed alongside for comparison.
-    AI L1: |-
+    AI L1: >-
       The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio
       of total floating-point operations (FLOPs) to total bytes transferred between
       the L1 cache and the processing units. This value is used as the x-coordinate
       for the L1 roofline.
-    AI L2: |-
+    AI L2: >-
       The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio
       of total floating-point operations (FLOPs) to total bytes transferred between
       the L2 cache and the L1 cache. This value is used as the x-coordinate for
       the L2 roofline.
-    AI HBM: |-
+    AI HBM: >-
       The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM).
       It is the ratio of total floating-point operations (FLOPs) to total bytes
       transferred between HBM and the L2 cache. This value is used as the x-coordinate
       for the HBM roofline.
-    Performance (GFLOPs): |-
+    Performance (GFLOPs): >-
       The overall achieved performance, measured in GigaFLOPs
       per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point
       operations divided by the total execution time. This value is used as the y-coordinate
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0500_command_processor_cpc_cpf.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0500_command_processor_cpc_cpf.yaml
index 118ce18331..411c4c803b 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0500_command_processor_cpc_cpf.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0500_command_processor_cpc_cpf.yaml
@@ -141,6 +141,6 @@ Panel Config:
       the CPC-L2 interface was active doing any work.
     CPC-UTCL1 Stall: Percent of CPC busy cycles where the CPC was stalled by address
       translation
-    CPC-UTCL2 Utilization: |-
+    CPC-UTCL2 Utilization: >-
       Percent of total cycles counted by the CPC's L2 address translation
       interface where the CPC was busy doing address translation work.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0600_workgroup_manager_spi.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0600_workgroup_manager_spi.yaml
index eb9845aa82..ca033cbdd7 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0600_workgroup_manager_spi.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0600_workgroup_manager_spi.yaml
@@ -168,7 +168,7 @@ Panel Config:
       in the kernel where a workgroup could not be scheduled to a CU due to a bottleneck
       within the workgroup manager rather than a lack of a CU or SIMD with sufficient
       resources.
-    Not-scheduled Rate (Scheduler-Pipe): |-
+    Not-scheduled Rate (Scheduler-Pipe): >-
       The percent of total scheduler-pipe cycles in the kernel where a workgroup
       could not be scheduled to a CU due to a bottleneck within the scheduler-pipes
       rather than a lack of a CU or SIMD with sufficient resources.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0700_wavefront.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0700_wavefront.yaml
index e9e9407cfc..dfc968df65 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0700_wavefront.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0700_wavefront.yaml
@@ -121,26 +121,26 @@ Panel Config:
     Workgroup Size: The total number of work-items (or, threads) in each workgroup
       (or, block) launched as part of the kernel dispatch. In HIP, this is equivalent
       to the total block size.
-    Total Wavefronts: |-
+    Total Wavefronts: >-
       The total number of wavefronts launched as part of the kernel dispatch.
       On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront
       size is always 64 work-items. Thus, the total number of wavefronts should
       be equivalent to the ceiling of grid size divided by 64.
     Saved Wavefronts: The total number of wavefronts saved at a context-save.
     Restored Wavefronts: The total number of wavefronts restored from a context-save.
-    VGPRs: |-
+    VGPRs: >-
       The number of architected vector general-purpose registers allocated
       for the kernel, see VALU. Note: this may not exactly match the number of VGPRs
       requested by the compiler due to allocation granularity.
-    AGPRs: |-
+    AGPRs: >-
       The number of accumulation vector general-purpose registers allocated
       for the kernel, see AGPRs. Note: this may not exactly match the number of
       AGPRs requested by the compiler due to allocation granularity.
-    SGPRs: |-
+    SGPRs: >-
       The number of scalar general-purpose registers allocated for the kernel,
       see SALU. Note: this may not exactly match the number of SGPRs requested by
       the compiler due to allocation granularity.
-    LDS Allocation: |-
+    LDS Allocation: >-
       The number of bytes of LDS memory (or, shared memory) allocated for
       this kernel. Note: This may also be larger than what was requested at compile
       time due to both allocation granularity and dynamic per-dispatch LDS allocations.
@@ -173,7 +173,7 @@ Panel Config:
       rather than identification of a precise limiter. The sum of this metric, Issue
       Wait Cycles and Active Wait Cycles should be equal to the total Wave Cycles
       metric.
-    Wavefront Occupancy: |-
+    Wavefront Occupancy: >-
       The time-averaged number of wavefronts resident on the accelerator over
       the lifetime of the kernel. Note: this metric may be inaccurate for short-running
       kernels (less than 1ms).
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1000_compute_units_instruction_mix.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1000_compute_units_instruction_mix.yaml
index 768fe6548b..823f543182 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1000_compute_units_instruction_mix.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1000_compute_units_instruction_mix.yaml
@@ -273,7 +273,7 @@ Panel Config:
       floating-point operands issued to the VALU per normalization unit.
     F64-Trans: The total number of transcendental instructions (such as sqrt) operating
       on 64-bit floating-point operands issued to the VALU per normalization unit.
-    Conversion: |-
+    Conversion: >-
       The total number of type conversion instructions (such as converting
       data to or from F32\u2194F64) issued to the VALU per normalization unit.
     Global/Generic Instr: The total number of global & generic memory instructions
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1100_compute_units_compute_pipeline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1100_compute_units_compute_pipeline.yaml
index 5e6ceb654f..d9a4abced9 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1100_compute_units_compute_pipeline.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1100_compute_units_compute_pipeline.yaml
@@ -251,37 +251,37 @@ Panel Config:
           max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
           unit: (OPs + $normUnit)
   metrics_description:
-    VALU FLOPs: |-
+    VALU FLOPs: >-
       The total floating-point operations executed per second on the VALU.
       This is also presented as a percent of the peak theoretical FLOPs achievable
       on the specific accelerator. Note: this does not include any floating-point
       operations from MFMA instructions.
-    VALU IOPs: |-
+    VALU IOPs: >-
       The total integer operations executed per second on the VALU. This is
       also presented as a percent of the peak theoretical IOPs achievable on the
       specific accelerator. Note: this does not include any integer operations from
       MFMA instructions.
-    MFMA FLOPs (BF16): |-
+    MFMA FLOPs (BF16): >-
       The total number of 16-bit brain floating point MFMA operations executed
       per second. Note: this does not include any 16-bit brain floating point operations
       from VALU instructions. This is also presented as a percent of the peak theoretical
       BF16 MFMA operations achievable on the specific accelerator.
-    MFMA FLOPs (F16): |-
+    MFMA FLOPs (F16): >-
       The total number of 16-bit floating point MFMA operations executed per
       second. Note: this does not include any 16-bit floating point operations from
       VALU instructions. This is also presented as a percent of the peak theoretical
       F16 MFMA operations achievable on the specific accelerator.
-    MFMA FLOPs (F32): |-
+    MFMA FLOPs (F32): >-
       The total number of 32-bit floating point MFMA operations executed per
       second. Note: this does not include any 32-bit floating point operations from
       VALU instructions. This is also presented as a percent of the peak theoretical
       F32 MFMA operations achievable on the specific accelerator.
-    MFMA FLOPs (F64): |-
+    MFMA FLOPs (F64): >-
       The total number of 64-bit floating point MFMA operations executed per
       second. Note: this does not include any 64-bit floating point operations from
       VALU instructions. This is also presented as a percent of the peak theoretical
       F64 MFMA operations achievable on the specific accelerator.
-    MFMA IOPs (INT8): |-
+    MFMA IOPs (INT8): >-
       The total number of 8-bit integer MFMA operations executed per second.
       Note: this does not include any 8-bit integer operations from VALU instructions.
       This is also presented as a percent of the peak theoretical INT8 MFMA operations
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1200_local_data_share_lds.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1200_local_data_share_lds.yaml
index b7767fea16..8e602e9f83 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1200_local_data_share_lds.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1200_local_data_share_lds.yaml
@@ -140,7 +140,7 @@ Panel Config:
       unit.
     Unaligned Stall: The total number of cycles spent in the LDS scheduler due to
       stalls from non-dword aligned addresses per normalization unit.
-    Mem Violations: |-
+    Mem Violations: >-
       The total number of out-of-bounds accesses made to the LDS, per normalization
       unit. This is unused and expected to be zero in most configurations for
       modern CDNA\u2122 accelerators.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1300_instruction_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1300_instruction_cache.yaml
index 35808d9d96..ffc6e890b9 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1300_instruction_cache.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1300_instruction_cache.yaml
@@ -92,7 +92,7 @@ Panel Config:
     Cache Hit Rate: The percent of L1I requests that hit [#l1i-cache]_ on a previously
       loaded line the cache. Calculated as the ratio of the number of L1I requests
       that hit over the number of all L1I requests.
-    L1I-L2 Bandwidth Utilization: |-
+    L1I-L2 Bandwidth Utilization: >-
       The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth
       achieved. Calculated as the ratio of the total number of requests from the
       L1I to the L2 cache over the total L1I-L2 interface cycles.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1400_scalar_l1_data_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1400_scalar_l1_data_cache.yaml
index 6b73164848..be2ce0db79 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1400_scalar_l1_data_cache.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1400_scalar_l1_data_cache.yaml
@@ -154,7 +154,7 @@ Panel Config:
     sL1D-L2 BW Utilization: The percentage of the peak theoretical sL1D - L2 interface
       bandwidth acheived. Calculated as total number of bytes read from, written to,
       or atomically updated across the sL1D - L2 interface.
-    sL1D-L2 BW: |-
+    sL1D-L2 BW: >-
       The total number of bytes read from, written to, or atomically updated
       across the sL1D\u2194L2 interface, divided by total duration. Note that sL1D
       writes and atomics are typically unused on current CDNA accelerators, so
@@ -164,7 +164,7 @@ Panel Config:
       unit.
     Hits: The total number of sL1D requests that hit on a previously loaded cache
       line, per normalization unit.
-    Misses - Non Duplicated: |-
+    Misses - Non Duplicated: >-
       The total number of sL1D requests that missed on a cache line that was
       not already pending due to another request, per normalization unit.
     Misses- Duplicated: The total number of sL1D requests that missed on a cache line
@@ -187,6 +187,6 @@ Panel Config:
       unit.
     Write Req: The total number of write requests from sL1D to the L2, per normalization
       unit. Typically unused on current CDNA accelerators.
-    Stall Cycles: |-
+    Stall Cycles: >-
       The total number of cycles the sL1D\u2194L2 interface was stalled, per
       normalization unit.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1600_vector_l1_data_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1600_vector_l1_data_cache.yaml
index 3125397a30..9111e85921 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1600_vector_l1_data_cache.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1600_vector_l1_data_cache.yaml
@@ -398,7 +398,7 @@ Panel Config:
       per normalization unit.
     Translation Misses: The total number of translation requests that missed in the
       UTCL1 due to translation not being present in the cache, per normalization unit.
-    Permission Misses: |-
+    Permission Misses: >-
       The total number of translation requests that missed in the UTCL1 due
       to a permission error, per normalization unit. This is unused and expected
       to be zero in most configurations for modern CDNA\u2122 accelerators.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0200_system_speed_of_light.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0200_system_speed_of_light.yaml
index bdbd62f755..225c5cf149 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0200_system_speed_of_light.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0200_system_speed_of_light.yaml
@@ -233,12 +233,12 @@ Panel Config:
           pop: None
           coll_level: SQ_IFETCH_LEVEL
   metrics_description:
-    VALU FLOPs: |-
+    VALU FLOPs: >-
       The total floating-point operations executed per second on the VALU.
       This is also presented as a percent of the peak theoretical FLOPs achievable
       on the specific accelerator. Note: this does not include any floating-point
       operations from MFMA instructions.
-    VALU IOPs: |-
+    VALU IOPs: >-
       The total integer operations executed per second on the VALU. This is
       also presented as a percent of the peak theoretical IOPs achievable on the
       specific accelerator. Note: this does not include any integer operations from
@@ -248,27 +248,27 @@ Panel Config:
       from VALU instructions. This is also presented as a percent of the peak theoretical
       F8 MFMA operations achievable on the specific accelerator. It is supported on
       AMD Instinct MI300 series and later only.
-    MFMA FLOPs (BF16): |-
+    MFMA FLOPs (BF16): >-
       The total number of 16-bit brain floating point MFMA operations executed
       per second. Note: this does not include any 16-bit brain floating point operations
       from VALU instructions. This is also presented as a percent of the peak theoretical
       BF16 MFMA operations achievable on the specific accelerator.
-    MFMA FLOPs (F16): |-
+    MFMA FLOPs (F16): >-
       The total number of 16-bit floating point MFMA operations executed per
       second. Note: this does not include any 16-bit floating point operations from
       VALU instructions. This is also presented as a percent of the peak theoretical
       F16 MFMA operations achievable on the specific accelerator.
-    MFMA FLOPs (F32): |-
+    MFMA FLOPs (F32): >-
       The total number of 32-bit floating point MFMA operations executed per
       second. Note: this does not include any 32-bit floating point operations from
       VALU instructions. This is also presented as a percent of the peak theoretical
       F32 MFMA operations achievable on the specific accelerator.
-    MFMA FLOPs (F64): |-
+    MFMA FLOPs (F64): >-
       The total number of 64-bit floating point MFMA operations executed per
       second. Note: this does not include any 64-bit floating point operations from
       VALU instructions. This is also presented as a percent of the peak theoretical
       F64 MFMA operations achievable on the specific accelerator.
-    MFMA IOPs (Int8): |-
+    MFMA IOPs (Int8): >-
       The total number of 8-bit integer MFMA operations executed per second.
       Note: this does not include any 8-bit integer operations from VALU instructions.
       This is also presented as a percent of the peak theoretical INT8 MFMA operations
@@ -301,7 +301,7 @@ Panel Config:
     IPC: The ratio of the total number of instructions executed on the CU over the
       total active CU cycles. This is also presented as a percent of the peak theoretical
       bandwidth achievable on the specific accelerator.
-    Wavefront Occupancy: |-
+    Wavefront Occupancy: >-
       The time-averaged number of wavefronts resident on the accelerator over
       the lifetime of the kernel. Note: this metric may be inaccurate for short-running
       kernels (less than 1ms). This is also presented as a percent of the peak theoretical
@@ -334,7 +334,7 @@ Panel Config:
       if only a single value is requested in a cache line, the data movement will
       still be counted as a full cache line. This is also presented as a percent of
       the peak theoretical bandwidth achievable on the specific accelerator.
-    L2-Fabric Read BW: |-
+    L2-Fabric Read BW: >-
       The number of bytes read by the L2 over the Infinity Fabric\u2122 interface
       per unit time. This is also presented as a percent of the peak theoretical
       bandwidth achievable on the specific accelerator.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0300_memory_chart.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0300_memory_chart.yaml
index 081d5654df..9d3e26ebb2 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0300_memory_chart.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0300_memory_chart.yaml
@@ -172,15 +172,15 @@ Panel Config:
     Active CUs: Total number of active compute units (CUs) on the accelerator during
       the kernel execution.
     Num CUs: Total number of compute units (CUs) on the accelerator.
-    VGPR: |-
+    VGPR: >-
       The number of architected vector general-purpose registers allocated
       for the kernel, see VALU. Note: this may not exactly match the number of VGPRs
       requested by the compiler due to allocation granularity.
-    SGPR: |-
+    SGPR: >-
       The number of scalar general-purpose registers allocated for the kernel,
       see SALU. Note: this may not exactly match the number of SGPRs requested by
       the compiler due to allocation granularity.
-    LDS Allocation: |-
+    LDS Allocation: >-
       The number of bytes of LDS memory (or, shared memory) allocated for
       this kernel. Note: This may also be larger than what was requested at compile
       time due to both allocation granularity and dynamic per-dispatch LDS allocations.
@@ -268,7 +268,7 @@ Panel Config:
       or data (atomic with return value) was returned to the L2.
     HBM Rd: The total number of L2 requests to Infinity Fabric to read 32B or 64B
       of data from the accelerator's local HBM, per normalization unit.
-    HBM Wr: |-
+    HBM Wr: >-
       The total number of L2 requests to Infinity Fabric to write or atomically
       update 32B or 64B of data in the accelerator's local HBM, per normalization
       unit.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0400_roofline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0400_roofline.yaml
index 0a4b3be693..83240870e8 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0400_roofline.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0400_roofline.yaml
@@ -148,17 +148,17 @@ Panel Config:
             Start_Timestamp) / 1e9) ) / 1e9
           unit: GFLOP/s
   metrics_description:
-    VALU FLOPs (F16): |-
+    VALU FLOPs (F16): >-
       The total 16-bit floating-point operations executed per second on the VALU.
       This is presented with the value of the peak empirical F16 FLOPs achievable
       on the specific accelerator. Note: this does not include any F16 operations
       from MFMA instructions.
-    VALU FLOPs (F32): |-
+    VALU FLOPs (F32): >-
       The total 32-bit floating-point operations executed per second on the VALU.
       This is presented with the value of the peak empirical F32 FLOPs achievable
       on the specific accelerator. Note: this does not include any F32 operations
       from MFMA instructions.
-    VALU FLOPs (F64): |-
+    VALU FLOPs (F64): >-
       The total 64-bit floating-point operations executed per second on the VALU.
       This is presented with the value of the peak empirical F64 FLOPs achievable
       on the specific accelerator. Note: this does not include any F64 operations
@@ -168,39 +168,39 @@ Panel Config:
       from VALU instructions. The peak empirically measured F8 MFMA operations achievable
       on the specific accelerator is displayed alongside for comparison. It is supported
       on AMD Instinct MI300 series and later only.
-    MFMA FLOPs (BF16): |-
+    MFMA FLOPs (BF16): >-
       The total number of 16-bit brain floating point MFMA operations executed
       per second. Note: this does not include any 16-bit brain floating point
       operations from VALU instructions. The peak empirically measured BF16 MFMA
       operations achievable on the specific accelerator is displayed alongside
       for comparison.
-    MFMA FLOPs (F16): |-
+    MFMA FLOPs (F16): >-
       The total number of 16-bit floating point MFMA operations executed per
       second. Note: this does not include any 16-bit floating point operations from
       VALU instructions. The peak empirically measured F16 MFMA operations
       achievable on the specific accelerator is displayed alongside for comparison.
-    MFMA FLOPs (F32): |-
+    MFMA FLOPs (F32): >-
       The total number of 32-bit floating point MFMA operations executed per
       second. Note: this does not include any 32-bit floating point operations from
       VALU instructions. The peak empirically measured F32 MFMA operations
       achievable on the specific accelerator is displayed alongside for comparison.
-    MFMA FLOPs (F64): |-
+    MFMA FLOPs (F64): >-
       The total number of 64-bit floating point MFMA operations executed per
       second. Note: this does not include any 64-bit floating point operations from
       VALU instructions. The peak empirically measured F64 MFMA operations
       achievable on the specific accelerator is displayed alongside for comparison.
-    MFMA FLOPs (F6F4): |-
+    MFMA FLOPs (F6F4): >-
       The total number of 4-bit and 6-bit floating point MFMA operations executed
       per second. Note: this does not include any floating point operations from
       VALU instructions. The peak empirically measured F6F4 MFMA operations
       achievable on the specific accelerator is displayed alongside for comparison.
       It is supported on AMD Instinct MI350 series (gfx950) and later only.
-    MFMA IOPs (Int8): |-
+    MFMA IOPs (Int8): >-
       The total number of 8-bit integer MFMA operations executed per second.
       Note: this does not include any 8-bit integer operations from VALU instructions.
       The peak empirically measured INT8 MFMA operations achievable on the specific
       accelerator is displayed alongside for comparison.
-    HBM Bandwidth: |-
+    HBM Bandwidth: >-
       The total number of bytes read from and written to High-Bandwidth
       Memory (HBM) per second. The peak empirically measured bandwidth achievable
       on the specific accelerator is displayed alongside for comparison.
@@ -221,22 +221,22 @@ Panel Config:
       from, stored to, or atomically updated in the LDS per unit time (see LDS Bandwidth
       example for more detail). The peak empirically measured LDS bandwidth achievable
       on the specific accelerator is displayed alongside for comparison.
-    AI L1: |-
+    AI L1: >-
       The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio
       of total floating-point operations (FLOPs) to total bytes transferred between
       the L1 cache and the processing units. This value is used as the x-coordinate
       for the L1 roofline.
-    AI L2: |-
+    AI L2: >-
       The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio
       of total floating-point operations (FLOPs) to total bytes transferred between
       the L2 cache and the L1 cache. This value is used as the x-coordinate for
       the L2 roofline.
-    AI HBM: |-
+    AI HBM: >-
       The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM).
       It is the ratio of total floating-point operations (FLOPs) to total bytes
       transferred between HBM and the L2 cache. This value is used as the x-coordinate
       for the HBM roofline.
-    Performance (GFLOPs): |-
+    Performance (GFLOPs): >-
       The overall achieved performance, measured in GigaFLOPs
       per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point
       operations divided by the total execution time. This value is used as the y-coordinate
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0500_command_processor_cpc_cpf.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0500_command_processor_cpc_cpf.yaml
index 58699ebb18..c4b9f8ad90 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0500_command_processor_cpc_cpf.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0500_command_processor_cpc_cpf.yaml
@@ -162,6 +162,6 @@ Panel Config:
       the CPC-L2 interface was active doing any work.
     CPC-UTCL1 Stall: Percent of CPC busy cycles where the CPC was stalled by address
       translation
-    CPC-UTCL2 Utilization: |-
+    CPC-UTCL2 Utilization: >-
       Percent of total cycles counted by the CPC's L2 address translation
       interface where the CPC was busy doing address translation work.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0600_workgroup_manager_spi.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0600_workgroup_manager_spi.yaml
index 02ed4b3d9c..7b2b017674 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0600_workgroup_manager_spi.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0600_workgroup_manager_spi.yaml
@@ -204,7 +204,7 @@ Panel Config:
       in the kernel where a workgroup could not be scheduled to a CU due to a bottleneck
       within the workgroup manager rather than a lack of a CU or SIMD with sufficient
       resources.
-    Not-scheduled Rate (Scheduler-Pipe): |-
+    Not-scheduled Rate (Scheduler-Pipe): >-
       The percent of total scheduler-pipe cycles in the kernel where a workgroup
       could not be scheduled to a CU due to a bottleneck within the scheduler-pipes
       rather than a lack of a CU or SIMD with sufficient resources.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0700_wavefront.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0700_wavefront.yaml
index bd6ca38642..3ac9d30c79 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0700_wavefront.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0700_wavefront.yaml
@@ -121,26 +121,26 @@ Panel Config:
     Workgroup Size: The total number of work-items (or, threads) in each workgroup
       (or, block) launched as part of the kernel dispatch. In HIP, this is equivalent
       to the total block size.
-    Total Wavefronts: |-
+    Total Wavefronts: >-
       The total number of wavefronts launched as part of the kernel dispatch.
       On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront
       size is always 64 work-items. Thus, the total number of wavefronts should
       be equivalent to the ceiling of grid size divided by 64.
     Saved Wavefronts: The total number of wavefronts saved at a context-save.
     Restored Wavefronts: The total number of wavefronts restored from a context-save.
-    VGPRs: |-
+    VGPRs: >-
       The number of architected vector general-purpose registers allocated
       for the kernel, see VALU. Note: this may not exactly match the number of VGPRs
       requested by the compiler due to allocation granularity.
-    AGPRs: |-
+    AGPRs: >-
       The number of accumulation vector general-purpose registers allocated
       for the kernel, see AGPRs. Note: this may not exactly match the number of
       AGPRs requested by the compiler due to allocation granularity.
-    SGPRs: |-
+    SGPRs: >-
       The number of scalar general-purpose registers allocated for the kernel,
       see SALU. Note: this may not exactly match the number of SGPRs requested by
       the compiler due to allocation granularity.
-    LDS Allocation: |-
+    LDS Allocation: >-
       The number of bytes of LDS memory (or, shared memory) allocated for
       this kernel. Note: This may also be larger than what was requested at compile
       time due to both allocation granularity and dynamic per-dispatch LDS allocations.
@@ -173,7 +173,7 @@ Panel Config:
       rather than identification of a precise limiter. The sum of this metric, Issue
       Wait Cycles and Active Wait Cycles should be equal to the total Wave Cycles
       metric.
-    Wavefront Occupancy: |-
+    Wavefront Occupancy: >-
       The time-averaged number of wavefronts resident on the accelerator over
       the lifetime of the kernel. Note: this metric may be inaccurate for short-running
       kernels (less than 1ms).
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1000_compute_units_instruction_mix.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1000_compute_units_instruction_mix.yaml
index 551dad2bc2..fc1efd2224 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1000_compute_units_instruction_mix.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1000_compute_units_instruction_mix.yaml
@@ -283,7 +283,7 @@ Panel Config:
       floating-point operands issued to the VALU per normalization unit.
     F64-Trans: The total number of transcendental instructions (such as sqrt) operating
       on 64-bit floating-point operands issued to the VALU per normalization unit.
-    Conversion: |-
+    Conversion: >-
       The total number of type conversion instructions (such as converting
       data to or from F32\u2194F64) issued to the VALU per normalization unit.
     Global/Generic Instr: The total number of global & generic memory instructions
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1100_compute_units_compute_pipeline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1100_compute_units_compute_pipeline.yaml
index dc36164667..3fd0dfd568 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1100_compute_units_compute_pipeline.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1100_compute_units_compute_pipeline.yaml
@@ -267,37 +267,37 @@ Panel Config:
           max: MAX(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / $denom))
           unit: (OPs + $normUnit)
   metrics_description:
-    VALU FLOPs: |-
+    VALU FLOPs: >-
       The total floating-point operations executed per second on the VALU.
       This is also presented as a percent of the peak theoretical FLOPs achievable
       on the specific accelerator. Note: this does not include any floating-point
       operations from MFMA instructions.
-    VALU IOPs: |-
+    VALU IOPs: >-
       The total integer operations executed per second on the VALU. This is
       also presented as a percent of the peak theoretical IOPs achievable on the
       specific accelerator. Note: this does not include any integer operations from
       MFMA instructions.
-    MFMA FLOPs (BF16): |-
+    MFMA FLOPs (BF16): >-
       The total number of 16-bit brain floating point MFMA operations executed
       per second. Note: this does not include any 16-bit brain floating point operations
       from VALU instructions. This is also presented as a percent of the peak theoretical
       BF16 MFMA operations achievable on the specific accelerator.
-    MFMA FLOPs (F16): |-
+    MFMA FLOPs (F16): >-
       The total number of 16-bit floating point MFMA operations executed per
       second. Note: this does not include any 16-bit floating point operations from
       VALU instructions. This is also presented as a percent of the peak theoretical
       F16 MFMA operations achievable on the specific accelerator.
-    MFMA FLOPs (F32): |-
+    MFMA FLOPs (F32): >-
       The total number of 32-bit floating point MFMA operations executed per
       second. Note: this does not include any 32-bit floating point operations from
       VALU instructions. This is also presented as a percent of the peak theoretical
       F32 MFMA operations achievable on the specific accelerator.
-    MFMA FLOPs (F64): |-
+    MFMA FLOPs (F64): >-
       The total number of 64-bit floating point MFMA operations executed per
       second. Note: this does not include any 64-bit floating point operations from
       VALU instructions. This is also presented as a percent of the peak theoretical
       F64 MFMA operations achievable on the specific accelerator.
-    MFMA IOPs (INT8): |-
+    MFMA IOPs (INT8): >-
       The total number of 8-bit integer MFMA operations executed per second.
       Note: this does not include any 8-bit integer operations from VALU instructions.
       This is also presented as a percent of the peak theoretical INT8 MFMA operations
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1200_local_data_share_lds.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1200_local_data_share_lds.yaml
index 7b839fc1f7..f186a85d0c 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1200_local_data_share_lds.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1200_local_data_share_lds.yaml
@@ -180,7 +180,7 @@ Panel Config:
       unit.
     Unaligned Stall: The total number of cycles spent in the LDS scheduler due to
       stalls from non-dword aligned addresses per normalization unit.
-    Mem Violations: |-
+    Mem Violations: >-
       The total number of out-of-bounds accesses made to the LDS, per normalization
       unit. This is unused and expected to be zero in most configurations for
       modern CDNA\u2122 accelerators.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1300_instruction_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1300_instruction_cache.yaml
index 35808d9d96..ffc6e890b9 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1300_instruction_cache.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1300_instruction_cache.yaml
@@ -92,7 +92,7 @@ Panel Config:
     Cache Hit Rate: The percent of L1I requests that hit [#l1i-cache]_ on a previously
       loaded line the cache. Calculated as the ratio of the number of L1I requests
       that hit over the number of all L1I requests.
-    L1I-L2 Bandwidth Utilization: |-
+    L1I-L2 Bandwidth Utilization: >-
       The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth
       achieved. Calculated as the ratio of the total number of requests from the
       L1I to the L2 cache over the total L1I-L2 interface cycles.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1400_scalar_l1_data_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1400_scalar_l1_data_cache.yaml
index 6b73164848..be2ce0db79 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1400_scalar_l1_data_cache.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1400_scalar_l1_data_cache.yaml
@@ -154,7 +154,7 @@ Panel Config:
     sL1D-L2 BW Utilization: The percentage of the peak theoretical sL1D - L2 interface
       bandwidth acheived. Calculated as total number of bytes read from, written to,
       or atomically updated across the sL1D - L2 interface.
-    sL1D-L2 BW: |-
+    sL1D-L2 BW: >-
       The total number of bytes read from, written to, or atomically updated
       across the sL1D\u2194L2 interface, divided by total duration. Note that sL1D
       writes and atomics are typically unused on current CDNA accelerators, so
@@ -164,7 +164,7 @@ Panel Config:
       unit.
     Hits: The total number of sL1D requests that hit on a previously loaded cache
       line, per normalization unit.
-    Misses - Non Duplicated: |-
+    Misses - Non Duplicated: >-
       The total number of sL1D requests that missed on a cache line that was
       not already pending due to another request, per normalization unit.
     Misses- Duplicated: The total number of sL1D requests that missed on a cache line
@@ -187,6 +187,6 @@ Panel Config:
       unit.
     Write Req: The total number of write requests from sL1D to the L2, per normalization
       unit. Typically unused on current CDNA accelerators.
-    Stall Cycles: |-
+    Stall Cycles: >-
       The total number of cycles the sL1D\u2194L2 interface was stalled, per
       normalization unit.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1600_vector_l1_data_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1600_vector_l1_data_cache.yaml
index 48408d16d7..0589588fbd 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1600_vector_l1_data_cache.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1600_vector_l1_data_cache.yaml
@@ -501,7 +501,7 @@ Panel Config:
       per normalization unit.
     Translation Misses: The total number of translation requests that missed in the
       UTCL1 due to translation not being present in the cache, per normalization unit.
-    Permission Misses: |-
+    Permission Misses: >-
       The total number of translation requests that missed in the UTCL1 due
       to a permission error, per normalization unit. This is unused and expected
       to be zero in most configurations for modern CDNA\u2122 accelerators.
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1700_l2_cache.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1700_l2_cache.yaml
index 40cbd3856f..d4fde58501 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1700_l2_cache.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/1700_l2_cache.yaml
@@ -706,7 +706,7 @@ Panel Config:
       requests are only considered atomic by Infinity Fabric if they are targeted
       at non-write-cacheable memory, such as fine-grained memory allocations or uncached
       memory allocations on the MI2XX.
-    Read Stall: |-
+    Read Stall: >-
       The ratio of the total number of cycles the L2-Fabric interface was
       stalled on a read request to any destination (local HBM, remote PCIe\xAE
       connected accelerator or CPU, or remote Infinity Fabric connected accelerator
diff --git a/projects/rocprofiler-compute/tools/autogen_hash.yaml b/projects/rocprofiler-compute/tools/autogen_hash.yaml
index 0b3955ff1a..e25b0bb4f9 100644
--- a/projects/rocprofiler-compute/tools/autogen_hash.yaml
+++ b/projects/rocprofiler-compute/tools/autogen_hash.yaml
@@ -1,116 +1,2 @@
-# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py
-src/rocprof_compute_soc/analysis_configs/gfx908/0000_top_stats.yaml: ad7818c680acb0d4e3cb624e0f14f79d44fa7efe14531c5643f47ac96266c91d
-src/rocprof_compute_soc/analysis_configs/gfx90a/0000_top_stats.yaml: ad7818c680acb0d4e3cb624e0f14f79d44fa7efe14531c5643f47ac96266c91d
-src/rocprof_compute_soc/analysis_configs/gfx940/0000_top_stats.yaml: ad7818c680acb0d4e3cb624e0f14f79d44fa7efe14531c5643f47ac96266c91d
-src/rocprof_compute_soc/analysis_configs/gfx941/0000_top_stats.yaml: ad7818c680acb0d4e3cb624e0f14f79d44fa7efe14531c5643f47ac96266c91d
-src/rocprof_compute_soc/analysis_configs/gfx942/0000_top_stats.yaml: ad7818c680acb0d4e3cb624e0f14f79d44fa7efe14531c5643f47ac96266c91d
-src/rocprof_compute_soc/analysis_configs/gfx950/0000_top_stats.yaml: ad7818c680acb0d4e3cb624e0f14f79d44fa7efe14531c5643f47ac96266c91d
-src/rocprof_compute_soc/analysis_configs/gfx908/0100_system_info.yaml: d95eea137c439cc2aa4ac5273f06ac6a05037a74550bc23a095162ee366d39cb
-src/rocprof_compute_soc/analysis_configs/gfx90a/0100_system_info.yaml: d95eea137c439cc2aa4ac5273f06ac6a05037a74550bc23a095162ee366d39cb
-src/rocprof_compute_soc/analysis_configs/gfx940/0100_system_info.yaml: d95eea137c439cc2aa4ac5273f06ac6a05037a74550bc23a095162ee366d39cb
-src/rocprof_compute_soc/analysis_configs/gfx941/0100_system_info.yaml: d95eea137c439cc2aa4ac5273f06ac6a05037a74550bc23a095162ee366d39cb
-src/rocprof_compute_soc/analysis_configs/gfx942/0100_system_info.yaml: d95eea137c439cc2aa4ac5273f06ac6a05037a74550bc23a095162ee366d39cb
-src/rocprof_compute_soc/analysis_configs/gfx950/0100_system_info.yaml: d95eea137c439cc2aa4ac5273f06ac6a05037a74550bc23a095162ee366d39cb
-src/rocprof_compute_soc/analysis_configs/gfx908/0200_system_speed_of_light.yaml: aa60b7a75e46196195675a1c8d6aa65211483ace8dfe346ed0228056586bc8a5
-src/rocprof_compute_soc/analysis_configs/gfx90a/0200_system_speed_of_light.yaml: 54d0ef58f8222463516984d3b9153806f5185de9e719d1903537af4c8344a4f4
-src/rocprof_compute_soc/analysis_configs/gfx940/0200_system_speed_of_light.yaml: a6a5d78d76eb39471249c4c55ccea2e8084a5136c01d29aaeb87d308cce05d2e
-src/rocprof_compute_soc/analysis_configs/gfx941/0200_system_speed_of_light.yaml: 352d4702fbebd8550883b777b875893a8404a7909d83c74cdd50c1b713452c81
-src/rocprof_compute_soc/analysis_configs/gfx942/0200_system_speed_of_light.yaml: a6a5d78d76eb39471249c4c55ccea2e8084a5136c01d29aaeb87d308cce05d2e
-src/rocprof_compute_soc/analysis_configs/gfx950/0200_system_speed_of_light.yaml: 1a164dfbb551e4b0a8a55a843d776738d90406cdbe2930e0f474b77a075a7353
-src/rocprof_compute_soc/analysis_configs/gfx908/0300_memory_chart.yaml: ff5fd164694f454a95ccd52c8c0bfa20aebfa476908cab2ac03215fb33e48598
-src/rocprof_compute_soc/analysis_configs/gfx90a/0300_memory_chart.yaml: 332c1965f462e75a479ddf3270294e1cf723701eb08b60c6cea550eb3bc192e7
-src/rocprof_compute_soc/analysis_configs/gfx940/0300_memory_chart.yaml: 92dc15222a707fff79ce2084172ae2068465bfe064b89538ca7e83359422dfc8
-src/rocprof_compute_soc/analysis_configs/gfx941/0300_memory_chart.yaml: 92dc15222a707fff79ce2084172ae2068465bfe064b89538ca7e83359422dfc8
-src/rocprof_compute_soc/analysis_configs/gfx942/0300_memory_chart.yaml: 92dc15222a707fff79ce2084172ae2068465bfe064b89538ca7e83359422dfc8
-src/rocprof_compute_soc/analysis_configs/gfx950/0300_memory_chart.yaml: d3a2e085061068ff8cff0b80f6944dc866ec3e748cf1e4c0cfcd76e1e14d21f8
-src/rocprof_compute_soc/analysis_configs/gfx908/0400_roofline.yaml: e91988af6d99a03e2a19593155447f79abe64dc128a83a170a5037ab466b238c
-src/rocprof_compute_soc/analysis_configs/gfx90a/0400_roofline.yaml: 0807c87d20faed19f2ef9470e9277715f2287e687aa831a328dcab4915a38812
-src/rocprof_compute_soc/analysis_configs/gfx940/0400_roofline.yaml: f5f35d1ae9a35fe83bcdf572aa788401c14cc6718761c4cf8e4dddcf249c3548
-src/rocprof_compute_soc/analysis_configs/gfx941/0400_roofline.yaml: 760ecef9947fa31d3a0fb5c45d653060d06213d8d9f216c19cbb1b1ce29942b6
-src/rocprof_compute_soc/analysis_configs/gfx942/0400_roofline.yaml: e037ce1a2cf8ba08e2317e322b56954caace6ec2427a966acbabf2135cd89855
-src/rocprof_compute_soc/analysis_configs/gfx950/0400_roofline.yaml: f53b2a92b3ac051290eff9b1f63343c30e6cd223b9cbf9d30a93ef4a5ff158b3
-src/rocprof_compute_soc/analysis_configs/gfx908/0500_command_processor_cpc_cpf.yaml: 649bec27b9ccee34c96520c1f6bc0977779a8c4f8a58ee21ff59d61207962966
-src/rocprof_compute_soc/analysis_configs/gfx90a/0500_command_processor_cpc_cpf.yaml: 649bec27b9ccee34c96520c1f6bc0977779a8c4f8a58ee21ff59d61207962966
-src/rocprof_compute_soc/analysis_configs/gfx940/0500_command_processor_cpc_cpf.yaml: 649bec27b9ccee34c96520c1f6bc0977779a8c4f8a58ee21ff59d61207962966
-src/rocprof_compute_soc/analysis_configs/gfx941/0500_command_processor_cpc_cpf.yaml: 649bec27b9ccee34c96520c1f6bc0977779a8c4f8a58ee21ff59d61207962966
-src/rocprof_compute_soc/analysis_configs/gfx942/0500_command_processor_cpc_cpf.yaml: 649bec27b9ccee34c96520c1f6bc0977779a8c4f8a58ee21ff59d61207962966
-src/rocprof_compute_soc/analysis_configs/gfx950/0500_command_processor_cpc_cpf.yaml: 1e4c1bc1158398df8966d24e56b7d434458ce10ade9e13f168887d9a0d9abaef
-src/rocprof_compute_soc/analysis_configs/gfx908/0600_workgroup_manager_spi.yaml: 2bcb7045609e8ff023c9bfa384e63f6a2cc926ff3261f3eab6737f89a899809e
-src/rocprof_compute_soc/analysis_configs/gfx90a/0600_workgroup_manager_spi.yaml: 2bcb7045609e8ff023c9bfa384e63f6a2cc926ff3261f3eab6737f89a899809e
-src/rocprof_compute_soc/analysis_configs/gfx940/0600_workgroup_manager_spi.yaml: 2bcb7045609e8ff023c9bfa384e63f6a2cc926ff3261f3eab6737f89a899809e
-src/rocprof_compute_soc/analysis_configs/gfx941/0600_workgroup_manager_spi.yaml: 2bcb7045609e8ff023c9bfa384e63f6a2cc926ff3261f3eab6737f89a899809e
-src/rocprof_compute_soc/analysis_configs/gfx942/0600_workgroup_manager_spi.yaml: 2bcb7045609e8ff023c9bfa384e63f6a2cc926ff3261f3eab6737f89a899809e
-src/rocprof_compute_soc/analysis_configs/gfx950/0600_workgroup_manager_spi.yaml: 6d97f3ebf3bef1d164255d4c4979e43d7f313f1eda067324aad9be06be98f090
-src/rocprof_compute_soc/analysis_configs/gfx908/0700_wavefront.yaml: da9fb740f9dfafa43c8d0401d22082915d1c04021e07fb8003ac1f31005e282b
-src/rocprof_compute_soc/analysis_configs/gfx90a/0700_wavefront.yaml: da9fb740f9dfafa43c8d0401d22082915d1c04021e07fb8003ac1f31005e282b
-src/rocprof_compute_soc/analysis_configs/gfx940/0700_wavefront.yaml: da9fb740f9dfafa43c8d0401d22082915d1c04021e07fb8003ac1f31005e282b
-src/rocprof_compute_soc/analysis_configs/gfx941/0700_wavefront.yaml: da9fb740f9dfafa43c8d0401d22082915d1c04021e07fb8003ac1f31005e282b
-src/rocprof_compute_soc/analysis_configs/gfx942/0700_wavefront.yaml: da9fb740f9dfafa43c8d0401d22082915d1c04021e07fb8003ac1f31005e282b
-src/rocprof_compute_soc/analysis_configs/gfx950/0700_wavefront.yaml: a6012921ec2e5984861d34ebfca416703b00f3b2cd4cb07541378a285a58b778
-src/rocprof_compute_soc/analysis_configs/gfx908/1000_compute_units_instruction_mix.yaml: 82ef2f27395f2887d1385a33b1d4bcb7cb646ece11146fe1238af2a2fc49108f
-src/rocprof_compute_soc/analysis_configs/gfx90a/1000_compute_units_instruction_mix.yaml: e58c1dff540e06ec3021ae4e852cec5a116e978f00f3e0902b74b5d86f1b88ac
-src/rocprof_compute_soc/analysis_configs/gfx940/1000_compute_units_instruction_mix.yaml: c74ada0b2cd9eda1e1115679267343e7afad9c9638b3a54b3f98193ae9637e09
-src/rocprof_compute_soc/analysis_configs/gfx941/1000_compute_units_instruction_mix.yaml: c74ada0b2cd9eda1e1115679267343e7afad9c9638b3a54b3f98193ae9637e09
-src/rocprof_compute_soc/analysis_configs/gfx942/1000_compute_units_instruction_mix.yaml: c74ada0b2cd9eda1e1115679267343e7afad9c9638b3a54b3f98193ae9637e09
-src/rocprof_compute_soc/analysis_configs/gfx950/1000_compute_units_instruction_mix.yaml: a0fe88305b0972c0702e542558c0d491eac26438577660e58817e988b7b1f0d4
-src/rocprof_compute_soc/analysis_configs/gfx908/1100_compute_units_compute_pipeline.yaml: e815205890d9c815f7f53cdaa64eeef6219bce83054b92fa2be25e240093bdb0
-src/rocprof_compute_soc/analysis_configs/gfx90a/1100_compute_units_compute_pipeline.yaml: b44f500ee07856ec8c59afa1ebb0a204d8b5f3247a43725ba16782484fef6ad1
-src/rocprof_compute_soc/analysis_configs/gfx940/1100_compute_units_compute_pipeline.yaml: e493741974eae65d88afd4fa98b6b3089fb483900b17af2630be18160964d80c
-src/rocprof_compute_soc/analysis_configs/gfx941/1100_compute_units_compute_pipeline.yaml: e493741974eae65d88afd4fa98b6b3089fb483900b17af2630be18160964d80c
-src/rocprof_compute_soc/analysis_configs/gfx942/1100_compute_units_compute_pipeline.yaml: e493741974eae65d88afd4fa98b6b3089fb483900b17af2630be18160964d80c
-src/rocprof_compute_soc/analysis_configs/gfx950/1100_compute_units_compute_pipeline.yaml: 4797cd3052fdb37278aa9a28572287c1a9a7228f05a77ce22c0eb4786cbbd404
-src/rocprof_compute_soc/analysis_configs/gfx908/1200_local_data_share_lds.yaml: 307733f9fee02c620558e2ee4ca3978954f62c3fab26cc98766511b93e96d54c
-src/rocprof_compute_soc/analysis_configs/gfx90a/1200_local_data_share_lds.yaml: 307733f9fee02c620558e2ee4ca3978954f62c3fab26cc98766511b93e96d54c
-src/rocprof_compute_soc/analysis_configs/gfx940/1200_local_data_share_lds.yaml: 307733f9fee02c620558e2ee4ca3978954f62c3fab26cc98766511b93e96d54c
-src/rocprof_compute_soc/analysis_configs/gfx941/1200_local_data_share_lds.yaml: 307733f9fee02c620558e2ee4ca3978954f62c3fab26cc98766511b93e96d54c
-src/rocprof_compute_soc/analysis_configs/gfx942/1200_local_data_share_lds.yaml: 307733f9fee02c620558e2ee4ca3978954f62c3fab26cc98766511b93e96d54c
-src/rocprof_compute_soc/analysis_configs/gfx950/1200_local_data_share_lds.yaml: 35c98741e9b5afd2f7638d2675b22138f5854168e15bc4633112857ed94edbc1
-src/rocprof_compute_soc/analysis_configs/gfx908/1300_instruction_cache.yaml: d2b0a455e9f28d66e6cef701d598072285c58eeebea2d08e1864a8602cdd797a
-src/rocprof_compute_soc/analysis_configs/gfx90a/1300_instruction_cache.yaml: d2b0a455e9f28d66e6cef701d598072285c58eeebea2d08e1864a8602cdd797a
-src/rocprof_compute_soc/analysis_configs/gfx940/1300_instruction_cache.yaml: d2b0a455e9f28d66e6cef701d598072285c58eeebea2d08e1864a8602cdd797a
-src/rocprof_compute_soc/analysis_configs/gfx941/1300_instruction_cache.yaml: d2b0a455e9f28d66e6cef701d598072285c58eeebea2d08e1864a8602cdd797a
-src/rocprof_compute_soc/analysis_configs/gfx942/1300_instruction_cache.yaml: d2b0a455e9f28d66e6cef701d598072285c58eeebea2d08e1864a8602cdd797a
-src/rocprof_compute_soc/analysis_configs/gfx950/1300_instruction_cache.yaml: d2b0a455e9f28d66e6cef701d598072285c58eeebea2d08e1864a8602cdd797a
-src/rocprof_compute_soc/analysis_configs/gfx908/1400_scalar_l1_data_cache.yaml: ccbe9a1309177db8760727f256cd14a7612708833828068dd2bede73ad319d75
-src/rocprof_compute_soc/analysis_configs/gfx90a/1400_scalar_l1_data_cache.yaml: ccbe9a1309177db8760727f256cd14a7612708833828068dd2bede73ad319d75
-src/rocprof_compute_soc/analysis_configs/gfx940/1400_scalar_l1_data_cache.yaml: ccbe9a1309177db8760727f256cd14a7612708833828068dd2bede73ad319d75
-src/rocprof_compute_soc/analysis_configs/gfx941/1400_scalar_l1_data_cache.yaml: ccbe9a1309177db8760727f256cd14a7612708833828068dd2bede73ad319d75
-src/rocprof_compute_soc/analysis_configs/gfx942/1400_scalar_l1_data_cache.yaml: ccbe9a1309177db8760727f256cd14a7612708833828068dd2bede73ad319d75
-src/rocprof_compute_soc/analysis_configs/gfx950/1400_scalar_l1_data_cache.yaml: ccbe9a1309177db8760727f256cd14a7612708833828068dd2bede73ad319d75
-src/rocprof_compute_soc/analysis_configs/gfx908/1500_address_processing_unit_and_data_return_path_ta_td.yaml: b98a800c31da0275704e076e561468dccdaf0b8bff1cc8d74a4e6bf9c7be2973
-src/rocprof_compute_soc/analysis_configs/gfx90a/1500_address_processing_unit_and_data_return_path_ta_td.yaml: 58834a04fc4fb6f9eb648a6b8944f737ce4a8c9d4a6c5f75104d9fd528f520a6
-src/rocprof_compute_soc/analysis_configs/gfx940/1500_address_processing_unit_and_data_return_path_ta_td.yaml: 7f37bcd01557a45aa5ed9009962a9f2499ad924a6a07d7d25a3af97138f360f8
-src/rocprof_compute_soc/analysis_configs/gfx941/1500_address_processing_unit_and_data_return_path_ta_td.yaml: 7f37bcd01557a45aa5ed9009962a9f2499ad924a6a07d7d25a3af97138f360f8
-src/rocprof_compute_soc/analysis_configs/gfx942/1500_address_processing_unit_and_data_return_path_ta_td.yaml: 7f37bcd01557a45aa5ed9009962a9f2499ad924a6a07d7d25a3af97138f360f8
-src/rocprof_compute_soc/analysis_configs/gfx950/1500_address_processing_unit_and_data_return_path_ta_td.yaml: 5c6555a93b01c057f01e0b0cef3169eeb324ca8c256c42f5f9fc0d1ea131486b
-src/rocprof_compute_soc/analysis_configs/gfx908/1600_vector_l1_data_cache.yaml: 4fcb618450366a29c09e428368e1a9afd29a0b80ec3f03a5b3d55a2111bd5704
-src/rocprof_compute_soc/analysis_configs/gfx90a/1600_vector_l1_data_cache.yaml: 4fcb618450366a29c09e428368e1a9afd29a0b80ec3f03a5b3d55a2111bd5704
-src/rocprof_compute_soc/analysis_configs/gfx940/1600_vector_l1_data_cache.yaml: 1c25e20d701aff1ab9276a29cfd5f219b24c621b534aa5b86d1b78d2ae2f300a
-src/rocprof_compute_soc/analysis_configs/gfx941/1600_vector_l1_data_cache.yaml: 1c25e20d701aff1ab9276a29cfd5f219b24c621b534aa5b86d1b78d2ae2f300a
-src/rocprof_compute_soc/analysis_configs/gfx942/1600_vector_l1_data_cache.yaml: 1c25e20d701aff1ab9276a29cfd5f219b24c621b534aa5b86d1b78d2ae2f300a
-src/rocprof_compute_soc/analysis_configs/gfx950/1600_vector_l1_data_cache.yaml: 3cec51c5a848c4f513c4c0a74aa35a5657289148a67179f8db4ea3e55bdb6ac3
-src/rocprof_compute_soc/analysis_configs/gfx908/1700_l2_cache.yaml: e37693ef03caf3d77ae7b91c3c166d033fa0732880cc50a21b8c06a4e79b1f38
-src/rocprof_compute_soc/analysis_configs/gfx90a/1700_l2_cache.yaml: 3314a1e473b1cfc95b742b1a8cfbc47d4602061ca89d7a4ac89ea7cc15908962
-src/rocprof_compute_soc/analysis_configs/gfx940/1700_l2_cache.yaml: cb8922a41dd2088e8e2b0c1e82c7b95fa55304cf90435b217da128234805d77a
-src/rocprof_compute_soc/analysis_configs/gfx941/1700_l2_cache.yaml: 2187f141480a2c57b271ded46255735510de5197441de830cf1efa9345e5566a
-src/rocprof_compute_soc/analysis_configs/gfx942/1700_l2_cache.yaml: 7ce34989a66b8f8750cf1bf76f5cdaf59bf662a7205355f6fe12cace796d4ceb
-src/rocprof_compute_soc/analysis_configs/gfx950/1700_l2_cache.yaml: a3a8db0f555cd1069a61dfc3b89df83e9423d4a0200f1401c7612942ff75152e
-src/rocprof_compute_soc/analysis_configs/gfx908/1800_l2_cache_per_channel.yaml: fd32454bf9f0d3027c77a85ea6be308e92f6815d0ea732c6bafacc8e0f32a25f
-src/rocprof_compute_soc/analysis_configs/gfx90a/1800_l2_cache_per_channel.yaml: 23e9a258ab541d24d29cde2237f9445db695e7a4d17d5974cb4fd5ff9a9869c0
-src/rocprof_compute_soc/analysis_configs/gfx940/1800_l2_cache_per_channel.yaml: eb0823823506bfe0d40931fd69c435baab4979d2dfee158dc33c3651721f9f33
-src/rocprof_compute_soc/analysis_configs/gfx941/1800_l2_cache_per_channel.yaml: eb0823823506bfe0d40931fd69c435baab4979d2dfee158dc33c3651721f9f33
-src/rocprof_compute_soc/analysis_configs/gfx942/1800_l2_cache_per_channel.yaml: eb0823823506bfe0d40931fd69c435baab4979d2dfee158dc33c3651721f9f33
-src/rocprof_compute_soc/analysis_configs/gfx950/1800_l2_cache_per_channel.yaml: b6336ab78a97fb9750e2f925893a5acc4e66e43ac60472c20225e56c440983d7
-src/rocprof_compute_soc/analysis_configs/gfx908/2100_pc_sampling.yaml: efa16d3aadc3363bcc067895a978b82c1e06fa90882dfe33e03315e2c0425d36
-src/rocprof_compute_soc/analysis_configs/gfx90a/2100_pc_sampling.yaml: efa16d3aadc3363bcc067895a978b82c1e06fa90882dfe33e03315e2c0425d36
-src/rocprof_compute_soc/analysis_configs/gfx940/2100_pc_sampling.yaml: efa16d3aadc3363bcc067895a978b82c1e06fa90882dfe33e03315e2c0425d36
-src/rocprof_compute_soc/analysis_configs/gfx941/2100_pc_sampling.yaml: efa16d3aadc3363bcc067895a978b82c1e06fa90882dfe33e03315e2c0425d36
-src/rocprof_compute_soc/analysis_configs/gfx942/2100_pc_sampling.yaml: efa16d3aadc3363bcc067895a978b82c1e06fa90882dfe33e03315e2c0425d36
-src/rocprof_compute_soc/analysis_configs/gfx950/2100_pc_sampling.yaml: efa16d3aadc3363bcc067895a978b82c1e06fa90882dfe33e03315e2c0425d36
-src/rocprof_compute_soc/profile_configs/sets/gfx908_sets.yaml: ee28989e70d0537db8b0f0a4bc5499444b44ff0e73d3e7f2926943be11d0aeda
-src/rocprof_compute_soc/profile_configs/sets/gfx90a_sets.yaml: 9c9533174a3f7bd5c8e09ec998743c7bb2642c4ce3f818b546673be9cafc40a8
-src/rocprof_compute_soc/profile_configs/sets/gfx940_sets.yaml: 44cd2b32b050cafa73d0ead5703b82836edf25a057c21699046b6b8b8918b242
-src/rocprof_compute_soc/profile_configs/sets/gfx941_sets.yaml: 44cd2b32b050cafa73d0ead5703b82836edf25a057c21699046b6b8b8918b242
-src/rocprof_compute_soc/profile_configs/sets/gfx942_sets.yaml: 44cd2b32b050cafa73d0ead5703b82836edf25a057c21699046b6b8b8918b242
-src/rocprof_compute_soc/profile_configs/sets/gfx950_sets.yaml: 238d9dc8a98cfead3fc904885bfe413e5bcb4f1af31e9820cd640388bcd1e1c2
-docs/data/metrics_description.yaml: 12164b43dab4a1088f90763a80ffc8feb38aa82fd7b767edf8f65bd304f22162
+# AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from tools/unified_config.yaml. Generated by tools/split_config.py
+{}
diff --git a/projects/rocprofiler-compute/tools/config_management/.config_hashes.json b/projects/rocprofiler-compute/tools/config_management/.config_hashes.json
index 5989df1edf..64e2376348 100644
--- a/projects/rocprofiler-compute/tools/config_management/.config_hashes.json
+++ b/projects/rocprofiler-compute/tools/config_management/.config_hashes.json
@@ -5,19 +5,19 @@
       "files": {
         "0000_top_stats.yaml": "2819d96f5b1c3704f2ac50868a246a7f",
         "0100_system_info.yaml": "cefae2b10db8cf4b0d3a971cff5e82c8",
-        "0200_system_speed_of_light.yaml": "c54676a8a385c02be50fcf09a721bef6",
-        "0300_memory_chart.yaml": "f952fe7de6d86cb22f6f8ce34867905f",
-        "0400_roofline.yaml": "02ca6cf3583f2718ab371bbbfdd8cfef",
-        "0500_command_processor_cpc_cpf.yaml": "93174ba73bf014c143e179719c110db6",
-        "0600_workgroup_manager_spi.yaml": "7364c8431929891d587e6f9b96ddce10",
-        "0700_wavefront.yaml": "5cc88d7743cba8c638491d97725f6778",
+        "0200_system_speed_of_light.yaml": "c4878ac57b7b7b4b5711672cb2f6dffc",
+        "0300_memory_chart.yaml": "221c6d2bb50a4f4177585b9988f88c7b",
+        "0400_roofline.yaml": "bad8d851694ff9a140e29a148a35fa50",
+        "0500_command_processor_cpc_cpf.yaml": "d8f424ec3fcfa4b2fcee2ad5e6456531",
+        "0600_workgroup_manager_spi.yaml": "8b6a89de516bed5821a9849627ad634a",
+        "0700_wavefront.yaml": "84ecb62efa87d87288bb63f7e3871028",
         "1000_compute_units_instruction_mix.yaml": "e96eccdcb0e5d28b292107c0f68ec845",
         "1100_compute_units_compute_pipeline.yaml": "8f61973d0d08bf49895b5dfe32d05c09",
-        "1200_local_data_share_lds.yaml": "97be647681c51e762e774eb91e8283fc",
-        "1300_instruction_cache.yaml": "dcab979ce17e30f2d48fd2734bf08e74",
-        "1400_scalar_l1_data_cache.yaml": "e90a514e7bb597ec1d22e238650c81d9",
+        "1200_local_data_share_lds.yaml": "14da923095596b4f0eeb82fd24bfd67c",
+        "1300_instruction_cache.yaml": "4b7696d75c93e55f7877e07770beda2d",
+        "1400_scalar_l1_data_cache.yaml": "ea6d0cdb6c34f574248f09554e976599",
         "1500_address_processing_unit_and_data_return_path_ta_td.yaml": "645eb10a440eed62c6250a0f5a2407f3",
-        "1600_vector_l1_data_cache.yaml": "e3b8d1787003094ab7b8372da818ff1e",
+        "1600_vector_l1_data_cache.yaml": "1daa7d96605e8cdf4116bf3b10fb9969",
         "1700_l2_cache.yaml": "38e7db4c404007c471864251dff30570",
         "1800_l2_cache_per_channel.yaml": "7193043cd8eee47501cd8c0ae02b51e9",
         "2100_pc_sampling.yaml": "8049866f25214544f1e53a9e2f08399b"
@@ -28,19 +28,19 @@
       "files": {
         "0000_top_stats.yaml": "2819d96f5b1c3704f2ac50868a246a7f",
         "0100_system_info.yaml": "cefae2b10db8cf4b0d3a971cff5e82c8",
-        "0200_system_speed_of_light.yaml": "747b14ab50dd4d7689af7c268569b32a",
-        "0300_memory_chart.yaml": "0d6d094ad24cebf6e583e643beaae06e",
-        "0400_roofline.yaml": "632b16e1d251e57de0cf7237d3a89766",
-        "0500_command_processor_cpc_cpf.yaml": "93174ba73bf014c143e179719c110db6",
-        "0600_workgroup_manager_spi.yaml": "7364c8431929891d587e6f9b96ddce10",
-        "0700_wavefront.yaml": "5cc88d7743cba8c638491d97725f6778",
-        "1000_compute_units_instruction_mix.yaml": "af6304cce1fe38c119b1d17fa635265c",
-        "1100_compute_units_compute_pipeline.yaml": "c38ece6032d757f394c83ad9f93e0dce",
-        "1200_local_data_share_lds.yaml": "97be647681c51e762e774eb91e8283fc",
-        "1300_instruction_cache.yaml": "dcab979ce17e30f2d48fd2734bf08e74",
-        "1400_scalar_l1_data_cache.yaml": "e90a514e7bb597ec1d22e238650c81d9",
+        "0200_system_speed_of_light.yaml": "dc6a6e1a8513e2d32aecc055a958c639",
+        "0300_memory_chart.yaml": "a61f219fe063c4c4b0b9cbaf96389a8b",
+        "0400_roofline.yaml": "da1d514ed19ca2466c167e983bdb4f13",
+        "0500_command_processor_cpc_cpf.yaml": "d8f424ec3fcfa4b2fcee2ad5e6456531",
+        "0600_workgroup_manager_spi.yaml": "8b6a89de516bed5821a9849627ad634a",
+        "0700_wavefront.yaml": "84ecb62efa87d87288bb63f7e3871028",
+        "1000_compute_units_instruction_mix.yaml": "84bd6a22a29335a4851bba675614e103",
+        "1100_compute_units_compute_pipeline.yaml": "39429cd6af68f91f1b20630c1bab8cc7",
+        "1200_local_data_share_lds.yaml": "14da923095596b4f0eeb82fd24bfd67c",
+        "1300_instruction_cache.yaml": "4b7696d75c93e55f7877e07770beda2d",
+        "1400_scalar_l1_data_cache.yaml": "ea6d0cdb6c34f574248f09554e976599",
         "1500_address_processing_unit_and_data_return_path_ta_td.yaml": "8005b28532601a759ace2f653d10da56",
-        "1600_vector_l1_data_cache.yaml": "e3b8d1787003094ab7b8372da818ff1e",
+        "1600_vector_l1_data_cache.yaml": "1daa7d96605e8cdf4116bf3b10fb9969",
         "1700_l2_cache.yaml": "1630ae8fc504ea056e91bb19909d5629",
         "1800_l2_cache_per_channel.yaml": "5ee4fd9c849670c301c4afee257acddd",
         "2100_pc_sampling.yaml": "8049866f25214544f1e53a9e2f08399b"
@@ -51,19 +51,19 @@
       "files": {
         "0000_top_stats.yaml": "2819d96f5b1c3704f2ac50868a246a7f",
         "0100_system_info.yaml": "cefae2b10db8cf4b0d3a971cff5e82c8",
-        "0200_system_speed_of_light.yaml": "74482aebb54b6d7b429c9ca605cb9951",
-        "0300_memory_chart.yaml": "e28ebc1340d2db1948c68225a6e008ff",
-        "0400_roofline.yaml": "1f3888778245e7eb05e769bda605588a",
-        "0500_command_processor_cpc_cpf.yaml": "93174ba73bf014c143e179719c110db6",
-        "0600_workgroup_manager_spi.yaml": "7364c8431929891d587e6f9b96ddce10",
-        "0700_wavefront.yaml": "5cc88d7743cba8c638491d97725f6778",
-        "1000_compute_units_instruction_mix.yaml": "ac290954de96988004b2a4be345a3a25",
-        "1100_compute_units_compute_pipeline.yaml": "470e3093ce9d53211923d3400e7e7bd7",
-        "1200_local_data_share_lds.yaml": "97be647681c51e762e774eb91e8283fc",
-        "1300_instruction_cache.yaml": "dcab979ce17e30f2d48fd2734bf08e74",
-        "1400_scalar_l1_data_cache.yaml": "e90a514e7bb597ec1d22e238650c81d9",
+        "0200_system_speed_of_light.yaml": "8b413c47f06f2e94b3faa723daac8edd",
+        "0300_memory_chart.yaml": "3d6c88ab2704dc4bd72a63e99fd68cf7",
+        "0400_roofline.yaml": "d4650e008f2e3a7d28871e8518153575",
+        "0500_command_processor_cpc_cpf.yaml": "d8f424ec3fcfa4b2fcee2ad5e6456531",
+        "0600_workgroup_manager_spi.yaml": "8b6a89de516bed5821a9849627ad634a",
+        "0700_wavefront.yaml": "84ecb62efa87d87288bb63f7e3871028",
+        "1000_compute_units_instruction_mix.yaml": "89c03d53fd7563da965ff3e4a1698b02",
+        "1100_compute_units_compute_pipeline.yaml": "0c3c36f6c2fed1f14476966295338e74",
+        "1200_local_data_share_lds.yaml": "14da923095596b4f0eeb82fd24bfd67c",
+        "1300_instruction_cache.yaml": "4b7696d75c93e55f7877e07770beda2d",
+        "1400_scalar_l1_data_cache.yaml": "ea6d0cdb6c34f574248f09554e976599",
         "1500_address_processing_unit_and_data_return_path_ta_td.yaml": "12fe315acb3e06d4c16e4538f418f0ca",
-        "1600_vector_l1_data_cache.yaml": "006854a23925320b94727261f30680b7",
+        "1600_vector_l1_data_cache.yaml": "ebff7d80c601d03027476ae9fb16ecae",
         "1700_l2_cache.yaml": "0987e21ac2547134fea87499dee01847",
         "1800_l2_cache_per_channel.yaml": "ba5eeabcd749ecbb107c42de5ce69317",
         "2100_pc_sampling.yaml": "8049866f25214544f1e53a9e2f08399b"
@@ -74,19 +74,19 @@
       "files": {
         "0000_top_stats.yaml": "2819d96f5b1c3704f2ac50868a246a7f",
         "0100_system_info.yaml": "cefae2b10db8cf4b0d3a971cff5e82c8",
-        "0200_system_speed_of_light.yaml": "7ed2ceba47e232b4e39431228a254f7f",
-        "0300_memory_chart.yaml": "e28ebc1340d2db1948c68225a6e008ff",
-        "0400_roofline.yaml": "a80de496435c2c76eb4cfdc38d62155f",
-        "0500_command_processor_cpc_cpf.yaml": "93174ba73bf014c143e179719c110db6",
-        "0600_workgroup_manager_spi.yaml": "7364c8431929891d587e6f9b96ddce10",
-        "0700_wavefront.yaml": "5cc88d7743cba8c638491d97725f6778",
-        "1000_compute_units_instruction_mix.yaml": "ac290954de96988004b2a4be345a3a25",
-        "1100_compute_units_compute_pipeline.yaml": "470e3093ce9d53211923d3400e7e7bd7",
-        "1200_local_data_share_lds.yaml": "97be647681c51e762e774eb91e8283fc",
-        "1300_instruction_cache.yaml": "dcab979ce17e30f2d48fd2734bf08e74",
-        "1400_scalar_l1_data_cache.yaml": "e90a514e7bb597ec1d22e238650c81d9",
+        "0200_system_speed_of_light.yaml": "0ddeaefd245291c7f88674431efd74f6",
+        "0300_memory_chart.yaml": "3d6c88ab2704dc4bd72a63e99fd68cf7",
+        "0400_roofline.yaml": "c066a19bc0e00e692c34998e44c62387",
+        "0500_command_processor_cpc_cpf.yaml": "d8f424ec3fcfa4b2fcee2ad5e6456531",
+        "0600_workgroup_manager_spi.yaml": "8b6a89de516bed5821a9849627ad634a",
+        "0700_wavefront.yaml": "84ecb62efa87d87288bb63f7e3871028",
+        "1000_compute_units_instruction_mix.yaml": "89c03d53fd7563da965ff3e4a1698b02",
+        "1100_compute_units_compute_pipeline.yaml": "0c3c36f6c2fed1f14476966295338e74",
+        "1200_local_data_share_lds.yaml": "14da923095596b4f0eeb82fd24bfd67c",
+        "1300_instruction_cache.yaml": "4b7696d75c93e55f7877e07770beda2d",
+        "1400_scalar_l1_data_cache.yaml": "ea6d0cdb6c34f574248f09554e976599",
         "1500_address_processing_unit_and_data_return_path_ta_td.yaml": "12fe315acb3e06d4c16e4538f418f0ca",
-        "1600_vector_l1_data_cache.yaml": "006854a23925320b94727261f30680b7",
+        "1600_vector_l1_data_cache.yaml": "ebff7d80c601d03027476ae9fb16ecae",
         "1700_l2_cache.yaml": "05a86637744ad66f6491620c4ad659d2",
         "1800_l2_cache_per_channel.yaml": "ba5eeabcd749ecbb107c42de5ce69317",
         "2100_pc_sampling.yaml": "8049866f25214544f1e53a9e2f08399b"
@@ -97,19 +97,19 @@
       "files": {
         "0000_top_stats.yaml": "2819d96f5b1c3704f2ac50868a246a7f",
         "0100_system_info.yaml": "cefae2b10db8cf4b0d3a971cff5e82c8",
-        "0200_system_speed_of_light.yaml": "74482aebb54b6d7b429c9ca605cb9951",
-        "0300_memory_chart.yaml": "e28ebc1340d2db1948c68225a6e008ff",
-        "0400_roofline.yaml": "f94c87dad18f87e5582566276a5c0cfc",
-        "0500_command_processor_cpc_cpf.yaml": "93174ba73bf014c143e179719c110db6",
-        "0600_workgroup_manager_spi.yaml": "7364c8431929891d587e6f9b96ddce10",
-        "0700_wavefront.yaml": "5cc88d7743cba8c638491d97725f6778",
-        "1000_compute_units_instruction_mix.yaml": "ac290954de96988004b2a4be345a3a25",
-        "1100_compute_units_compute_pipeline.yaml": "470e3093ce9d53211923d3400e7e7bd7",
-        "1200_local_data_share_lds.yaml": "97be647681c51e762e774eb91e8283fc",
-        "1300_instruction_cache.yaml": "dcab979ce17e30f2d48fd2734bf08e74",
-        "1400_scalar_l1_data_cache.yaml": "e90a514e7bb597ec1d22e238650c81d9",
+        "0200_system_speed_of_light.yaml": "8b413c47f06f2e94b3faa723daac8edd",
+        "0300_memory_chart.yaml": "3d6c88ab2704dc4bd72a63e99fd68cf7",
+        "0400_roofline.yaml": "318c3e774d41a639628a7f72c2462375",
+        "0500_command_processor_cpc_cpf.yaml": "d8f424ec3fcfa4b2fcee2ad5e6456531",
+        "0600_workgroup_manager_spi.yaml": "8b6a89de516bed5821a9849627ad634a",
+        "0700_wavefront.yaml": "84ecb62efa87d87288bb63f7e3871028",
+        "1000_compute_units_instruction_mix.yaml": "89c03d53fd7563da965ff3e4a1698b02",
+        "1100_compute_units_compute_pipeline.yaml": "0c3c36f6c2fed1f14476966295338e74",
+        "1200_local_data_share_lds.yaml": "14da923095596b4f0eeb82fd24bfd67c",
+        "1300_instruction_cache.yaml": "4b7696d75c93e55f7877e07770beda2d",
+        "1400_scalar_l1_data_cache.yaml": "ea6d0cdb6c34f574248f09554e976599",
         "1500_address_processing_unit_and_data_return_path_ta_td.yaml": "12fe315acb3e06d4c16e4538f418f0ca",
-        "1600_vector_l1_data_cache.yaml": "006854a23925320b94727261f30680b7",
+        "1600_vector_l1_data_cache.yaml": "ebff7d80c601d03027476ae9fb16ecae",
         "1700_l2_cache.yaml": "96e49399b26d00d88ad534a35c95304b",
         "1800_l2_cache_per_channel.yaml": "ba5eeabcd749ecbb107c42de5ce69317",
         "2100_pc_sampling.yaml": "8049866f25214544f1e53a9e2f08399b"
@@ -120,20 +120,20 @@
       "files": {
         "0000_top_stats.yaml": "2819d96f5b1c3704f2ac50868a246a7f",
         "0100_system_info.yaml": "cefae2b10db8cf4b0d3a971cff5e82c8",
-        "0200_system_speed_of_light.yaml": "4a215bccc9378583a6e7e7733b601537",
-        "0300_memory_chart.yaml": "f19548711a687779df0c0b87a1df7a27",
-        "0400_roofline.yaml": "156c1a1d7a6c1e55aea25552334a84d5",
-        "0500_command_processor_cpc_cpf.yaml": "5b67ff80efbc2e1dffb7e3922499ca88",
-        "0600_workgroup_manager_spi.yaml": "63a7b6f7a4487fb87d67549214e08aac",
-        "0700_wavefront.yaml": "1ecfc3a91ec0cce6ed9eb94afae17aa9",
-        "1000_compute_units_instruction_mix.yaml": "7088fafcaa66a8ec48a9d3939cd7339a",
-        "1100_compute_units_compute_pipeline.yaml": "fce707e3f419ee2708676c8f7c325df5",
-        "1200_local_data_share_lds.yaml": "06bee89ddab210dbd122eaaedef0b29a",
-        "1300_instruction_cache.yaml": "dcab979ce17e30f2d48fd2734bf08e74",
-        "1400_scalar_l1_data_cache.yaml": "e90a514e7bb597ec1d22e238650c81d9",
+        "0200_system_speed_of_light.yaml": "a5ee49ce96bfab87128c856c827db870",
+        "0300_memory_chart.yaml": "e2401641a8f280fda308f87e5ad243df",
+        "0400_roofline.yaml": "2bd3b630b72d6d165c0d30cf481136a9",
+        "0500_command_processor_cpc_cpf.yaml": "3f7dab1663ad7a6fae3801aec2b1e8d0",
+        "0600_workgroup_manager_spi.yaml": "e6546a92d283fed5a5dc6df203efb670",
+        "0700_wavefront.yaml": "330468fd711057b422de9b952c5cfe69",
+        "1000_compute_units_instruction_mix.yaml": "c8bbdde1f29c9548a8e0ed7fcdd9ae04",
+        "1100_compute_units_compute_pipeline.yaml": "30e64960bbac4cc5626615a60240bd5f",
+        "1200_local_data_share_lds.yaml": "0e57c559dbcd5526e2e8006a47a69f4b",
+        "1300_instruction_cache.yaml": "4b7696d75c93e55f7877e07770beda2d",
+        "1400_scalar_l1_data_cache.yaml": "ea6d0cdb6c34f574248f09554e976599",
         "1500_address_processing_unit_and_data_return_path_ta_td.yaml": "355a0c6b9b113fcfb686a300b78be21a",
-        "1600_vector_l1_data_cache.yaml": "68382e45c7a3c578df861d6285024803",
-        "1700_l2_cache.yaml": "f70f23b93e97b99327b5db3907eb133e",
+        "1600_vector_l1_data_cache.yaml": "689aba850739a9cbd64ce1e816e95dff",
+        "1700_l2_cache.yaml": "067f8c8a7264762fdc58a41728b4382b",
         "1800_l2_cache_per_channel.yaml": "7e2a1809a9b7f70a088068d6689c8aa4",
         "2100_pc_sampling.yaml": "8049866f25214544f1e53a9e2f08399b"
       }
diff --git a/projects/rocprofiler-compute/tools/config_management/README.md b/projects/rocprofiler-compute/tools/config_management/README.md
index 3677bdf2c6..d19ecf5e95 100644
--- a/projects/rocprofiler-compute/tools/config_management/README.md
+++ b/projects/rocprofiler-compute/tools/config_management/README.md
@@ -97,7 +97,7 @@ Addition:
     metric_descriptions:
       New Metric:
         plain: Description text
-        rst: |- # Optional
+        rst: >- # Optional
           Description with :ref:`RST markup <link>`
 
 Deletion:
@@ -231,7 +231,7 @@ Modification:
     metric_descriptions:
       Existing Metric:
         plain: Updated description
-        rst: |-
+        rst: >-
           Updated description with **RST**"
 ```
 
diff --git a/projects/rocprofiler-compute/tools/config_management/metric_description_manager.py b/projects/rocprofiler-compute/tools/config_management/metric_description_manager.py
index 6c197c89d5..3fa75d5122 100644
--- a/projects/rocprofiler-compute/tools/config_management/metric_description_manager.py
+++ b/projects/rocprofiler-compute/tools/config_management/metric_description_manager.py
@@ -114,11 +114,9 @@ def merge_docs_rst_as_default(descs: dict, docs_file: Path) -> dict:
     for section, metrics in descs.items():
         docs_section = docs.get(section) or {}
         for metric_name, d in metrics.items():
-            # If panel didn't explicitly provide rst, inherit from docs
-            if not d.get("rst"):
-                doc_entry = docs_section.get(metric_name) or {}
-                if doc_entry.get("rst"):
-                    d["rst"] = doc_entry["rst"]
+            doc_entry = docs_section.get(metric_name) or {}
+            if doc_entry.get("rst"):
+                d["rst"] = doc_entry["rst"]
     return descs
 
 
@@ -129,10 +127,6 @@ def merge_units_as_default(descs: dict, docs_file: Path, per_arch_file: Path) ->
       2) else from docs file,
       3) else leave as-is (missing).
     """
-    per_arch: dict = {}
-    if per_arch_file.exists():
-        with open(per_arch_file, "r", encoding="utf-8") as f:
-            per_arch = yaml.safe_load(f) or {}
 
     docs: dict = {}
     if docs_file.exists():
@@ -140,18 +134,11 @@ def merge_units_as_default(descs: dict, docs_file: Path, per_arch_file: Path) ->
             docs = yaml.safe_load(f) or {}
 
     for section, metrics in descs.items():
-        psec = per_arch.get(section) or {}
         dsec = docs.get(section) or {}
         for metric, data in metrics.items():
-            # Only fill if panel did NOT explicitly set unit
-            if "unit" not in data or data["unit"] is None:
-                unit = None
-                if metric in psec and isinstance(psec[metric], dict):
-                    unit = psec[metric].get("unit")
-                if unit is None and metric in dsec and isinstance(dsec[metric], dict):
-                    unit = dsec[metric].get("unit")
-                if unit is not None:
-                    data["unit"] = unit
+            doc_entry = dsec.get(metric)
+            if doc_entry and "unit" in doc_entry:
+                data["unit"] = doc_entry["unit"]
     return descs
 
 
@@ -403,7 +390,7 @@ def sync_arch(
     update_per_arch_metrics_file(arch_name, descriptions, per_arch_metrics_dir)
 
     # 5) Only when latest: update docs, but overwrite 'rst' only for overrides
-    if is_latest:
+    if is_latest and (panel_rst_overrides or panel_unit_overrides):
         if not update_docs_metrics_file(
             descriptions,
             docs_metrics_file,
diff --git a/projects/rocprofiler-compute/tools/config_management/utils.py b/projects/rocprofiler-compute/tools/config_management/utils.py
index 0af6e5aceb..d79d65a257 100644
--- a/projects/rocprofiler-compute/tools/config_management/utils.py
+++ b/projects/rocprofiler-compute/tools/config_management/utils.py
@@ -31,7 +31,7 @@ import yaml
 
 def str_representer(dumper, data):
     if "\n" in data:
-        return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
+        return dumper.represent_scalar("tag:yaml.org,2002:str", data, style=">")
     return dumper.represent_scalar("tag:yaml.org,2002:str", data)
 
 
diff --git a/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx908_metrics_description.yaml b/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx908_metrics_description.yaml
index 0fd8a4b262..10e5304b9c 100644
--- a/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx908_metrics_description.yaml
+++ b/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx908_metrics_description.yaml
@@ -1,20 +1,20 @@
 System Speed-of-Light:
   VALU FLOPs:
-    rst: |-
+    rst: >-
       The total floating-point operations executed per second on the :ref:`VALU
       <desc-valu>`. This is also presented as a percent of the peak theoretical
       FLOPs achievable on the specific accelerator. Note: this does not include
       any floating-point operations from :ref:`MFMA <desc-mfma>` instructions.
     unit: GFLOPs
   VALU IOPs:
-    rst: |-
+    rst: >-
       The total integer operations executed per second on the :ref:`VALU <desc-valu>`.
       This is also presented as a percent of the peak theoretical IOPs achievable
       on the specific accelerator. Note: this does not include any integer operations
       from :ref:`MFMA <desc-mfma>` instructions.
     unit: GOIPs
   MFMA FLOPs (BF16):
-    rst: |-
+    rst: >-
       The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>`
       operations executed per second. Note: this does not include any 16-bit brain
       floating point operations from :ref:`VALU <desc-valu>` instructions. This
@@ -22,7 +22,7 @@ System Speed-of-Light:
       achievable on the specific accelerator.
     unit: GFLOPs
   MFMA FLOPs (F16):
-    rst: |-
+    rst: >-
       The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 16-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. This is also presented
@@ -30,7 +30,7 @@ System Speed-of-Light:
       specific accelerator.
     unit: GFLOPs
   MFMA FLOPs (F32):
-    rst: |-
+    rst: >-
       The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 32-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. This is also presented
@@ -38,7 +38,7 @@ System Speed-of-Light:
       specific accelerator.
     unit: GFLOPs
   MFMA FLOPs (F64):
-    rst: |-
+    rst: >-
       The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 64-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. This is also presented
@@ -46,7 +46,7 @@ System Speed-of-Light:
       specific accelerator.
     unit: GFLOPs
   MFMA IOPs (Int8):
-    rst: |-
+    rst: >-
       The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
       per second. Note: this does not include any 8-bit integer operations from
       :ref:`VALU <desc-valu>` instructions. This is also presented as a percent
@@ -99,7 +99,7 @@ System Speed-of-Light:
       over the :ref:`total active CU cycles <total-active-cu-cycles>`.
     unit: Instructions per-cycle
   Wavefront Occupancy:
-    rst: |-
+    rst: >-
       The time-averaged number of wavefronts resident on the accelerator over
       the lifetime of the kernel. Note: this metric may be inaccurate for short-running
       kernels (less than 1ms). This is also presented as a percent of the peak theoretical
@@ -144,7 +144,7 @@ System Speed-of-Light:
       peak theoretical bandwidth achievable on the specific accelerator.
     unit: GB/s
   L2-Fabric Read BW:
-    rst: |-
+    rst: >-
       The number of bytes read by the L2 over the :ref:`Infinity Fabric\u2122
       interface <l2-fabric>` per unit time. This is also presented as a percent
       of the peak theoretical bandwidth achievable on the specific accelerator.
@@ -229,19 +229,19 @@ Memory Chart:
     rst: Total number of compute units (CUs) on the accelerator.
     unit: CUs
   VGPR:
-    rst: |-
+    rst: >-
       The number of architected vector general-purpose registers allocated for the
       kernel, see :ref:`VALU <desc-valu>`. Note: this may not exactly match the
       number of VGPRs requested by the compiler due to allocation granularity.
     unit: VGPRs
   SGPR:
-    rst: |-
+    rst: >-
       The number of scalar general-purpose registers allocated for the kernel, see
       :ref:`SALU <desc-salu>`. Note: this may not exactly match the number of
       SGPRs requested by the compiler due to allocation granularity.
     unit: SGPRs
   LDS Allocation:
-    rst: |-
+    rst: >-
       The number of bytes of :doc:`LDS <local-data-share>` memory (or, shared memory)
       allocated for this kernel. Note: This may also be larger than what was requested
       at compile time due to both allocation granularity and dynamic per-dispatch
@@ -420,28 +420,28 @@ Memory Chart:
     unit: Requests per normalization unit
 Roofline Performance Rates:
   VALU FLOPs (F16):
-    rst: |-
+    rst: >-
       The total 16-bit floating-point operations executed per second on the :ref:`VALU
       <desc-valu>`. This is presented with the value of the peak empirical F16 FLOPs achievable
       on the specific accelerator. Note: this does not include any F16 operations
       from :ref:`MFMA <desc-mfma>` instructions.
     unit: GFLOPs
   VALU FLOPs (F32):
-    rst: |-
+    rst: >-
       The total 32-bit floating-point operations executed per second on the :ref:`VALU
       <desc-valu>`. This is presented with the value of the peak empirical F32 FLOPs achievable
       on the specific accelerator. Note: this does not include any F32 operations
       from :ref:`MFMA <desc-mfma>` instructions.
     unit: GFLOPs
   VALU FLOPs (F64):
-    rst: |-
+    rst: >-
       The total 64-bit floating-point operations executed per second on the :ref:`VALU
       <desc-valu>`. This is presented with the value of the peak empirical F64 FLOPs achievable
       on the specific accelerator. Note: this does not include any F64 operations
       from :ref:`MFMA <desc-mfma>` instructions.
     unit: GFLOPs
   MFMA FLOPs (F64):
-    rst: |-
+    rst: >-
       The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 64-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
@@ -449,7 +449,7 @@ Roofline Performance Rates:
       displayed alongside for comparison.
     unit: GFLOPs
   MFMA FLOPs (F32):
-    rst: |-
+    rst: >-
       The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 32-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
@@ -457,7 +457,7 @@ Roofline Performance Rates:
       displayed alongside for comparison.
     unit: GFLOPs
   MFMA FLOPs (F16):
-    rst: |-
+    rst: >-
       The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 16-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
@@ -465,7 +465,7 @@ Roofline Performance Rates:
       displayed alongside for comparison.
     unit: GFLOPs
   MFMA FLOPs (BF16):
-    rst: |-
+    rst: >-
       The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>`
       operations executed per second. Note: this does not include any 16-bit brain
       floating point operations from :ref:`VALU <desc-valu>` instructions. The
@@ -473,7 +473,7 @@ Roofline Performance Rates:
       accelerator is displayed alongside for comparison.
     unit: GFLOPs
   MFMA IOPs (Int8):
-    rst: |-
+    rst: >-
       The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
       per second. Note: this does not include any 8-bit integer operations from
       :ref:`VALU <desc-valu>` instructions. The peak empirically measured INT8 MFMA
@@ -481,7 +481,7 @@ Roofline Performance Rates:
       for comparison.
     unit: GIOPs
   HBM Bandwidth:
-    rst: |-
+    rst: >-
       The total number of bytes read from and written to High-Bandwidth
         Memory (HBM) per second. The peak empirically measured bandwidth achievable
         on the specific accelerator is displayed alongside for comparison.
@@ -512,28 +512,28 @@ Roofline Performance Rates:
     unit: GB/s
 Roofline Plot Points:
   AI HBM:
-    rst: |-
+    rst: >-
       The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM).
       It is the ratio of total floating-point operations (FLOPs) to total bytes
       transferred between HBM and the L2 cache. This value is used as the x-coordinate
       for the HBM roofline.
     unit: FLOPs/Byte
   AI L2:
-    rst: |-
+    rst: >-
       The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio
       of total floating-point operations (FLOPs) to total bytes transferred between
       the L2 cache and the L1 cache. This value is used as the x-coordinate for
       the L2 roofline.
     unit: FLOPs/Byte
   AI L1:
-    rst: |-
+    rst: >-
       The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio
       of total floating-point operations (FLOPs) to total bytes transferred between
       the L1 cache and the processing units. This value is used as the x-coordinate
       for the L1 roofline.
     unit: FLOPs/Byte
   Performance (GFLOPs):
-    rst: |-
+    rst: >-
       The overall achieved performance, measured in GigaFLOPs
       per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point
       operations divided by the total execution time. This value is used as the y-coordinate
@@ -591,7 +591,7 @@ Workgroup manager utilizations:
       any work.
     unit: Percent
   Scheduler-Pipe Utilization:
-    rst: |-
+    rst: >-
       The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
       in the kernel where the scheduler-pipes were actively doing any work. Note: this
       value is expected to range between 0% and 25%. See :ref:`desc-spi`.
@@ -629,7 +629,7 @@ Workgroup manager utilizations:
     unit: Cycles/wave
 Workgroup Manager - Resource Allocation:
   Not-scheduled Rate (Workgroup Manager):
-    rst: |-
+    rst: >-
       The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
       in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
       due to a bottleneck within the workgroup manager rather than a lack of a
@@ -638,7 +638,7 @@ Workgroup Manager - Resource Allocation:
       description.
     unit: Percent
   Not-scheduled Rate (Scheduler-Pipe):
-    rst: |-
+    rst: >-
       The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
       in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
       due to a bottleneck within the scheduler-pipes rather than a lack of a CU
@@ -647,7 +647,7 @@ Workgroup Manager - Resource Allocation:
       description.
     unit: Percent
   Scheduler-Pipe Stall Rate:
-    rst: |-
+    rst: >-
       The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
       in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
       due to occupancy limitations (like a lack of a CU or :ref:`SIMD <desc-valu>`
@@ -711,7 +711,7 @@ Wavefront Launch Stats:
       block size.
     unit: Work-Items
   Total Wavefronts:
-    rst: |-
+    rst: >-
       The total number of wavefronts launched as part of the kernel dispatch.
       On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront
       size is always 64 work-items. Thus, the total number of wavefronts should
@@ -726,25 +726,25 @@ Wavefront Launch Stats:
       <https://docs.kernel.org/gpu/amdgpu/module-parameters.html?highlight=cwsr>`_.
     unit: Wavefronts
   VGPRs:
-    rst: |-
+    rst: >-
       The number of architected vector general-purpose registers allocated for the
       kernel, see :ref:`VALU <desc-valu>`. Note: this may not exactly match the
       number of VGPRs requested by the compiler due to allocation granularity.
     unit: VGPRs
   AGPRs:
-    rst: |-
+    rst: >-
       The number of accumulation vector general-purpose registers allocated
       for the kernel, see :ref:`AGPRs <desc-agprs>`. Note: this may not exactly match
       the number of AGPRs requested by the compiler due to allocation granularity.
     unit: AGPRs
   SGPRs:
-    rst: |-
+    rst: >-
       The number of scalar general-purpose registers allocated for the kernel, see
       :ref:`SALU <desc-salu>`. Note: this may not exactly match the number of
       SGPRs requested by the compiler due to allocation granularity.
     unit: SGPRs
   LDS Allocation:
-    rst: |-
+    rst: >-
       The number of bytes of :doc:`LDS <local-data-share>` memory (or, shared memory)
       allocated for this kernel. Note: This may also be larger than what was requested
       at compile time due to both allocation granularity and dynamic per-dispatch
@@ -767,7 +767,7 @@ Wavefront Runtime Stats:
       This is averaged over all wavefronts in a kernel dispatch.
     unit: Instructions per wavefront
   Wave Cycles:
-    rst: |-
+    rst: >-
       The number of cycles a wavefront in the kernel dispatch spent resident
       on a compute unit per :ref:`normalization unit <normalization-units>`. This is
       averaged over all wavefronts in a kernel dispatch. Note: this should not
@@ -805,7 +805,7 @@ Wavefront Runtime Stats:
       the total Wave Cycles metric.
     unit: Cycles per normalization unit
   Wavefront Occupancy:
-    rst: |-
+    rst: >-
       The time-averaged number of wavefronts resident on the accelerator over the
       lifetime of the kernel. Note: this metric may be inaccurate for short-running
       kernels (less than 1ms).
@@ -976,7 +976,7 @@ LDS Statistics:
       to stalls from non-dword aligned addresses per :ref:`normalization unit <normalization-units>`.
     unit: Cycles per normalization unit
   Mem Violations:
-    rst: |-
+    rst: >-
       The total number of out-of-bounds accesses made to the LDS, per :ref:`normalization
       unit <normalization-units>`. This is unused and expected to be zero in
       most configurations for modern CDNA\u2122 accelerators.
@@ -993,7 +993,7 @@ L1I Speed-of-Light:
       over the number of all L1I requests.
     unit: Percent
   L1I-L2 Bandwidth Utilization:
-    rst: |-
+    rst: >-
       The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth
       achieved. Calculated as the ratio of the total number of requests from
       the L1I to the L2 cache over the :ref:`total L1I-L2 interface cycles <total-l1i-cycles>`.
@@ -1100,7 +1100,7 @@ Scalar L1D cache accesses:
     unit: Requests per normalization unit
 Scalar L1D Cache - L2 Interface:
   sL1D-L2 BW:
-    rst: |-
+    rst: >-
       The total number of bytes read from, written to, or atomically updated
       across the sL1D\u2194:doc:`L2 <l2-cache>` interface, divided by total duration.
       Note that sL1D writes and atomics are typically
@@ -1122,7 +1122,7 @@ Scalar L1D Cache - L2 Interface:
       CDNA accelerators.
     unit: Requests per normalization unit
   Stall Cycles:
-    rst: |-
+    rst: >-
       The total number of cycles the sL1D\u2194 :doc:`L2 <l2-cache>` interface
       was stalled, per :ref:`normalization unit <normalization-units>`.
     unit: Cycles per normalization unit
@@ -1432,7 +1432,7 @@ L1 Unified Translation Cache (UTCL1):
       translation not being present in the cache, per :ref:`normalization unit <normalization-units>`.
     unit: unit
   Permission Misses:
-    rst: |-
+    rst: >-
       The total number of translation requests that missed in the UTCL1 due
       to a permission error, per :ref:`normalization unit <normalization-units>`.
       This is unused and expected to be zero in most configurations for modern
diff --git a/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx90a_metrics_description.yaml b/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx90a_metrics_description.yaml
index 39e2a52664..828289ee98 100644
--- a/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx90a_metrics_description.yaml
+++ b/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx90a_metrics_description.yaml
@@ -1,20 +1,20 @@
 System Speed-of-Light:
   VALU FLOPs:
-    rst: |-
+    rst: >-
       The total floating-point operations executed per second on the :ref:`VALU
       <desc-valu>`. This is also presented as a percent of the peak theoretical
       FLOPs achievable on the specific accelerator. Note: this does not include
       any floating-point operations from :ref:`MFMA <desc-mfma>` instructions.
     unit: GFLOPs
   VALU IOPs:
-    rst: |-
+    rst: >-
       The total integer operations executed per second on the :ref:`VALU <desc-valu>`.
       This is also presented as a percent of the peak theoretical IOPs achievable
       on the specific accelerator. Note: this does not include any integer operations
       from :ref:`MFMA <desc-mfma>` instructions.
     unit: GOIPs
   MFMA FLOPs (BF16):
-    rst: |-
+    rst: >-
       The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>`
       operations executed per second. Note: this does not include any 16-bit brain
       floating point operations from :ref:`VALU <desc-valu>` instructions. This
@@ -22,7 +22,7 @@ System Speed-of-Light:
       achievable on the specific accelerator.
     unit: GFLOPs
   MFMA FLOPs (F16):
-    rst: |-
+    rst: >-
       The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 16-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. This is also presented
@@ -30,7 +30,7 @@ System Speed-of-Light:
       specific accelerator.
     unit: GFLOPs
   MFMA FLOPs (F32):
-    rst: |-
+    rst: >-
       The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 32-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. This is also presented
@@ -38,7 +38,7 @@ System Speed-of-Light:
       specific accelerator.
     unit: GFLOPs
   MFMA FLOPs (F64):
-    rst: |-
+    rst: >-
       The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 64-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. This is also presented
@@ -46,7 +46,7 @@ System Speed-of-Light:
       specific accelerator.
     unit: GFLOPs
   MFMA IOPs (Int8):
-    rst: |-
+    rst: >-
       The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
       per second. Note: this does not include any 8-bit integer operations from
       :ref:`VALU <desc-valu>` instructions. This is also presented as a percent
@@ -99,7 +99,7 @@ System Speed-of-Light:
       over the :ref:`total active CU cycles <total-active-cu-cycles>`.
     unit: Instructions per-cycle
   Wavefront Occupancy:
-    rst: |-
+    rst: >-
       The time-averaged number of wavefronts resident on the accelerator over
       the lifetime of the kernel. Note: this metric may be inaccurate for short-running
       kernels (less than 1ms). This is also presented as a percent of the peak theoretical
@@ -144,7 +144,7 @@ System Speed-of-Light:
       peak theoretical bandwidth achievable on the specific accelerator.
     unit: GB/s
   L2-Fabric Read BW:
-    rst: |-
+    rst: >-
       The number of bytes read by the L2 over the :ref:`Infinity Fabric\u2122
       interface <l2-fabric>` per unit time. This is also presented as a percent
       of the peak theoretical bandwidth achievable on the specific accelerator.
@@ -229,19 +229,19 @@ Memory Chart:
     rst: Total number of compute units (CUs) on the accelerator.
     unit: CUs
   VGPR:
-    rst: |-
+    rst: >-
       The number of architected vector general-purpose registers allocated for the
       kernel, see :ref:`VALU <desc-valu>`. Note: this may not exactly match the
       number of VGPRs requested by the compiler due to allocation granularity.
     unit: VGPRs
   SGPR:
-    rst: |-
+    rst: >-
       The number of scalar general-purpose registers allocated for the kernel, see
       :ref:`SALU <desc-salu>`. Note: this may not exactly match the number of
       SGPRs requested by the compiler due to allocation granularity.
     unit: SGPRs
   LDS Allocation:
-    rst: |-
+    rst: >-
       The number of bytes of :doc:`LDS <local-data-share>` memory (or, shared memory)
       allocated for this kernel. Note: This may also be larger than what was requested
       at compile time due to both allocation granularity and dynamic per-dispatch
@@ -420,28 +420,28 @@ Memory Chart:
     unit: Requests per normalization unit
 Roofline Performance Rates:
   VALU FLOPs (F16):
-    rst: |-
+    rst: >-
       The total 16-bit floating-point operations executed per second on the :ref:`VALU
       <desc-valu>`. This is presented with the value of the peak empirical F16 FLOPs achievable
       on the specific accelerator. Note: this does not include any F16 operations
       from :ref:`MFMA <desc-mfma>` instructions.
     unit: GFLOPs
   VALU FLOPs (F32):
-    rst: |-
+    rst: >-
       The total 32-bit floating-point operations executed per second on the :ref:`VALU
       <desc-valu>`. This is presented with the value of the peak empirical F32 FLOPs achievable
       on the specific accelerator. Note: this does not include any F32 operations
       from :ref:`MFMA <desc-mfma>` instructions.
     unit: GFLOPs
   VALU FLOPs (F64):
-    rst: |-
+    rst: >-
       The total 64-bit floating-point operations executed per second on the :ref:`VALU
       <desc-valu>`. This is presented with the value of the peak empirical F64 FLOPs achievable
       on the specific accelerator. Note: this does not include any F64 operations
       from :ref:`MFMA <desc-mfma>` instructions.
     unit: GFLOPs
   MFMA FLOPs (F64):
-    rst: |-
+    rst: >-
       The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 64-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
@@ -449,7 +449,7 @@ Roofline Performance Rates:
       displayed alongside for comparison.
     unit: GFLOPs
   MFMA FLOPs (F32):
-    rst: |-
+    rst: >-
       The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 32-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
@@ -457,7 +457,7 @@ Roofline Performance Rates:
       displayed alongside for comparison.
     unit: GFLOPs
   MFMA FLOPs (F16):
-    rst: |-
+    rst: >-
       The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 16-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
@@ -465,7 +465,7 @@ Roofline Performance Rates:
       displayed alongside for comparison.
     unit: GFLOPs
   MFMA FLOPs (BF16):
-    rst: |-
+    rst: >-
       The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>`
       operations executed per second. Note: this does not include any 16-bit brain
       floating point operations from :ref:`VALU <desc-valu>` instructions. The
@@ -473,7 +473,7 @@ Roofline Performance Rates:
       accelerator is displayed alongside for comparison.
     unit: GFLOPs
   MFMA IOPs (Int8):
-    rst: |-
+    rst: >-
       The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
       per second. Note: this does not include any 8-bit integer operations from
       :ref:`VALU <desc-valu>` instructions. The peak empirically measured INT8 MFMA
@@ -481,7 +481,7 @@ Roofline Performance Rates:
       for comparison.
     unit: GIOPs
   HBM Bandwidth:
-    rst: |-
+    rst: >-
       The total number of bytes read from and written to High-Bandwidth
         Memory (HBM) per second. The peak empirically measured bandwidth achievable
         on the specific accelerator is displayed alongside for comparison.
@@ -512,28 +512,28 @@ Roofline Performance Rates:
     unit: GB/s
 Roofline Plot Points:
   AI HBM:
-    rst: |-
+    rst: >-
       The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM).
       It is the ratio of total floating-point operations (FLOPs) to total bytes
       transferred between HBM and the L2 cache. This value is used as the x-coordinate
       for the HBM roofline.
     unit: FLOPs/Byte
   AI L2:
-    rst: |-
+    rst: >-
       The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio
       of total floating-point operations (FLOPs) to total bytes transferred between
       the L2 cache and the L1 cache. This value is used as the x-coordinate for
       the L2 roofline.
     unit: FLOPs/Byte
   AI L1:
-    rst: |-
+    rst: >-
       The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio
       of total floating-point operations (FLOPs) to total bytes transferred between
       the L1 cache and the processing units. This value is used as the x-coordinate
       for the L1 roofline.
     unit: FLOPs/Byte
   Performance (GFLOPs):
-    rst: |-
+    rst: >-
       The overall achieved performance, measured in GigaFLOPs
       per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point
       operations divided by the total execution time. This value is used as the y-coordinate
@@ -591,7 +591,7 @@ Workgroup manager utilizations:
       any work.
     unit: Percent
   Scheduler-Pipe Utilization:
-    rst: |-
+    rst: >-
       The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
       in the kernel where the scheduler-pipes were actively doing any work. Note: this
       value is expected to range between 0% and 25%. See :ref:`desc-spi`.
@@ -629,7 +629,7 @@ Workgroup manager utilizations:
     unit: Cycles/wave
 Workgroup Manager - Resource Allocation:
   Not-scheduled Rate (Workgroup Manager):
-    rst: |-
+    rst: >-
       The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
       in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
       due to a bottleneck within the workgroup manager rather than a lack of a
@@ -638,7 +638,7 @@ Workgroup Manager - Resource Allocation:
       description.
     unit: Percent
   Not-scheduled Rate (Scheduler-Pipe):
-    rst: |-
+    rst: >-
       The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
       in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
       due to a bottleneck within the scheduler-pipes rather than a lack of a CU
@@ -647,7 +647,7 @@ Workgroup Manager - Resource Allocation:
       description.
     unit: Percent
   Scheduler-Pipe Stall Rate:
-    rst: |-
+    rst: >-
       The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
       in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
       due to occupancy limitations (like a lack of a CU or :ref:`SIMD <desc-valu>`
@@ -711,7 +711,7 @@ Wavefront Launch Stats:
       block size.
     unit: Work-Items
   Total Wavefronts:
-    rst: |-
+    rst: >-
       The total number of wavefronts launched as part of the kernel dispatch.
       On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront
       size is always 64 work-items. Thus, the total number of wavefronts should
@@ -726,25 +726,25 @@ Wavefront Launch Stats:
       <https://docs.kernel.org/gpu/amdgpu/module-parameters.html?highlight=cwsr>`_.
     unit: Wavefronts
   VGPRs:
-    rst: |-
+    rst: >-
       The number of architected vector general-purpose registers allocated for the
       kernel, see :ref:`VALU <desc-valu>`. Note: this may not exactly match the
       number of VGPRs requested by the compiler due to allocation granularity.
     unit: VGPRs
   AGPRs:
-    rst: |-
+    rst: >-
       The number of accumulation vector general-purpose registers allocated
       for the kernel, see :ref:`AGPRs <desc-agprs>`. Note: this may not exactly match
       the number of AGPRs requested by the compiler due to allocation granularity.
     unit: AGPRs
   SGPRs:
-    rst: |-
+    rst: >-
       The number of scalar general-purpose registers allocated for the kernel, see
       :ref:`SALU <desc-salu>`. Note: this may not exactly match the number of
       SGPRs requested by the compiler due to allocation granularity.
     unit: SGPRs
   LDS Allocation:
-    rst: |-
+    rst: >-
       The number of bytes of :doc:`LDS <local-data-share>` memory (or, shared memory)
       allocated for this kernel. Note: This may also be larger than what was requested
       at compile time due to both allocation granularity and dynamic per-dispatch
@@ -767,7 +767,7 @@ Wavefront Runtime Stats:
       This is averaged over all wavefronts in a kernel dispatch.
     unit: Instructions per wavefront
   Wave Cycles:
-    rst: |-
+    rst: >-
       The number of cycles a wavefront in the kernel dispatch spent resident
       on a compute unit per :ref:`normalization unit <normalization-units>`. This is
       averaged over all wavefronts in a kernel dispatch. Note: this should not
@@ -805,7 +805,7 @@ Wavefront Runtime Stats:
       the total Wave Cycles metric.
     unit: Cycles per normalization unit
   Wavefront Occupancy:
-    rst: |-
+    rst: >-
       The time-averaged number of wavefronts resident on the accelerator over the
       lifetime of the kernel. Note: this metric may be inaccurate for short-running
       kernels (less than 1ms).
@@ -907,7 +907,7 @@ VALU Arithmetic Instruction Mix:
       unit <normalization-units>`.
     unit: Instructions per normalization unit
   Conversion:
-    rst: |-
+    rst: >-
       The total number of type conversion instructions (such as converting data
       to or from F32\u2194F64) issued to the VALU per :ref:`normalization unit
       <normalization-units>`.
@@ -976,21 +976,21 @@ MFMA Arithmetic Instruction Mix:
     unit: Instructions per normalization unit
 Compute Speed-of-Light:
   VALU FLOPs:
-    rst: |-
+    rst: >-
       The total floating-point operations executed per second on the :ref:`VALU
       <desc-valu>`. This is also presented as a percent of the peak theoretical
       FLOPs achievable on the specific accelerator. Note: this does not include
       any floating-point operations from :ref:`MFMA <desc-mfma>` instructions.
     unit: GFLOPs
   VALU IOPs:
-    rst: |-
+    rst: >-
       The total integer operations executed per second on the :ref:`VALU <desc-valu>`.
       This is also presented as a percent of the peak theoretical IOPs achievable
       on the specific accelerator. Note: this does not include any integer operations
       from :ref:`MFMA <desc-mfma>` instructions.
     unit: GIOPs
   MFMA FLOPs (BF16):
-    rst: |-
+    rst: >-
       The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 16-bit brain floating
       point operations from :ref:`VALU <desc-valu>` instructions. This is also
@@ -998,7 +998,7 @@ Compute Speed-of-Light:
       on the specific accelerator.
     unit: GFLOPs
   MFMA FLOPs (F16):
-    rst: |-
+    rst: >-
       The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 16-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. This is also presented
@@ -1006,7 +1006,7 @@ Compute Speed-of-Light:
       specific accelerator.
     unit: GFLOPs
   MFMA FLOPs (F32):
-    rst: |-
+    rst: >-
       The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 32-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. This is also presented
@@ -1014,7 +1014,7 @@ Compute Speed-of-Light:
       specific accelerator.
     unit: GFLOPs
   MFMA FLOPs (F64):
-    rst: |-
+    rst: >-
       The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 64-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. This is also presented
@@ -1026,7 +1026,7 @@ Compute Speed-of-Light:
       achievable on the specific accelerator.
     unit: GFLOPs
   MFMA IOPs (INT8):
-    rst: |-
+    rst: >-
       The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
       per second. Note: this does not include any 8-bit integer operations from
       :ref:`VALU <desc-valu>` instructions. This is also presented as a percent
@@ -1112,7 +1112,7 @@ Arithmetic Operations:
       unit <normalization-units>`.
     unit: FLOP per normalization unit
   BF16 OPs:
-    rst: |-
+    rst: >-
       The total number of 16-bit brain floating-point operations executed on
       either the :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization
       unit <normalization-units>`. Note: on current CDNA accelerators, the VALU
@@ -1129,7 +1129,7 @@ Arithmetic Operations:
       unit <normalization-units>`.
     unit: FLOP per normalization unit
   INT8 OPs:
-    rst: |-
+    rst: >-
       The total number of 8-bit integer operations executed on either the :ref:`VALU
       <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization unit
       <normalization-units>`. Note: on current CDNA accelerators, the VALU has
@@ -1206,7 +1206,7 @@ LDS Statistics:
       to stalls from non-dword aligned addresses per :ref:`normalization unit <normalization-units>`.
     unit: Cycles per normalization unit
   Mem Violations:
-    rst: |-
+    rst: >-
       The total number of out-of-bounds accesses made to the LDS, per :ref:`normalization
       unit <normalization-units>`. This is unused and expected to be zero in
       most configurations for modern CDNA\u2122 accelerators.
@@ -1223,7 +1223,7 @@ L1I Speed-of-Light:
       over the number of all L1I requests.
     unit: Percent
   L1I-L2 Bandwidth Utilization:
-    rst: |-
+    rst: >-
       The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth
       achieved. Calculated as the ratio of the total number of requests from
       the L1I to the L2 cache over the :ref:`total L1I-L2 interface cycles <total-l1i-cycles>`.
@@ -1330,7 +1330,7 @@ Scalar L1D cache accesses:
     unit: Requests per normalization unit
 Scalar L1D Cache - L2 Interface:
   sL1D-L2 BW:
-    rst: |-
+    rst: >-
       The total number of bytes read from, written to, or atomically updated
       across the sL1D\u2194:doc:`L2 <l2-cache>` interface, divided by total duration.
       Note that sL1D writes and atomics are typically
@@ -1352,7 +1352,7 @@ Scalar L1D Cache - L2 Interface:
       CDNA accelerators.
     unit: Requests per normalization unit
   Stall Cycles:
-    rst: |-
+    rst: >-
       The total number of cycles the sL1D\u2194 :doc:`L2 <l2-cache>` interface
       was stalled, per :ref:`normalization unit <normalization-units>`.
     unit: Cycles per normalization unit
@@ -1676,7 +1676,7 @@ L1 Unified Translation Cache (UTCL1):
       translation not being present in the cache, per :ref:`normalization unit <normalization-units>`.
     unit: unit
   Permission Misses:
-    rst: |-
+    rst: >-
       The total number of translation requests that missed in the UTCL1 due
       to a permission error, per :ref:`normalization unit <normalization-units>`.
       This is unused and expected to be zero in most configurations for modern
diff --git a/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx940_metrics_description.yaml b/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx940_metrics_description.yaml
index 5a978ccee5..f208484a66 100644
--- a/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx940_metrics_description.yaml
+++ b/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx940_metrics_description.yaml
@@ -1,20 +1,20 @@
 System Speed-of-Light:
   VALU FLOPs:
-    rst: |-
+    rst: >-
       The total floating-point operations executed per second on the :ref:`VALU
       <desc-valu>`. This is also presented as a percent of the peak theoretical
       FLOPs achievable on the specific accelerator. Note: this does not include
       any floating-point operations from :ref:`MFMA <desc-mfma>` instructions.
     unit: GFLOPs
   VALU IOPs:
-    rst: |-
+    rst: >-
       The total integer operations executed per second on the :ref:`VALU <desc-valu>`.
       This is also presented as a percent of the peak theoretical IOPs achievable
       on the specific accelerator. Note: this does not include any integer operations
       from :ref:`MFMA <desc-mfma>` instructions.
     unit: GOIPs
   MFMA FLOPs (F8):
-    rst: |-
+    rst: >-
       The total number of 8-bit brain floating point :ref:`MFMA <desc-mfma>`
       operations executed per second. Note: this does not include any 16-bit brain
       floating point operations from :ref:`VALU <desc-valu>` instructions. This
@@ -23,7 +23,7 @@ System Speed-of-Light:
       series and later only.
     unit: GFLOPs
   MFMA FLOPs (BF16):
-    rst: |-
+    rst: >-
       The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>`
       operations executed per second. Note: this does not include any 16-bit brain
       floating point operations from :ref:`VALU <desc-valu>` instructions. This
@@ -31,7 +31,7 @@ System Speed-of-Light:
       achievable on the specific accelerator.
     unit: GFLOPs
   MFMA FLOPs (F16):
-    rst: |-
+    rst: >-
       The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 16-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. This is also presented
@@ -39,7 +39,7 @@ System Speed-of-Light:
       specific accelerator.
     unit: GFLOPs
   MFMA FLOPs (F32):
-    rst: |-
+    rst: >-
       The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 32-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. This is also presented
@@ -47,7 +47,7 @@ System Speed-of-Light:
       specific accelerator.
     unit: GFLOPs
   MFMA FLOPs (F64):
-    rst: |-
+    rst: >-
       The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 64-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. This is also presented
@@ -55,7 +55,7 @@ System Speed-of-Light:
       specific accelerator.
     unit: GFLOPs
   MFMA IOPs (Int8):
-    rst: |-
+    rst: >-
       The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
       per second. Note: this does not include any 8-bit integer operations from
       :ref:`VALU <desc-valu>` instructions. This is also presented as a percent
@@ -108,7 +108,7 @@ System Speed-of-Light:
       over the :ref:`total active CU cycles <total-active-cu-cycles>`.
     unit: Instructions per-cycle
   Wavefront Occupancy:
-    rst: |-
+    rst: >-
       The time-averaged number of wavefronts resident on the accelerator over
       the lifetime of the kernel. Note: this metric may be inaccurate for short-running
       kernels (less than 1ms). This is also presented as a percent of the peak theoretical
@@ -153,7 +153,7 @@ System Speed-of-Light:
       peak theoretical bandwidth achievable on the specific accelerator.
     unit: GB/s
   L2-Fabric Read BW:
-    rst: |-
+    rst: >-
       The number of bytes read by the L2 over the :ref:`Infinity Fabric\u2122
       interface <l2-fabric>` per unit time. This is also presented as a percent
       of the peak theoretical bandwidth achievable on the specific accelerator.
@@ -238,19 +238,19 @@ Memory Chart:
     rst: Total number of compute units (CUs) on the accelerator.
     unit: CUs
   VGPR:
-    rst: |-
+    rst: >-
       The number of architected vector general-purpose registers allocated for the
       kernel, see :ref:`VALU <desc-valu>`. Note: this may not exactly match the
       number of VGPRs requested by the compiler due to allocation granularity.
     unit: VGPRs
   SGPR:
-    rst: |-
+    rst: >-
       The number of scalar general-purpose registers allocated for the kernel, see
       :ref:`SALU <desc-salu>`. Note: this may not exactly match the number of
       SGPRs requested by the compiler due to allocation granularity.
     unit: SGPRs
   LDS Allocation:
-    rst: |-
+    rst: >-
       The number of bytes of :doc:`LDS <local-data-share>` memory (or, shared memory)
       allocated for this kernel. Note: This may also be larger than what was requested
       at compile time due to both allocation granularity and dynamic per-dispatch
@@ -419,28 +419,28 @@ Memory Chart:
     unit: Requests per normalization unit
 Roofline Performance Rates:
   VALU FLOPs (F16):
-    rst: |-
+    rst: >-
       The total 16-bit floating-point operations executed per second on the :ref:`VALU
       <desc-valu>`. This is presented with the value of the peak empirical F16 FLOPs achievable
       on the specific accelerator. Note: this does not include any F16 operations
       from :ref:`MFMA <desc-mfma>` instructions.
     unit: GFLOPs
   VALU FLOPs (F32):
-    rst: |-
+    rst: >-
       The total 32-bit floating-point operations executed per second on the :ref:`VALU
       <desc-valu>`. This is presented with the value of the peak empirical F32 FLOPs achievable
       on the specific accelerator. Note: this does not include any F32 operations
       from :ref:`MFMA <desc-mfma>` instructions.
     unit: GFLOPs
   VALU FLOPs (F64):
-    rst: |-
+    rst: >-
       The total 64-bit floating-point operations executed per second on the :ref:`VALU
       <desc-valu>`. This is presented with the value of the peak empirical F64 FLOPs achievable
       on the specific accelerator. Note: this does not include any F64 operations
       from :ref:`MFMA <desc-mfma>` instructions.
     unit: GFLOPs
   MFMA FLOPs (F64):
-    rst: |-
+    rst: >-
       The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 64-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
@@ -448,7 +448,7 @@ Roofline Performance Rates:
       displayed alongside for comparison.
     unit: GFLOPs
   MFMA FLOPs (F32):
-    rst: |-
+    rst: >-
       The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 32-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
@@ -456,7 +456,7 @@ Roofline Performance Rates:
       displayed alongside for comparison.
     unit: GFLOPs
   MFMA FLOPs (F16):
-    rst: |-
+    rst: >-
       The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 16-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
@@ -464,7 +464,7 @@ Roofline Performance Rates:
       displayed alongside for comparison.
     unit: GFLOPs
   MFMA FLOPs (BF16):
-    rst: |-
+    rst: >-
       The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>`
       operations executed per second. Note: this does not include any 16-bit brain
       floating point operations from :ref:`VALU <desc-valu>` instructions. The
@@ -472,7 +472,7 @@ Roofline Performance Rates:
       accelerator is displayed alongside for comparison.
     unit: GFLOPs
   MFMA FLOPs (F8):
-    rst: |-
+    rst: >-
       The total number of 8-bit brain floating point :ref:`MFMA <desc-mfma>`
       operations executed per second. Note: this does not include any 16-bit brain
       floating point operations from :ref:`VALU <desc-valu>` instructions. The
@@ -481,7 +481,7 @@ Roofline Performance Rates:
       Instinct MI300 series and later only.
     unit: GFLOPs
   MFMA IOPs (Int8):
-    rst: |-
+    rst: >-
       The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
       per second. Note: this does not include any 8-bit integer operations from
       :ref:`VALU <desc-valu>` instructions. The peak empirically measured INT8 MFMA
@@ -489,7 +489,7 @@ Roofline Performance Rates:
       for comparison.
     unit: GIOPs
   HBM Bandwidth:
-    rst: |-
+    rst: >-
       The total number of bytes read from and written to High-Bandwidth
         Memory (HBM) per second. The peak empirically measured bandwidth achievable
         on the specific accelerator is displayed alongside for comparison.
@@ -520,28 +520,28 @@ Roofline Performance Rates:
     unit: GB/s
 Roofline Plot Points:
   AI HBM:
-    rst: |-
+    rst: >-
       The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM).
       It is the ratio of total floating-point operations (FLOPs) to total bytes
       transferred between HBM and the L2 cache. This value is used as the x-coordinate
       for the HBM roofline.
     unit: FLOPs/Byte
   AI L2:
-    rst: |-
+    rst: >-
       The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio
       of total floating-point operations (FLOPs) to total bytes transferred between
       the L2 cache and the L1 cache. This value is used as the x-coordinate for
       the L2 roofline.
     unit: FLOPs/Byte
   AI L1:
-    rst: |-
+    rst: >-
       The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio
       of total floating-point operations (FLOPs) to total bytes transferred between
       the L1 cache and the processing units. This value is used as the x-coordinate
       for the L1 roofline.
     unit: FLOPs/Byte
   Performance (GFLOPs):
-    rst: |-
+    rst: >-
       The overall achieved performance, measured in GigaFLOPs
       per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point
       operations divided by the total execution time. This value is used as the y-coordinate
@@ -599,7 +599,7 @@ Workgroup manager utilizations:
       any work.
     unit: Percent
   Scheduler-Pipe Utilization:
-    rst: |-
+    rst: >-
       The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
       in the kernel where the scheduler-pipes were actively doing any work. Note: this
       value is expected to range between 0% and 25%. See :ref:`desc-spi`.
@@ -637,7 +637,7 @@ Workgroup manager utilizations:
     unit: Cycles/wave
 Workgroup Manager - Resource Allocation:
   Not-scheduled Rate (Workgroup Manager):
-    rst: |-
+    rst: >-
       The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
       in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
       due to a bottleneck within the workgroup manager rather than a lack of a
@@ -646,7 +646,7 @@ Workgroup Manager - Resource Allocation:
       description.
     unit: Percent
   Not-scheduled Rate (Scheduler-Pipe):
-    rst: |-
+    rst: >-
       The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
       in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
       due to a bottleneck within the scheduler-pipes rather than a lack of a CU
@@ -655,7 +655,7 @@ Workgroup Manager - Resource Allocation:
       description.
     unit: Percent
   Scheduler-Pipe Stall Rate:
-    rst: |-
+    rst: >-
       The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
       in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
       due to occupancy limitations (like a lack of a CU or :ref:`SIMD <desc-valu>`
@@ -719,7 +719,7 @@ Wavefront Launch Stats:
       block size.
     unit: Work-Items
   Total Wavefronts:
-    rst: |-
+    rst: >-
       The total number of wavefronts launched as part of the kernel dispatch.
       On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront
       size is always 64 work-items. Thus, the total number of wavefronts should
@@ -734,25 +734,25 @@ Wavefront Launch Stats:
       <https://docs.kernel.org/gpu/amdgpu/module-parameters.html?highlight=cwsr>`_.
     unit: Wavefronts
   VGPRs:
-    rst: |-
+    rst: >-
       The number of architected vector general-purpose registers allocated for the
       kernel, see :ref:`VALU <desc-valu>`. Note: this may not exactly match the
       number of VGPRs requested by the compiler due to allocation granularity.
     unit: VGPRs
   AGPRs:
-    rst: |-
+    rst: >-
       The number of accumulation vector general-purpose registers allocated
       for the kernel, see :ref:`AGPRs <desc-agprs>`. Note: this may not exactly match
       the number of AGPRs requested by the compiler due to allocation granularity.
     unit: AGPRs
   SGPRs:
-    rst: |-
+    rst: >-
       The number of scalar general-purpose registers allocated for the kernel, see
       :ref:`SALU <desc-salu>`. Note: this may not exactly match the number of
       SGPRs requested by the compiler due to allocation granularity.
     unit: SGPRs
   LDS Allocation:
-    rst: |-
+    rst: >-
       The number of bytes of :doc:`LDS <local-data-share>` memory (or, shared memory)
       allocated for this kernel. Note: This may also be larger than what was requested
       at compile time due to both allocation granularity and dynamic per-dispatch
@@ -775,7 +775,7 @@ Wavefront Runtime Stats:
       This is averaged over all wavefronts in a kernel dispatch.
     unit: Instructions per wavefront
   Wave Cycles:
-    rst: |-
+    rst: >-
       The number of cycles a wavefront in the kernel dispatch spent resident
       on a compute unit per :ref:`normalization unit <normalization-units>`. This is
       averaged over all wavefronts in a kernel dispatch. Note: this should not
@@ -813,7 +813,7 @@ Wavefront Runtime Stats:
       the total Wave Cycles metric.
     unit: Cycles per normalization unit
   Wavefront Occupancy:
-    rst: |-
+    rst: >-
       The time-averaged number of wavefronts resident on the accelerator over the
       lifetime of the kernel. Note: this metric may be inaccurate for short-running
       kernels (less than 1ms).
@@ -915,7 +915,7 @@ VALU Arithmetic Instruction Mix:
       unit <normalization-units>`.
     unit: Instructions per normalization unit
   Conversion:
-    rst: |-
+    rst: >-
       The total number of type conversion instructions (such as converting data
       to or from F32\u2194F64) issued to the VALU per :ref:`normalization unit
       <normalization-units>`.
@@ -989,14 +989,14 @@ MFMA Arithmetic Instruction Mix:
     unit: Instructions per normalization unit
 Compute Speed-of-Light:
   VALU FLOPs:
-    rst: |-
+    rst: >-
       The total floating-point operations executed per second on the :ref:`VALU
       <desc-valu>`. This is also presented as a percent of the peak theoretical
       FLOPs achievable on the specific accelerator. Note: this does not include
       any floating-point operations from :ref:`MFMA <desc-mfma>` instructions.
     unit: GFLOPs
   VALU IOPs:
-    rst: |-
+    rst: >-
       The total integer operations executed per second on the :ref:`VALU <desc-valu>`.
       This is also presented as a percent of the peak theoretical IOPs achievable
       on the specific accelerator. Note: this does not include any integer operations
@@ -1006,7 +1006,7 @@ Compute Speed-of-Light:
     rst: ''
     unit: Unknown
   MFMA FLOPs (BF16):
-    rst: |-
+    rst: >-
       The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 16-bit brain floating
       point operations from :ref:`VALU <desc-valu>` instructions. This is also
@@ -1014,7 +1014,7 @@ Compute Speed-of-Light:
       on the specific accelerator.
     unit: GFLOPs
   MFMA FLOPs (F16):
-    rst: |-
+    rst: >-
       The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 16-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. This is also presented
@@ -1022,7 +1022,7 @@ Compute Speed-of-Light:
       specific accelerator.
     unit: GFLOPs
   MFMA FLOPs (F32):
-    rst: |-
+    rst: >-
       The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 32-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. This is also presented
@@ -1030,7 +1030,7 @@ Compute Speed-of-Light:
       specific accelerator.
     unit: GFLOPs
   MFMA FLOPs (F64):
-    rst: |-
+    rst: >-
       The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 64-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. This is also presented
@@ -1042,7 +1042,7 @@ Compute Speed-of-Light:
       achievable on the specific accelerator.
     unit: GFLOPs
   MFMA IOPs (INT8):
-    rst: |-
+    rst: >-
       The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
       per second. Note: this does not include any 8-bit integer operations from
       :ref:`VALU <desc-valu>` instructions. This is also presented as a percent
@@ -1131,7 +1131,7 @@ Arithmetic Operations:
       unit <normalization-units>`.
     unit: FLOP per normalization unit
   BF16 OPs:
-    rst: |-
+    rst: >-
       The total number of 16-bit brain floating-point operations executed on
       either the :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization
       unit <normalization-units>`. Note: on current CDNA accelerators, the VALU
@@ -1148,7 +1148,7 @@ Arithmetic Operations:
       unit <normalization-units>`.
     unit: FLOP per normalization unit
   INT8 OPs:
-    rst: |-
+    rst: >-
       The total number of 8-bit integer operations executed on either the :ref:`VALU
       <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization unit
       <normalization-units>`. Note: on current CDNA accelerators, the VALU has
@@ -1225,7 +1225,7 @@ LDS Statistics:
       to stalls from non-dword aligned addresses per :ref:`normalization unit <normalization-units>`.
     unit: Cycles per normalization unit
   Mem Violations:
-    rst: |-
+    rst: >-
       The total number of out-of-bounds accesses made to the LDS, per :ref:`normalization
       unit <normalization-units>`. This is unused and expected to be zero in
       most configurations for modern CDNA\u2122 accelerators.
@@ -1242,7 +1242,7 @@ L1I Speed-of-Light:
       over the number of all L1I requests.
     unit: Percent
   L1I-L2 Bandwidth Utilization:
-    rst: |-
+    rst: >-
       The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth
       achieved. Calculated as the ratio of the total number of requests from
       the L1I to the L2 cache over the :ref:`total L1I-L2 interface cycles <total-l1i-cycles>`.
@@ -1349,7 +1349,7 @@ Scalar L1D cache accesses:
     unit: Requests per normalization unit
 Scalar L1D Cache - L2 Interface:
   sL1D-L2 BW:
-    rst: |-
+    rst: >-
       The total number of bytes read from, written to, or atomically updated
       across the sL1D\u2194:doc:`L2 <l2-cache>` interface, divided by total duration.
       Note that sL1D writes and atomics are typically
@@ -1371,7 +1371,7 @@ Scalar L1D Cache - L2 Interface:
       CDNA accelerators.
     unit: Requests per normalization unit
   Stall Cycles:
-    rst: |-
+    rst: >-
       The total number of cycles the sL1D\u2194 :doc:`L2 <l2-cache>` interface
       was stalled, per :ref:`normalization unit <normalization-units>`.
     unit: Cycles per normalization unit
@@ -1681,7 +1681,7 @@ L1 Unified Translation Cache (UTCL1):
       translation not being present in the cache, per :ref:`normalization unit <normalization-units>`.
     unit: unit
   Permission Misses:
-    rst: |-
+    rst: >-
       The total number of translation requests that missed in the UTCL1 due
       to a permission error, per :ref:`normalization unit <normalization-units>`.
       This is unused and expected to be zero in most configurations for modern
diff --git a/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx941_metrics_description.yaml b/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx941_metrics_description.yaml
index 5a978ccee5..f208484a66 100644
--- a/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx941_metrics_description.yaml
+++ b/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx941_metrics_description.yaml
@@ -1,20 +1,20 @@
 System Speed-of-Light:
   VALU FLOPs:
-    rst: |-
+    rst: >-
       The total floating-point operations executed per second on the :ref:`VALU
       <desc-valu>`. This is also presented as a percent of the peak theoretical
       FLOPs achievable on the specific accelerator. Note: this does not include
       any floating-point operations from :ref:`MFMA <desc-mfma>` instructions.
     unit: GFLOPs
   VALU IOPs:
-    rst: |-
+    rst: >-
       The total integer operations executed per second on the :ref:`VALU <desc-valu>`.
       This is also presented as a percent of the peak theoretical IOPs achievable
       on the specific accelerator. Note: this does not include any integer operations
       from :ref:`MFMA <desc-mfma>` instructions.
     unit: GOIPs
   MFMA FLOPs (F8):
-    rst: |-
+    rst: >-
       The total number of 8-bit brain floating point :ref:`MFMA <desc-mfma>`
       operations executed per second. Note: this does not include any 16-bit brain
       floating point operations from :ref:`VALU <desc-valu>` instructions. This
@@ -23,7 +23,7 @@ System Speed-of-Light:
       series and later only.
     unit: GFLOPs
   MFMA FLOPs (BF16):
-    rst: |-
+    rst: >-
       The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>`
       operations executed per second. Note: this does not include any 16-bit brain
       floating point operations from :ref:`VALU <desc-valu>` instructions. This
@@ -31,7 +31,7 @@ System Speed-of-Light:
       achievable on the specific accelerator.
     unit: GFLOPs
   MFMA FLOPs (F16):
-    rst: |-
+    rst: >-
       The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 16-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. This is also presented
@@ -39,7 +39,7 @@ System Speed-of-Light:
       specific accelerator.
     unit: GFLOPs
   MFMA FLOPs (F32):
-    rst: |-
+    rst: >-
       The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 32-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. This is also presented
@@ -47,7 +47,7 @@ System Speed-of-Light:
       specific accelerator.
     unit: GFLOPs
   MFMA FLOPs (F64):
-    rst: |-
+    rst: >-
       The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 64-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. This is also presented
@@ -55,7 +55,7 @@ System Speed-of-Light:
       specific accelerator.
     unit: GFLOPs
   MFMA IOPs (Int8):
-    rst: |-
+    rst: >-
       The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
       per second. Note: this does not include any 8-bit integer operations from
       :ref:`VALU <desc-valu>` instructions. This is also presented as a percent
@@ -108,7 +108,7 @@ System Speed-of-Light:
       over the :ref:`total active CU cycles <total-active-cu-cycles>`.
     unit: Instructions per-cycle
   Wavefront Occupancy:
-    rst: |-
+    rst: >-
       The time-averaged number of wavefronts resident on the accelerator over
       the lifetime of the kernel. Note: this metric may be inaccurate for short-running
       kernels (less than 1ms). This is also presented as a percent of the peak theoretical
@@ -153,7 +153,7 @@ System Speed-of-Light:
       peak theoretical bandwidth achievable on the specific accelerator.
     unit: GB/s
   L2-Fabric Read BW:
-    rst: |-
+    rst: >-
       The number of bytes read by the L2 over the :ref:`Infinity Fabric\u2122
       interface <l2-fabric>` per unit time. This is also presented as a percent
       of the peak theoretical bandwidth achievable on the specific accelerator.
@@ -238,19 +238,19 @@ Memory Chart:
     rst: Total number of compute units (CUs) on the accelerator.
     unit: CUs
   VGPR:
-    rst: |-
+    rst: >-
       The number of architected vector general-purpose registers allocated for the
       kernel, see :ref:`VALU <desc-valu>`. Note: this may not exactly match the
       number of VGPRs requested by the compiler due to allocation granularity.
     unit: VGPRs
   SGPR:
-    rst: |-
+    rst: >-
       The number of scalar general-purpose registers allocated for the kernel, see
       :ref:`SALU <desc-salu>`. Note: this may not exactly match the number of
       SGPRs requested by the compiler due to allocation granularity.
     unit: SGPRs
   LDS Allocation:
-    rst: |-
+    rst: >-
       The number of bytes of :doc:`LDS <local-data-share>` memory (or, shared memory)
       allocated for this kernel. Note: This may also be larger than what was requested
       at compile time due to both allocation granularity and dynamic per-dispatch
@@ -419,28 +419,28 @@ Memory Chart:
     unit: Requests per normalization unit
 Roofline Performance Rates:
   VALU FLOPs (F16):
-    rst: |-
+    rst: >-
       The total 16-bit floating-point operations executed per second on the :ref:`VALU
       <desc-valu>`. This is presented with the value of the peak empirical F16 FLOPs achievable
       on the specific accelerator. Note: this does not include any F16 operations
       from :ref:`MFMA <desc-mfma>` instructions.
     unit: GFLOPs
   VALU FLOPs (F32):
-    rst: |-
+    rst: >-
       The total 32-bit floating-point operations executed per second on the :ref:`VALU
       <desc-valu>`. This is presented with the value of the peak empirical F32 FLOPs achievable
       on the specific accelerator. Note: this does not include any F32 operations
       from :ref:`MFMA <desc-mfma>` instructions.
     unit: GFLOPs
   VALU FLOPs (F64):
-    rst: |-
+    rst: >-
       The total 64-bit floating-point operations executed per second on the :ref:`VALU
       <desc-valu>`. This is presented with the value of the peak empirical F64 FLOPs achievable
       on the specific accelerator. Note: this does not include any F64 operations
       from :ref:`MFMA <desc-mfma>` instructions.
     unit: GFLOPs
   MFMA FLOPs (F64):
-    rst: |-
+    rst: >-
       The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 64-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
@@ -448,7 +448,7 @@ Roofline Performance Rates:
       displayed alongside for comparison.
     unit: GFLOPs
   MFMA FLOPs (F32):
-    rst: |-
+    rst: >-
       The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 32-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
@@ -456,7 +456,7 @@ Roofline Performance Rates:
       displayed alongside for comparison.
     unit: GFLOPs
   MFMA FLOPs (F16):
-    rst: |-
+    rst: >-
       The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 16-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
@@ -464,7 +464,7 @@ Roofline Performance Rates:
       displayed alongside for comparison.
     unit: GFLOPs
   MFMA FLOPs (BF16):
-    rst: |-
+    rst: >-
       The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>`
       operations executed per second. Note: this does not include any 16-bit brain
       floating point operations from :ref:`VALU <desc-valu>` instructions. The
@@ -472,7 +472,7 @@ Roofline Performance Rates:
       accelerator is displayed alongside for comparison.
     unit: GFLOPs
   MFMA FLOPs (F8):
-    rst: |-
+    rst: >-
       The total number of 8-bit brain floating point :ref:`MFMA <desc-mfma>`
       operations executed per second. Note: this does not include any 16-bit brain
       floating point operations from :ref:`VALU <desc-valu>` instructions. The
@@ -481,7 +481,7 @@ Roofline Performance Rates:
       Instinct MI300 series and later only.
     unit: GFLOPs
   MFMA IOPs (Int8):
-    rst: |-
+    rst: >-
       The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
       per second. Note: this does not include any 8-bit integer operations from
       :ref:`VALU <desc-valu>` instructions. The peak empirically measured INT8 MFMA
@@ -489,7 +489,7 @@ Roofline Performance Rates:
       for comparison.
     unit: GIOPs
   HBM Bandwidth:
-    rst: |-
+    rst: >-
       The total number of bytes read from and written to High-Bandwidth
         Memory (HBM) per second. The peak empirically measured bandwidth achievable
         on the specific accelerator is displayed alongside for comparison.
@@ -520,28 +520,28 @@ Roofline Performance Rates:
     unit: GB/s
 Roofline Plot Points:
   AI HBM:
-    rst: |-
+    rst: >-
       The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM).
       It is the ratio of total floating-point operations (FLOPs) to total bytes
       transferred between HBM and the L2 cache. This value is used as the x-coordinate
       for the HBM roofline.
     unit: FLOPs/Byte
   AI L2:
-    rst: |-
+    rst: >-
       The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio
       of total floating-point operations (FLOPs) to total bytes transferred between
       the L2 cache and the L1 cache. This value is used as the x-coordinate for
       the L2 roofline.
     unit: FLOPs/Byte
   AI L1:
-    rst: |-
+    rst: >-
       The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio
       of total floating-point operations (FLOPs) to total bytes transferred between
       the L1 cache and the processing units. This value is used as the x-coordinate
       for the L1 roofline.
     unit: FLOPs/Byte
   Performance (GFLOPs):
-    rst: |-
+    rst: >-
       The overall achieved performance, measured in GigaFLOPs
       per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point
       operations divided by the total execution time. This value is used as the y-coordinate
@@ -599,7 +599,7 @@ Workgroup manager utilizations:
       any work.
     unit: Percent
   Scheduler-Pipe Utilization:
-    rst: |-
+    rst: >-
       The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
       in the kernel where the scheduler-pipes were actively doing any work. Note: this
       value is expected to range between 0% and 25%. See :ref:`desc-spi`.
@@ -637,7 +637,7 @@ Workgroup manager utilizations:
     unit: Cycles/wave
 Workgroup Manager - Resource Allocation:
   Not-scheduled Rate (Workgroup Manager):
-    rst: |-
+    rst: >-
       The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
       in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
       due to a bottleneck within the workgroup manager rather than a lack of a
@@ -646,7 +646,7 @@ Workgroup Manager - Resource Allocation:
       description.
     unit: Percent
   Not-scheduled Rate (Scheduler-Pipe):
-    rst: |-
+    rst: >-
       The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
       in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
       due to a bottleneck within the scheduler-pipes rather than a lack of a CU
@@ -655,7 +655,7 @@ Workgroup Manager - Resource Allocation:
       description.
     unit: Percent
   Scheduler-Pipe Stall Rate:
-    rst: |-
+    rst: >-
       The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
       in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
       due to occupancy limitations (like a lack of a CU or :ref:`SIMD <desc-valu>`
@@ -719,7 +719,7 @@ Wavefront Launch Stats:
       block size.
     unit: Work-Items
   Total Wavefronts:
-    rst: |-
+    rst: >-
       The total number of wavefronts launched as part of the kernel dispatch.
       On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront
       size is always 64 work-items. Thus, the total number of wavefronts should
@@ -734,25 +734,25 @@ Wavefront Launch Stats:
       <https://docs.kernel.org/gpu/amdgpu/module-parameters.html?highlight=cwsr>`_.
     unit: Wavefronts
   VGPRs:
-    rst: |-
+    rst: >-
       The number of architected vector general-purpose registers allocated for the
       kernel, see :ref:`VALU <desc-valu>`. Note: this may not exactly match the
       number of VGPRs requested by the compiler due to allocation granularity.
     unit: VGPRs
   AGPRs:
-    rst: |-
+    rst: >-
       The number of accumulation vector general-purpose registers allocated
       for the kernel, see :ref:`AGPRs <desc-agprs>`. Note: this may not exactly match
       the number of AGPRs requested by the compiler due to allocation granularity.
     unit: AGPRs
   SGPRs:
-    rst: |-
+    rst: >-
       The number of scalar general-purpose registers allocated for the kernel, see
       :ref:`SALU <desc-salu>`. Note: this may not exactly match the number of
       SGPRs requested by the compiler due to allocation granularity.
     unit: SGPRs
   LDS Allocation:
-    rst: |-
+    rst: >-
       The number of bytes of :doc:`LDS <local-data-share>` memory (or, shared memory)
       allocated for this kernel. Note: This may also be larger than what was requested
       at compile time due to both allocation granularity and dynamic per-dispatch
@@ -775,7 +775,7 @@ Wavefront Runtime Stats:
       This is averaged over all wavefronts in a kernel dispatch.
     unit: Instructions per wavefront
   Wave Cycles:
-    rst: |-
+    rst: >-
       The number of cycles a wavefront in the kernel dispatch spent resident
       on a compute unit per :ref:`normalization unit <normalization-units>`. This is
       averaged over all wavefronts in a kernel dispatch. Note: this should not
@@ -813,7 +813,7 @@ Wavefront Runtime Stats:
       the total Wave Cycles metric.
     unit: Cycles per normalization unit
   Wavefront Occupancy:
-    rst: |-
+    rst: >-
       The time-averaged number of wavefronts resident on the accelerator over the
       lifetime of the kernel. Note: this metric may be inaccurate for short-running
       kernels (less than 1ms).
@@ -915,7 +915,7 @@ VALU Arithmetic Instruction Mix:
       unit <normalization-units>`.
     unit: Instructions per normalization unit
   Conversion:
-    rst: |-
+    rst: >-
       The total number of type conversion instructions (such as converting data
       to or from F32\u2194F64) issued to the VALU per :ref:`normalization unit
       <normalization-units>`.
@@ -989,14 +989,14 @@ MFMA Arithmetic Instruction Mix:
     unit: Instructions per normalization unit
 Compute Speed-of-Light:
   VALU FLOPs:
-    rst: |-
+    rst: >-
       The total floating-point operations executed per second on the :ref:`VALU
       <desc-valu>`. This is also presented as a percent of the peak theoretical
       FLOPs achievable on the specific accelerator. Note: this does not include
       any floating-point operations from :ref:`MFMA <desc-mfma>` instructions.
     unit: GFLOPs
   VALU IOPs:
-    rst: |-
+    rst: >-
       The total integer operations executed per second on the :ref:`VALU <desc-valu>`.
       This is also presented as a percent of the peak theoretical IOPs achievable
       on the specific accelerator. Note: this does not include any integer operations
@@ -1006,7 +1006,7 @@ Compute Speed-of-Light:
     rst: ''
     unit: Unknown
   MFMA FLOPs (BF16):
-    rst: |-
+    rst: >-
       The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 16-bit brain floating
       point operations from :ref:`VALU <desc-valu>` instructions. This is also
@@ -1014,7 +1014,7 @@ Compute Speed-of-Light:
       on the specific accelerator.
     unit: GFLOPs
   MFMA FLOPs (F16):
-    rst: |-
+    rst: >-
       The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 16-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. This is also presented
@@ -1022,7 +1022,7 @@ Compute Speed-of-Light:
       specific accelerator.
     unit: GFLOPs
   MFMA FLOPs (F32):
-    rst: |-
+    rst: >-
       The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 32-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. This is also presented
@@ -1030,7 +1030,7 @@ Compute Speed-of-Light:
       specific accelerator.
     unit: GFLOPs
   MFMA FLOPs (F64):
-    rst: |-
+    rst: >-
       The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 64-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. This is also presented
@@ -1042,7 +1042,7 @@ Compute Speed-of-Light:
       achievable on the specific accelerator.
     unit: GFLOPs
   MFMA IOPs (INT8):
-    rst: |-
+    rst: >-
       The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
       per second. Note: this does not include any 8-bit integer operations from
       :ref:`VALU <desc-valu>` instructions. This is also presented as a percent
@@ -1131,7 +1131,7 @@ Arithmetic Operations:
       unit <normalization-units>`.
     unit: FLOP per normalization unit
   BF16 OPs:
-    rst: |-
+    rst: >-
       The total number of 16-bit brain floating-point operations executed on
       either the :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization
       unit <normalization-units>`. Note: on current CDNA accelerators, the VALU
@@ -1148,7 +1148,7 @@ Arithmetic Operations:
       unit <normalization-units>`.
     unit: FLOP per normalization unit
   INT8 OPs:
-    rst: |-
+    rst: >-
       The total number of 8-bit integer operations executed on either the :ref:`VALU
       <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization unit
       <normalization-units>`. Note: on current CDNA accelerators, the VALU has
@@ -1225,7 +1225,7 @@ LDS Statistics:
       to stalls from non-dword aligned addresses per :ref:`normalization unit <normalization-units>`.
     unit: Cycles per normalization unit
   Mem Violations:
-    rst: |-
+    rst: >-
       The total number of out-of-bounds accesses made to the LDS, per :ref:`normalization
       unit <normalization-units>`. This is unused and expected to be zero in
       most configurations for modern CDNA\u2122 accelerators.
@@ -1242,7 +1242,7 @@ L1I Speed-of-Light:
       over the number of all L1I requests.
     unit: Percent
   L1I-L2 Bandwidth Utilization:
-    rst: |-
+    rst: >-
       The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth
       achieved. Calculated as the ratio of the total number of requests from
       the L1I to the L2 cache over the :ref:`total L1I-L2 interface cycles <total-l1i-cycles>`.
@@ -1349,7 +1349,7 @@ Scalar L1D cache accesses:
     unit: Requests per normalization unit
 Scalar L1D Cache - L2 Interface:
   sL1D-L2 BW:
-    rst: |-
+    rst: >-
       The total number of bytes read from, written to, or atomically updated
       across the sL1D\u2194:doc:`L2 <l2-cache>` interface, divided by total duration.
       Note that sL1D writes and atomics are typically
@@ -1371,7 +1371,7 @@ Scalar L1D Cache - L2 Interface:
       CDNA accelerators.
     unit: Requests per normalization unit
   Stall Cycles:
-    rst: |-
+    rst: >-
       The total number of cycles the sL1D\u2194 :doc:`L2 <l2-cache>` interface
       was stalled, per :ref:`normalization unit <normalization-units>`.
     unit: Cycles per normalization unit
@@ -1681,7 +1681,7 @@ L1 Unified Translation Cache (UTCL1):
       translation not being present in the cache, per :ref:`normalization unit <normalization-units>`.
     unit: unit
   Permission Misses:
-    rst: |-
+    rst: >-
       The total number of translation requests that missed in the UTCL1 due
       to a permission error, per :ref:`normalization unit <normalization-units>`.
       This is unused and expected to be zero in most configurations for modern
diff --git a/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx942_metrics_description.yaml b/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx942_metrics_description.yaml
index 4d27ec667a..be9b4719e0 100644
--- a/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx942_metrics_description.yaml
+++ b/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx942_metrics_description.yaml
@@ -1,20 +1,20 @@
 System Speed-of-Light:
   VALU FLOPs:
-    rst: |-
+    rst: >-
       The total floating-point operations executed per second on the :ref:`VALU
       <desc-valu>`. This is also presented as a percent of the peak theoretical
       FLOPs achievable on the specific accelerator. Note: this does not include
       any floating-point operations from :ref:`MFMA <desc-mfma>` instructions.
     unit: GFLOPs
   VALU IOPs:
-    rst: |-
+    rst: >-
       The total integer operations executed per second on the :ref:`VALU <desc-valu>`.
       This is also presented as a percent of the peak theoretical IOPs achievable
       on the specific accelerator. Note: this does not include any integer operations
       from :ref:`MFMA <desc-mfma>` instructions.
     unit: GOIPs
   MFMA FLOPs (F8):
-    rst: |-
+    rst: >-
       The total number of 8-bit brain floating point :ref:`MFMA <desc-mfma>`
       operations executed per second. Note: this does not include any 16-bit brain
       floating point operations from :ref:`VALU <desc-valu>` instructions. This
@@ -23,7 +23,7 @@ System Speed-of-Light:
       series and later only.
     unit: GFLOPs
   MFMA FLOPs (BF16):
-    rst: |-
+    rst: >-
       The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>`
       operations executed per second. Note: this does not include any 16-bit brain
       floating point operations from :ref:`VALU <desc-valu>` instructions. This
@@ -31,7 +31,7 @@ System Speed-of-Light:
       achievable on the specific accelerator.
     unit: GFLOPs
   MFMA FLOPs (F16):
-    rst: |-
+    rst: >-
       The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 16-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. This is also presented
@@ -39,7 +39,7 @@ System Speed-of-Light:
       specific accelerator.
     unit: GFLOPs
   MFMA FLOPs (F32):
-    rst: |-
+    rst: >-
       The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 32-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. This is also presented
@@ -47,7 +47,7 @@ System Speed-of-Light:
       specific accelerator.
     unit: GFLOPs
   MFMA FLOPs (F64):
-    rst: |-
+    rst: >-
       The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 64-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. This is also presented
@@ -55,7 +55,7 @@ System Speed-of-Light:
       specific accelerator.
     unit: GFLOPs
   MFMA IOPs (Int8):
-    rst: |-
+    rst: >-
       The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
       per second. Note: this does not include any 8-bit integer operations from
       :ref:`VALU <desc-valu>` instructions. This is also presented as a percent
@@ -108,7 +108,7 @@ System Speed-of-Light:
       over the :ref:`total active CU cycles <total-active-cu-cycles>`.
     unit: Instructions per-cycle
   Wavefront Occupancy:
-    rst: |-
+    rst: >-
       The time-averaged number of wavefronts resident on the accelerator over
       the lifetime of the kernel. Note: this metric may be inaccurate for short-running
       kernels (less than 1ms). This is also presented as a percent of the peak theoretical
@@ -153,7 +153,7 @@ System Speed-of-Light:
       peak theoretical bandwidth achievable on the specific accelerator.
     unit: GB/s
   L2-Fabric Read BW:
-    rst: |-
+    rst: >-
       The number of bytes read by the L2 over the :ref:`Infinity Fabric\u2122
       interface <l2-fabric>` per unit time. This is also presented as a percent
       of the peak theoretical bandwidth achievable on the specific accelerator.
@@ -238,19 +238,19 @@ Memory Chart:
     rst: Total number of compute units (CUs) on the accelerator.
     unit: CUs
   VGPR:
-    rst: |-
+    rst: >-
       The number of architected vector general-purpose registers allocated for the
       kernel, see :ref:`VALU <desc-valu>`. Note: this may not exactly match the
       number of VGPRs requested by the compiler due to allocation granularity.
     unit: VGPRs
   SGPR:
-    rst: |-
+    rst: >-
       The number of scalar general-purpose registers allocated for the kernel, see
       :ref:`SALU <desc-salu>`. Note: this may not exactly match the number of
       SGPRs requested by the compiler due to allocation granularity.
     unit: SGPRs
   LDS Allocation:
-    rst: |-
+    rst: >-
       The number of bytes of :doc:`LDS <local-data-share>` memory (or, shared memory)
       allocated for this kernel. Note: This may also be larger than what was requested
       at compile time due to both allocation granularity and dynamic per-dispatch
@@ -419,28 +419,28 @@ Memory Chart:
     unit: Requests per normalization unit
 Roofline Performance Rates:
   VALU FLOPs (F16):
-    rst: |-
+    rst: >-
       The total 16-bit floating-point operations executed per second on the :ref:`VALU
       <desc-valu>`. This is presented with the value of the peak empirical F16 FLOPs achievable
       on the specific accelerator. Note: this does not include any F16 operations
       from :ref:`MFMA <desc-mfma>` instructions.
     unit: GFLOPs
   VALU FLOPs (F32):
-    rst: |-
+    rst: >-
       The total 32-bit floating-point operations executed per second on the :ref:`VALU
       <desc-valu>`. This is presented with the value of the peak empirical F32 FLOPs achievable
       on the specific accelerator. Note: this does not include any F32 operations
       from :ref:`MFMA <desc-mfma>` instructions.
     unit: GFLOPs
   VALU FLOPs (F64):
-    rst: |-
+    rst: >-
       The total 64-bit floating-point operations executed per second on the :ref:`VALU
       <desc-valu>`. This is presented with the value of the peak empirical F64 FLOPs achievable
       on the specific accelerator. Note: this does not include any F64 operations
       from :ref:`MFMA <desc-mfma>` instructions.
     unit: GFLOPs
   MFMA FLOPs (F64):
-    rst: |-
+    rst: >-
       The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 64-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
@@ -448,7 +448,7 @@ Roofline Performance Rates:
       displayed alongside for comparison.
     unit: GFLOPs
   MFMA FLOPs (F32):
-    rst: |-
+    rst: >-
       The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 32-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
@@ -456,7 +456,7 @@ Roofline Performance Rates:
       displayed alongside for comparison.
     unit: GFLOPs
   MFMA FLOPs (F16):
-    rst: |-
+    rst: >-
       The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 16-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
@@ -464,7 +464,7 @@ Roofline Performance Rates:
       displayed alongside for comparison.
     unit: GFLOPs
   MFMA FLOPs (BF16):
-    rst: |-
+    rst: >-
       The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>`
       operations executed per second. Note: this does not include any 16-bit brain
       floating point operations from :ref:`VALU <desc-valu>` instructions. The
@@ -472,7 +472,7 @@ Roofline Performance Rates:
       accelerator is displayed alongside for comparison.
     unit: GFLOPs
   MFMA FLOPs (F8):
-    rst: |-
+    rst: >-
       The total number of 8-bit brain floating point :ref:`MFMA <desc-mfma>`
       operations executed per second. Note: this does not include any 16-bit brain
       floating point operations from :ref:`VALU <desc-valu>` instructions. The
@@ -481,7 +481,7 @@ Roofline Performance Rates:
       Instinct MI300 series and later only.
     unit: GFLOPs
   MFMA IOPs (Int8):
-    rst: |-
+    rst: >-
       The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
       per second. Note: this does not include any 8-bit integer operations from
       :ref:`VALU <desc-valu>` instructions. The peak empirically measured INT8 MFMA
@@ -489,7 +489,7 @@ Roofline Performance Rates:
       for comparison.
     unit: GIOPs
   HBM Bandwidth:
-    rst: |-
+    rst: >-
       The total number of bytes read from and written to High-Bandwidth
         Memory (HBM) per second. The peak empirically measured bandwidth achievable
         on the specific accelerator is displayed alongside for comparison.
@@ -520,28 +520,28 @@ Roofline Performance Rates:
     unit: GB/s
 Roofline Plot Points:
   AI HBM:
-    rst: |-
+    rst: >-
       The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM).
       It is the ratio of total floating-point operations (FLOPs) to total bytes
       transferred between HBM and the L2 cache. This value is used as the x-coordinate
       for the HBM roofline.
     unit: FLOPs/Byte
   AI L2:
-    rst: |-
+    rst: >-
       The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio
       of total floating-point operations (FLOPs) to total bytes transferred between
       the L2 cache and the L1 cache. This value is used as the x-coordinate for
       the L2 roofline.
     unit: FLOPs/Byte
   AI L1:
-    rst: |-
+    rst: >-
       The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio
       of total floating-point operations (FLOPs) to total bytes transferred between
       the L1 cache and the processing units. This value is used as the x-coordinate
       for the L1 roofline.
     unit: FLOPs/Byte
   Performance (GFLOPs):
-    rst: |-
+    rst: >-
       The overall achieved performance, measured in GigaFLOPs
       per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point
       operations divided by the total execution time. This value is used as the y-coordinate
@@ -599,7 +599,7 @@ Workgroup manager utilizations:
       any work.
     unit: Percent
   Scheduler-Pipe Utilization:
-    rst: |-
+    rst: >-
       The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
       in the kernel where the scheduler-pipes were actively doing any work. Note: this
       value is expected to range between 0% and 25%. See :ref:`desc-spi`.
@@ -637,7 +637,7 @@ Workgroup manager utilizations:
     unit: Cycles/wave
 Workgroup Manager - Resource Allocation:
   Not-scheduled Rate (Workgroup Manager):
-    rst: |-
+    rst: >-
       The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
       in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
       due to a bottleneck within the workgroup manager rather than a lack of a
@@ -646,7 +646,7 @@ Workgroup Manager - Resource Allocation:
       description.
     unit: Percent
   Not-scheduled Rate (Scheduler-Pipe):
-    rst: |-
+    rst: >-
       The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
       in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
       due to a bottleneck within the scheduler-pipes rather than a lack of a CU
@@ -655,7 +655,7 @@ Workgroup Manager - Resource Allocation:
       description.
     unit: Percent
   Scheduler-Pipe Stall Rate:
-    rst: |-
+    rst: >-
       The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
       in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
       due to occupancy limitations (like a lack of a CU or :ref:`SIMD <desc-valu>`
@@ -719,7 +719,7 @@ Wavefront Launch Stats:
       block size.
     unit: Work-Items
   Total Wavefronts:
-    rst: |-
+    rst: >-
       The total number of wavefronts launched as part of the kernel dispatch.
       On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront
       size is always 64 work-items. Thus, the total number of wavefronts should
@@ -734,25 +734,25 @@ Wavefront Launch Stats:
       <https://docs.kernel.org/gpu/amdgpu/module-parameters.html?highlight=cwsr>`_.
     unit: Wavefronts
   VGPRs:
-    rst: |-
+    rst: >-
       The number of architected vector general-purpose registers allocated for the
       kernel, see :ref:`VALU <desc-valu>`. Note: this may not exactly match the
       number of VGPRs requested by the compiler due to allocation granularity.
     unit: VGPRs
   AGPRs:
-    rst: |-
+    rst: >-
       The number of accumulation vector general-purpose registers allocated
       for the kernel, see :ref:`AGPRs <desc-agprs>`. Note: this may not exactly match
       the number of AGPRs requested by the compiler due to allocation granularity.
     unit: AGPRs
   SGPRs:
-    rst: |-
+    rst: >-
       The number of scalar general-purpose registers allocated for the kernel, see
       :ref:`SALU <desc-salu>`. Note: this may not exactly match the number of
       SGPRs requested by the compiler due to allocation granularity.
     unit: SGPRs
   LDS Allocation:
-    rst: |-
+    rst: >-
       The number of bytes of :doc:`LDS <local-data-share>` memory (or, shared memory)
       allocated for this kernel. Note: This may also be larger than what was requested
       at compile time due to both allocation granularity and dynamic per-dispatch
@@ -775,7 +775,7 @@ Wavefront Runtime Stats:
       This is averaged over all wavefronts in a kernel dispatch.
     unit: Instructions per wavefront
   Wave Cycles:
-    rst: |-
+    rst: >-
       The number of cycles a wavefront in the kernel dispatch spent resident
       on a compute unit per :ref:`normalization unit <normalization-units>`. This is
       averaged over all wavefronts in a kernel dispatch. Note: this should not
@@ -813,7 +813,7 @@ Wavefront Runtime Stats:
       the total Wave Cycles metric.
     unit: Cycles per normalization unit
   Wavefront Occupancy:
-    rst: |-
+    rst: >-
       The time-averaged number of wavefronts resident on the accelerator over the
       lifetime of the kernel. Note: this metric may be inaccurate for short-running
       kernels (less than 1ms).
@@ -915,7 +915,7 @@ VALU Arithmetic Instruction Mix:
       unit <normalization-units>`.
     unit: Instructions per normalization unit
   Conversion:
-    rst: |-
+    rst: >-
       The total number of type conversion instructions (such as converting data
       to or from F32\u2194F64) issued to the VALU per :ref:`normalization unit
       <normalization-units>`.
@@ -989,14 +989,14 @@ MFMA Arithmetic Instruction Mix:
     unit: Instructions per normalization unit
 Compute Speed-of-Light:
   VALU FLOPs:
-    rst: |-
+    rst: >-
       The total floating-point operations executed per second on the :ref:`VALU
       <desc-valu>`. This is also presented as a percent of the peak theoretical
       FLOPs achievable on the specific accelerator. Note: this does not include
       any floating-point operations from :ref:`MFMA <desc-mfma>` instructions.
     unit: GFLOPs
   VALU IOPs:
-    rst: |-
+    rst: >-
       The total integer operations executed per second on the :ref:`VALU <desc-valu>`.
       This is also presented as a percent of the peak theoretical IOPs achievable
       on the specific accelerator. Note: this does not include any integer operations
@@ -1006,7 +1006,7 @@ Compute Speed-of-Light:
     rst: ''
     unit: Unknown
   MFMA FLOPs (BF16):
-    rst: |-
+    rst: >-
       The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 16-bit brain floating
       point operations from :ref:`VALU <desc-valu>` instructions. This is also
@@ -1014,7 +1014,7 @@ Compute Speed-of-Light:
       on the specific accelerator.
     unit: GFLOPs
   MFMA FLOPs (F16):
-    rst: |-
+    rst: >-
       The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 16-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. This is also presented
@@ -1022,7 +1022,7 @@ Compute Speed-of-Light:
       specific accelerator.
     unit: GFLOPs
   MFMA FLOPs (F32):
-    rst: |-
+    rst: >-
       The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 32-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. This is also presented
@@ -1030,7 +1030,7 @@ Compute Speed-of-Light:
       specific accelerator.
     unit: GFLOPs
   MFMA FLOPs (F64):
-    rst: |-
+    rst: >-
       The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 64-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. This is also presented
@@ -1042,7 +1042,7 @@ Compute Speed-of-Light:
       achievable on the specific accelerator.
     unit: GFLOPs
   MFMA IOPs (INT8):
-    rst: |-
+    rst: >-
       The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
       per second. Note: this does not include any 8-bit integer operations from
       :ref:`VALU <desc-valu>` instructions. This is also presented as a percent
@@ -1131,7 +1131,7 @@ Arithmetic Operations:
       unit <normalization-units>`.
     unit: FLOP per normalization unit
   BF16 OPs:
-    rst: |-
+    rst: >-
       The total number of 16-bit brain floating-point operations executed on
       either the :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization
       unit <normalization-units>`. Note: on current CDNA accelerators, the VALU
@@ -1148,7 +1148,7 @@ Arithmetic Operations:
       unit <normalization-units>`.
     unit: FLOP per normalization unit
   INT8 OPs:
-    rst: |-
+    rst: >-
       The total number of 8-bit integer operations executed on either the :ref:`VALU
       <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization unit
       <normalization-units>`. Note: on current CDNA accelerators, the VALU has
@@ -1225,7 +1225,7 @@ LDS Statistics:
       to stalls from non-dword aligned addresses per :ref:`normalization unit <normalization-units>`.
     unit: Cycles per normalization unit
   Mem Violations:
-    rst: |-
+    rst: >-
       The total number of out-of-bounds accesses made to the LDS, per :ref:`normalization
       unit <normalization-units>`. This is unused and expected to be zero in
       most configurations for modern CDNA\u2122 accelerators.
@@ -1242,7 +1242,7 @@ L1I Speed-of-Light:
       over the number of all L1I requests.
     unit: Percent
   L1I-L2 Bandwidth Utilization:
-    rst: |-
+    rst: >-
       The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth
       achieved. Calculated as the ratio of the total number of requests from
       the L1I to the L2 cache over the :ref:`total L1I-L2 interface cycles <total-l1i-cycles>`.
@@ -1349,7 +1349,7 @@ Scalar L1D cache accesses:
     unit: Requests per normalization unit
 Scalar L1D Cache - L2 Interface:
   sL1D-L2 BW:
-    rst: |-
+    rst: >-
       The total number of bytes read from, written to, or atomically updated
       across the sL1D\u2194:doc:`L2 <l2-cache>` interface, divided by total duration.
       Note that sL1D writes and atomics are typically
@@ -1371,7 +1371,7 @@ Scalar L1D Cache - L2 Interface:
       CDNA accelerators.
     unit: Requests per normalization unit
   Stall Cycles:
-    rst: |-
+    rst: >-
       The total number of cycles the sL1D\u2194 :doc:`L2 <l2-cache>` interface
       was stalled, per :ref:`normalization unit <normalization-units>`.
     unit: Cycles per normalization unit
@@ -1681,7 +1681,7 @@ L1 Unified Translation Cache (UTCL1):
       translation not being present in the cache, per :ref:`normalization unit <normalization-units>`.
     unit: unit
   Permission Misses:
-    rst: |-
+    rst: >-
       The total number of translation requests that missed in the UTCL1 due
       to a permission error, per :ref:`normalization unit <normalization-units>`.
       This is unused and expected to be zero in most configurations for modern
diff --git a/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx950_metrics_description.yaml b/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx950_metrics_description.yaml
index d0a1898da5..f2fb71f054 100644
--- a/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx950_metrics_description.yaml
+++ b/projects/rocprofiler-compute/tools/per_arch_metric_definitions/gfx950_metrics_description.yaml
@@ -1,20 +1,20 @@
 System Speed-of-Light:
   VALU FLOPs:
-    rst: |-
+    rst: >-
       The total floating-point operations executed per second on the :ref:`VALU
       <desc-valu>`. This is also presented as a percent of the peak theoretical
       FLOPs achievable on the specific accelerator. Note: this does not include
       any floating-point operations from :ref:`MFMA <desc-mfma>` instructions.
     unit: GFLOPs
   VALU IOPs:
-    rst: |-
+    rst: >-
       The total integer operations executed per second on the :ref:`VALU <desc-valu>`.
       This is also presented as a percent of the peak theoretical IOPs achievable
       on the specific accelerator. Note: this does not include any integer operations
       from :ref:`MFMA <desc-mfma>` instructions.
     unit: GOIPs
   MFMA FLOPs (F8):
-    rst: |-
+    rst: >-
       The total number of 8-bit brain floating point :ref:`MFMA <desc-mfma>`
       operations executed per second. Note: this does not include any 16-bit brain
       floating point operations from :ref:`VALU <desc-valu>` instructions. This
@@ -23,7 +23,7 @@ System Speed-of-Light:
       series and later only.
     unit: GFLOPs
   MFMA FLOPs (BF16):
-    rst: |-
+    rst: >-
       The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>`
       operations executed per second. Note: this does not include any 16-bit brain
       floating point operations from :ref:`VALU <desc-valu>` instructions. This
@@ -31,7 +31,7 @@ System Speed-of-Light:
       achievable on the specific accelerator.
     unit: GFLOPs
   MFMA FLOPs (F16):
-    rst: |-
+    rst: >-
       The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 16-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. This is also presented
@@ -39,7 +39,7 @@ System Speed-of-Light:
       specific accelerator.
     unit: GFLOPs
   MFMA FLOPs (F32):
-    rst: |-
+    rst: >-
       The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 32-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. This is also presented
@@ -47,7 +47,7 @@ System Speed-of-Light:
       specific accelerator.
     unit: GFLOPs
   MFMA FLOPs (F64):
-    rst: |-
+    rst: >-
       The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 64-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. This is also presented
@@ -58,7 +58,7 @@ System Speed-of-Light:
     rst: ''
     unit: Unknown
   MFMA IOPs (Int8):
-    rst: |-
+    rst: >-
       The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
       per second. Note: this does not include any 8-bit integer operations from
       :ref:`VALU <desc-valu>` instructions. This is also presented as a percent
@@ -111,7 +111,7 @@ System Speed-of-Light:
       over the :ref:`total active CU cycles <total-active-cu-cycles>`.
     unit: Instructions per-cycle
   Wavefront Occupancy:
-    rst: |-
+    rst: >-
       The time-averaged number of wavefronts resident on the accelerator over
       the lifetime of the kernel. Note: this metric may be inaccurate for short-running
       kernels (less than 1ms). This is also presented as a percent of the peak theoretical
@@ -156,7 +156,7 @@ System Speed-of-Light:
       peak theoretical bandwidth achievable on the specific accelerator.
     unit: GB/s
   L2-Fabric Read BW:
-    rst: |-
+    rst: >-
       The number of bytes read by the L2 over the :ref:`Infinity Fabric\u2122
       interface <l2-fabric>` per unit time. This is also presented as a percent
       of the peak theoretical bandwidth achievable on the specific accelerator.
@@ -241,19 +241,19 @@ Memory Chart:
     rst: Total number of compute units (CUs) on the accelerator.
     unit: CUs
   VGPR:
-    rst: |-
+    rst: >-
       The number of architected vector general-purpose registers allocated for the
       kernel, see :ref:`VALU <desc-valu>`. Note: this may not exactly match the
       number of VGPRs requested by the compiler due to allocation granularity.
     unit: VGPRs
   SGPR:
-    rst: |-
+    rst: >-
       The number of scalar general-purpose registers allocated for the kernel, see
       :ref:`SALU <desc-salu>`. Note: this may not exactly match the number of
       SGPRs requested by the compiler due to allocation granularity.
     unit: SGPRs
   LDS Allocation:
-    rst: |-
+    rst: >-
       The number of bytes of :doc:`LDS <local-data-share>` memory (or, shared memory)
       allocated for this kernel. Note: This may also be larger than what was requested
       at compile time due to both allocation granularity and dynamic per-dispatch
@@ -432,28 +432,28 @@ Memory Chart:
     unit: Requests per normalization unit
 Roofline Performance Rates:
   VALU FLOPs (F16):
-    rst: |-
+    rst: >-
       The total 16-bit floating-point operations executed per second on the :ref:`VALU
       <desc-valu>`. This is presented with the value of the peak empirical F16 FLOPs achievable
       on the specific accelerator. Note: this does not include any F16 operations
       from :ref:`MFMA <desc-mfma>` instructions.
     unit: GFLOPs
   VALU FLOPs (F32):
-    rst: |-
+    rst: >-
       The total 32-bit floating-point operations executed per second on the :ref:`VALU
       <desc-valu>`. This is presented with the value of the peak empirical F32 FLOPs achievable
       on the specific accelerator. Note: this does not include any F32 operations
       from :ref:`MFMA <desc-mfma>` instructions.
     unit: GFLOPs
   VALU FLOPs (F64):
-    rst: |-
+    rst: >-
       The total 64-bit floating-point operations executed per second on the :ref:`VALU
       <desc-valu>`. This is presented with the value of the peak empirical F64 FLOPs achievable
       on the specific accelerator. Note: this does not include any F64 operations
       from :ref:`MFMA <desc-mfma>` instructions.
     unit: GFLOPs
   MFMA FLOPs (F64):
-    rst: |-
+    rst: >-
       The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 64-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
@@ -461,7 +461,7 @@ Roofline Performance Rates:
       displayed alongside for comparison.
     unit: GFLOPs
   MFMA FLOPs (F32):
-    rst: |-
+    rst: >-
       The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 32-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
@@ -469,7 +469,7 @@ Roofline Performance Rates:
       displayed alongside for comparison.
     unit: GFLOPs
   MFMA FLOPs (F16):
-    rst: |-
+    rst: >-
       The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 16-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
@@ -477,7 +477,7 @@ Roofline Performance Rates:
       displayed alongside for comparison.
     unit: GFLOPs
   MFMA FLOPs (BF16):
-    rst: |-
+    rst: >-
       The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>`
       operations executed per second. Note: this does not include any 16-bit brain
       floating point operations from :ref:`VALU <desc-valu>` instructions. The
@@ -485,7 +485,7 @@ Roofline Performance Rates:
       accelerator is displayed alongside for comparison.
     unit: GFLOPs
   MFMA FLOPs (F8):
-    rst: |-
+    rst: >-
       The total number of 8-bit brain floating point :ref:`MFMA <desc-mfma>`
       operations executed per second. Note: this does not include any 16-bit brain
       floating point operations from :ref:`VALU <desc-valu>` instructions. The
@@ -494,7 +494,7 @@ Roofline Performance Rates:
       Instinct MI300 series and later only.
     unit: GFLOPs
   MFMA FLOPs (F6F4):
-    rst: |-
+    rst: >-
       The total number of 4-bit and 6-bit floating point :ref:`MFMA <desc-mfma>`
       operations executed per second. Note: this does not include any floating point
       operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
@@ -503,7 +503,7 @@ Roofline Performance Rates:
       series (gfx950) and later only.
     unit: GFLOPs
   MFMA IOPs (Int8):
-    rst: |-
+    rst: >-
       The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
       per second. Note: this does not include any 8-bit integer operations from
       :ref:`VALU <desc-valu>` instructions. The peak empirically measured INT8 MFMA
@@ -511,7 +511,7 @@ Roofline Performance Rates:
       for comparison.
     unit: GIOPs
   HBM Bandwidth:
-    rst: |-
+    rst: >-
       The total number of bytes read from and written to High-Bandwidth
         Memory (HBM) per second. The peak empirically measured bandwidth achievable
         on the specific accelerator is displayed alongside for comparison.
@@ -542,28 +542,28 @@ Roofline Performance Rates:
     unit: GB/s
 Roofline Plot Points:
   AI HBM:
-    rst: |-
+    rst: >-
       The Arithmetic Intensity (AI) relative to High-Bandwidth Memory (HBM).
       It is the ratio of total floating-point operations (FLOPs) to total bytes
       transferred between HBM and the L2 cache. This value is used as the x-coordinate
       for the HBM roofline.
     unit: FLOPs/Byte
   AI L2:
-    rst: |-
+    rst: >-
       The Arithmetic Intensity (AI) relative to the L2 Cache. It is the ratio
       of total floating-point operations (FLOPs) to total bytes transferred between
       the L2 cache and the L1 cache. This value is used as the x-coordinate for
       the L2 roofline.
     unit: FLOPs/Byte
   AI L1:
-    rst: |-
+    rst: >-
       The Arithmetic Intensity (AI) relative to the L1 Cache. It is the ratio
       of total floating-point operations (FLOPs) to total bytes transferred between
       the L1 cache and the processing units. This value is used as the x-coordinate
       for the L1 roofline.
     unit: FLOPs/Byte
   Performance (GFLOPs):
-    rst: |-
+    rst: >-
       The overall achieved performance, measured in GigaFLOPs
       per second (GFLOP/s). This is calculated as the sum of all VALU and MFMA floating-point
       operations divided by the total execution time. This value is used as the y-coordinate
@@ -633,7 +633,7 @@ Workgroup manager utilizations:
       any work.
     unit: Percent
   Scheduler-Pipe Utilization:
-    rst: |-
+    rst: >-
       The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
       in the kernel where the scheduler-pipes were actively doing any work. Note: this
       value is expected to range between 0% and 25%. See :ref:`desc-spi`.
@@ -674,7 +674,7 @@ Workgroup manager utilizations:
     unit: Cycles/wave
 Workgroup Manager - Resource Allocation:
   Not-scheduled Rate (Workgroup Manager):
-    rst: |-
+    rst: >-
       The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
       in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
       due to a bottleneck within the workgroup manager rather than a lack of a
@@ -683,7 +683,7 @@ Workgroup Manager - Resource Allocation:
       description.
     unit: Percent
   Not-scheduled Rate (Scheduler-Pipe):
-    rst: |-
+    rst: >-
       The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
       in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
       due to a bottleneck within the scheduler-pipes rather than a lack of a CU
@@ -695,7 +695,7 @@ Workgroup Manager - Resource Allocation:
     rst: ''
     unit: Unknown
   Scheduler-Pipe Stall Rate:
-    rst: |-
+    rst: >-
       The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>`
       in the kernel where a workgroup could not be scheduled to a :doc:`CU <compute-unit>`
       due to occupancy limitations (like a lack of a CU or :ref:`SIMD <desc-valu>`
@@ -759,7 +759,7 @@ Wavefront Launch Stats:
       block size.
     unit: Work-Items
   Total Wavefronts:
-    rst: |-
+    rst: >-
       The total number of wavefronts launched as part of the kernel dispatch.
       On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront
       size is always 64 work-items. Thus, the total number of wavefronts should
@@ -774,25 +774,25 @@ Wavefront Launch Stats:
       <https://docs.kernel.org/gpu/amdgpu/module-parameters.html?highlight=cwsr>`_.
     unit: Wavefronts
   VGPRs:
-    rst: |-
+    rst: >-
       The number of architected vector general-purpose registers allocated for the
       kernel, see :ref:`VALU <desc-valu>`. Note: this may not exactly match the
       number of VGPRs requested by the compiler due to allocation granularity.
     unit: VGPRs
   AGPRs:
-    rst: |-
+    rst: >-
       The number of accumulation vector general-purpose registers allocated
       for the kernel, see :ref:`AGPRs <desc-agprs>`. Note: this may not exactly match
       the number of AGPRs requested by the compiler due to allocation granularity.
     unit: AGPRs
   SGPRs:
-    rst: |-
+    rst: >-
       The number of scalar general-purpose registers allocated for the kernel, see
       :ref:`SALU <desc-salu>`. Note: this may not exactly match the number of
       SGPRs requested by the compiler due to allocation granularity.
     unit: SGPRs
   LDS Allocation:
-    rst: |-
+    rst: >-
       The number of bytes of :doc:`LDS <local-data-share>` memory (or, shared memory)
       allocated for this kernel. Note: This may also be larger than what was requested
       at compile time due to both allocation granularity and dynamic per-dispatch
@@ -815,7 +815,7 @@ Wavefront Runtime Stats:
       This is averaged over all wavefronts in a kernel dispatch.
     unit: Instructions per wavefront
   Wave Cycles:
-    rst: |-
+    rst: >-
       The number of cycles a wavefront in the kernel dispatch spent resident
       on a compute unit per :ref:`normalization unit <normalization-units>`. This is
       averaged over all wavefronts in a kernel dispatch. Note: this should not
@@ -853,7 +853,7 @@ Wavefront Runtime Stats:
       the total Wave Cycles metric.
     unit: Cycles per normalization unit
   Wavefront Occupancy:
-    rst: |-
+    rst: >-
       The time-averaged number of wavefronts resident on the accelerator over the
       lifetime of the kernel. Note: this metric may be inaccurate for short-running
       kernels (less than 1ms).
@@ -955,7 +955,7 @@ VALU Arithmetic Instruction Mix:
       unit <normalization-units>`.
     unit: Instructions per normalization unit
   Conversion:
-    rst: |-
+    rst: >-
       The total number of type conversion instructions (such as converting data
       to or from F32\u2194F64) issued to the VALU per :ref:`normalization unit
       <normalization-units>`.
@@ -1035,14 +1035,14 @@ MFMA Arithmetic Instruction Mix:
     unit: Unknown
 Compute Speed-of-Light:
   VALU FLOPs:
-    rst: |-
+    rst: >-
       The total floating-point operations executed per second on the :ref:`VALU
       <desc-valu>`. This is also presented as a percent of the peak theoretical
       FLOPs achievable on the specific accelerator. Note: this does not include
       any floating-point operations from :ref:`MFMA <desc-mfma>` instructions.
     unit: GFLOPs
   VALU IOPs:
-    rst: |-
+    rst: >-
       The total integer operations executed per second on the :ref:`VALU <desc-valu>`.
       This is also presented as a percent of the peak theoretical IOPs achievable
       on the specific accelerator. Note: this does not include any integer operations
@@ -1052,7 +1052,7 @@ Compute Speed-of-Light:
     rst: ''
     unit: Unknown
   MFMA FLOPs (BF16):
-    rst: |-
+    rst: >-
       The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 16-bit brain floating
       point operations from :ref:`VALU <desc-valu>` instructions. This is also
@@ -1060,7 +1060,7 @@ Compute Speed-of-Light:
       on the specific accelerator.
     unit: GFLOPs
   MFMA FLOPs (F16):
-    rst: |-
+    rst: >-
       The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 16-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. This is also presented
@@ -1068,7 +1068,7 @@ Compute Speed-of-Light:
       specific accelerator.
     unit: GFLOPs
   MFMA FLOPs (F32):
-    rst: |-
+    rst: >-
       The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 32-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. This is also presented
@@ -1076,7 +1076,7 @@ Compute Speed-of-Light:
       specific accelerator.
     unit: GFLOPs
   MFMA FLOPs (F64):
-    rst: |-
+    rst: >-
       The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
       executed per second. Note: this does not include any 64-bit floating point
       operations from :ref:`VALU <desc-valu>` instructions. This is also presented
@@ -1091,7 +1091,7 @@ Compute Speed-of-Light:
     rst: ''
     unit: Unknown
   MFMA IOPs (INT8):
-    rst: |-
+    rst: >-
       The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
       per second. Note: this does not include any 8-bit integer operations from
       :ref:`VALU <desc-valu>` instructions. This is also presented as a percent
@@ -1183,7 +1183,7 @@ Arithmetic Operations:
       unit <normalization-units>`.
     unit: FLOP per normalization unit
   BF16 OPs:
-    rst: |-
+    rst: >-
       The total number of 16-bit brain floating-point operations executed on
       either the :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization
       unit <normalization-units>`. Note: on current CDNA accelerators, the VALU
@@ -1203,7 +1203,7 @@ Arithmetic Operations:
     rst: ''
     unit: Unknown
   INT8 OPs:
-    rst: |-
+    rst: >-
       The total number of 8-bit integer operations executed on either the :ref:`VALU
       <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per :ref:`normalization unit
       <normalization-units>`. Note: on current CDNA accelerators, the VALU has
@@ -1298,7 +1298,7 @@ LDS Statistics:
       to stalls from non-dword aligned addresses per :ref:`normalization unit <normalization-units>`.
     unit: Cycles per normalization unit
   Mem Violations:
-    rst: |-
+    rst: >-
       The total number of out-of-bounds accesses made to the LDS, per :ref:`normalization
       unit <normalization-units>`. This is unused and expected to be zero in
       most configurations for modern CDNA\u2122 accelerators.
@@ -1321,7 +1321,7 @@ L1I Speed-of-Light:
       over the number of all L1I requests.
     unit: Percent
   L1I-L2 Bandwidth Utilization:
-    rst: |-
+    rst: >-
       The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth
       achieved. Calculated as the ratio of the total number of requests from
       the L1I to the L2 cache over the :ref:`total L1I-L2 interface cycles <total-l1i-cycles>`.
@@ -1428,7 +1428,7 @@ Scalar L1D cache accesses:
     unit: Requests per normalization unit
 Scalar L1D Cache - L2 Interface:
   sL1D-L2 BW:
-    rst: |-
+    rst: >-
       The total number of bytes read from, written to, or atomically updated
       across the sL1D\u2194:doc:`L2 <l2-cache>` interface, divided by total duration.
       Note that sL1D writes and atomics are typically
@@ -1450,7 +1450,7 @@ Scalar L1D Cache - L2 Interface:
       CDNA accelerators.
     unit: Requests per normalization unit
   Stall Cycles:
-    rst: |-
+    rst: >-
       The total number of cycles the sL1D\u2194 :doc:`L2 <l2-cache>` interface
       was stalled, per :ref:`normalization unit <normalization-units>`.
     unit: Cycles per normalization unit
@@ -1818,7 +1818,7 @@ L1 Unified Translation Cache (UTCL1):
     rst: ''
     unit: Unknown
   Permission Misses:
-    rst: |-
+    rst: >-
       The total number of translation requests that missed in the UTCL1 due
       to a permission error, per :ref:`normalization unit <normalization-units>`.
       This is unused and expected to be zero in most configurations for modern
@@ -1968,7 +1968,7 @@ L2-Fabric interface metrics:
       with return value) was returned to the L2.
     unit: Cycles
   Read Stall:
-    rst: |-
+    rst: >-
       The ratio of the total number of cycles the L2-Fabric interface was stalled
       on a read request to any destination (local HBM, remote PCIe\xAE connected
       accelerator or CPU, or remote Infinity Fabric connected accelerator [#inf]_