From 6902b12e658084570ac8a972151a8e04ef2f2f49 Mon Sep 17 00:00:00 2001
From: vedithal-amd <Vignesh.Edithal@amd.com>
Date: Fri, 1 Aug 2025 13:56:29 -0400
Subject: [PATCH] Fix metrics description (#853)

* Fix ordering of metrics description
---
 docs/data/metrics_description.yaml | 1554 ++++++++++++++--------------
 utils/autogen_hash.yaml            |    2 +-
 utils/split_config.py              |    2 +-
 3 files changed, 779 insertions(+), 779 deletions(-)
diff --git a/docs/data/metrics_description.yaml b/docs/data/metrics_description.yaml
index ae027f63cf..822119dc28 100644
--- a/docs/data/metrics_description.yaml
+++ b/docs/data/metrics_description.yaml
@@ -1,19 +1,10 @@
 # AUTOGENERATED FILE. Only edit for testing purposes, not for development. Generated from utils/unified_config.yaml. Generated by utils/split_config.py
 Wavefront launch stats:
-  Scratch Allocation:
-    rst: The number of bytes of :ref:`scratch memory <memory-spaces>` requested  per
-      work-item for this kernel. Scratch memory is used for stack memory  on the accelerator,
-      as well as for register spills and restores.
-    unit: Bytes per work-item
-  Saved Wavefronts:
-    rst: The total number of wavefronts saved at a context-save. See  `cwsr_enable
-      <https://docs.kernel.org/gpu/amdgpu/module-parameters.html?highlight=cwsr>`_.
-    unit: Wavefronts
-  VGPRs:
-    rst: 'The number of architected vector general-purpose registers allocated for  the
-      kernel, see :ref:`VALU <desc-valu>`.  Note: this may not exactly  match the
-      number of VGPRs requested by the compiler due to allocation  granularity.'
-    unit: VGPRs
+  AGPRs:
+    rst: 'The number of accumulation vector general-purpose registers allocated for  the
+      kernel, see :ref:`AGPRs <desc-agprs>`.  Note: this may not exactly  match the
+      number of AGPRs requested by the compiler due to allocation  granularity.'
+    unit: AGPRs
   Grid Size:
     rst: The total number of work-items (or, threads) launched as a part of the kernel
       dispatch. In HIP, this is equivalent to the total grid size multiplied by the
@@ -29,32 +20,37 @@ Wavefront launch stats:
     rst: The total number of wavefronts restored from a context-save. See  `cwsr_enable
       <https://docs.kernel.org/gpu/amdgpu/module-parameters.html?highlight=cwsr>`_.
     unit: Wavefronts
-  AGPRs:
-    rst: 'The number of accumulation vector general-purpose registers allocated for  the
-      kernel, see :ref:`AGPRs <desc-agprs>`.  Note: this may not exactly  match the
-      number of AGPRs requested by the compiler due to allocation  granularity.'
-    unit: AGPRs
   SGPRs:
     rst: 'The number of scalar general-purpose registers allocated for the kernel,  see
       :ref:`SALU <desc-salu>`.  Note: this may not exactly match the number  of SGPRs
       requested by the compiler due to allocation granularity. plain'
     unit: SGPRs
+  Saved Wavefronts:
+    rst: The total number of wavefronts saved at a context-save. See  `cwsr_enable
+      <https://docs.kernel.org/gpu/amdgpu/module-parameters.html?highlight=cwsr>`_.
+    unit: Wavefronts
+  Scratch Allocation:
+    rst: The number of bytes of :ref:`scratch memory <memory-spaces>` requested  per
+      work-item for this kernel. Scratch memory is used for stack memory  on the accelerator,
+      as well as for register spills and restores.
+    unit: Bytes per work-item
   Total Wavefronts:
     rst: "The total number of wavefronts launched as part of the kernel dispatch.\
       \ On AMD Instinct\u2122 CDNA\u2122 accelerators and GCN\u2122 GPUs, the wavefront\
       \ size is always 64 work-items. Thus, the total number of wavefronts should\
       \ be equivalent to the ceiling of grid size divided by 64."
     unit: Wavefronts
+  VGPRs:
+    rst: 'The number of architected vector general-purpose registers allocated for  the
+      kernel, see :ref:`VALU <desc-valu>`.  Note: this may not exactly  match the
+      number of VGPRs requested by the compiler due to allocation  granularity.'
+    unit: VGPRs
   Workgroup Size:
     rst: The total number of work-items (or, threads) in each workgroup (or, block)
       launched as part of the kernel dispatch. In HIP, this is equivalent to the total
       block size.
     unit: Work-Items
 Wavefront runtime stats:
-  Instructions per wavefront:
-    rst: The average number of instructions (of all types) executed per wavefront.
-      This is averaged over all wavefronts in a kernel dispatch.
-    unit: Instructions per wavefront
   Active Cycles:
     rst: The average number of cycles a wavefront in the kernel dispatch was  actively
       executing instructions per  :ref:`normalization unit <normalization-units>`.
@@ -65,28 +61,6 @@ Wavefront runtime stats:
       The  sum of this metric, Issue Wait Cycles and Active Wait Cycles should be  equal
       to the total Wave Cycles metric.
     unit: Cycles per normalization unit
-  Wave Cycles:
-    rst: 'The number of cycles a wavefront in the kernel dispatch spent resident on  a
-      compute unit per :ref:`normalization unit <normalization-units>`. This  is averaged
-      over all wavefronts in a kernel dispatch.  Note: this should  not be directly
-      compared to the kernel cycles above.'
-    unit: Cycles per normalization unit
-  Wavefront Occupancy:
-    rst: 'The time-averaged number of wavefronts resident on the accelerator over  the
-      lifetime of the kernel. Note: this metric may be inaccurate for  short-running
-      kernels (less than 1ms).'
-    unit: Wavefronts
-  Issue Wait Cycles:
-    rst: The number of cycles a wavefront in the kernel dispatch was unable to  issue
-      an instruction for any reason (e.g., execution pipe back-pressure,  arbitration
-      loss, etc.) per  :ref:`normalization unit <normalization-units>`.  This counter
-      is  incremented at every cycle by *all* wavefronts on a CU unable to issue an  instruction.  As
-      such, it is most useful to get a sense of how waves were  spending their time,
-      rather than identification of a precise limiter  because another wave could
-      be actively executing while a wave is issue  stalled.  The sum of this metric,
-      Dependency Wait Cycles and Active  Cycles should be equal to the total Wave
-      Cycles metric.
-    unit: Cycles per normalization unit
   Dependency Wait Cycles:
     rst: The number of cycles a wavefront in the kernel dispatch stalled waiting  on
       memory of any kind (e.g., instruction fetch, vector or scalar memory,  etc.)
@@ -97,17 +71,50 @@ Wavefront runtime stats:
       be actively executing while a wave is stalled.  The sum of this metric, Issue
       Wait Cycles and Active Cycles should be  equal to the total Wave Cycles metric.
     unit: Cycles per normalization unit
+  Instructions per wavefront:
+    rst: The average number of instructions (of all types) executed per wavefront.
+      This is averaged over all wavefronts in a kernel dispatch.
+    unit: Instructions per wavefront
+  Issue Wait Cycles:
+    rst: The number of cycles a wavefront in the kernel dispatch was unable to  issue
+      an instruction for any reason (e.g., execution pipe back-pressure,  arbitration
+      loss, etc.) per  :ref:`normalization unit <normalization-units>`.  This counter
+      is  incremented at every cycle by *all* wavefronts on a CU unable to issue an  instruction.  As
+      such, it is most useful to get a sense of how waves were  spending their time,
+      rather than identification of a precise limiter  because another wave could
+      be actively executing while a wave is issue  stalled.  The sum of this metric,
+      Dependency Wait Cycles and Active  Cycles should be equal to the total Wave
+      Cycles metric.
+    unit: Cycles per normalization unit
   Kernel Time:
     rst: The total duration of the executed kernel.
     unit: Nanoseconds
   Kernel Time (Cycles):
     rst: The total duration of the executed kernel in cycles.
     unit: Cycles
+  Wave Cycles:
+    rst: 'The number of cycles a wavefront in the kernel dispatch spent resident on  a
+      compute unit per :ref:`normalization unit <normalization-units>`. This  is averaged
+      over all wavefronts in a kernel dispatch.  Note: this should  not be directly
+      compared to the kernel cycles above.'
+    unit: Cycles per normalization unit
+  Wavefront Occupancy:
+    rst: 'The time-averaged number of wavefronts resident on the accelerator over  the
+      lifetime of the kernel. Note: this metric may be inaccurate for  short-running
+      kernels (less than 1ms).'
+    unit: Wavefronts
 Overall instruction mix:
+  Branch:
+    rst: The total number of branch operations issued. These typically consist of  jump
+      or branch operations and are used to implement control flow.
+    unit: Instructions
   LDS:
     rst: The total number of LDS (also known as shared memory) operations issued.  These
       include loads, stores, atomics, and HIP's ``__shfl`` operations.
     unit: Instructions
+  MFMA:
+    rst: The total number of matrix fused multiply-add instructions issued.
+    unit: Instructions
   SALU:
     rst: The total number of scalar arithmetic logic unit (SALU) operations issued.
       Typically these are used for address calculations, literal constants, and other
@@ -115,14 +122,6 @@ Overall instruction mix:
       (SMEM) operations are issued by the SALU, they are counted separately in this
       section.
     unit: Instructions
-  MFMA:
-    rst: The total number of matrix fused multiply-add instructions issued.
-    unit: Instructions
-  VMEM:
-    rst: The total number of vector memory operations issued. These include most  loads,
-      stores and atomic operations and all accesses to  :ref:`generic, global, private
-      and texture <memory-spaces>` memory.
-    unit: Instructions
   SMEM:
     rst: The total number of scalar memory (SMEM) operations issued. These are  typically
       used for loading kernel arguments, base-pointers and loads  from HIP's ``__constant__``
@@ -135,27 +134,51 @@ Overall instruction mix:
       operations, non-uniform  address calculations, transcendental operations, integer
       operations,  shifts, conditional evaluation, etc.
     unit: Instructions
-  Branch:
-    rst: The total number of branch operations issued. These typically consist of  jump
-      or branch operations and are used to implement control flow.
+  VMEM:
+    rst: The total number of vector memory operations issued. These include most  loads,
+      stores and atomic operations and all accesses to  :ref:`generic, global, private
+      and texture <memory-spaces>` memory.
     unit: Instructions
 VALU arithmetic instruction mix:
+  Conversion:
+    rst: "The total number of type conversion instructions (such as converting data\
+      \  to or from F32\u2194F64) issued to the VALU per  :ref:`normalization unit\
+      \ <normalization-units>`."
+    unit: Instructions per normalization unit
   F16-ADD:
     rst: The total number of addition instructions operating on 16-bit  floating-point
       operands issued to the VALU per  :ref:`normalization unit <normalization-units>`.
     unit: Instructions per normalization unit
+  F16-FMA:
+    rst: The total number of fused multiply-add instructions operating on 16-bit  floating-point
+      operands issued to the VALU per  :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
   F16-MUL:
     rst: The total number of multiplication instructions operating on 16-bit  floating-point
       operands issued to the VALU per  :ref:`normalization unit <normalization-units>`.
     unit: Instructions per normalization unit
-  INT32:
-    rst: The total number of instructions operating on 32-bit integer operands  issued
-      to the VALU per :ref:`normalization unit <normalization-units>`.
+  F16-Trans:
+    rst: The total number of transcendental instructions (e.g., `sqrt`) operating  on
+      16-bit floating-point operands issued to the VALU per  :ref:`normalization unit
+      <normalization-units>`.
+    unit: Instructions per normalization unit
+  F32-ADD:
+    rst: The total number of addition instructions operating on 32-bit  floating-point
+      operands issued to the VALU per  :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  F32-FMA:
+    rst: The total number of fused multiply-add instructions operating on 32-bit  floating-point
+      operands issued to the VALU per  :ref:`normalization unit <normalization-units>`.
     unit: Instructions per normalization unit
   F32-MUL:
     rst: The total number of multiplication instructions operating on 32-bit  floating-point
       operands issued to the VALU per  :ref:`normalization unit <normalization-units>`.
     unit: Instructions per normalization unit
+  F32-Trans:
+    rst: The total number of transcendental instructions (such as ``sqrt``)  operating
+      on 32-bit floating-point operands issued to the VALU per  :ref:`normalization
+      unit <normalization-units>`.
+    unit: Instructions per normalization unit
   F64-ADD:
     rst: The total number of addition instructions operating on 64-bit  floating-point
       operands issued to the VALU per  :ref:`normalization unit <normalization-units>`.
@@ -164,59 +187,24 @@ VALU arithmetic instruction mix:
     rst: The total number of fused multiply-add instructions operating on 64-bit  floating-point
       operands issued to the VALU per  :ref:`normalization unit <normalization-units>`.
     unit: Instructions per normalization unit
-  Conversion:
-    rst: "The total number of type conversion instructions (such as converting data\
-      \  to or from F32\u2194F64) issued to the VALU per  :ref:`normalization unit\
-      \ <normalization-units>`."
-    unit: Instructions per normalization unit
-  F16-FMA:
-    rst: The total number of fused multiply-add instructions operating on 16-bit  floating-point
+  F64-MUL:
+    rst: The total number of multiplication instructions operating on 64-bit  floating-point
       operands issued to the VALU per  :ref:`normalization unit <normalization-units>`.
     unit: Instructions per normalization unit
-  F32-FMA:
-    rst: The total number of fused multiply-add instructions operating on 32-bit  floating-point
-      operands issued to the VALU per  :ref:`normalization unit <normalization-units>`.
-    unit: Instructions per normalization unit
-  F16-Trans:
-    rst: The total number of transcendental instructions (e.g., `sqrt`) operating  on
-      16-bit floating-point operands issued to the VALU per  :ref:`normalization unit
-      <normalization-units>`.
-    unit: Instructions per normalization unit
-  F32-Trans:
-    rst: The total number of transcendental instructions (such as ``sqrt``)  operating
-      on 32-bit floating-point operands issued to the VALU per  :ref:`normalization
-      unit <normalization-units>`.
-    unit: Instructions per normalization unit
   F64-Trans:
     rst: The total number of transcendental instructions (such as `sqrt`)  operating
       on 64-bit floating-point operands issued to the VALU per  :ref:`normalization
       unit <normalization-units>`.
     unit: Instructions per normalization unit
-  F32-ADD:
-    rst: The total number of addition instructions operating on 32-bit  floating-point
-      operands issued to the VALU per  :ref:`normalization unit <normalization-units>`.
-    unit: Instructions per normalization unit
-  F64-MUL:
-    rst: The total number of multiplication instructions operating on 64-bit  floating-point
-      operands issued to the VALU per  :ref:`normalization unit <normalization-units>`.
+  INT32:
+    rst: The total number of instructions operating on 32-bit integer operands  issued
+      to the VALU per :ref:`normalization unit <normalization-units>`.
     unit: Instructions per normalization unit
   INT64:
     rst: The total number of instructions operating on 64-bit integer operands  issued
       to the VALU per :ref:`normalization unit <normalization-units>`.
     unit: Instructions per normalization unit
 MFMA instruction mix:
-  MFMA-F64:
-    rst: The total number of 64-bit floating-point :ref:`MFMA <desc-mfma>`  instructions
-      issued per :ref:`normalization unit <normalization-units>`.
-    unit: Instructions per normalization unit
-  MFMA-I8:
-    rst: The total number of 8-bit integer :ref:`MFMA <desc-mfma>` instructions  issued
-      per :ref:`normalization unit <normalization-units>`.
-    unit: Instructions per normalization unit
-  MFMA-F32:
-    rst: The total number of 32-bit floating-point :ref:`MFMA <desc-mfma>`  instructions
-      issued per :ref:`normalization unit <normalization-units>`.
-    unit: Instructions per normalization unit
   MFMA-BF16:
     rst: The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>`  instructions
       issued per :ref:`normalization unit <normalization-units>`.
@@ -225,11 +213,23 @@ MFMA instruction mix:
     rst: The total number of 16-bit floating point :ref:`MFMA <desc-mfma>`  instructions
       issued per :ref:`normalization unit <normalization-units>`.
     unit: Instructions per normalization unit
+  MFMA-F32:
+    rst: The total number of 32-bit floating-point :ref:`MFMA <desc-mfma>`  instructions
+      issued per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  MFMA-F64:
+    rst: The total number of 64-bit floating-point :ref:`MFMA <desc-mfma>`  instructions
+      issued per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
   MFMA-F8:
     rst: The total number of 8-bit floating point :ref:`MFMA <desc-mfma>` instructions  issued
       per :ref:`normalization unit <normalization-units>`. This is supported in AMD
       Instinct MI300 series and later only.
     unit: Instructions per normalization unit
+  MFMA-I8:
+    rst: The total number of 8-bit integer :ref:`MFMA <desc-mfma>` instructions  issued
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
 Compute Speed-of-Light:
   MFMA FLOPs (BF16):
     rst: 'The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>`  operations
@@ -238,6 +238,13 @@ Compute Speed-of-Light:
       as a percent of the peak theoretical  BF16 MFMA operations achievable on the
       specific accelerator.'
     unit: GFLOPs
+  MFMA FLOPs (F16):
+    rst: 'The total number of 16-bit floating point :ref:`MFMA <desc-mfma>`  operations
+      executed per second. Note: this does not include any 16-bit  floating point
+      operations from :ref:`VALU <desc-valu>` instructions. This  is also presented
+      as a percent of the peak theoretical F16 MFMA  operations achievable on the
+      specific accelerator.'
+    unit: GFLOPs
   MFMA FLOPs (F32):
     rst: 'The total number of 32-bit floating point :ref:`MFMA <desc-mfma>`  operations
       executed per second. Note: this does not include any 32-bit  floating point
@@ -274,36 +281,62 @@ Compute Speed-of-Light:
       on the specific accelerator. Note: this does  not include any integer operations
       from :ref:`MFMA <desc-mfma>`  instructions.'
     unit: GIOPs
-  MFMA FLOPs (F16):
-    rst: 'The total number of 16-bit floating point :ref:`MFMA <desc-mfma>`  operations
-      executed per second. Note: this does not include any 16-bit  floating point
-      operations from :ref:`VALU <desc-valu>` instructions. This  is also presented
-      as a percent of the peak theoretical F16 MFMA  operations achievable on the
-      specific accelerator.'
-    unit: GFLOPs
 Pipeline statistics:
-  IPC:
-    rst: The ratio of the total number of instructions executed on the  :doc:`CU <compute-unit>`
-      over the  :ref:`total active CU cycles <total-active-cu-cycles>`.
-    unit: Instructions per cycle
   Branch Utilization:
     rst: Indicates what percent of the kernel's duration the  :ref:`branch <desc-branch>`
       unit was busy executing instructions.  Computed as the ratio of the total number
       of cycles spent by the  :ref:`scheduler <desc-scheduler>` issuing branch instructions
       over the  :ref:`total CU cycles <total-cu-cycles>`.
     unit: Percent
+  IPC:
+    rst: The ratio of the total number of instructions executed on the  :doc:`CU <compute-unit>`
+      over the  :ref:`total active CU cycles <total-active-cu-cycles>`.
+    unit: Instructions per cycle
+  IPC (Issued):
+    rst: The ratio of the total number of  (non-:ref:`internal <ipc-internal-instructions>`)
+      instructions issued over  the number of cycles where the :ref:`scheduler <desc-scheduler>`
+      was  actively working on issuing instructions. Refer to the  :ref:`Issued IPC
+      <issued-ipc>` example for further detail.
+    unit: Instructions per cycle
+  MFMA Instruction Cycles:
+    rst: The average duration of :ref:`MFMA <desc-mfma>` instructions in this  kernel
+      in cycles. Computed as the ratio of the total number of cycles the  MFMA unit
+      was busy over the total number of MFMA instructions. Compare  to, for example,
+      the  `AMD Matrix Instruction Calculator <https://github.com/RadeonOpenCompute/amd_matrix_instruction_calculator>`_.
+    unit: Cycles per instruction
+  MFMA Utilization:
+    rst: Indicates what percent of the kernel's duration the  :ref:`MFMA <desc-mfma>`
+      unit was busy executing instructions. Computed as  the ratio of the total number
+      of cycles spent by the  :ref:`MFMA <desc-salu>` was busy over the  :ref:`total
+      CU cycles <total-cu-cycles>`.
+    unit: Percent
   SALU Utilization:
     rst: Indicates what percent of the kernel's duration the  :ref:`SALU <desc-salu>`
       was busy executing instructions. Computed as the  ratio of the total number
       of cycles spent by the  :ref:`scheduler <desc-scheduler>` issuing SALU / :ref:`SMEM
       <desc-smem>`  instructions over the :ref:`total CU cycles <total-cu-cycles>`.
     unit: Percent
-  MFMA Instruction Cycles:
-    rst: The average duration of :ref:`MFMA <desc-mfma>` instructions in this  kernel
-      in cycles. Computed as the ratio of the total number of cycles the  MFMA unit
-      was busy over the total number of MFMA instructions. Compare  to, for example,
-      the  `AMD Matrix Instruction Calculator <https://github.com/RadeonOpenCompute/amd_matrix_instruction_calculator>`_.
-    unit: Cycles per instruction
+  SMEM Latency:
+    rst: The average number of round-trip cycles (that is, from issue to data  return
+      / acknowledgment) required for a SMEM instruction to complete.
+    unit: Cycles
+  VALU Active Threads:
+    rst: Indicates the average level of :ref:`divergence <desc-divergence>` within  a
+      wavefront over the lifetime of the kernel. The number of work-items  that were
+      active in a wavefront during execution of each  :ref:`VALU <desc-valu>` instruction,
+      time-averaged over all VALU  instructions run on all wavefronts in the kernel.
+    unit: Work-items
+  VALU Utilization:
+    rst: Indicates what percent of the kernel's duration the  :ref:`VALU <desc-valu>`
+      was busy executing instructions. Does not include  :ref:`VMEM <desc-vmem>` operations.
+      Computed as the ratio of the total  number of cycles spent by the :ref:`scheduler
+      <desc-scheduler>` issuing  VALU instructions over the :ref:`total CU cycles
+      <total-cu-cycles>`.
+    unit: Percent
+  VMEM Latency:
+    rst: The average number of round-trip cycles (that is, from issue to data  return
+      / acknowledgment) required for a VMEM instruction to complete.
+    unit: Cycles
   VMEM Utilization:
     rst: Indicates what percent of the kernel's duration the  :ref:`VMEM <desc-vmem>`
       unit was busy executing instructions, including  both global/generic and spill/scratch
@@ -312,50 +345,7 @@ Pipeline statistics:
       the ratio of the total number of cycles spent by the  :ref:`scheduler <desc-scheduler>`
       issuing VMEM instructions over the  :ref:`total CU cycles <total-cu-cycles>`.
     unit: Percent
-  SMEM Latency:
-    rst: The average number of round-trip cycles (that is, from issue to data  return
-      / acknowledgment) required for a SMEM instruction to complete.
-    unit: Cycles
-  MFMA Utilization:
-    rst: Indicates what percent of the kernel's duration the  :ref:`MFMA <desc-mfma>`
-      unit was busy executing instructions. Computed as  the ratio of the total number
-      of cycles spent by the  :ref:`MFMA <desc-salu>` was busy over the  :ref:`total
-      CU cycles <total-cu-cycles>`.
-    unit: Percent
-  VMEM Latency:
-    rst: The average number of round-trip cycles (that is, from issue to data  return
-      / acknowledgment) required for a VMEM instruction to complete.
-    unit: Cycles
-  VALU Utilization:
-    rst: Indicates what percent of the kernel's duration the  :ref:`VALU <desc-valu>`
-      was busy executing instructions. Does not include  :ref:`VMEM <desc-vmem>` operations.
-      Computed as the ratio of the total  number of cycles spent by the :ref:`scheduler
-      <desc-scheduler>` issuing  VALU instructions over the :ref:`total CU cycles
-      <total-cu-cycles>`.
-    unit: Percent
-  IPC (Issued):
-    rst: The ratio of the total number of  (non-:ref:`internal <ipc-internal-instructions>`)
-      instructions issued over  the number of cycles where the :ref:`scheduler <desc-scheduler>`
-      was  actively working on issuing instructions. Refer to the  :ref:`Issued IPC
-      <issued-ipc>` example for further detail.
-    unit: Instructions per cycle
-  VALU Active Threads:
-    rst: Indicates the average level of :ref:`divergence <desc-divergence>` within  a
-      wavefront over the lifetime of the kernel. The number of work-items  that were
-      active in a wavefront during execution of each  :ref:`VALU <desc-valu>` instruction,
-      time-averaged over all VALU  instructions run on all wavefronts in the kernel.
-    unit: Work-items
 Arithmetic operations:
-  IOPs (Total):
-    rst: The total number of integer operations executed on either the  :ref:`VALU
-      <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per  :ref:`normalization unit
-      <normalization-units>`.
-    unit: IOP per normalization unit
-  FLOPs (Total):
-    rst: The total number of floating-point operations executed on either the  :ref:`VALU
-      <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per  :ref:`normalization unit
-      <normalization-units>`.
-    unit: FLOP per normalization unit
   BF16 OPs:
     rst: 'The total number of 16-bit brain floating-point operations executed on either
       the  :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per  :ref:`normalization
@@ -372,17 +362,27 @@ Arithmetic operations:
       :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per  :ref:`normalization
       unit <normalization-units>`.
     unit: FLOP per normalization unit
+  F64 OPs:
+    rst: The total number of 64-bit floating-point operations executed on either  the
+      :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per  :ref:`normalization
+      unit <normalization-units>`.
+    unit: FLOP per normalization unit
+  FLOPs (Total):
+    rst: The total number of floating-point operations executed on either the  :ref:`VALU
+      <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per  :ref:`normalization unit
+      <normalization-units>`.
+    unit: FLOP per normalization unit
   INT8 OPs:
     rst: 'The total number of 8-bit integer operations executed on either the  :ref:`VALU
       <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per  :ref:`normalization unit
       <normalization-units>`. Note: on current CDNA  accelerators, the VALU has no
       native INT8 instructions.'
     unit: IOP per normalization unit
-  F64 OPs:
-    rst: The total number of 64-bit floating-point operations executed on either  the
-      :ref:`VALU <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per  :ref:`normalization
-      unit <normalization-units>`.
-    unit: FLOP per normalization unit
+  IOPs (Total):
+    rst: The total number of integer operations executed on either the  :ref:`VALU
+      <desc-valu>` or :ref:`MFMA <desc-mfma>` units, per  :ref:`normalization unit
+      <normalization-units>`.
+    unit: IOP per normalization unit
 LDS Speed-of-Light:
   Access Rate:
     rst: Indicates the percentage of SIMDs in the :ref:`VALU <desc-valu>` [#lds-workload]_
@@ -411,36 +411,34 @@ LDS Speed-of-Light:
       number of cycles LDS was active over the  :ref:`total CU cycles <total-cu-cycles>`.
     unit: Percent
 LDS Statistics:
-  Index Accesses:
-    rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>`  over
-      all operations per :ref:`normalization unit <normalization-units>`.
-    unit: Cycles per normalization unit
-  Theoretical Bandwidth:
-    rst: Indicates the maximum amount of bytes that could have been loaded from,  stored
-      to, or atomically updated in the LDS per  :ref:`normalization unit <normalization-units>`.
-      Does *not* take into  account the execution mask of the wavefront when the instruction
-      was  executed. See the  :ref:`LDS bandwidth example <lds-bandwidth>` for more
-      detail.
-    unit: Bytes per normalization unit
-  Bank Conflicts/Access:
-    rst: The ratio of the number of cycles spent in the  :ref:`LDS scheduler <desc-lds>`
-      due to bank conflicts (as determined by  the conflict resolution hardware) to
-      the base number of cycles that would  be spent in the LDS scheduler in a completely
-      uncontended case. This is  the unnormalized form of the Bank Conflict Rate.
-    unit: Conflicts per Access
-  LDS Instructions:
-    rst: The total number of LDS instructions (including, but not limited to,  read/write/atomics
-      and HIP's ``__shfl`` instructions) executed per  :ref:`normalization unit <normalization-units>`.
-    unit: Instructions per normalization unit
-  Unaligned Stall:
+  Addr Conflict:
     rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>`  due
-      to stalls from non-dword aligned addresses per  :ref:`normalization unit <normalization-units>`.
+      to address conflicts (as determined by the conflict resolution  hardware) per
+      :ref:`normalization unit <normalization-units>`.
+    unit: Cycles per normalization unit
+  Atomic Return Cycles:
+    rst: The total number of cycles spent on LDS atomics with return per  :ref:`normalization
+      unit <normalization-units>`.
     unit: Cycles per normalization unit
   Bank Conflict:
     rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>`  due
       to bank conflicts (as determined by the conflict resolution hardware)  per :ref:`normalization
       unit <normalization-units>`.
     unit: Cycles per normalization unit
+  Bank Conflicts/Access:
+    rst: The ratio of the number of cycles spent in the  :ref:`LDS scheduler <desc-lds>`
+      due to bank conflicts (as determined by  the conflict resolution hardware) to
+      the base number of cycles that would  be spent in the LDS scheduler in a completely
+      uncontended case. This is  the unnormalized form of the Bank Conflict Rate.
+    unit: Conflicts per Access
+  Index Accesses:
+    rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>`  over
+      all operations per :ref:`normalization unit <normalization-units>`.
+    unit: Cycles per normalization unit
+  LDS Instructions:
+    rst: The total number of LDS instructions (including, but not limited to,  read/write/atomics
+      and HIP's ``__shfl`` instructions) executed per  :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
   LDS Latency:
     rst: The average number of round-trip cycles (i.e., from issue to data-return  /
       acknowledgment) required for an LDS instruction to complete.
@@ -450,21 +448,18 @@ LDS Statistics:
       \ unit <normalization-units>`. This is unused and  expected to be zero in most\
       \ configurations for modern CDNA\u2122 accelerators."
     unit: Accesses per normalization unit
-  Atomic Return Cycles:
-    rst: The total number of cycles spent on LDS atomics with return per  :ref:`normalization
-      unit <normalization-units>`.
-    unit: Cycles per normalization unit
-  Addr Conflict:
+  Theoretical Bandwidth:
+    rst: Indicates the maximum amount of bytes that could have been loaded from,  stored
+      to, or atomically updated in the LDS per  :ref:`normalization unit <normalization-units>`.
+      Does *not* take into  account the execution mask of the wavefront when the instruction
+      was  executed. See the  :ref:`LDS bandwidth example <lds-bandwidth>` for more
+      detail.
+    unit: Bytes per normalization unit
+  Unaligned Stall:
     rst: The total number of cycles spent in the :ref:`LDS scheduler <desc-lds>`  due
-      to address conflicts (as determined by the conflict resolution  hardware) per
-      :ref:`normalization unit <normalization-units>`.
+      to stalls from non-dword aligned addresses per  :ref:`normalization unit <normalization-units>`.
     unit: Cycles per normalization unit
 vL1D Speed-of-Light:
-  Hit rate:
-    rst: The ratio of the number of vL1D cache line requests that hit [#vl1d-hit]_  in
-      vL1D cache over the total number of cache line requests to the  :ref:`vL1D Cache
-      RAM <desc-tc>`.
-    unit: Percent
   Bandwidth:
     rst: The number of bytes looked up in the vL1D cache as a result of  :ref:`VMEM
       <desc-vmem>` instructions, as a percent of the peak  theoretical bandwidth achievable
@@ -480,20 +475,17 @@ vL1D Speed-of-Light:
       generated per instruction  divided by the ideal number of thread-requests per
       instruction.
     unit: Percent
+  Hit rate:
+    rst: The ratio of the number of vL1D cache line requests that hit [#vl1d-hit]_  in
+      vL1D cache over the total number of cache line requests to the  :ref:`vL1D Cache
+      RAM <desc-tc>`.
+    unit: Percent
   Utilization:
     rst: Indicates how busy the :ref:`vL1D Cache RAM <desc-tc>` was during the  kernel
       execution. The number of cycles where the vL1D Cache RAM is  actively processing
       any request divided by the number of cycles where the  vL1D is active [#vl1d-activity]_.
     unit: Percent
 Busy / stall metrics:
-  Data Stall:
-    rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the address  processor
-      was stalled from sending write/atomic data further into the  vL1D pipeline
-    unit: Percent
-  "Data-Processor \u2192 Address Stall":
-    rst: Percent of :ref:`total CU cycles <total-cu-cycles>` the address processor  was
-      stalled waiting to send command data to the  :ref:`data processor <desc-td>`
-    unit: Percent
   Address Processing Unit Busy:
     rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the address  processor
       was busy
@@ -502,16 +494,15 @@ Busy / stall metrics:
     rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the address  processor
       was stalled from sending address requests further into the vL1D  pipeline
     unit: Percent
+  Data Stall:
+    rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the address  processor
+      was stalled from sending write/atomic data further into the  vL1D pipeline
+    unit: Percent
+  "Data-Processor \u2192 Address Stall":
+    rst: Percent of :ref:`total CU cycles <total-cu-cycles>` the address processor  was
+      stalled waiting to send command data to the  :ref:`data processor <desc-td>`
+    unit: Percent
 Instruction counts:
-  Spill/Stack Write Instructions:
-    rst: The total number of spill/stack memory write instructions executed on all  :doc:`compute
-      units <compute-unit>` on the accelerator, per  :ref:`normalization unit <normalization-units>`.
-    unit: Instructions per normalization unit
-  Global/Generic Read Instructions:
-    rst: The total number of global & generic memory read instructions executed on  all
-      :doc:`compute units <compute-unit>` on the accelerator, per  :ref:`normalization
-      unit <normalization-units>`.
-    unit: Instructions per normalization unit
   Global/Generic Atomic Instructions:
     rst: The total number of global & generic memory atomic (with and without  return)
       instructions executed on all :doc:`compute units <compute-unit>`  on the accelerator,
@@ -521,9 +512,10 @@ Instruction counts:
     rst: The total number of global & generic memory instructions executed on all  :doc:`compute
       units <compute-unit>` on the accelerator, per  :ref:`normalization unit <normalization-units>`.
     unit: Instructions per normalization unit
-  Spill/Stack Read Instructions:
-    rst: The total number of spill/stack memory read instructions executed on all  :doc:`compute
-      units <compute-unit>` on the accelerator, per  :ref:`normalization unit <normalization-units>`.
+  Global/Generic Read Instructions:
+    rst: The total number of global & generic memory read instructions executed on  all
+      :doc:`compute units <compute-unit>` on the accelerator, per  :ref:`normalization
+      unit <normalization-units>`.
     unit: Instructions per normalization unit
   Global/Generic Write Instructions:
     rst: The total number of global & generic memory write instructions executed  on
@@ -540,6 +532,14 @@ Instruction counts:
     rst: The total number of spill/stack memory instructions executed on all  :doc:`compute
       units <compute-unit>` on the accelerator, per  :ref:`normalization unit <normalization-units>`.
     unit: Instructions per normalization unit
+  Spill/Stack Read Instructions:
+    rst: The total number of spill/stack memory read instructions executed on all  :doc:`compute
+      units <compute-unit>` on the accelerator, per  :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
+  Spill/Stack Write Instructions:
+    rst: The total number of spill/stack memory write instructions executed on all  :doc:`compute
+      units <compute-unit>` on the accelerator, per  :ref:`normalization unit <normalization-units>`.
+    unit: Instructions per normalization unit
   Total Instructions:
     rst: The total number of memory instructions executed by the address processer
       over all compute units on the accelerator, per normalization unit.
@@ -549,21 +549,15 @@ Spill / stack metrics:
     rst: The number of cycles the address processing unit spent working on  coalesced
       spill/stack read instructions, per  :ref:`normalization unit <normalization-units>`.
     unit: Cycles per normalization unit
-  Spill/Stack Total Cycles:
-    rst: The number of cycles the address processing unit spent working on  spill/stack
-      instructions, per  :ref:`normalization unit <normalization-units>`.
-    unit: Cycles per normalization unit
   Spill/Stack Coalesced Write:
     rst: The number of cycles the address processing unit spent working on  coalesced
       spill/stack write instructions, per  :ref:`normalization unit <normalization-units>`.
     unit: Cycles per normalization unit
+  Spill/Stack Total Cycles:
+    rst: The number of cycles the address processing unit spent working on  spill/stack
+      instructions, per  :ref:`normalization unit <normalization-units>`.
+    unit: Cycles per normalization unit
 L1 Unified Translation Cache (UTCL1):
-  Permission Misses:
-    rst: "The total number of translation requests that missed in the UTCL1 due to\
-      \  a permission error, per :ref:`normalization unit <normalization-units>`.\
-      \  This is unused and expected to be zero in most configurations for modern\
-      \  CDNA\u2122 accelerators."
-    unit: Requests per normalization unit
   Hit Ratio:
     rst: The ratio of the number of translation requests that hit in the UTCL1 divided
       by the total number of translation requests made to the UTCL1.
@@ -572,6 +566,12 @@ L1 Unified Translation Cache (UTCL1):
     rst: The number of translation requests that hit in the UTCL1, and could be reused,
       per normalization unit.
     unit: Requests per normalization unit
+  Permission Misses:
+    rst: "The total number of translation requests that missed in the UTCL1 due to\
+      \  a permission error, per :ref:`normalization unit <normalization-units>`.\
+      \  This is unused and expected to be zero in most configurations for modern\
+      \  CDNA\u2122 accelerators."
+    unit: Requests per normalization unit
   Req:
     rst: The number of translation requests made to the UTCL1 per normalization unit.
     unit: Requests per normalization unit
@@ -580,41 +580,39 @@ L1 Unified Translation Cache (UTCL1):
       not being present in the cache, per  :ref:`normalization unit <normalization-units>`.
     unit: unit
 vL1D cache stall metrics:
+  Stalled on L2 Data:
+    rst: The ratio of the number of cycles where the vL1D is stalled waiting for  requested
+      data to return from the :doc:`L2 cache <l2-cache>` divided by  the number of
+      cycles where the vL1D is active [#vl1d-activity]_.
+    unit: Percent
   Stalled on L2 Req:
     rst: The ratio of the number of cycles where the vL1D is stalled waiting to  issue
       a request for data to the :doc:`L2 cache <l2-cache>` divided by the  number
       of cycles where the vL1D is active [#vl1d-activity]_.
     unit: Percent
-  Tag RAM Stall (Read):
-    rst: The ratio of the number of cycles where the vL1D is stalled due to Read requests
-      with conflicting tags being looked up  concurrently, divided by the number of
-      cycles where the  vL1D is active [#vl1d-activity]_.
-    unit: Percent
   Tag RAM Stall (Atomic):
     rst: The ratio of the number of cycles where the vL1D is stalled due to Atomic
       requests with conflicting tags being looked up  concurrently, divided by the
       number of cycles where the  vL1D is active [#vl1d-activity]_.
     unit: Percent
+  Tag RAM Stall (Read):
+    rst: The ratio of the number of cycles where the vL1D is stalled due to Read requests
+      with conflicting tags being looked up  concurrently, divided by the number of
+      cycles where the  vL1D is active [#vl1d-activity]_.
+    unit: Percent
   Tag RAM Stall (Write):
     rst: The ratio of the number of cycles where the vL1D is stalled due to Write
       requests with conflicting tags being looked up  concurrently, divided by the
       number of cycles where the  vL1D is active [#vl1d-activity]_.
     unit: Percent
-  Stalled on L2 Data:
-    rst: The ratio of the number of cycles where the vL1D is stalled waiting for  requested
-      data to return from the :doc:`L2 cache <l2-cache>` divided by  the number of
-      cycles where the vL1D is active [#vl1d-activity]_.
-    unit: Percent
 vL1D cache access metrics:
-  Total Req:
-    rst: The total number of incoming requests from the  :ref:`address processing
-      unit <desc-ta>` after coalescing.
-    unit: Requests
-  Cache Hits:
-    rst: The number of cache accesses minus the number of outgoing requests to the  :doc:`L2
-      cache <l2-cache>`, that is, the number of cache line requests  serviced by the
-      :ref:`vL1D Cache RAM <desc-tc>` per  :ref:`normalization unit <normalization-units>`.
-    unit: Cache lines per normalization unit
+  Atomic Req:
+    rst: The total number of incoming atomic requests from the  :ref:`address processing
+      unit <desc-ta>` after coalescing per  :ref:`normalization unit <normalization-units>`
+    unit: Requests per normalization unit
+  Cache Accesses:
+    rst: The total number of cache line lookups in the vL1D.
+    unit: Cache lines
   Cache BW:
     rst: The number of bytes looked up in the vL1D cache as a result of  :ref:`VMEM
       <desc-vmem>` instructions per  :ref:`normalization unit <normalization-units>`.  The
@@ -623,53 +621,15 @@ vL1D cache access metrics:
       for  instance, if only a single value is requested in a cache line, the data  movement
       will still be counted as a full cache line.
     unit: Bytes per normalization unit
-  Cache Accesses:
-    rst: The total number of cache line lookups in the vL1D.
-    unit: Cache lines
   Cache Hit Rate:
     rst: The ratio of the number of vL1D cache line requests that hit in vL1D  cache
       over the total number of cache line requests to the  :ref:`vL1D Cache RAM <desc-tc>`.
     unit: Percent
-  Atomic Req:
-    rst: The total number of incoming atomic requests from the  :ref:`address processing
-      unit <desc-ta>` after coalescing per  :ref:`normalization unit <normalization-units>`
-    unit: Requests per normalization unit
-  L1-L2 BW:
-    rst: The number of bytes transferred across the vL1D-L2 interface as a result  of
-      :ref:`VMEM <desc-vmem>` instructions, per  :ref:`normalization unit <normalization-units>`.
-      The number of bytes is  calculated as the number of cache lines requested multiplied
-      by the cache  line size. This value does not consider partial requests, so for  instance,
-      if only a single value is requested in a cache line, the data  movement will
-      still be counted as a full cache line.
-    unit: Bytes per normalization unit
-  L1-L2 Read Latency:
-    rst: Calculated as the average number of cycles that the vL1D cache took to  issue
-      and receive read requests from the :doc:`L2 Cache <l2-cache>`. This  number
-      also includes requests for atomics with return values.
-    unit: Cycles
-  L1-L2 Write Latency:
-    rst: Calculated as the average number of cycles that the vL1D cache took to  issue
-      and receive acknowledgement of a write request to the  :doc:`L2 Cache <l2-cache>`.
-      This number also includes requests for  atomics without return values.
-    unit: Cycles
-  Write Req:
-    rst: The total number of incoming write requests from the  :ref:`address processing
-      unit <desc-ta>` after coalescing per  :ref:`normalization unit <normalization-units>`
-    unit: Requests per normalization unit
-  L1-L2 Read:
-    rst: The number of read requests for a vL1D cache line that were not satisfied  by
-      the vL1D and must be retrieved from the to the  :doc:`L2 Cache <l2-cache>` per  :ref:`normalization
-      unit <normalization-units>`.
-    unit: Requests per normalization unit
-  L1-L2 Atomic:
-    rst: The number of atomic requests that are sent through the vL1D to the  :doc:`L2
-      cache <l2-cache>`, per  :ref:`normalization unit <normalization-units>`. This
-      includes requests  for atomics with, and without return.
-    unit: Requests per normalization unit
-  Read Req:
-    rst: The total number of incoming read requests from the  :ref:`address processing
-      unit <desc-ta>` after coalescing per  :ref:`normalization unit <normalization-units>`
-    unit: Requests per normalization unit
+  Cache Hits:
+    rst: The number of cache accesses minus the number of outgoing requests to the  :doc:`L2
+      cache <l2-cache>`, that is, the number of cache line requests  serviced by the
+      :ref:`vL1D Cache RAM <desc-tc>` per  :ref:`normalization unit <normalization-units>`.
+    unit: Cache lines per normalization unit
   Invalidations:
     rst: The number of times the vL1D was issued a write-back invalidate command  during
       the kernel's execution per  :ref:`normalization unit <normalization-units>`.  This
@@ -679,19 +639,70 @@ vL1D cache access metrics:
     rst: Calculated as the average number of cycles that a vL1D cache line request
       spent in the vL1D cache pipeline.
     unit: Cycles
+  L1-L2 Atomic:
+    rst: The number of atomic requests that are sent through the vL1D to the  :doc:`L2
+      cache <l2-cache>`, per  :ref:`normalization unit <normalization-units>`. This
+      includes requests  for atomics with, and without return.
+    unit: Requests per normalization unit
+  L1-L2 BW:
+    rst: The number of bytes transferred across the vL1D-L2 interface as a result  of
+      :ref:`VMEM <desc-vmem>` instructions, per  :ref:`normalization unit <normalization-units>`.
+      The number of bytes is  calculated as the number of cache lines requested multiplied
+      by the cache  line size. This value does not consider partial requests, so for  instance,
+      if only a single value is requested in a cache line, the data  movement will
+      still be counted as a full cache line.
+    unit: Bytes per normalization unit
+  L1-L2 Read:
+    rst: The number of read requests for a vL1D cache line that were not satisfied  by
+      the vL1D and must be retrieved from the to the  :doc:`L2 Cache <l2-cache>` per  :ref:`normalization
+      unit <normalization-units>`.
+    unit: Requests per normalization unit
+  L1-L2 Read Latency:
+    rst: Calculated as the average number of cycles that the vL1D cache took to  issue
+      and receive read requests from the :doc:`L2 Cache <l2-cache>`. This  number
+      also includes requests for atomics with return values.
+    unit: Cycles
   L1-L2 Write:
     rst: The number of write requests to a vL1D cache line that were sent through  the
       vL1D to the :doc:`L2 cache <l2-cache>`, per  :ref:`normalization unit <normalization-units>`.
     unit: Requests per normalization unit
+  L1-L2 Write Latency:
+    rst: Calculated as the average number of cycles that the vL1D cache took to  issue
+      and receive acknowledgement of a write request to the  :doc:`L2 Cache <l2-cache>`.
+      This number also includes requests for  atomics without return values.
+    unit: Cycles
+  Read Req:
+    rst: The total number of incoming read requests from the  :ref:`address processing
+      unit <desc-ta>` after coalescing per  :ref:`normalization unit <normalization-units>`
+    unit: Requests per normalization unit
+  Total Req:
+    rst: The total number of incoming requests from the  :ref:`address processing
+      unit <desc-ta>` after coalescing.
+    unit: Requests
+  Write Req:
+    rst: The total number of incoming write requests from the  :ref:`address processing
+      unit <desc-ta>` after coalescing per  :ref:`normalization unit <normalization-units>`
+    unit: Requests per normalization unit
 Vector L1 data-return path or Texture Data (TD):
+  Atomic Instructions:
+    rst: The number of atomic instructions submitted to the  :ref:`data-return unit
+      <desc-td>` by the  :ref:`address processor <desc-ta>` summed over all  :doc:`compute
+      units <compute-unit>` on the accelerator, per  :ref:`normalization unit <normalization-units>`.
+      This is expected to be  the sum of global/generic and spill/stack atomics in
+      the  :ref:`address processor <desc-ta>`.
+    unit: Instructions per normalization unit
+  "Cache RAM \u2192 Data-Return Stall":
+    rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the data-return  unit
+      was stalled on data to be returned from the  :ref:`vL1D Cache RAM <desc-tc>`.
+    unit: Percent
   Coalescable Instructions:
     rst: The number of instructions submitted to the  :ref:`data-return unit <desc-td>`
       by the  :ref:`address processor <desc-ta>` that were found to be coalescable,
       per  :ref:`normalization unit <normalization-units>`.
     unit: Instructions per normalization unit
-  "Cache RAM \u2192 Data-Return Stall":
+  Data-Return Busy:
     rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the data-return  unit
-      was stalled on data to be returned from the  :ref:`vL1D Cache RAM <desc-tc>`.
+      was busy processing or waiting on data to return to the  :doc:`CU <compute-unit>`.
     unit: Percent
   Read Instructions:
     rst: The number of read instructions submitted to the  :ref:`data-return unit
@@ -705,13 +716,6 @@ Vector L1 data-return path or Texture Data (TD):
       was stalled by the :ref:`workgroup manager <desc-spi>` due to  initialization
       of registers as a part of launching new workgroups.
     unit: Percent
-  Atomic Instructions:
-    rst: The number of atomic instructions submitted to the  :ref:`data-return unit
-      <desc-td>` by the  :ref:`address processor <desc-ta>` summed over all  :doc:`compute
-      units <compute-unit>` on the accelerator, per  :ref:`normalization unit <normalization-units>`.
-      This is expected to be  the sum of global/generic and spill/stack atomics in
-      the  :ref:`address processor <desc-ta>`.
-    unit: Instructions per normalization unit
   Write Instructions:
     rst: The number of store instructions submitted to the  :ref:`data-return unit
       <desc-td>` by the  :ref:`address processor <desc-ta>` summed over all  :doc:`compute
@@ -719,11 +723,24 @@ Vector L1 data-return path or Texture Data (TD):
       This is expected to be  the sum of global/generic and spill/stack stores counted
       by the  :ref:`vL1D cache-front-end <ta-instruction-counts>`.
     unit: Instructions per normalization unit
-  Data-Return Busy:
-    rst: Percent of the :ref:`total CU cycles <total-cu-cycles>` the data-return  unit
-      was busy processing or waiting on data to return to the  :doc:`CU <compute-unit>`.
-    unit: Percent
 L2 Speed-of-Light:
+  HBM Bandwidth:
+    rst: Maximum theoretical bandwidth of the accelerator's local high-bandwidth memory
+      (HBM) per unit time. This value is calculated as the number of HBM channels
+      multiplied by the HBM channel width multiplied by the HBM clock frequency.
+    unit: GB/s
+  Hit Rate:
+    rst: The ratio of the number of L2 cache line requests that hit in the L2  cache
+      over the total number of incoming cache line requests to the L2 cache.
+    unit: Percent
+  L2-Fabric Read BW:
+    rst: The number of bytes read by the L2 over the  :ref:`Infinity Fabric interface
+      <l2-fabric>` per unit time.
+    unit: GB/s
+  L2-Fabric Write and Atomic BW:
+    rst: The number of bytes sent by the L2 over the  :ref:`Infinity Fabric interface
+      <l2-fabric>` by write and atomic  operations per unit time.
+    unit: GB/s
   Peak Bandwidth:
     rst: The number of bytes looked up in the L2 cache, as a percent of the peak  theoretical
       bandwidth achievable on the specific accelerator. The number  of bytes is calculated
@@ -732,68 +749,15 @@ L2 Speed-of-Light:
       requested in a cache line, the data  movement will still be counted as a full
       cache line.
     unit: Percent
-  HBM Bandwidth:
-    rst: Maximum theoretical bandwidth of the accelerator's local high-bandwidth memory
-      (HBM) per unit time. This value is calculated as the number of HBM channels
-      multiplied by the HBM channel width multiplied by the HBM clock frequency.
-    unit: GB/s
   Utilization:
     rst: The ratio of the  :ref:`number of cycles an L2 channel was active, summed
       over all L2 channels on the accelerator <total-active-l2-cycles>`  over the
       :ref:`total L2 cycles <total-l2-cycles>`.
     unit: Percent
-  L2-Fabric Write and Atomic BW:
-    rst: The number of bytes sent by the L2 over the  :ref:`Infinity Fabric interface
-      <l2-fabric>` by write and atomic  operations per unit time.
-    unit: GB/s
-  L2-Fabric Read BW:
-    rst: The number of bytes read by the L2 over the  :ref:`Infinity Fabric interface
-      <l2-fabric>` per unit time.
-    unit: GB/s
-  Hit Rate:
-    rst: The ratio of the number of L2 cache line requests that hit in the L2  cache
-      over the total number of incoming cache line requests to the L2 cache.
-    unit: Percent
 L2 cache accesses:
-  UC Req:
-    rst: The total number of requests to the L2 that go to Uncached (UC) memory  allocations.
-      See the :ref:`memory-type` for more information.
-    unit: Requests per normalization unit
-  Misses:
-    rst: The total number of requests to the L2 from all clients that miss in the  cache.
-      As noted in the :ref:`Speed-of-Light <l2-sol>` section, these do  not include
-      hit-on-miss requests.
-    unit: Requests per normalization unit
-  Cache Hit:
-    rst: The ratio of the number of L2 cache line requests that hit in the L2  cache
-      over the total number of incoming cache line requests to the L2  cache.
-    unit: Percent
-  Streaming Req:
-    rst: The total number of incoming requests to the L2 that are marked as  *streaming*.
-      The exact meaning of this may differ depending on the  targeted accelerator,
-      however on an :ref:`MI2XX <mixxx-note>` this  corresponds to  `non-temporal
-      load or stores <https://clang.llvm.org/docs/LanguageExtensions.html#non-temporal-load-store-builtins>`_.  The
-      L2 cache attempts to evict *streaming* requests before normal  requests when
-      the L2 is at capacity.
-    unit: Requests per normalization unit
-  Writeback (Internal):
-    rst: The total number of L2 cache lines written back to memory for internal  hardware
-      reasons, per :ref:`normalization unit <normalization-units>`.
-    unit: Cache lines per normalization unit
-  Write Req:
-    rst: The total number of write requests to the L2 from all clients.
-    unit: Requests per normalization unit
-  CC Req:
-    rst: The total number of requests to the L2 that go to Coherently Cacheable (CC)  memory
-      allocations. See the :ref:`memory-type` for more information.
-    unit: Requests per normalization unit
-  Writeback (vL1D Req):
-    rst: The total number of L2 cache lines written back to memory due to requests  initiated
-      by the :doc:`vL1D cache <vector-l1-cache>`, per  :ref:`normalization unit <normalization-units>`.
-    unit: Cache lines per normalization unit
-  Req:
-    rst: The total number of incoming requests to the L2 from all clients for all  request
-      types, per :ref:`normalization unit <normalization-units>`.
+  Atomic Req:
+    rst: The total number of atomic requests (with and without return) to the L2 from
+      all clients.
     unit: Requests per normalization unit
   Bandwidth:
     rst: The number of bytes looked up in the L2 cache, per  :ref:`normalization unit
@@ -802,75 +766,90 @@ L2 cache accesses:
       consider partial requests, so for example,  if only a single value is requested
       in a cache line, the data movement  will still be counted as a full cache line.
     unit: Bytes per normalization unit
+  CC Req:
+    rst: The total number of requests to the L2 that go to Coherently Cacheable (CC)  memory
+      allocations. See the :ref:`memory-type` for more information.
+    unit: Requests per normalization unit
+  Cache Hit:
+    rst: The ratio of the number of L2 cache line requests that hit in the L2  cache
+      over the total number of incoming cache line requests to the L2  cache.
+    unit: Percent
+  Evict (Internal):
+    rst: The total number of L2 cache lines evicted from the cache due to capacity  limits,
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Cache lines per normalization unit
+  Evict (vL1D Req):
+    rst: The total number of L2 cache lines evicted from the cache due to  invalidation
+      requests initiated by the  :doc:`vL1D cache <vector-l1-cache>`, per  :ref:`normalization
+      unit <normalization-units>`.
+    unit: Cache lines per normalization unit
   Hits:
     rst: The total number of requests to the L2 from all clients that hit in the  cache.
       As noted in the :ref:`Speed-of-Light <l2-sol>` section, this  includes hit-on-miss
       requests.
     unit: Requests per normalization unit
+  Misses:
+    rst: The total number of requests to the L2 from all clients that miss in the  cache.
+      As noted in the :ref:`Speed-of-Light <l2-sol>` section, these do  not include
+      hit-on-miss requests.
+    unit: Requests per normalization unit
+  NC Req:
+    rst: The total number of requests to the L2 to Not-hardware-Coherent (NC)  memory
+      allocations, per :ref:`normalization unit <normalization-units>`.  See the :ref:`memory-type`
+      for more information.
+    unit: Requests per normalization unit
+  Probe Req:
+    rst: The number of coherence probe requests made to the L2 cache from outside  the
+      accelerator. On an :ref:`MI2XX <mixxx-note>`, probe requests may be  generated
+      by, for example, writes to  :ref:`fine-grained device <memory-type>` memory
+      or by writes to  :ref:`coarse-grained <memory-type>` device memory.
+    unit: Requests per normalization unit
   RW Req:
     rst: The total number of requests to the L2 that go to Read-Write coherent memory  (RW)
       allocations. See the :ref:`memory-type` for more information.
     unit: Requests per normalization unit
+  Read Req:
+    rst: 'The total number of read requests to the L2 from all clients.  '
+    unit: Requests per normalization unit
+  Req:
+    rst: The total number of incoming requests to the L2 from all clients for all  request
+      types, per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Streaming Req:
+    rst: The total number of incoming requests to the L2 that are marked as  *streaming*.
+      The exact meaning of this may differ depending on the  targeted accelerator,
+      however on an :ref:`MI2XX <mixxx-note>` this  corresponds to  `non-temporal
+      load or stores <https://clang.llvm.org/docs/LanguageExtensions.html#non-temporal-load-store-builtins>`_.  The
+      L2 cache attempts to evict *streaming* requests before normal  requests when
+      the L2 is at capacity.
+    unit: Requests per normalization unit
+  UC Req:
+    rst: The total number of requests to the L2 that go to Uncached (UC) memory  allocations.
+      See the :ref:`memory-type` for more information.
+    unit: Requests per normalization unit
+  Write Req:
+    rst: The total number of write requests to the L2 from all clients.
+    unit: Requests per normalization unit
   Writeback:
     rst: The total number of L2 cache lines written back to memory for any reason.  Write-backs
       may occur due to user code (such as HIP kernel calls to  ``__threadfence_system``
       or atomic built-ins) by the  :doc:`command processor <command-processor>`'s
       memory acquire/release  fences, or for other internal hardware reasons.
     unit: Cache lines per normalization unit
-  NC Req:
-    rst: The total number of requests to the L2 to Not-hardware-Coherent (NC)  memory
-      allocations, per :ref:`normalization unit <normalization-units>`.  See the :ref:`memory-type`
-      for more information.
-    unit: Requests per normalization unit
-  Evict (vL1D Req):
-    rst: The total number of L2 cache lines evicted from the cache due to  invalidation
-      requests initiated by the  :doc:`vL1D cache <vector-l1-cache>`, per  :ref:`normalization
-      unit <normalization-units>`.
+  Writeback (Internal):
+    rst: The total number of L2 cache lines written back to memory for internal  hardware
+      reasons, per :ref:`normalization unit <normalization-units>`.
     unit: Cache lines per normalization unit
-  Probe Req:
-    rst: The number of coherence probe requests made to the L2 cache from outside  the
-      accelerator. On an :ref:`MI2XX <mixxx-note>`, probe requests may be  generated
-      by, for example, writes to  :ref:`fine-grained device <memory-type>` memory
-      or by writes to  :ref:`coarse-grained <memory-type>` device memory.
-    unit: Requests per normalization unit
-  Atomic Req:
-    rst: The total number of atomic requests (with and without return) to the L2 from
-      all clients.
-    unit: Requests per normalization unit
-  Evict (Internal):
-    rst: The total number of L2 cache lines evicted from the cache due to capacity  limits,
-      per :ref:`normalization unit <normalization-units>`.
+  Writeback (vL1D Req):
+    rst: The total number of L2 cache lines written back to memory due to requests  initiated
+      by the :doc:`vL1D cache <vector-l1-cache>`, per  :ref:`normalization unit <normalization-units>`.
     unit: Cache lines per normalization unit
-  Read Req:
-    rst: 'The total number of read requests to the L2 from all clients.  '
-    unit: Requests per normalization unit
 L2-Fabric interface metrics:
-  Remote Write and Atomic Traffic:
-    rst: The percent of read requests generated by the L2 cache that are routed to  any
-      memory location other than the accelerator's local high-bandwidth  memory (HBM)
-      -- for example, the CPU's DRAM or a remote accelerator's  HBM. This breakdown
-      does not consider the *size* of the request (meaning  that 32B and 64B requests
-      are both counted as a single request), so this  metric only *approximates* the
-      percent of the L2-Fabric Read bandwidth  directed to a remote location. Note
-      that on current CDNA  accelerators, such as the :ref:`MI2XX <mixxx-note>`, requests
-      are only  considered *atomic* by Infinity Fabric if they are targeted at  :ref:`fine-grained
-      memory <memory-type>` allocations or  :ref:`uncached memory <memory-type>` allocations.
-    unit: Percent
-  Remote Read Traffic:
-    rst: The percent of read requests generated by the L2 cache that are routed to  any
-      memory location other than the accelerator's local high-bandwidth  memory (HBM)
-      -- for example, the CPU's DRAM or a remote accelerator's  HBM. This breakdown
-      does not consider the *size* of the request (meaning  that 32B and 64B requests
-      are both counted as a single request), so this  metric only *approximates* the
-      percent of the L2-Fabric Read bandwidth  directed to a remote location.
-    unit: Percent
-  Uncached Write and Atomic Traffic:
-    rst: The percent of write and atomic requests generated by the L2 cache that  are
-      targeting :ref:`uncached memory allocations <memory-type>`. This  breakdown
-      does not consider the *size* of the request (meaning that 32B  and 64B requests
-      are both counted as a single request), so this metric  only *approximates* the
-      percent of the L2-Fabric read bandwidth directed  to uncached memory allocations.
-    unit: Percent
+  Atomic Latency:
+    rst: The time-averaged number of cycles atomic requests spent in Infinity Fabric
+      before a completion acknowledgement (atomic without return value) or data (atomic
+      with return value) was returned to the L2.
+    unit: Cycles
   Atomic Traffic:
     rst: The percent of write requests generated by the L2 cache that are atomic  requests
       to *any* memory location. This breakdown does not consider the  *size* of the
@@ -888,20 +867,6 @@ L2-Fabric interface metrics:
       as a single request), so this metric only *approximates*  the percent of the
       L2-Fabric Read bandwidth directed to the local HBM.
     unit: Percent
-  Write and Atomic Latency:
-    rst: The time-averaged number of cycles write requests spent in Infinity Fabric
-      before a completion acknowledgement was returned to the L2.
-    unit: Cycles
-  Read Latency:
-    rst: The time-averaged number of cycles read requests spent in Infinity Fabric  before
-      data was returned to the L2.
-    unit: Cycles
-  Read Stall:
-    rst: "The ratio of the total number of cycles the L2-Fabric interface was stalled\
-      \ on a read request to any destination (local HBM, remote PCIe\xAE connected\
-      \ accelerator or CPU, or remote Infinity Fabric connected accelerator [#inf]_\
-      \ or CPU) over the :ref:`total active L2 cycles <total-active-l2-cycles>`."
-    unit: Percent
   HBM Write and Atomic Traffic:
     rst: The percent of write and atomic requests generated by the L2 cache that  are
       routed to the accelerator's local high-bandwidth memory (HBM). This  breakdown
@@ -913,28 +878,38 @@ L2-Fabric interface metrics:
       at  :ref:`fine-grained memory <memory-type>` allocations or  :ref:`uncached
       memory <memory-type>` allocations.
     unit: Percent
-  Write and Atomic BW:
-    rst: The total number of bytes written by the L2 over Infinity Fabric by write  and
-      atomic operations per  :ref:`normalization unit <normalization-units>`. Note
-      that on current  CDNA accelerators, such as the :ref:`MI2XX <mixxx-note>`, requests
-      are  only considered *atomic* by Infinity Fabric if they are targeted at  non-write-cacheable
-      memory, for example,  :ref:`fine-grained memory <memory-type>` allocations or  :ref:`uncached
-      memory <memory-type>` allocations on the  MI2XX.
-    unit: Bytes per normalization unit
   Read BW:
     rst: The total number of bytes read by the L2 cache from Infinity Fabric per  :ref:`normalization
       unit <normalization-units>`.
     unit: Bytes per normalization unit
-  Atomic Latency:
-    rst: The time-averaged number of cycles atomic requests spent in Infinity Fabric
-      before a completion acknowledgement (atomic without return value) or data (atomic
-      with return value) was returned to the L2.
+  Read Latency:
+    rst: The time-averaged number of cycles read requests spent in Infinity Fabric  before
+      data was returned to the L2.
     unit: Cycles
-  Write Stall:
-    rst: The ratio of the total number of cycles the L2-Fabric interface was stalled
-      on a write or atomic request to any destination (local HBM, remote accelerator
-      or CPU, PCIe connected accelerator or CPU, or remote Infinity Fabric connected
-      accelerator [#inf]_ or CPU) over the :ref:`total active L2 cycles <total-active-l2-cycles>`.
+  Read Stall:
+    rst: "The ratio of the total number of cycles the L2-Fabric interface was stalled\
+      \ on a read request to any destination (local HBM, remote PCIe\xAE connected\
+      \ accelerator or CPU, or remote Infinity Fabric connected accelerator [#inf]_\
+      \ or CPU) over the :ref:`total active L2 cycles <total-active-l2-cycles>`."
+    unit: Percent
+  Remote Read Traffic:
+    rst: The percent of read requests generated by the L2 cache that are routed to  any
+      memory location other than the accelerator's local high-bandwidth  memory (HBM)
+      -- for example, the CPU's DRAM or a remote accelerator's  HBM. This breakdown
+      does not consider the *size* of the request (meaning  that 32B and 64B requests
+      are both counted as a single request), so this  metric only *approximates* the
+      percent of the L2-Fabric Read bandwidth  directed to a remote location.
+    unit: Percent
+  Remote Write and Atomic Traffic:
+    rst: The percent of read requests generated by the L2 cache that are routed to  any
+      memory location other than the accelerator's local high-bandwidth  memory (HBM)
+      -- for example, the CPU's DRAM or a remote accelerator's  HBM. This breakdown
+      does not consider the *size* of the request (meaning  that 32B and 64B requests
+      are both counted as a single request), so this  metric only *approximates* the
+      percent of the L2-Fabric Read bandwidth  directed to a remote location. Note
+      that on current CDNA  accelerators, such as the :ref:`MI2XX <mixxx-note>`, requests
+      are only  considered *atomic* by Infinity Fabric if they are targeted at  :ref:`fine-grained
+      memory <memory-type>` allocations or  :ref:`uncached memory <memory-type>` allocations.
     unit: Percent
   Uncached Read Traffic:
     rst: The percent of read requests generated by the L2 cache that are reading  from
@@ -947,22 +922,77 @@ L2-Fabric interface metrics:
       the percent of the  L2-Fabric read bandwidth directed to an uncached memory
       location.
     unit: Percent
+  Uncached Write and Atomic Traffic:
+    rst: The percent of write and atomic requests generated by the L2 cache that  are
+      targeting :ref:`uncached memory allocations <memory-type>`. This  breakdown
+      does not consider the *size* of the request (meaning that 32B  and 64B requests
+      are both counted as a single request), so this metric  only *approximates* the
+      percent of the L2-Fabric read bandwidth directed  to uncached memory allocations.
+    unit: Percent
+  Write Stall:
+    rst: The ratio of the total number of cycles the L2-Fabric interface was stalled
+      on a write or atomic request to any destination (local HBM, remote accelerator
+      or CPU, PCIe connected accelerator or CPU, or remote Infinity Fabric connected
+      accelerator [#inf]_ or CPU) over the :ref:`total active L2 cycles <total-active-l2-cycles>`.
+    unit: Percent
+  Write and Atomic BW:
+    rst: The total number of bytes written by the L2 over Infinity Fabric by write  and
+      atomic operations per  :ref:`normalization unit <normalization-units>`. Note
+      that on current  CDNA accelerators, such as the :ref:`MI2XX <mixxx-note>`, requests
+      are  only considered *atomic* by Infinity Fabric if they are targeted at  non-write-cacheable
+      memory, for example,  :ref:`fine-grained memory <memory-type>` allocations or  :ref:`uncached
+      memory <memory-type>` allocations on the  MI2XX.
+    unit: Bytes per normalization unit
+  Write and Atomic Latency:
+    rst: The time-averaged number of cycles write requests spent in Infinity Fabric
+      before a completion acknowledgement was returned to the L2.
+    unit: Cycles
 L2 - Fabric interface detailed metrics:
-  Remote Write and Atomic:
+  Atomic:
+    rst: The total number of L2 requests to Infinity Fabric to atomically update  32B
+      or 64B of data in any memory location, per  :ref:`normalization unit <normalization-units>`.
+      See  :ref:`l2-request-flow` for more detail. Note that on current CDNA  accelerators,
+      such as the :ref:`MI2XX <mixxx-note>`, requests are only  considered *atomic*
+      by Infinity Fabric if they are targeted at  non-write-cacheable memory, such
+      as  :ref:`fine-grained memory <memory-type>` allocations or  :ref:`uncached
+      memory <memory-type>` allocations on the MI2XX.
+    unit: Requests per normalization unit
+  HBM Read:
+    rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of  data
+      from the accelerator's local HBM, per  :ref:`normalization unit <normalization-units>`.
+      See  :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+  HBM Write and Atomic:
     rst: The total number of L2 requests to Infinity Fabric to write or atomically  update
-      32B or 64B of data in any memory location other than the  accelerator's local
-      HBM, per  :ref:`normalization unit <normalization-units>`. See  :ref:`l2-request-flow`
-      for more detail.
+      32B or 64B of data in the accelerator's local HBM, per  :ref:`normalization
+      unit <normalization-units>`. See  :ref:`l2-request-flow` for more detail.  plain
+    unit: Requests per normalization unit
+  Read (32B):
+    rst: The total number of L2 requests to Infinity Fabric to read 32B of data  from
+      any memory location, per  :ref:`normalization unit <normalization-units>`. See  :ref:`l2-request-flow`
+      for more detail. Typically unused on CDNA  accelerators.
     unit: Requests per normalization unit
   Read (64B):
     rst: The total number of L2 requests to Infinity Fabric to read 64B of data  from
       any memory location, per  :ref:`normalization unit <normalization-units>`. See  :ref:`l2-request-flow`
       for more detail.
     unit: Requests per normalization unit
-  Read (32B):
-    rst: The total number of L2 requests to Infinity Fabric to read 32B of data  from
-      any memory location, per  :ref:`normalization unit <normalization-units>`. See  :ref:`l2-request-flow`
-      for more detail. Typically unused on CDNA  accelerators.
+  Read (Uncached):
+    rst: The total number of L2 requests to Infinity Fabric to read  :ref:`uncached
+      data <memory-type>` from any memory location, per  :ref:`normalization unit
+      <normalization-units>`. 64B requests for  uncached data are counted as two 32B
+      uncached data requests. See  :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+  Remote Read:
+    rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of  data
+      from any source other than the accelerator's local HBM, per  :ref:`normalization
+      unit <normalization-units>`. See  :ref:`l2-request-flow` for more detail.
+    unit: Requests per normalization unit
+  Remote Write and Atomic:
+    rst: The total number of L2 requests to Infinity Fabric to write or atomically  update
+      32B or 64B of data in any memory location other than the  accelerator's local
+      HBM, per  :ref:`normalization unit <normalization-units>`. See  :ref:`l2-request-flow`
+      for more detail.
     unit: Requests per normalization unit
   Write and Atomic (32B):
     rst: The total number of L2 requests to Infinity Fabric to write or atomically  update
@@ -974,47 +1004,12 @@ L2 - Fabric interface detailed metrics:
       64B of data in any memory location, per  :ref:`normalization unit <normalization-units>`.
       See  :ref:`l2-request-flow` for more detail.
     unit: Requests per normalization unit
-  HBM Read:
-    rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of  data
-      from the accelerator's local HBM, per  :ref:`normalization unit <normalization-units>`.
-      See  :ref:`l2-request-flow` for more detail.
-    unit: Requests per normalization unit
-  Remote Read:
-    rst: The total number of L2 requests to Infinity Fabric to read 32B or 64B of  data
-      from any source other than the accelerator's local HBM, per  :ref:`normalization
-      unit <normalization-units>`. See  :ref:`l2-request-flow` for more detail.
-    unit: Requests per normalization unit
-  HBM Write and Atomic:
-    rst: The total number of L2 requests to Infinity Fabric to write or atomically  update
-      32B or 64B of data in the accelerator's local HBM, per  :ref:`normalization
-      unit <normalization-units>`. See  :ref:`l2-request-flow` for more detail.  plain
-    unit: Requests per normalization unit
-  Atomic:
-    rst: The total number of L2 requests to Infinity Fabric to atomically update  32B
-      or 64B of data in any memory location, per  :ref:`normalization unit <normalization-units>`.
-      See  :ref:`l2-request-flow` for more detail. Note that on current CDNA  accelerators,
-      such as the :ref:`MI2XX <mixxx-note>`, requests are only  considered *atomic*
-      by Infinity Fabric if they are targeted at  non-write-cacheable memory, such
-      as  :ref:`fine-grained memory <memory-type>` allocations or  :ref:`uncached
-      memory <memory-type>` allocations on the MI2XX.
-    unit: Requests per normalization unit
-  Read (Uncached):
-    rst: The total number of L2 requests to Infinity Fabric to read  :ref:`uncached
-      data <memory-type>` from any memory location, per  :ref:`normalization unit
-      <normalization-units>`. 64B requests for  uncached data are counted as two 32B
-      uncached data requests. See  :ref:`l2-request-flow` for more detail.
-    unit: Requests per normalization unit
   Write and Atomic (Uncached):
     rst: The total number of L2 requests to Infinity Fabric to write or atomically  update
       32B or 64B of :ref:`uncached data <memory-type>`, per  :ref:`normalization unit
       <normalization-units>`. See  :ref:`l2-request-flow` for more detail.
     unit: Requests per normalization unit
 L2 - Fabric Interface stalls:
-  Write - Credit Starvation:
-    rst: The number of cycles the L2-Fabric interface was stalled on write or  atomic
-      requests to any memory location because too many write/atomic  requests were
-      currently in flight, as a percent of the  :ref:`total active L2 cycles <total-active-l2-cycles>`.
-    unit: Percent
   Read - HBM Stall:
     rst: The number of cycles the L2-Fabric interface was stalled on read requests
       to the accelerator's local HBM as a percent of the :ref:`total active L2 cycles
@@ -1025,36 +1020,41 @@ L2 - Fabric Interface stalls:
       to remote Infinity Fabric connected accelerators [#inf]_ or CPUs as a percent
       of the :ref:`total active L2 cycles <total-active-l2-cycles>`.
     unit: Percent
-  Write - PCIe Stall:
+  Read - PCIe Stall:
+    rst: The number of cycles the L2-Fabric interface was stalled on read requests
+      to remote PCIe connected accelerators [#inf]_ or CPUs as a percent of the :ref:`total
+      active L2 cycles <total-active-l2-cycles>`.
+    unit: Percent
+  Write - Credit Starvation:
+    rst: The number of cycles the L2-Fabric interface was stalled on write or  atomic
+      requests to any memory location because too many write/atomic  requests were
+      currently in flight, as a percent of the  :ref:`total active L2 cycles <total-active-l2-cycles>`.
+    unit: Percent
+  Write - HBM Stall:
     rst: The number of cycles the L2-Fabric interface was stalled on write or atomic
-      requests to remote PCIe connected accelerators [#inf]_ or CPUs as a percent
-      of the :ref:`total active L2 cycles <total-active-l2-cycles>`.
+      requests to accelerator's local HBM as a percent of the total active L2 cycles.
     unit: Percent
   Write - Infinity Fabric Stall:
     rst: The number of cycles the L2-Fabric interface was stalled on write or atomic
       requests to remote Infinity Fabric connected accelerators [#inf]_ or CPUs as
       a percent of the :ref:`total active L2 cycles <total-active-l2-cycles>`.
     unit: Percent
-  Write - HBM Stall:
+  Write - PCIe Stall:
     rst: The number of cycles the L2-Fabric interface was stalled on write or atomic
-      requests to accelerator's local HBM as a percent of the total active L2 cycles.
-    unit: Percent
-  Read - PCIe Stall:
-    rst: The number of cycles the L2-Fabric interface was stalled on read requests
-      to remote PCIe connected accelerators [#inf]_ or CPUs as a percent of the :ref:`total
-      active L2 cycles <total-active-l2-cycles>`.
+      requests to remote PCIe connected accelerators [#inf]_ or CPUs as a percent
+      of the :ref:`total active L2 cycles <total-active-l2-cycles>`.
     unit: Percent
 Scalar L1D Speed-of-Light:
-  Cache Hit Rate:
-    rst: Indicates the percent of sL1D requests that hit on a previously loaded  line
-      the cache. The ratio of the number of sL1D requests that hit  [#sl1d-cache]_
-      over the number of all sL1D requests.
-    unit: Percent
   Bandwidth:
     rst: The number of bytes looked up in the sL1D cache, as a percent of the peak  theoretical
       bandwidth. Calculated as the ratio of sL1D requests over the  :ref:`total sL1D
       cycles <total-sl1d-cycles>`.
     unit: Percent
+  Cache Hit Rate:
+    rst: Indicates the percent of sL1D requests that hit on a previously loaded  line
+      the cache. The ratio of the number of sL1D requests that hit  [#sl1d-cache]_
+      over the number of all sL1D requests.
+    unit: Percent
   sL1D-L2 BW:
     rst: "The total number of bytes read from, written to, or atomically updated \
       \ across the sL1D\u2194:doc:`L2 <l2-cache>` interface, per  :ref:`normalization\
@@ -1063,6 +1063,30 @@ Scalar L1D Speed-of-Light:
       \ be interpreted as an sL1D\u2192L2 read bandwidth."
     unit: Bytes per normalization unit
 Scalar L1D cache accesses:
+  Atomic Req:
+    rst: The total number of atomic requests from sL1D to the  :doc:`L2 <l2-cache>`,
+      per  :ref:`normalization unit <normalization-units>`. Typically unused on  current
+      CDNA accelerators.
+    unit: Requests per normalization unit
+  Cache Hit Rate:
+    rst: Indicates the percent of sL1D requests that hit on a previously loaded  line
+      the cache. The ratio of the number of sL1D requests that hit  [#sl1d-cache]_
+      over the number of all sL1D requests.
+    unit: Percent
+  Hits:
+    rst: The total number of sL1D requests that hit on a previously loaded cache  line,
+      per :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
+  Misses - Non Duplicated:
+    rst: The total number of sL1D requests that missed on a cache line that *was  not*
+      already pending due to another request, per  :ref:`normalization unit <normalization-units>`.
+      See :ref:`desc-sl1d-sol`  for more detail.
+    unit: Requests per normalization unit
+  Misses- Duplicated:
+    rst: The total number of sL1D requests that missed on a cache line that *was*  already
+      pending due to another request, per  :ref:`normalization unit <normalization-units>`.
+      See  :ref:`desc-sl1d-sol` for more detail.
+    unit: Requests per normalization unit
   Read Req (1 DWord):
     rst: The total number of sL1D read requests made for a single dword of data  (4B),
       per :ref:`normalization unit <normalization-units>`.
@@ -1071,38 +1095,10 @@ Scalar L1D cache accesses:
     rst: The total number of sL1D read requests made for a sixteen dwords of data  (64B),
       per :ref:`normalization unit <normalization-units>`.
     unit: Requests per normalization unit
-  Atomic Req:
-    rst: The total number of atomic requests from sL1D to the  :doc:`L2 <l2-cache>`,
-      per  :ref:`normalization unit <normalization-units>`. Typically unused on  current
-      CDNA accelerators.
-    unit: Requests per normalization unit
-  Misses - Non Duplicated:
-    rst: The total number of sL1D requests that missed on a cache line that *was  not*
-      already pending due to another request, per  :ref:`normalization unit <normalization-units>`.
-      See :ref:`desc-sl1d-sol`  for more detail.
-    unit: Requests per normalization unit
   Read Req (2 DWord):
     rst: The total number of sL1D read requests made for a two dwords of data  (8B),
       per :ref:`normalization unit <normalization-units>`.
     unit: Requests per normalization unit
-  Hits:
-    rst: The total number of sL1D requests that hit on a previously loaded cache  line,
-      per :ref:`normalization unit <normalization-units>`.
-    unit: Requests per normalization unit
-  Cache Hit Rate:
-    rst: Indicates the percent of sL1D requests that hit on a previously loaded  line
-      the cache. The ratio of the number of sL1D requests that hit  [#sl1d-cache]_
-      over the number of all sL1D requests.
-    unit: Percent
-  Misses- Duplicated:
-    rst: The total number of sL1D requests that missed on a cache line that *was*  already
-      pending due to another request, per  :ref:`normalization unit <normalization-units>`.
-      See  :ref:`desc-sl1d-sol` for more detail.
-    unit: Requests per normalization unit
-  Req:
-    rst: The total number of requests, of any size or type, made to the sL1D per  :ref:`normalization
-      unit <normalization-units>`.
-    unit: Requests per normalization unit
   Read Req (4 DWord):
     rst: The total number of sL1D read requests made for a four dwords of data  (16B),
       per :ref:`normalization unit <normalization-units>`.
@@ -1115,19 +1111,20 @@ Scalar L1D cache accesses:
     rst: The total number of sL1D read requests of any size, per  :ref:`normalization
       unit <normalization-units>`.
     unit: Requests per normalization unit
+  Req:
+    rst: The total number of requests, of any size or type, made to the sL1D per  :ref:`normalization
+      unit <normalization-units>`.
+    unit: Requests per normalization unit
 Scalar L1D Cache - L2 Interface:
-  sL1D-L2 BW:
-    rst: "The total number of bytes read from, written to, or atomically updated \
-      \ across the sL1D\u2194:doc:`L2 <l2-cache>` interface, per  :ref:`normalization\
-      \ unit <normalization-units>`. Note that sL1D writes  and atomics are typically\
-      \ unused on current CDNA accelerators, so in the  majority of cases this can\
-      \ be interpreted as an sL1D\u2192L2 read bandwidth."
-    unit: Bytes per normalization unit
   Atomic Req:
     rst: The total number of atomic requests from sL1D to the  :doc:`L2 <l2-cache>`,
       per  :ref:`normalization unit <normalization-units>`. Typically unused on  current
       CDNA accelerators.
     unit: Requests per normalization unit
+  Read Req:
+    rst: The total number of read requests from sL1D to the :doc:`L2 <l2-cache>`,  per
+      :ref:`normalization unit <normalization-units>`.
+    unit: Requests per normalization unit
   Stall Cycles:
     rst: "The total number of cycles the sL1D\u2194  :doc:`L2 <l2-cache>` interface\
       \ was stalled, per  :ref:`normalization unit <normalization-units>`."
@@ -1137,27 +1134,39 @@ Scalar L1D Cache - L2 Interface:
       :ref:`normalization unit <normalization-units>`. Typically unused on  current
       CDNA accelerators.
     unit: Requests per normalization unit
-  Read Req:
-    rst: The total number of read requests from sL1D to the :doc:`L2 <l2-cache>`,  per
-      :ref:`normalization unit <normalization-units>`.
-    unit: Requests per normalization unit
+  sL1D-L2 BW:
+    rst: "The total number of bytes read from, written to, or atomically updated \
+      \ across the sL1D\u2194:doc:`L2 <l2-cache>` interface, per  :ref:`normalization\
+      \ unit <normalization-units>`. Note that sL1D writes  and atomics are typically\
+      \ unused on current CDNA accelerators, so in the  majority of cases this can\
+      \ be interpreted as an sL1D\u2192L2 read bandwidth."
+    unit: Bytes per normalization unit
 L1I Speed-of-Light:
-  Cache Hit Rate:
-    rst: The percent of L1I requests that hit [#l1i-cache]_ on a previously loaded  line
-      the cache. Calculated as the ratio of the number of L1I requests  that hit over
-      the number of all L1I requests.
-    unit: Percent
   Bandwidth:
     rst: The number of bytes looked up in the L1I cache, as a percent of the peak  theoretical
       bandwidth. Calculated as the ratio of L1I requests over the  :ref:`total L1I
       cycles <total-l1i-cycles>`.
     unit: Percent
+  Cache Hit Rate:
+    rst: The percent of L1I requests that hit [#l1i-cache]_ on a previously loaded  line
+      the cache. Calculated as the ratio of the number of L1I requests  that hit over
+      the number of all L1I requests.
+    unit: Percent
   L1I-L2 Bandwidth:
     rst: "The percent of the peak theoretical L1I \u2192 L2 cache request bandwidth\
       \  achieved. Calculated as the ratio of the total number of requests from  the\
       \ L1I to the L2 cache over the  :ref:`total L1I-L2 interface cycles <total-l1i-cycles>`."
     unit: Percent
 L1I cache accesses:
+  Cache Hit Rate:
+    rst: The percent of L1I requests that hit [#l1i-cache]_ on a previously loaded  line
+      the cache. Calculated as the ratio of the number of L1I requests  that hit over
+      the number of all L1I requests.
+    unit: Percent
+  Hits:
+    rst: The total number of L1I requests that hit on a previously loaded cache  line,
+      per :ref:`normalization-unit <normalization-units>`.
+    unit: Requests per normalization unit
   Instruction Fetch Latency:
     rst: The average number of cycles spent to fetch instructions to a  :doc:`CU <compute-unit>`.
     unit: Cycles
@@ -1171,15 +1180,6 @@ L1I cache accesses:
       not* already pending due to another request, per  :ref:`normalization-unit <normalization-units>`.
       See note in  :ref:`desc-l1i-sol` for more detail.
     unit: Requests per normalization unit
-  Hits:
-    rst: The total number of L1I requests that hit on a previously loaded cache  line,
-      per :ref:`normalization-unit <normalization-units>`.
-    unit: Requests per normalization unit
-  Cache Hit Rate:
-    rst: The percent of L1I requests that hit [#l1i-cache]_ on a previously loaded  line
-      the cache. Calculated as the ratio of the number of L1I requests  that hit over
-      the number of all L1I requests.
-    unit: Percent
   Req:
     rst: The total number of requests made to the L1I per normalization-unit
     unit: Requests per normalization unit
@@ -1190,76 +1190,72 @@ L1I <-> L2 interface:
       \ L1I to the L2 cache over the  :ref:`total L1I-L2 interface cycles <total-l1i-cycles>`."
     unit: Percent
 Workgroup manager utilizations:
+  Accelerator Utilization:
+    rst: The percent of cycles in the kernel where the accelerator was actively doing
+      any work.
+    unit: Percent
+  Dispatched Wavefronts:
+    rst: The total number of wavefronts, summed over all workgroups, forming this
+      kernel launch.
+    unit: Wavefronts
+  Dispatched Workgroups:
+    rst: The total number of workgroups forming this kernel launch.
+    unit: Workgroups
+  SGPR Writes:
+    rst: The average number of cycles spent initializing :ref:`SGPRs <desc-salu>`  at
+      wave creation.
+    unit: Cycles/wave
   SIMD Utilization:
     rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel  where
       any :ref:`SIMD <desc-valu>` on a CU was actively doing any work,  summed over
       all CUs. Low values (less than 100%) indicate that the  accelerator was not
       fully saturated by the kernel, or a potential  load-imbalance issue.
     unit: Percent
-  Workgroup Manager Utilization:
-    rst: The percent of cycles in the kernel where the workgroup manager was actively
-      doing any work.
-    unit: Percent
-  Accelerator Utilization:
-    rst: The percent of cycles in the kernel where the accelerator was actively doing
-      any work.
-    unit: Percent
-  Dispatched Workgroups:
-    rst: The total number of workgroups forming this kernel launch.
-    unit: Workgroups
   Scheduler-Pipe Utilization:
     rst: 'The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>` in  the
       kernel where the scheduler-pipes were actively doing any work. Note:  this value
       is expected to range between 0% and 25%. See :ref:`desc-spi`.'
     unit: Percent
-  Dispatched Wavefronts:
-    rst: The total number of wavefronts, summed over all workgroups, forming this
-      kernel launch.
-    unit: Wavefronts
-  SGPR Writes:
-    rst: The average number of cycles spent initializing :ref:`SGPRs <desc-salu>`  at
-      wave creation.
-    unit: Cycles/wave
-  VGPR Writes:
-    rst: The average number of cycles spent initializing :ref:`VGPRs <desc-valu>`  at
-      wave creation.
-    unit: Cycles/wave
   Shader Engine Utilization:
     rst: The percent of :ref:`total shader engine cycles <total-se-cycles>` in the  kernel
       where any CU in a shader-engine was actively doing any work,  normalized over
       all shader-engines. Low values (e.g., << 100%) indicate  that the accelerator
       was not fully saturated by the kernel, or a  potential load-imbalance issue.
     unit: Percent
+  VGPR Writes:
+    rst: The average number of cycles spent initializing :ref:`VGPRs <desc-valu>`  at
+      wave creation.
+    unit: Cycles/wave
+  Workgroup Manager Utilization:
+    rst: The percent of cycles in the kernel where the workgroup manager was actively
+      doing any work.
+    unit: Percent
 Workgroup Manager - Resource Allocation:
   Insufficient CU Barriers:
     rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel  where
       a workgroup could not be scheduled to a :doc:`CU <compute-unit>`  due to lack
       of available :ref:`barriers <desc-barrier>`.
     unit: Percent
+  Insufficient CU LDS:
+    rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel  where
+      a workgroup could not be scheduled to a :doc:`CU <compute-unit>`  due to lack
+      of available :doc:`LDS <local-data-share>`.
+    unit: Percent
   Insufficient SIMD SGPRs:
     rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel  where
       a workgroup could not be scheduled to a :ref:`SIMD <desc-valu>`  due to lack
       of available :ref:`SGPRs <desc-salu>`.
     unit: Percent
+  Insufficient SIMD VGPRs:
+    rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel  where
+      a workgroup could not be scheduled to a  :ref:`SIMD <desc-valu>`  due to lack
+      of available :ref:`VGPRs <desc-valu>`.
+    unit: Percent
   Insufficient SIMD Waveslots:
     rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel  where
       a workgroup could not be scheduled to a  :ref:`SIMD <desc-valu>`  due to lack
       of available :ref:`waveslots <desc-valu>`.
     unit: Percent
-  Reached CU Workgroup Limit:
-    rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel  where
-      a workgroup could not be scheduled to a :doc:`CU <compute-unit>`  due to limits
-      within the workgroup manager.  This is expected to be  always be zero on CDNA2
-      or newer accelerators (and small for previous  accelerators).
-    unit: Percent
-  Scratch Stall Rate:
-    rst: The percent of :ref:`total shader-engine cycles <total-se-cycles>` in the  kernel
-      where a workgroup could not be scheduled to a  :doc:`CU <compute-unit>` due
-      to lack of  :ref:`private (a.k.a., scratch) memory <memory-type>` slots. While
-      this  can reach up to 100%, note that the actual occupancy limitations on a  kernel
-      using private memory are typically quite small (for example, less  than 1% of
-      the total number of waves that can be scheduled to an  accelerator).
-    unit: Percent
   Not-scheduled Rate (Scheduler-Pipe):
     rst: 'The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>` in  the
       kernel where a workgroup could not be scheduled to a  :doc:`CU <compute-unit>`
@@ -1267,23 +1263,6 @@ Workgroup Manager - Resource Allocation:
       :ref:`SIMD <desc-valu>` with sufficient  resources. Note: this value is expected
       to range between 0-25%, see note  in :ref:`workgroup manager <desc-spi>` description.'
     unit: Percent
-  Scheduler-Pipe Stall Rate:
-    rst: 'The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>` in  the
-      kernel where a workgroup could not be scheduled to a  :doc:`CU <compute-unit>`
-      due to occupancy limitations (like a lack of a  CU or :ref:`SIMD <desc-valu>`
-      with sufficient resources). Note: this  value is expected to range between 0-25%,
-      see note in  :ref:`workgroup manager <desc-spi>` description.'
-    unit: Percent
-  Insufficient SIMD VGPRs:
-    rst: The percent of :ref:`total SIMD cycles <total-simd-cycles>` in the kernel  where
-      a workgroup could not be scheduled to a  :ref:`SIMD <desc-valu>`  due to lack
-      of available :ref:`VGPRs <desc-valu>`.
-    unit: Percent
-  Insufficient CU LDS:
-    rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel  where
-      a workgroup could not be scheduled to a :doc:`CU <compute-unit>`  due to lack
-      of available :doc:`LDS <local-data-share>`.
-    unit: Percent
   Not-scheduled Rate (Workgroup Manager):
     rst: 'The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>` in  the
       kernel where a workgroup could not be scheduled to a  :doc:`CU <compute-unit>`
@@ -1297,156 +1276,101 @@ Workgroup Manager - Resource Allocation:
       within the workgroup manager.  This is expected to be  always be zero on CDNA2
       or newer accelerators (and small for previous  accelerators).
     unit: Percent
+  Reached CU Workgroup Limit:
+    rst: The percent of :ref:`total CU cycles <total-cu-cycles>` in the kernel  where
+      a workgroup could not be scheduled to a :doc:`CU <compute-unit>`  due to limits
+      within the workgroup manager.  This is expected to be  always be zero on CDNA2
+      or newer accelerators (and small for previous  accelerators).
+    unit: Percent
+  Scheduler-Pipe Stall Rate:
+    rst: 'The percent of :ref:`total scheduler-pipe cycles <total-pipe-cycles>` in  the
+      kernel where a workgroup could not be scheduled to a  :doc:`CU <compute-unit>`
+      due to occupancy limitations (like a lack of a  CU or :ref:`SIMD <desc-valu>`
+      with sufficient resources). Note: this  value is expected to range between 0-25%,
+      see note in  :ref:`workgroup manager <desc-spi>` description.'
+    unit: Percent
+  Scratch Stall Rate:
+    rst: The percent of :ref:`total shader-engine cycles <total-se-cycles>` in the  kernel
+      where a workgroup could not be scheduled to a  :doc:`CU <compute-unit>` due
+      to lack of  :ref:`private (a.k.a., scratch) memory <memory-type>` slots. While
+      this  can reach up to 100%, note that the actual occupancy limitations on a  kernel
+      using private memory are typically quite small (for example, less  than 1% of
+      the total number of waves that can be scheduled to an  accelerator).
+    unit: Percent
 Command processor fetcher (CPF):
   CPF Stall:
     rst: Percent of CPF busy cycles where the CPF was stalled for any reason.
     unit: Percent
+  CPF Utilization:
+    rst: Percent of total cycles where the CPF was busy actively doing any work. The
+      ratio of CPF busy cycles over total cycles counted by the CPF.
+    unit: Percent
+  CPF-L2 Stall:
+    rst: Percent of CPF-:doc:`L2 <l2-cache>` L2 busy cycles where the CPF-L2 interface
+      was stalled for any reason.
+    unit: Percent
   CPF-L2 Utilization:
     rst: Percent of total cycles counted by the CPF-:doc:`L2 <l2-cache>` interface  where
       the CPF-L2 interface was active doing any work. The ratio of CPF-L2  busy cycles
       over total cycles counted by the CPF-L2.
     unit: Percent
-  CPF-L2 Stall:
-    rst: Percent of CPF-:doc:`L2 <l2-cache>` L2 busy cycles where the CPF-L2 interface
-      was stalled for any reason.
-    unit: Percent
   CPF-UTCL1 Stall:
     rst: Percent of CPF busy cycles where the CPF was stalled by address translation.
     unit: Percent
-  CPF Utilization:
-    rst: Percent of total cycles where the CPF was busy actively doing any work. The
-      ratio of CPF busy cycles over total cycles counted by the CPF.
-    unit: Percent
 Command processor packet processor (CPC):
-  CPC-Workgroup Manager Utilization:
-    rst: Percent of CPC busy cycles spent dispatching workgroups to the  :ref:`workgroup
-      manager <desc-spi>`.
-    unit: Percent
-  CPC-UTCL2 Utilization:
-    rst: Percent of total cycles counted by the CPC's :doc:`L2 <l2-cache>` address  translation
-      interface where the CPC was busy doing address translation  work.
-    unit: Percent
   CPC Packet Decoding Utilization:
     rst: Percent of CPC busy cycles spent decoding commands for processing.
     unit: Percent
-  CPC-UTCL1 Stall:
-    rst: Percent of CPC busy cycles where the CPC was stalled by address translation
-    unit: Percent
-  CPC-L2 Utilization:
-    rst: Percent of total cycles counted by the CPC-:doc:`L2 <l2-cache>` interface  where
-      the CPC-L2 interface was active doing any work.
+  CPC Stall Rate:
+    rst: Percent of CPC busy cycles where the CPC was stalled for any reason.
     unit: Percent
   CPC Utilization:
     rst: Percent of total cycles where the CPC was busy actively doing any work. The
       ratio of CPC busy cycles over total cycles counted by the CPC.
     unit: Percent
-  CPC Stall Rate:
-    rst: Percent of CPC busy cycles where the CPC was stalled for any reason.
+  CPC-L2 Utilization:
+    rst: Percent of total cycles counted by the CPC-:doc:`L2 <l2-cache>` interface  where
+      the CPC-L2 interface was active doing any work.
+    unit: Percent
+  CPC-UTCL1 Stall:
+    rst: Percent of CPC busy cycles where the CPC was stalled by address translation
+    unit: Percent
+  CPC-UTCL2 Utilization:
+    rst: Percent of total cycles counted by the CPC's :doc:`L2 <l2-cache>` address  translation
+      interface where the CPC was busy doing address translation  work.
+    unit: Percent
+  CPC-Workgroup Manager Utilization:
+    rst: Percent of CPC busy cycles spent dispatching workgroups to the  :ref:`workgroup
+      manager <desc-spi>`.
     unit: Percent
 System Speed-of-Light:
-  sL1D Cache BW:
-    rst: The number of bytes looked up in the sL1D cache per unit time. This is  also
-      presented as a percent of the peak theoretical bandwidth achievable  on the
-      specific accelerator.
-    unit: GB/s
-  VALU Utilization:
-    rst: Indicates what percent of the kernel's duration the :ref:`VALU <desc-valu>`
-      was busy executing instructions. Does not include :ref:`VMEM <desc-vmem>` operations.
-      Computed as the ratio of the total number of cycles spent by the :ref:`scheduler
-      <desc-scheduler>` issuing VALU instructions over the :ref:`total CU cycles <total-cu-cycles>`.
-    unit: Percent
-  LDS Bank Conflicts/Access:
-    rst: The ratio of the number of cycles spent in the  :doc:`LDS scheduler <local-data-share>`
-      due to bank conflicts (as  determined by the conflict resolution hardware) to
-      the base number of  cycles that would be spent in the LDS scheduler in a completely  uncontended
-      case. This is also presented in normalized form (i.e., the  Bank Conflict Rate).
-    unit: Conflicts/Access
-  VALU IOPs:
-    rst: 'The total integer operations executed per second on the :ref:`VALU <desc-valu>`.
-      This is also presented as a percent of the peak theoretical IOPs achievable
-      on the specific accelerator. Note: this does not include any integer operations
-      from :ref:`MFMA <desc-mfma>` instructions.'
-    unit: GOIPs
-  L2-Fabric Write Latency:
-    rst: The time-averaged number of cycles write requests spent in Infinity  Fabric
-      before a completion acknowledgement was returned to the L2.
-    unit: Cycles
-  IPC:
-    rst: The ratio of the total number of instructions executed on the :doc:`CU <compute-unit>`
-      over the :ref:`total active CU cycles <total-active-cu-cycles>`.
-    unit: Instructions per-cycle
-  SALU Utilization:
-    rst: Indicates what percent of the kernel's duration the :ref:`SALU <desc-salu>`
-      was busy executing instructions. Computed as the ratio of the total number of
-      cycles spent by the :ref:`scheduler <desc-scheduler>` issuing SALU / :ref:`SMEM
-      <desc-smem>` instructions over the :ref:`total CU cycles <total-cu-cycles>`.
-    unit: Percent
-  L1I Hit Rate:
-    rst: The percent of L1I requests that hit on a previously loaded line the  cache.
-      Calculated as the ratio of the number of L1I requests that hit  over the number
-      of all L1I requests.
-    unit: GB/s
-  VMEM Utilization:
-    rst: Indicates what percent of the kernel's duration the :ref:`VMEM <desc-vmem>`
-      unit was busy executing instructions, including both global/generic and spill/scratch
-      operations (see the :ref:`VMEM instruction count metrics <ta-instruction-counts>`
-      for more detail).  Does not include :ref:`VALU <desc-valu>` operations. Computed
-      as the ratio of the total number of cycles spent by the :ref:`scheduler <desc-scheduler>`
-      issuing VMEM instructions over the :ref:`total CU cycles <total-cu-cycles>`.
-    unit: Percent
-  MFMA FLOPs (F64):
-    rst: 'The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
-      executed per second. Note: this does not include any 64-bit floating point operations
-      from :ref:`VALU <desc-valu>` instructions. This is also presented as a percent
-      of the peak theoretical F64 MFMA operations achievable on the specific accelerator.'
-    unit: GFLOPs
-  Wavefront Occupancy:
-    rst: 'The time-averaged number of wavefronts resident on the accelerator over
-      the lifetime of the kernel. Note: this metric may be inaccurate for short-running
-      kernels (less than 1ms). This is also presented as a percent of the peak theoretical
-      occupancy achievable on the specific accelerator.'
-    unit: Wavefronts
-  MFMA FLOPs (BF16):
-    rst: 'The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>`
-      operations executed per second. Note: this does not include any 16-bit brain
-      floating point operations from :ref:`VALU <desc-valu>` instructions. This is
-      also presented as a percent of the peak theoretical BF16 MFMA operations achievable
-      on the specific accelerator.'
-    unit: GFLOPs
+  Active CUs:
+    rst: Total number of active compute units (CUs) on the accelerator during the
+      kernel execution.
+    unit: Number
   Branch Utilization:
     rst: Indicates what percent of the kernel's duration the :ref:`branch <desc-branch>`
       unit was busy executing instructions. Computed as the ratio of the total number
       of cycles spent by the :ref:`scheduler <desc-scheduler>` issuing branch instructions
       over the :ref:`total CU cycles <total-cu-cycles>`.
     unit: Percent
-  Theoretical LDS Bandwidth:
-    rst: Indicates the maximum amount of bytes that could have been loaded from, stored
-      to, or atomically updated in the LDS per unit time (see :ref:`LDS Bandwidth
-      <lds-bandwidth>` example for more detail). This is also presented as a percent
-      of the peak theoretical F64 MFMA operations achievable on the specific accelerator.
-    unit: GB/s
-  L2-Fabric Read Latency:
-    rst: The time-averaged number of cycles read requests spent in Infinity Fabric  before
-      data was returned to the L2.
-    unit: Cycles
-  MFMA Utilization:
-    rst: Indicates what percent of the kernel's duration the :ref:`MFMA <desc-mfma>`
-      unit was busy executing instructions. Computed as the ratio of the total number
-      of cycles spent by the :ref:`MFMA <desc-salu>` was busy over the :ref:`total
-      CU cycles <total-cu-cycles>`.
+  IPC:
+    rst: The ratio of the total number of instructions executed on the :doc:`CU <compute-unit>`
+      over the :ref:`total active CU cycles <total-active-cu-cycles>`.
+    unit: Instructions per-cycle
+  L1I BW:
+    rst: The number of bytes looked up in the L1I cache per unit time. This is  also
+      presented as a percent of the peak theoretical bandwidth achievable  on the
+      specific accelerator.
     unit: Percent
-  MFMA IOPs (Int8):
-    rst: 'The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
-      per second. Note: this does not include any 8-bit integer operations from :ref:`VALU
-      <desc-valu>` instructions. This is also presented as a percent of the peak theoretical
-      INT8 MFMA operations achievable on the specific accelerator.'
-    unit: GIOPs
-  VALU FLOPs:
-    rst: 'The total floating-point operations executed per second on the :ref:`VALU
-      <desc-valu>`. This is also presented as a percent of the peak theoretical FLOPs
-      achievable on the specific accelerator. Note: this does not include any floating-point
-      operations from :ref:`MFMA <desc-mfma>` instructions.'
-    unit: GFLOPs
+  L1I Fetch Latency:
+    rst: The average number of cycles spent to fetch instructions to a  :doc:`CU <compute-unit>`.
+    unit: Cycles
+  L1I Hit Rate:
+    rst: The percent of L1I requests that hit on a previously loaded line the  cache.
+      Calculated as the ratio of the number of L1I requests that hit  over the number
+      of all L1I requests.
+    unit: GB/s
   L2 Cache BW:
     rst: The number of bytes looked up in the L2 cache per unit time.  The number  of
       bytes is calculated as the number of cache lines requested multiplied  by the
@@ -1455,50 +1379,6 @@ System Speed-of-Light:
       be counted as a full cache line. This is also  presented as a percent of the
       peak theoretical bandwidth achievable on  the specific accelerator.
     unit: GB/s
-  VALU Active Threads:
-    rst: Indicates the average level of :ref:`divergence <desc-divergence>` within
-      a wavefront over the lifetime of the kernel. The number of work-items that were
-      active in a wavefront during execution of each :ref:`VALU <desc-valu>` instruction,
-      time-averaged over all VALU instructions run on all wavefronts in the kernel.
-    unit: Work-items
-  MFMA FLOPs (F16):
-    rst: 'The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
-      executed per second. Note: this does not include any 16-bit floating point operations
-      from :ref:`VALU <desc-valu>` instructions. This is also presented as a percent
-      of the peak theoretical F16 MFMA operations achievable on the specific accelerator.'
-    unit: GFLOPs
-  L1I BW:
-    rst: The number of bytes looked up in the L1I cache per unit time. This is  also
-      presented as a percent of the peak theoretical bandwidth achievable  on the
-      specific accelerator.
-    unit: Percent
-  MFMA FLOPs (F8):
-    rst: 'The total number of 8-bit brain floating point :ref:`MFMA <desc-mfma>` operations
-      executed per second. Note: this does not include any 16-bit brain floating point
-      operations from :ref:`VALU <desc-valu>` instructions. This is also presented
-      as a percent of the peak theoretical F8 MFMA operations achievable on the specific
-      accelerator. It is supported on AMD Instinct MI300 series and later only.'
-    unit: GFLOPs
-  L2-Fabric Write BW:
-    rst: The number of bytes sent by the L2 over the  :ref:`Infinity Fabric interface
-      <l2-fabric>` by write and atomic  operations per unit time. This is also presented
-      as a percent of the peak  theoretical bandwidth achievable on the specific accelerator.
-    unit: GB/s
-  MFMA FLOPs (F32):
-    rst: 'The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
-      executed per second. Note: this does not include any 32-bit floating point operations
-      from :ref:`VALU <desc-valu>` instructions. This is also presented as a percent
-      of the peak theoretical F32 MFMA operations achievable on the specific accelerator.'
-    unit: GFLOPs
-  sL1D Cache Hit Rate:
-    rst: The percent of sL1D requests that hit on a previously loaded line the  cache.
-      Calculated as the ratio of the number of sL1D requests that hit  over the number
-      of all sL1D requests.
-    unit: Percent
-  Active CUs:
-    rst: Total number of active compute units (CUs) on the accelerator during the
-      kernel execution.
-    unit: Number
   L2 Cache Hit Rate:
     rst: The ratio of the number of L2 cache line requests that hit in the L2  cache
       over the total number of incoming cache line requests to the L2  cache.
@@ -1508,13 +1388,129 @@ System Speed-of-Light:
       \ interface <l2-fabric>` per unit time. This is also  presented as a percent\
       \ of the peak theoretical bandwidth achievable on  the specific accelerator."
     unit: GB/s
-  vL1D Cache Hit Rate:
-    rst: The ratio of the number of vL1D cache line requests that hit in vL1D  cache
-      over the total number of cache line requests to the  :ref:`vL1D cache RAM <desc-tc>`.
-    unit: Percent
-  L1I Fetch Latency:
-    rst: The average number of cycles spent to fetch instructions to a  :doc:`CU <compute-unit>`.
+  L2-Fabric Read Latency:
+    rst: The time-averaged number of cycles read requests spent in Infinity Fabric  before
+      data was returned to the L2.
     unit: Cycles
+  L2-Fabric Write BW:
+    rst: The number of bytes sent by the L2 over the  :ref:`Infinity Fabric interface
+      <l2-fabric>` by write and atomic  operations per unit time. This is also presented
+      as a percent of the peak  theoretical bandwidth achievable on the specific accelerator.
+    unit: GB/s
+  L2-Fabric Write Latency:
+    rst: The time-averaged number of cycles write requests spent in Infinity  Fabric
+      before a completion acknowledgement was returned to the L2.
+    unit: Cycles
+  LDS Bank Conflicts/Access:
+    rst: The ratio of the number of cycles spent in the  :doc:`LDS scheduler <local-data-share>`
+      due to bank conflicts (as  determined by the conflict resolution hardware) to
+      the base number of  cycles that would be spent in the LDS scheduler in a completely  uncontended
+      case. This is also presented in normalized form (i.e., the  Bank Conflict Rate).
+    unit: Conflicts/Access
+  MFMA FLOPs (BF16):
+    rst: 'The total number of 16-bit brain floating point :ref:`MFMA <desc-mfma>`
+      operations executed per second. Note: this does not include any 16-bit brain
+      floating point operations from :ref:`VALU <desc-valu>` instructions. This is
+      also presented as a percent of the peak theoretical BF16 MFMA operations achievable
+      on the specific accelerator.'
+    unit: GFLOPs
+  MFMA FLOPs (F16):
+    rst: 'The total number of 16-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 16-bit floating point operations
+      from :ref:`VALU <desc-valu>` instructions. This is also presented as a percent
+      of the peak theoretical F16 MFMA operations achievable on the specific accelerator.'
+    unit: GFLOPs
+  MFMA FLOPs (F32):
+    rst: 'The total number of 32-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 32-bit floating point operations
+      from :ref:`VALU <desc-valu>` instructions. This is also presented as a percent
+      of the peak theoretical F32 MFMA operations achievable on the specific accelerator.'
+    unit: GFLOPs
+  MFMA FLOPs (F64):
+    rst: 'The total number of 64-bit floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 64-bit floating point operations
+      from :ref:`VALU <desc-valu>` instructions. This is also presented as a percent
+      of the peak theoretical F64 MFMA operations achievable on the specific accelerator.'
+    unit: GFLOPs
+  MFMA FLOPs (F8):
+    rst: 'The total number of 8-bit brain floating point :ref:`MFMA <desc-mfma>` operations
+      executed per second. Note: this does not include any 16-bit brain floating point
+      operations from :ref:`VALU <desc-valu>` instructions. This is also presented
+      as a percent of the peak theoretical F8 MFMA operations achievable on the specific
+      accelerator. It is supported on AMD Instinct MI300 series and later only.'
+    unit: GFLOPs
+  MFMA IOPs (Int8):
+    rst: 'The total number of 8-bit integer :ref:`MFMA <desc-mfma>` operations executed
+      per second. Note: this does not include any 8-bit integer operations from :ref:`VALU
+      <desc-valu>` instructions. This is also presented as a percent of the peak theoretical
+      INT8 MFMA operations achievable on the specific accelerator.'
+    unit: GIOPs
+  MFMA Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`MFMA <desc-mfma>`
+      unit was busy executing instructions. Computed as the ratio of the total number
+      of cycles spent by the :ref:`MFMA <desc-salu>` was busy over the :ref:`total
+      CU cycles <total-cu-cycles>`.
+    unit: Percent
+  SALU Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`SALU <desc-salu>`
+      was busy executing instructions. Computed as the ratio of the total number of
+      cycles spent by the :ref:`scheduler <desc-scheduler>` issuing SALU / :ref:`SMEM
+      <desc-smem>` instructions over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  Theoretical LDS Bandwidth:
+    rst: Indicates the maximum amount of bytes that could have been loaded from, stored
+      to, or atomically updated in the LDS per unit time (see :ref:`LDS Bandwidth
+      <lds-bandwidth>` example for more detail). This is also presented as a percent
+      of the peak theoretical F64 MFMA operations achievable on the specific accelerator.
+    unit: GB/s
+  VALU Active Threads:
+    rst: Indicates the average level of :ref:`divergence <desc-divergence>` within
+      a wavefront over the lifetime of the kernel. The number of work-items that were
+      active in a wavefront during execution of each :ref:`VALU <desc-valu>` instruction,
+      time-averaged over all VALU instructions run on all wavefronts in the kernel.
+    unit: Work-items
+  VALU FLOPs:
+    rst: 'The total floating-point operations executed per second on the :ref:`VALU
+      <desc-valu>`. This is also presented as a percent of the peak theoretical FLOPs
+      achievable on the specific accelerator. Note: this does not include any floating-point
+      operations from :ref:`MFMA <desc-mfma>` instructions.'
+    unit: GFLOPs
+  VALU IOPs:
+    rst: 'The total integer operations executed per second on the :ref:`VALU <desc-valu>`.
+      This is also presented as a percent of the peak theoretical IOPs achievable
+      on the specific accelerator. Note: this does not include any integer operations
+      from :ref:`MFMA <desc-mfma>` instructions.'
+    unit: GOIPs
+  VALU Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`VALU <desc-valu>`
+      was busy executing instructions. Does not include :ref:`VMEM <desc-vmem>` operations.
+      Computed as the ratio of the total number of cycles spent by the :ref:`scheduler
+      <desc-scheduler>` issuing VALU instructions over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  VMEM Utilization:
+    rst: Indicates what percent of the kernel's duration the :ref:`VMEM <desc-vmem>`
+      unit was busy executing instructions, including both global/generic and spill/scratch
+      operations (see the :ref:`VMEM instruction count metrics <ta-instruction-counts>`
+      for more detail).  Does not include :ref:`VALU <desc-valu>` operations. Computed
+      as the ratio of the total number of cycles spent by the :ref:`scheduler <desc-scheduler>`
+      issuing VMEM instructions over the :ref:`total CU cycles <total-cu-cycles>`.
+    unit: Percent
+  Wavefront Occupancy:
+    rst: 'The time-averaged number of wavefronts resident on the accelerator over
+      the lifetime of the kernel. Note: this metric may be inaccurate for short-running
+      kernels (less than 1ms). This is also presented as a percent of the peak theoretical
+      occupancy achievable on the specific accelerator.'
+    unit: Wavefronts
+  sL1D Cache BW:
+    rst: The number of bytes looked up in the sL1D cache per unit time. This is  also
+      presented as a percent of the peak theoretical bandwidth achievable  on the
+      specific accelerator.
+    unit: GB/s
+  sL1D Cache Hit Rate:
+    rst: The percent of sL1D requests that hit on a previously loaded line the  cache.
+      Calculated as the ratio of the number of sL1D requests that hit  over the number
+      of all sL1D requests.
+    unit: Percent
   vL1D Cache BW:
     rst: The number of bytes looked up in the vL1D cache as a result of  :ref:`VMEM
       <desc-vmem>` instructions per unit time. The number of bytes  is calculated
@@ -1524,3 +1520,7 @@ System Speed-of-Light:
       cache line. This is also presented as a  percent of the peak theoretical bandwidth
       achievable on the specific  accelerator.
     unit: GB/s
+  vL1D Cache Hit Rate:
+    rst: The ratio of the number of vL1D cache line requests that hit in vL1D  cache
+      over the total number of cache line requests to the  :ref:`vL1D cache RAM <desc-tc>`.
+    unit: Percent
diff --git a/utils/autogen_hash.yaml b/utils/autogen_hash.yaml
index 756d690a24..ff42ad10e6 100644
--- a/utils/autogen_hash.yaml
+++ b/utils/autogen_hash.yaml
@@ -107,4 +107,4 @@ src/rocprof_compute_soc/analysis_configs/gfx940/2100_pc_sampling.yaml: 4f3af5504
 src/rocprof_compute_soc/analysis_configs/gfx941/2100_pc_sampling.yaml: 4f3af55040c40bee5f1fd88d83e2324d06e5dc462c0adc3e6d5b19b3f31af5e7
 src/rocprof_compute_soc/analysis_configs/gfx942/2100_pc_sampling.yaml: 4f3af55040c40bee5f1fd88d83e2324d06e5dc462c0adc3e6d5b19b3f31af5e7
 src/rocprof_compute_soc/analysis_configs/gfx950/2100_pc_sampling.yaml: 4f3af55040c40bee5f1fd88d83e2324d06e5dc462c0adc3e6d5b19b3f31af5e7
-docs/data/metrics_description.yaml: 7a79754edf27080a1701e959904c7db80c661dc552f3cdf94b0b2d332a2b2c45
+docs/data/metrics_description.yaml: b912cf868d488d6ff78d4efc6ceeca27cca5811f4c705efa68a21dd6ddb1609b
diff --git a/utils/split_config.py b/utils/split_config.py
index 7e0c2b6f67..ae178978d3 100644
--- a/utils/split_config.py
+++ b/utils/split_config.py
@@ -137,7 +137,7 @@ def update_documentation():
                     panel_config["metrics_description"].keys()
                 )
                 # Add metrics info
-                for metric_name in metric_names:
+                for metric_name in sorted(list(metric_names)):
                     metrics_info[metric_name] = {
                         "rst": panel_config["metrics_description"][metric_name]["rst"],
                         "unit": panel_config["metrics_description"][metric_name]["unit"],