[rocprofiler-compute][TUI] Restructure Performance Metrics (#232)

2025-08-20 17:00:54 -04:00
@@ -53,6 +53,50 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs.
  * sL1D-L2 BW Utilization (section 1401)
  * Bandwidth Utilization (section 1601)

+* Update `System Speed-of-Light` panel to `GPU Speed-of-Light` in TUI with the following metrics:
+  * Theoretical LDS Bandwidth
+  * vL1D Cache BW
+  * L2 Cache BW
+  * L2-Fabric Read BW
+  * L2-Fabric Write BW
+  * Kernel Time
+  * Kernel Time (Cycles)
+  * SIMD Utilization
+  * Clock Rate
+
+* Add `Compute Throughput` panel to TUI with the following metrics:
+  * VALU FLOPs
+  * VALU IOPs
+  * MFMA FLOPs (F8)
+  * MFMA FLOPs (BF16)
+  * MFMA FLOPs (F16)
+  * MFMA FLOPs (F32)
+  * MFMA FLOPs (F64)
+  * MFMA FLOPs (F6F4) (in gfx950)
+  * MFMA IOPs (Int8)
+  * SALU Utilization
+  * VALU Utilization
+  * MFMA Utilization
+  * VMEM Utilization
+  * Branch Utilization
+  * IPC
+
+* Add `Memory Throughput` panel to TUI with the following metrics:
+  * vL1D Cache BW
+  * vL1D Cache Utilization
+  * Theoretical LDS Bandwidth
+  * LDS Utilization
+  * L2 Cache BW
+  * L2 Cache Utilization
+  * L2-Fabric Read BW
+  * L2-Fabric Write BW
+  * sL1D Cache BW
+  * L1I BW
+  * Address Processing Unit Busy
+  * Data-Return Busy
+  * L1I-L2 Bandwidth
+  * sL1D-L2 BW
+
 ### Resolved issues

 * Fixed not detecting memory clock issue when using amd-smi
@@ -23,6 +23,7 @@

 ##############################################################################

+
 from pathlib import Path

 # NB: Creating a new module to share global vars across modules
@@ -32,6 +32,7 @@ from pathlib import Path

 import pandas as pd

+import config
 from utils import file_io, parser, schema
 from utils.logger import (
    console_debug,
@@ -76,9 +77,14 @@ class OmniAnalyze_Base:
        if list_stats:
            ac.panel_configs = file_io.top_stats_build_in_config
        else:
-            arch_panel_config = (
+            arch_panel_config = [
                config_dir if single_panel_config else config_dir.joinpath(arch)
-            )
+            ]
+            # Use restructured perf metrics in TUI analyze mode
+            if self.__args.tui and arch in ["gfx942", "gfx950"]:
+                arch_panel_config.append(
+                    f"{config.rocprof_compute_home}/rocprof_compute_tui/utils/{arch}"
+                )
            ac.panel_configs = file_io.load_panel_configs(arch_panel_config)

        # TODO: filter_metrics should/might be one per arch
@@ -0,0 +1,103 @@
+# TUI use only
+# NOTE: This is used as a TUI-only yaml file for the beta release of the new performance metric organization
+Panel Config:
+  id: 3200
+  title: GPU Speed-of-Light
+  metrics_description:
+    Theoretical LDS Bandwidth: Indicates the maximum amount of bytes that could have
+      been loaded from, stored to, or atomically updated in the LDS per unit time
+      (see LDS Bandwidth example for more detail). This is also presented as a percent
+      of the peak theoretical F64 MFMA operations achievable on the specific accelerator.
+    vL1D Cache BW: The number of bytes looked up in the vL1D cache as a result of
+      VMEM instructions per unit time. The number of bytes is calculated as the number
+      of cache lines requested multiplied by the cache line size. This value does
+      not consider partial requests, so e.g., if only a single value is requested
+      in a cache line, the data movement will still be counted as a full cache line.
+      This is also presented as a percent of the peak theoretical bandwidth achievable
+      on the specific accelerator.
+    L2 Cache BW: The number of bytes looked up in the L2 cache per unit time. The
+      number of bytes is calculated as the number of cache lines requested multiplied
+      by the cache line size. This value does not consider partial requests, so e.g.,
+      if only a single value is requested in a cache line, the data movement will
+      still be counted as a full cache line. This is also presented as a percent of
+      the peak theoretical bandwidth achievable on the specific accelerator.
+    L2-Fabric Read BW: "The number of bytes read by the L2 over the Infinity Fabric\u2122\
+      \ interface per unit time. This is also presented as a percent of the peak theoretical\
+      \ bandwidth achievable on the specific accelerator."
+    L2-Fabric Write BW: The number of bytes sent by the L2 over the Infinity Fabric
+      interface by write and atomic operations per unit time. This is also presented
+      as a percent of the peak theoretical bandwidth achievable on the specific accelerator.
+    Kernel Time: The total duration of the executed kernel.
+    Kernel Time (Cycles): The total duration of the executed kernel in cycles.
+    SIMD Utilization: The percent of total SIMD cycles in the kernel where any SIMD
+      on a CU was actively doing any work, summed over all CUs. Low values (less than
+      100%) indicate that the accelerator was not fully saturated by the kernel, or
+      a potential load-imbalance issue.
+    Clock Rate:
+  data source:
+  - metric_table:
+      id: 3201
+      title: GPU Speed-of-Light
+      header:
+        metric: Metric
+        value: Avg
+        unit: Unit
+        peak: Peak
+        pop: Pct of Peak
+      metric:
+        Theoretical LDS Bandwidth:
+          value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
+            / (End_Timestamp - Start_Timestamp)))
+          unit: GB/s
+          peak: (($max_sclk * $cu_per_gpu) * 0.128)
+          pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
+            / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * 0.00128)))
+        vL1D Cache BW:
+          value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
+          unit: GB/s
+          peak: ((($max_sclk / 1000) * 128) * $cu_per_gpu)
+          pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp
+            - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu))
+        L2 Cache BW:
+          value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))
+          unit: GB/s
+          peak: ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))
+          pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))))
+            / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)))
+        L2-Fabric Read BW:
+          value: AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
+            - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
+            - Start_Timestamp))
+          unit: GB/s
+          peak: $hbmBandwidth
+          pop: ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
+            - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
+            - Start_Timestamp)))) / $hbmBandwidth)
+        L2-Fabric Write BW:
+          value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
+            * 32)) / (End_Timestamp - Start_Timestamp)))
+          unit: GB/s
+          peak: $hbmBandwidth
+          pop: ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum -
+            TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp)))) /
+            $hbmBandwidth)
+        Kernel Time:
+          avg: AVG((End_Timestamp - Start_Timestamp))
+          unit: ns
+          peak: N/A
+          pop: N/A
+        Kernel Time (Cycles):
+          avg: AVG($GRBM_GUI_ACTIVE_PER_XCD)
+          unit: Cycle
+          peak: N/A
+          pop: N/A
+        SIMD Utilization:
+          value: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
+          unit: Pct
+          peak: 100
+          pop: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
+        Clock Rate:
+          value: (GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu) / (End_Timestamp - Start_Timestamp)
+          unit: MHz
+          peak: N/A # attainable peak? theoretical freq?
+          pop: N/A
@@ -0,0 +1,163 @@
+# TUI use only
+# NOTE: This is used as a TUI-only yaml file for the beta release of the new performance metric organization
+Panel Config:
+  id: 3300
+  title: Compute Throughput
+  metrics_description:
+    VALU FLOPs: 'The total floating-point operations executed per second on the VALU.
+      This is also presented as a percent of the peak theoretical FLOPs achievable
+      on the specific accelerator. Note: this does not include any floating-point
+      operations from MFMA instructions.'
+    VALU IOPs: 'The total integer operations executed per second on the VALU. This
+      is also presented as a percent of the peak theoretical IOPs achievable on the
+      specific accelerator. Note: this does not include any integer operations from
+      MFMA instructions.'
+    MFMA FLOPs (F8): The total number of 8-bit brain floating point MFMA operations
+      executed per second. This does not include any 16-bit brain floating point operations
+      from VALU instructions. This is also presented as a percent of the peak theoretical
+      F8 MFMA operations achievable on the specific accelerator. It is supported on
+      AMD Instinct MI300 series and later only.
+    MFMA FLOPs (BF16): 'The total number of 16-bit brain floating point MFMA operations
+      executed per second. Note: this does not include any 16-bit brain floating point
+      operations from VALU instructions. This is also presented as a percent of the
+      peak theoretical BF16 MFMA operations achievable on the specific accelerator.'
+    MFMA FLOPs (F16): 'The total number of 16-bit floating point MFMA operations executed
+      per second. Note: this does not include any 16-bit floating point operations
+      from VALU instructions. This is also presented as a percent of the peak theoretical
+      F16 MFMA operations achievable on the specific accelerator.'
+    MFMA FLOPs (F32): 'The total number of 32-bit floating point MFMA operations executed
+      per second. Note: this does not include any 32-bit floating point operations
+      from VALU instructions. This is also presented as a percent of the peak theoretical
+      F32 MFMA operations achievable on the specific accelerator.'
+    MFMA FLOPs (F64): 'The total number of 64-bit floating point MFMA operations executed
+      per second. Note: this does not include any 64-bit floating point operations
+      from VALU instructions. This is also presented as a percent of the peak theoretical
+      F64 MFMA operations achievable on the specific accelerator.'
+    MFMA IOPs (Int8): 'The total number of 8-bit integer MFMA operations executed
+      per second. Note: this does not include any 8-bit integer operations from VALU
+      instructions. This is also presented as a percent of the peak theoretical INT8
+      MFMA operations achievable on the specific accelerator.'
+    SALU Utilization: Indicates what percent of the kernel's duration the SALU was
+      busy executing instructions. Computed as the ratio of the total number of cycles
+      spent by the scheduler issuing SALU or SMEM instructions over the total CU cycles.
+    VALU Utilization: Indicates what percent of the kernel's duration the VALU was
+      busy executing instructions. Does not include VMEM operations. Computed as the
+      ratio of the total number of cycles spent by the scheduler issuing VALU instructions
+      over the total CU cycles.
+    MFMA Utilization: Indicates what percent of the kernel's duration the MFMA unit
+      was busy executing instructions. Computed as the ratio of the total number of
+      cycles the MFMA was busy over the total CU cycles.
+    VMEM Utilization: Indicates what percent of the kernel's duration the VMEM unit
+      was busy executing instructions, including both global/generic and spill/scratch
+      operations (see the VMEM instruction count metrics) for more detail). Does not
+      include VALU operations. Computed as the ratio of the total number of cycles
+      spent by the scheduler issuing VMEM instructions over the total CU cycles.
+    Branch Utilization: Indicates what percent of the kernel's duration the branch
+      unit was busy executing instructions. Computed as the ratio of the total number
+      of cycles spent by the scheduler issuing branch instructions over the total
+      CU cycles
+    IPC: The ratio of the total number of instructions executed on the CU over the
+      total active CU cycles. This is also presented as a percent of the peak theoretical
+      bandwidth achievable on the specific accelerator.
+  data source:
+  - metric_table:
+      id: 3301
+      title: Compute Throughput
+      header:
+        metric: Metric
+        value: Avg
+        unit: Unit
+        peak: Peak
+        pop: Pct of Peak
+      metric:
+        VALU FLOPs:
+          value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) +
+            SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
+            + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
+            + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
+            + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))
+          unit: GFLOP/s
+          peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
+          pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
+            + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
+            + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
+            + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
+            + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))))
+            / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000))
+        VALU IOPs:
+          value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp
+            - Start_Timestamp)))
+          unit: GIOP/s
+          peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
+          pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp
+            - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000))
+        MFMA FLOPs (F8):
+          value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))
+          unit: GFLOP/s
+          peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
+          pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp -
+            Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
+        MFMA FLOPs (BF16):
+          value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))
+          unit: GFLOP/s
+          peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
+          pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp
+            - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
+        MFMA FLOPs (F16):
+          value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))
+          unit: GFLOP/s
+          peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
+          pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp -
+            Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
+        MFMA FLOPs (F32):
+          value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))
+          unit: GFLOP/s
+          peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
+          pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp -
+            Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
+        MFMA FLOPs (F64):
+          value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))
+          unit: GFLOP/s
+          peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
+          pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp -
+            Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
+        MFMA IOPs (Int8):
+          value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))
+          unit: GIOP/s
+          peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
+          pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp -
+            Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
+        SALU Utilization:
+          value: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
+          unit: pct
+          peak: 100
+          pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
+        VALU Utilization:
+          value: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
+          unit: pct
+          peak: 100
+          pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
+        MFMA Utilization:
+          value: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD
+            * $cu_per_gpu) * 4)))
+          unit: pct
+          peak: 100
+          pop: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD
+            * $cu_per_gpu) * 4)))
+        VMEM Utilization:
+          value: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
+            / $cu_per_gpu))
+          unit: pct
+          peak: 100
+          pop: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
+            / $cu_per_gpu))
+        Branch Utilization:
+          value: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
+          unit: pct
+          peak: 100
+          pop: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
+        IPC:
+          value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))
+          unit: Instr/cycle
+          peak: 5
+          pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5)
@@ -0,0 +1,162 @@
+# TUI use only
+# NOTE: This is used as a TUI-only yaml file for the beta release of the new performance metric organization
+Panel Config:
+  id: 3400
+  title: Memory Throughput
+  metrics_description:
+    vL1D Cache BW: The number of bytes looked up in the vL1D cache as a result of
+      VMEM instructions per unit time. The number of bytes is calculated as the number
+      of cache lines requested multiplied by the cache line size. This value does
+      not consider partial requests, so e.g., if only a single value is requested
+      in a cache line, the data movement will still be counted as a full cache line.
+      This is also presented as a percent of the peak theoretical bandwidth achievable
+      on the specific accelerator.
+    vL1D Cache Utilization: Indicates how busy the vL1D Cache RAM was during the kernel execution.
+      The number of cycles where the vL1D Cache RAM is actively processing any request
+      divided by the number of cycles where the vL1D is active.
+    Theoretical LDS Bandwidth: Indicates the maximum amount of bytes that could have
+      been loaded from, stored to, or atomically updated in the LDS per unit time
+      (see LDS Bandwidth example for more detail). This is also presented as a percent
+      of the peak theoretical F64 MFMA operations achievable on the specific accelerator.
+    LDS Utilization: Indicates what percent of the kernel's duration the LDS was actively
+      executing instructions (including, but not limited to, load, store, atomic and
+      HIP's __shfl operations). Calculated as the ratio of the total number of cycles
+      LDS was active over the total CU cycles.
+    L2 Cache BW: The number of bytes looked up in the L2 cache per unit time. The
+      number of bytes is calculated as the number of cache lines requested multiplied
+      by the cache line size. This value does not consider partial requests, so e.g.,
+      if only a single value is requested in a cache line, the data movement will
+      still be counted as a full cache line. This is also presented as a percent of
+      the peak theoretical bandwidth achievable on the specific accelerator.
+    L2 Cache Utilization: The ratio of the number of cycles an L2 channel was active, summed
+      over all L2 channels on the accelerator over the total L2 cycles.
+    L2-Fabric Read BW: "The number of bytes read by the L2 over the Infinity Fabric\u2122\
+      \ interface per unit time. This is also presented as a percent of the peak theoretical\
+      \ bandwidth achievable on the specific accelerator."
+    L2-Fabric Write BW: The number of bytes sent by the L2 over the Infinity Fabric
+      interface by write and atomic operations per unit time. This is also presented
+      as a percent of the peak theoretical bandwidth achievable on the specific accelerator.
+    sL1D Cache BW: The number of bytes looked up in the sL1D cache per unit time.
+      This is also presented as a percent of the peak theoretical bandwidth achievable
+      on the specific accelerator.
+    L1I BW: The percent of L1I requests that hit on a previously loaded line the cache.
+      Calculated as the ratio of the number of L1I requests that hit over the number
+      of all L1I requests.
+    Address Processing Unit Busy: Percent of the total CU cycles the address processor
+      was busy.
+    Data-Return Busy: Percent of the total CU cycles the data-return unit was busy
+      processing or waiting on data to return to the CU.
+    L1I-L2 Bandwidth: Total number of bytes transferred across L1I - L2 interface
+      divided by total duration.
+    sL1D-L2 BW: "The total number of bytes read from, written to, or atomically updated\
+      \ across the sL1D\u2194L2 interface, divided by total duration. Note that sL1D\
+      \ writes and atomics are typically unused on current CDNA accelerators, so in\
+      \ the majority of cases this can be interpreted as an sL1D\u2192L2 read bandwidth."
+  data source:
+  - metric_table:
+      id: 3401
+      title: Memory Throughput
+      header:
+        metric: Metric
+        value: Avg
+        unit: Unit
+        peak: Peak
+        pop: Pct of Peak
+      metric:
+        vL1D Cache BW:
+          value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
+          unit: GB/s
+          peak: ((($max_sclk / 1000) * 128) * $cu_per_gpu)
+          pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp
+            - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu))
+        vL1D Cache Utilization:
+          value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
+            != 0) else None))
+          unit: Pct of Peak
+          peak: 100
+          pop: None
+        Theoretical LDS Bandwidth:
+          value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
+            / (End_Timestamp - Start_Timestamp)))
+          unit: GB/s
+          peak: (($max_sclk * $cu_per_gpu) * 0.128)
+          pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
+            / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * 0.00128)))
+        LDS Utilization:
+          value: AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
+          unit: Pct of Peak
+          peak: 100
+          pop: None
+        L2 Cache Hit Rate:
+          value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
+            + TCC_MISS_sum) != 0) else None))
+          unit: pct
+          peak: 100
+          pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
+            + TCC_MISS_sum) != 0) else None))
+        L2 Cache BW:
+          value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))
+          unit: GB/s
+          peak: ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))
+          pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))))
+            / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)))
+        L2 Cache Utilization:
+          value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($total_l2_chan) * $GRBM_GUI_ACTIVE_PER_XCD)))
+          unit: pct
+          peak: 100
+          pop: None
+        L2-Fabric Read BW:
+          value: AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
+            - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
+            - Start_Timestamp))
+          unit: GB/s
+          peak: $hbmBandwidth
+          pop: ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
+            - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
+            - Start_Timestamp)))) / $hbmBandwidth)
+        L2-Fabric Write BW:
+          value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
+            * 32)) / (End_Timestamp - Start_Timestamp)))
+          unit: GB/s
+          peak: $hbmBandwidth
+          pop: ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum -
+            TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp)))) /
+            $hbmBandwidth)
+        sL1D Cache BW:
+          value: AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))
+          unit: GB/s
+          peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu)
+          pop: ((100 * AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) *
+            64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu))
+        L1I Hit Rate:
+          value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
+          unit: pct
+          peak: 100
+          pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
+        L1I BW:
+          value: AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))
+          unit: GB/s
+          peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu)
+          pop: ((100 * AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) *
+            64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu))
+        Address Processing Unit Busy:
+          avg: AVG(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
+          unit: pct
+          peak: 100
+          pop: N/A
+        Data-Return Busy:
+          avg: AVG(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
+          unit: pct
+          peak: 100
+          pop: N/A
+        L1I-L2 Bandwidth:
+          avg: AVG(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
+          unit: GB/s
+          peak: N/A
+          pop: N/A
+        sL1D-L2 BW:
+          value: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
+            * 64)) / (End_Timestamp - Start_Timestamp)))
+          unit: GB/s
+          peak: N/A
+          pop: N/A
@@ -0,0 +1,103 @@
+# TUI use only
+# NOTE: This is used as a TUI-only yaml file for the beta release of the new performance metric organization
+Panel Config:
+  id: 3200
+  title: GPU Speed-of-Light
+  metrics_description:
+    Theoretical LDS Bandwidth: Indicates the maximum amount of bytes that could have
+      been loaded from, stored to, or atomically updated in the LDS per unit time
+      (see LDS Bandwidth example for more detail). This is also presented as a percent
+      of the peak theoretical F64 MFMA operations achievable on the specific accelerator.
+    vL1D Cache BW: The number of bytes looked up in the vL1D cache as a result of
+      VMEM instructions per unit time. The number of bytes is calculated as the number
+      of cache lines requested multiplied by the cache line size. This value does
+      not consider partial requests, so e.g., if only a single value is requested
+      in a cache line, the data movement will still be counted as a full cache line.
+      This is also presented as a percent of the peak theoretical bandwidth achievable
+      on the specific accelerator.
+    L2 Cache BW: The number of bytes looked up in the L2 cache per unit time. The
+      number of bytes is calculated as the number of cache lines requested multiplied
+      by the cache line size. This value does not consider partial requests, so e.g.,
+      if only a single value is requested in a cache line, the data movement will
+      still be counted as a full cache line. This is also presented as a percent of
+      the peak theoretical bandwidth achievable on the specific accelerator.
+    L2-Fabric Read BW: "The number of bytes read by the L2 over the Infinity Fabric\u2122\
+      \ interface per unit time. This is also presented as a percent of the peak theoretical\
+      \ bandwidth achievable on the specific accelerator."
+    L2-Fabric Write BW: The number of bytes sent by the L2 over the Infinity Fabric
+      interface by write and atomic operations per unit time. This is also presented
+      as a percent of the peak theoretical bandwidth achievable on the specific accelerator.
+    Kernel Time: The total duration of the executed kernel.
+    Kernel Time (Cycles): The total duration of the executed kernel in cycles.
+    SIMD Utilization: The percent of total SIMD cycles in the kernel where any SIMD
+      on a CU was actively doing any work, summed over all CUs. Low values (less than
+      100%) indicate that the accelerator was not fully saturated by the kernel, or
+      a potential load-imbalance issue.
+    Clock Rate:
+  data source:
+  - metric_table:
+      id: 3201
+      title: GPU Speed-of-Light
+      header:
+        metric: Metric
+        value: Avg
+        unit: Unit
+        peak: Peak
+        pop: Pct of Peak
+      metric:
+        Theoretical LDS Bandwidth:
+          value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
+            / (End_Timestamp - Start_Timestamp)))
+          unit: GB/s
+          peak: (($max_sclk * $cu_per_gpu) * 0.128)
+          pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
+            / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * 0.00128)))
+        vL1D Cache BW:
+          value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
+          unit: GB/s
+          peak: ((($max_sclk / 1000) * 128) * $cu_per_gpu)
+          pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp
+            - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu))
+        L2 Cache BW:
+          value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))
+          unit: GB/s
+          peak: ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))
+          pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))))
+            / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)))
+        L2-Fabric Read BW:
+          value: AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
+            - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
+            - Start_Timestamp))
+          unit: GB/s
+          peak: $hbmBandwidth
+          pop: ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
+            - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
+            - Start_Timestamp)))) / $hbmBandwidth)
+        L2-Fabric Write BW:
+          value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
+            * 32)) / (End_Timestamp - Start_Timestamp)))
+          unit: GB/s
+          peak: $hbmBandwidth
+          pop: ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum -
+            TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp)))) /
+            $hbmBandwidth)
+        Kernel Time:
+          avg: AVG((End_Timestamp - Start_Timestamp))
+          unit: ns
+          peak: N/A
+          pop: N/A
+        Kernel Time (Cycles):
+          avg: AVG($GRBM_GUI_ACTIVE_PER_XCD)
+          unit: Cycle
+          peak: N/A
+          pop: N/A
+        SIMD Utilization:
+          value: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
+          unit: Pct
+          peak: 100
+          pop: AVG(100 * SQ_BUSY_CU_CYCLES / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu))
+        Clock Rate:
+          value: (GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu) / (End_Timestamp - Start_Timestamp)
+          unit: ns
+          peak: N/A
+          pop: N/A
@@ -0,0 +1,169 @@
+# TUI use only
+# NOTE: This is used as a TUI-only yaml file for the beta release of the new performance metric organization
+Panel Config:
+  id: 3300
+  title: Compute Throughput
+  metrics_description:
+    VALU FLOPs: 'The total floating-point operations executed per second on the VALU.
+      This is also presented as a percent of the peak theoretical FLOPs achievable
+      on the specific accelerator. Note: this does not include any floating-point
+      operations from MFMA instructions.'
+    VALU IOPs: 'The total integer operations executed per second on the VALU. This
+      is also presented as a percent of the peak theoretical IOPs achievable on the
+      specific accelerator. Note: this does not include any integer operations from
+      MFMA instructions.'
+    MFMA FLOPs (F8): The total number of 8-bit brain floating point MFMA operations
+      executed per second. This does not include any 16-bit brain floating point operations
+      from VALU instructions. This is also presented as a percent of the peak theoretical
+      F8 MFMA operations achievable on the specific accelerator. It is supported on
+      AMD Instinct MI300 series and later only.
+    MFMA FLOPs (BF16): 'The total number of 16-bit brain floating point MFMA operations
+      executed per second. Note: this does not include any 16-bit brain floating point
+      operations from VALU instructions. This is also presented as a percent of the
+      peak theoretical BF16 MFMA operations achievable on the specific accelerator.'
+    MFMA FLOPs (F16): 'The total number of 16-bit floating point MFMA operations executed
+      per second. Note: this does not include any 16-bit floating point operations
+      from VALU instructions. This is also presented as a percent of the peak theoretical
+      F16 MFMA operations achievable on the specific accelerator.'
+    MFMA FLOPs (F32): 'The total number of 32-bit floating point MFMA operations executed
+      per second. Note: this does not include any 32-bit floating point operations
+      from VALU instructions. This is also presented as a percent of the peak theoretical
+      F32 MFMA operations achievable on the specific accelerator.'
+    MFMA FLOPs (F64): 'The total number of 64-bit floating point MFMA operations executed
+      per second. Note: this does not include any 64-bit floating point operations
+      from VALU instructions. This is also presented as a percent of the peak theoretical
+      F64 MFMA operations achievable on the specific accelerator.'
+    MFMA IOPs (Int8): 'The total number of 8-bit integer MFMA operations executed
+      per second. Note: this does not include any 8-bit integer operations from VALU
+      instructions. This is also presented as a percent of the peak theoretical INT8
+      MFMA operations achievable on the specific accelerator.'
+    SALU Utilization: Indicates what percent of the kernel's duration the SALU was
+      busy executing instructions. Computed as the ratio of the total number of cycles
+      spent by the scheduler issuing SALU or SMEM instructions over the total CU cycles.
+    VALU Utilization: Indicates what percent of the kernel's duration the VALU was
+      busy executing instructions. Does not include VMEM operations. Computed as the
+      ratio of the total number of cycles spent by the scheduler issuing VALU instructions
+      over the total CU cycles.
+    MFMA Utilization: Indicates what percent of the kernel's duration the MFMA unit
+      was busy executing instructions. Computed as the ratio of the total number of
+      cycles the MFMA was busy over the total CU cycles.
+    VMEM Utilization: Indicates what percent of the kernel's duration the VMEM unit
+      was busy executing instructions, including both global/generic and spill/scratch
+      operations (see the VMEM instruction count metrics) for more detail). Does not
+      include VALU operations. Computed as the ratio of the total number of cycles
+      spent by the scheduler issuing VMEM instructions over the total CU cycles.
+    Branch Utilization: Indicates what percent of the kernel's duration the branch
+      unit was busy executing instructions. Computed as the ratio of the total number
+      of cycles spent by the scheduler issuing branch instructions over the total
+      CU cycles
+    IPC: The ratio of the total number of instructions executed on the CU over the
+      total active CU cycles. This is also presented as a percent of the peak theoretical
+      bandwidth achievable on the specific accelerator.
+  data source:
+  - metric_table:
+      id: 3301
+      title: Compute Throughput
+      header:
+        metric: Metric
+        value: Avg
+        unit: Unit
+        peak: Peak
+        pop: Pct of Peak
+      metric:
+        VALU FLOPs:
+          value: AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16) +
+            SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
+            + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
+            + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
+            + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp)))
+          unit: GFLOP/s
+          peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
+          pop: ((100 * AVG(((((64 * (((SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16)
+            + SQ_INSTS_VALU_TRANS_F16) + (2 * SQ_INSTS_VALU_FMA_F16))) + (64 * (((SQ_INSTS_VALU_ADD_F32
+            + SQ_INSTS_VALU_MUL_F32) + SQ_INSTS_VALU_TRANS_F32) + (2 * SQ_INSTS_VALU_FMA_F32))))
+            + (64 * (((SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64) + SQ_INSTS_VALU_TRANS_F64)
+            + (2 * SQ_INSTS_VALU_FMA_F64)))) / (End_Timestamp - Start_Timestamp))))
+            / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000))
+        VALU IOPs:
+          value: AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp
+            - Start_Timestamp)))
+          unit: GIOP/s
+          peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
+          pop: ((100 * AVG(((64 * (SQ_INSTS_VALU_INT32 + SQ_INSTS_VALU_INT64)) / (End_Timestamp
+            - Start_Timestamp)))) / (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000))
+        MFMA FLOPs (F8):
+          value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))
+          unit: GFLOP/s
+          peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
+          pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp -
+            Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
+        MFMA FLOPs (BF16):
+          value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))
+          unit: GFLOP/s
+          peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
+          pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp
+            - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
+        MFMA FLOPs (F16):
+          value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))
+          unit: GFLOP/s
+          peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
+          pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp -
+            Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
+        MFMA FLOPs (F32):
+          value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))
+          unit: GFLOP/s
+          peak: ((($max_sclk * $cu_per_gpu) * 256) / 1000)
+          pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp -
+            Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 256) / 1000))
+        MFMA FLOPs (F64):
+          value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp - Start_Timestamp)))
+          unit: GFLOP/s
+          peak: ((($max_sclk * $cu_per_gpu) * 128) / 1000)
+          pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512) / (End_Timestamp -
+            Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 128) / 1000))
+        MFMA FLOPs (F6F4):
+          value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp - Start_Timestamp)))
+          unit: GFLOP/s
+          peak: ((($max_sclk * $cu_per_gpu) * 16834) / 1000)
+          pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F6F4 * 512) / (End_Timestamp
+            - Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 16834) / 1000))
+        MFMA IOPs (Int8):
+          value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))
+          unit: GIOP/s
+          peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
+          pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp -
+            Start_Timestamp)))) / ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
+        SALU Utilization:
+          value: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
+          unit: pct
+          peak: 100
+          pop: AVG(((100 * SQ_ACTIVE_INST_SCA) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
+        VALU Utilization:
+          value: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
+          unit: pct
+          peak: 100
+          pop: AVG(((100 * SQ_ACTIVE_INST_VALU) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
+        MFMA Utilization:
+          value: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD
+            * $cu_per_gpu) * 4)))
+          unit: pct
+          peak: 100
+          pop: AVG(((100 * SQ_VALU_MFMA_BUSY_CYCLES) / (($GRBM_GUI_ACTIVE_PER_XCD
+            * $cu_per_gpu) * 4)))
+        VMEM Utilization:
+          value: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
+            / $cu_per_gpu))
+          unit: pct
+          peak: 100
+          pop: AVG((((100 * (SQ_ACTIVE_INST_FLAT+SQ_ACTIVE_INST_VMEM)) / $GRBM_GUI_ACTIVE_PER_XCD)
+            / $cu_per_gpu))
+        Branch Utilization:
+          value: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
+          unit: pct
+          peak: 100
+          pop: AVG((((100 * SQ_ACTIVE_INST_MISC) / $GRBM_GUI_ACTIVE_PER_XCD) / $cu_per_gpu))
+        IPC:
+          value: AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))
+          unit: Instr/cycle
+          peak: 5
+          pop: ((100 * AVG((SQ_INSTS / SQ_BUSY_CU_CYCLES))) / 5)
@@ -0,0 +1,161 @@
+# TUI use only
+# NOTE: This is used as a TUI-only yaml file for the beta release of the new performance metric organization
+Panel Config:
+  id: 3400
+  title: Memory Throughput
+  metrics_description:
+    vL1D Cache BW: The number of bytes looked up in the vL1D cache as a result of
+      VMEM instructions per unit time. The number of bytes is calculated as the number
+      of cache lines requested multiplied by the cache line size. This value does
+      not consider partial requests, so e.g., if only a single value is requested
+      in a cache line, the data movement will still be counted as a full cache line.
+      This is also presented as a percent of the peak theoretical bandwidth achievable
+      on the specific accelerator.
+    vL1D Cache Utilization: Indicates how busy the vL1D Cache RAM was during the kernel execution.
+      The number of cycles where the vL1D Cache RAM is actively processing any request
+      divided by the number of cycles where the vL1D is active.
+    Theoretical LDS Bandwidth: Indicates the maximum amount of bytes that could have
+      been loaded from, stored to, or atomically updated in the LDS per unit time
+      (see LDS Bandwidth example for more detail). This is also presented as a percent
+      of the peak theoretical F64 MFMA operations achievable on the specific accelerator.
+    LDS Utilization: Indicates what percent of the kernel's duration the LDS was actively
+      executing instructions (including, but not limited to, load, store, atomic and
+      HIP's __shfl operations). Calculated as the ratio of the total number of cycles
+      LDS was active over the total CU cycles.
+    L2 Cache BW: The number of bytes looked up in the L2 cache per unit time. The
+      number of bytes is calculated as the number of cache lines requested multiplied
+      by the cache line size. This value does not consider partial requests, so e.g.,
+      if only a single value is requested in a cache line, the data movement will
+      still be counted as a full cache line. This is also presented as a percent of
+      the peak theoretical bandwidth achievable on the specific accelerator.
+    L2 Cache Utilization: The ratio of the number of cycles an L2 channel was active, summed
+      over all L2 channels on the accelerator over the total L2 cycles.
+    L2-Fabric Read BW: "The number of bytes read by the L2 over the Infinity Fabric\u2122\
+      \ interface per unit time. This is also presented as a percent of the peak theoretical\
+      \ bandwidth achievable on the specific accelerator."
+    L2-Fabric Write BW: The number of bytes sent by the L2 over the Infinity Fabric
+      interface by write and atomic operations per unit time. This is also presented
+      as a percent of the peak theoretical bandwidth achievable on the specific accelerator.
+    sL1D Cache BW: The number of bytes looked up in the sL1D cache per unit time.
+      This is also presented as a percent of the peak theoretical bandwidth achievable
+      on the specific accelerator.
+    L1I BW: The percent of L1I requests that hit on a previously loaded line the cache.
+      Calculated as the ratio of the number of L1I requests that hit over the number
+      of all L1I requests.
+    Address Processing Unit Busy: Percent of the total CU cycles the address processor
+      was busy.
+    Data-Return Busy: Percent of the total CU cycles the data-return unit was busy
+      processing or waiting on data to return to the CU.
+    L1I-L2 Bandwidth: Total number of bytes transferred across L1I - L2 interface
+      divided by total duration.
+    sL1D-L2 BW: "The total number of bytes read from, written to, or atomically updated\
+      \ across the sL1D\u2194L2 interface, divided by total duration. Note that sL1D\
+      \ writes and atomics are typically unused on current CDNA accelerators, so in\
+      \ the majority of cases this can be interpreted as an sL1D\u2192L2 read bandwidth."
+  data source:
+  - metric_table:
+      id: 3401
+      title: Memory Throughput
+      header:
+        metric: Metric
+        value: Avg
+        unit: Unit
+        peak: Peak
+        pop: Pct of Peak
+      metric:
+        vL1D Cache BW:
+          value: AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp - Start_Timestamp)))
+          unit: GB/s
+          peak: ((($max_sclk / 1000) * 128) * $cu_per_gpu)
+          pop: ((100 * AVG(((TCP_TOTAL_CACHE_ACCESSES_sum * 128) / (End_Timestamp
+            - Start_Timestamp)))) / ((($max_sclk / 1000) * 128) * $cu_per_gpu))
+        vL1D Cache Utilization:
+          value: AVG((((TCP_GATE_EN2_sum * 100) / TCP_GATE_EN1_sum) if (TCP_GATE_EN1_sum
+            != 0) else None))
+          unit: Pct of Peak
+          peak: 100
+          pop: None
+        Theoretical LDS Bandwidth:
+          value: AVG(((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
+            / (End_Timestamp - Start_Timestamp)))
+          unit: GB/s
+          peak: (($max_sclk * $cu_per_gpu) * 0.128)
+          pop: AVG((((((SQ_LDS_IDX_ACTIVE - SQ_LDS_BANK_CONFLICT) * 4) * TO_INT($lds_banks_per_cu))
+            / (End_Timestamp - Start_Timestamp)) / (($max_sclk * $cu_per_gpu) * 0.00128)))
+        LDS Utilization:
+          value: AVG(((100 * SQ_LDS_IDX_ACTIVE) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
+          unit: Pct of Peak
+          peak: 100
+          pop: None
+        L2 Cache Hit Rate:
+          value: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
+            + TCC_MISS_sum) != 0) else None))
+          unit: pct
+          peak: 100
+          pop: AVG((((100 * TCC_HIT_sum) / (TCC_HIT_sum + TCC_MISS_sum)) if ((TCC_HIT_sum
+            + TCC_MISS_sum) != 0) else None))
+        L2 Cache BW:
+          value: AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp)))
+          unit: GB/s
+          peak: ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan))
+          pop: ((100 * AVG(((TCC_REQ_sum * 128) / (End_Timestamp - Start_Timestamp))))
+            / ((($max_sclk / 1000) * 128) * TO_INT($total_l2_chan)))
+        L2 Cache Utilization:
+          value: AVG(((TCC_BUSY_sum * 100) / (TO_INT($total_l2_chan) * $GRBM_GUI_ACTIVE_PER_XCD)))
+          unit: pct
+          peak: 100
+          pop: None
+        L2-Fabric Read BW:
+          value: AVG((((TCC_EA0_RDREQ_32B_sum * 32) + (TCC_EA0_RDREQ_64B_sum * 64)
+            + (TCC_EA0_RDREQ_128B_sum * 128)) / (End_Timestamp - Start_Timestamp)))
+          unit: GB/s
+          peak: $hbmBandwidth
+          pop: ((100 * (AVG((128 * TCC_BUBBLE_sum + 64 * (TCC_EA0_RDREQ_sum - TCC_BUBBLE_sum
+            - TCC_EA0_RDREQ_32B_sum) + 32 * TCC_EA0_RDREQ_32B_sum) / (End_Timestamp
+            - Start_Timestamp)))) / $hbmBandwidth)
+        L2-Fabric Write BW:
+          value: AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum - TCC_EA0_WRREQ_64B_sum)
+            * 32)) / (End_Timestamp - Start_Timestamp)))
+          unit: GB/s
+          peak: $hbmBandwidth
+          pop: ((100 * AVG((((TCC_EA0_WRREQ_64B_sum * 64) + ((TCC_EA0_WRREQ_sum -
+            TCC_EA0_WRREQ_64B_sum) * 32)) / (End_Timestamp - Start_Timestamp)))) /
+            $hbmBandwidth)
+        sL1D Cache BW:
+          value: AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))
+          unit: GB/s
+          peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu)
+          pop: ((100 * AVG(((SQC_DCACHE_REQ / (End_Timestamp - Start_Timestamp)) *
+            64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu))
+        L1I Hit Rate:
+          value: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
+          unit: pct
+          peak: 100
+          pop: AVG(((100 * SQC_ICACHE_HITS) / (SQC_ICACHE_HITS + SQC_ICACHE_MISSES)))
+        L1I BW:
+          value: AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) * 64))
+          unit: GB/s
+          peak: ((($max_sclk / 1000) * 64) * $sqc_per_gpu)
+          pop: ((100 * AVG(((SQC_ICACHE_REQ / (End_Timestamp - Start_Timestamp)) *
+            64))) / ((($max_sclk / 1000) * 64) * $sqc_per_gpu))
+        Address Processing Unit Busy:
+          avg: AVG(((100 * TA_TA_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
+          unit: pct
+          peak: 100
+          pop: N/A
+        Data-Return Busy:
+          avg: AVG(((100 * TD_TD_BUSY_sum) / ($GRBM_GUI_ACTIVE_PER_XCD * $cu_per_gpu)))
+          unit: pct
+          peak: 100
+          pop: N/A
+        L1I-L2 Bandwidth:
+          avg: AVG(((SQC_TC_INST_REQ * 64) / (End_Timestamp - Start_Timestamp)))
+          unit: GB/s
+          peak: N/A
+          pop: N/A
+        sL1D-L2 BW:
+          value: AVG(((((SQC_TC_DATA_READ_REQ + SQC_TC_DATA_WRITE_REQ + SQC_TC_DATA_ATOMIC_REQ)
+            * 64)) / (End_Timestamp - Start_Timestamp)))
+          unit: GB/s
+          peak: N/A
+          pop: N/A
@@ -7,8 +7,14 @@ sections:
    collapsed: true
    class: "sysinfo-section"
    subsections:
-      - title: "System Speed-of-Light"
-        data_path: ["2. System Speed-of-Light", "2.1 System Speed-of-Light"]
+      - title: "GPU Speed-of-Light"
+        data_path: ["32. GPU Speed-of-Light", "32.1 GPU Speed-of-Light"]
+        collapsed: true
+      - title: "Compute Throughput"
+        data_path: ["33. Compute Throughput", "33.1 Compute Throughput"]
+        collapsed: true
+      - title: "Memory Throughput"
+        data_path: ["34. Memory Throughput", "34.1 Memory Throughput"]
        collapsed: true
      - title: "Memory Chart"
        data_path: ["3. Memory Chart", "3.1 Memory Chart"]
@@ -17,14 +23,16 @@ sections:

  - title: "Detailed Block Analysis"
    collapsed: true
-    class: "kernels-section"
-    dynamic_sections: true
-    skip_sections:
-      - "0. Top Stats"
-      - "1. System Info"
-      - "2. System Speed-of-Light"
-      - "3. Memory Chart"
-      - "4. Roofline"
+    class: "block-section"
+    subsections:
+      - arch_config_data: true
+        exclude_keys:
+          - "0. Top Stats"
+          - "1. System Info"
+          - "2. System Speed-of-Light"
+          - "3. Memory Chart"
+          - "4. Roofline"
+        collapsed: true

  - title: "Source Level Analysis"
    collapsed: true
@@ -32,4 +40,4 @@ sections:
    subsections:
      - title: "PC Sampling"
        data_path: ["21. PC Sampling", "21.1 PC Sampling"]
-        collapsed: true
+        collapsed: true
@@ -99,6 +99,8 @@ def get_top_kernels_and_dispatch_ids(runs):
        top_kernel_df, dispatch_id_df, on="Kernel_Name", how="outer"
    ).sort_values("Pct", ascending=False)

+    # Remove unwanted columns
+    merged_df = merged_df.drop(columns=["Count", "GPU_ID"])
    return merged_df.to_dict("records")


@@ -55,22 +55,19 @@ class KernelView(Container):

    def __init__(self, config_path: Optional[str] = None):
        super().__init__(id="kernel-view")
-        self.status_label = None
-        self.dfs = {}
-        self.top_kernel = []
-
-        if rocprof_compute_home:
-            config_path = (
-                rocprof_compute_home
-                / "rocprof_compute_tui"
-                / "utils"
-                / "kernel_view_config.yaml"
-            )
-        self.config_path = config_path
-
-        self.keys = None
+        self.kernel_to_df_dict = {}
+        self.top_kernel_to_df_list = []
        self.current_selection = None

+        self.config_path = config_path or (
+            rocprof_compute_home
+            / "rocprof_compute_tui"
+            / "utils"
+            / "kernel_view_config.yaml"
+            if rocprof_compute_home
+            else None
+        )
+
    def compose(self):
        """
        Compose the split panel layout with two scrollable containers.
@@ -88,94 +85,85 @@ class KernelView(Container):
            # empty on init
            pass

-    def update_results(self, per_kernel_dfs, top_kernels) -> None:
-        self.dfs = per_kernel_dfs
-        self.top_kernel = top_kernels
+    def update_results(self, kernel_to_df_dict, top_kernel_to_df_list) -> None:
+        self.kernel_to_df_dict = kernel_to_df_dict
+        self.top_kernel_to_df_list = top_kernel_to_df_list

        top_container = self.query_one("#top-container", VerticalScroll)
        top_container.remove_children()

-        if self.top_kernel:
-            try:
-                header = self.build_header()
-                top_container.mount(header)
-                selector = self.build_selector()
-                top_container.mount(selector)
-            except Exception as e:
-                top_container.mount(
-                    Label(f"Error displaying kernel list: {str(e)}", classes="error")
-                )
-        else:
+        if not self.top_kernel_to_df_list:
            top_container.mount(Label("No kernels available", classes="placeholder"))
+            return

-        self.current_selection = self.top_kernel[0]["Kernel_Name"]
-        self._update_bottom_content()
+        # Build and mount components
+        self.new_perf_metric()
+        # build header section
+        keys = self.top_kernel_to_df_list[0].keys()
+        header_text = " | ".join(f"{key:25}" for key in keys)
+        top_container.mount(Label(header_text, classes="kernel-table-header"))
+
+        # build selector section
+        radio_buttons = []
+        for i, kernel in enumerate(self.top_kernel_to_df_list):
+            row_text = " | ".join(
+                f"{str(kernel.get(key, 'N/A'))[:18]:25}" for key in keys
+            )
+            button = RadioButton(row_text, id=f"kernel-{i}")
+            button.kernel_data = kernel
+            radio_buttons.append(button)
+        top_container.mount(RadioSet(*radio_buttons))
+
+        # build analysis section
+        self.current_selection = self.top_kernel_to_df_list[0]["Kernel_Name"]
+        self.update_bottom_content()

    def update_view(self, message: str, log_level: str) -> None:
-        """
-        Update the view with a status message.
-        """
-        if self.status_label is None:
-            self.status_label = Label(f"{message}", classes=log_level)
+        if not hasattr(self, "status_label") or self.status_label is None:
+            self.status_label = Label(message, classes=log_level)
            self.mount(self.status_label)
        else:
-            self.status_label.update(f"{message}")
+            self.status_label.update(message)
            self.status_label.set_classes(log_level)

-    def reload_config(self, config_path: str = None) -> None:
-        if config_path:
-            self.config_path = config_path
+    def new_perf_metric(self):
+        new_metrics = ["VGPRs", "Grid Size", "Workgroup Size"]
+        for new_metric in new_metrics:
+            for i, kernel in enumerate(self.top_kernel_to_df_list):
+                df_path = self.kernel_to_df_dict[kernel["Kernel_Name"]]["7. Wavefront"][
+                    "7.1 Wavefront Launch Stats"
+                ]["df"]
+                metric_avg = (
+                    df_path[df_path["Metric"] == new_metric]["Avg"].iloc[0].item()
+                )
+                self.top_kernel_to_df_list[i][new_metric] = metric_avg

-        if self.dfs and self.top_kernel:
-            self.update_results()
-
-    def build_header(self):
-        all_keys = set()
-
-        for kernel in self.top_kernel:
-            all_keys.update(kernel.keys())
-
-        self.keys = sorted(all_keys)
-
-        if "Kernel_Name" in self.keys:
-            self.keys.remove("Kernel_Name")
-            self.keys.insert(0, "Kernel_Name")
-
-        header_text = " | ".join(f"{key:25}" for key in self.keys)
-        header_label = Label(header_text, classes="kernel-table-header")
-
-        return header_label
-
-    def build_selector(self):
-        radio_buttons = []
-
-        for i, kernel in enumerate(self.top_kernel):
-            row_data = []
-            for key in self.keys:
-                value = str(kernel.get(key, "N/A"))
-                if len(value) > 18:
-                    value = value[:15] + "..."
-                row_data.append(f"{value:25}")
-
-            row_text = " | ".join(row_data)
-            radio_button = RadioButton(row_text, id=f"kernel-{i}")
-            radio_button.kernel_data = kernel
-            radio_buttons.append(radio_button)
-
-        selector = RadioSet(*radio_buttons)
-
-        return selector
+        """
+        header_order = [
+            "Dispatch_ID",
+            "Kernel_Name",
+            "Mean(ns)",
+            "Median(ns)",
+            "Sum(ns)",
+            "Compute Throughput",
+            "Memory Throughput",
+            "VGPRs",
+            "Grid Size",
+            "Workgroup Size",
+        ]
+        """

    @on(RadioSet.Changed)
    def on_radio_changed(self, event: RadioSet.Changed) -> None:
-        if event.pressed:
-            kernel_data = getattr(event.pressed, "kernel_data", None)
-            if kernel_data and "Kernel_Name" in kernel_data:
-                selected_kernel = kernel_data["Kernel_Name"]
-                self.current_selection = selected_kernel
-                self._update_bottom_content()
+        if not event.pressed:
+            return

-    def _update_bottom_content(self):
+        kernel_data = getattr(event.pressed, "kernel_data", None)
+        if kernel_data and "Kernel_Name" in kernel_data:
+            self.current_selection = kernel_data["Kernel_Name"]
+            self.update_bottom_content()
+
+    def update_bottom_content(self):
        bottom_container = self.query_one("#bottom-container", VerticalScroll)
        bottom_container.remove_children()

@@ -183,24 +171,28 @@ class KernelView(Container):
            Label("Toggle kernel selection to view detailed analysis.")
        )

-        if self.current_selection and self.current_selection in self.dfs:
-            bottom_container.mount(
-                Label(f"Current kernel selection: {self.current_selection}")
-            )
-            filtered_dfs = self.dfs[self.current_selection]
-
-            try:
-                sections = build_all_sections(filtered_dfs, self.config_path)
-                for section in sections:
-                    bottom_container.mount(section)
-            except Exception as e:
-                bottom_container.mount(
-                    Label(f"Error displaying results: {str(e)}", classes="error")
-                )
-        else:
+        if not (
+            self.current_selection and self.current_selection in self.kernel_to_df_dict
+        ):
            bottom_container.mount(
                Label(
                    f"No data available for kernel: {self.current_selection}",
                    classes="error",
                )
            )
+            return
+
+        bottom_container.mount(
+            Label(f"Current kernel selection: {self.current_selection}")
+        )
+
+        try:
+            sections = build_all_sections(
+                self.kernel_to_df_dict[self.current_selection], self.config_path
+            )
+            for section in sections:
+                bottom_container.mount(section)
+        except Exception as e:
+            bottom_container.mount(
+                Label(f"Error displaying results: {str(e)}", classes="error")
+            )
@@ -50,18 +50,12 @@ class MainView(Horizontal):
    """Main view layout for the application."""

    selected_path = reactive(None)
-    per_kernel_dfs = reactive({})
-    top_kernels = reactive([])
+    kernel_to_df_dict = reactive({})
+    top_kernel_to_df_list = reactive([])

    def __init__(self):
        super().__init__(id="main-container")
-        self.start_path = (
-            # NOTE: is cwd the best choice?
-            Path.cwd()
-            if DEFAULT_START_PATH is None
-            else Path(DEFAULT_START_PATH)
-        )
-
+        self.start_path = Path(DEFAULT_START_PATH) if DEFAULT_START_PATH else Path.cwd()
        self.logger = Logger()
        self.logger.info("MainView initialized", update_ui=False)

@@ -77,9 +71,7 @@ class MainView(Horizontal):
        with Horizontal(id="center-container"):
            with Vertical(id="activity-container"):
                # Center Panel - Analysis results display
-                center_panel = CenterPanel()
-                yield center_panel
-                self.center = center_panel
+                yield CenterPanel()

                # Bottom Panel - Output, terminal, and metric description
                tabs = TabsArea()
@@ -97,215 +89,91 @@ class MainView(Horizontal):

    @on(DataTable.CellSelected)
    def on_data_table_cell_selected(self, event: DataTable.CellSelected) -> None:
-        table = event.data_table
-        row_idx = event.coordinate.row
-
-        self.logger.info(f"Cell selected at row {row_idx}")
-
        try:
-            row_data = table.get_row_at(row_idx)
-            content = f"Selected Metric ID: {row_data[0]}\n"
-            content += f"Selected Metric: {row_data[1]}\n"
-            # content += f"Metric Description:\n\t{row_data[-1]}"
-
-            self.metric_description.text = content
-            self.logger.info(f"Row {row_idx} data displayed in metric_description")
-
+            row_data = event.data_table.get_row_at(event.coordinate.row)
+            self.metric_description.text = (
+                f"Selected Metric ID: {row_data[0]}\nSelected Metric: {row_data[1]}\n"
+            )
+            self.logger.info(f"Row {event.coordinate.row} data displayed")
        except Exception as e:
-            error_msg = f"Error displaying row {row_idx}: {str(e)}"
-            table.add_column("Error")
-            table.add_row(str(e))
+            error_msg = f"Error displaying row {event.coordinate.row}: {str(e)}"
            self.metric_description.text = error_msg
            self.logger.error(error_msg)

    @work(thread=True)
    def run_analysis(self) -> None:
-        self.per_kernel_dfs = {}
-        self.top_kernels = []
+        self.kernel_to_df_dict = {}
+        self.top_kernel_to_df_list = []

        if not self.selected_path:
-            error_msg = "No directory selected for analysis"
-            self._update_view(error_msg, LogLevel.ERROR)
-            self.logger.error(error_msg)
+            self.app.call_from_thread(
+                lambda: self.query_one("#kernel-view").update_view(
+                    "No directory selected for analysis", LogLevel.ERROR
+                )
+            )
            return

        try:
            self.logger.info(f"Starting analysis on: {self.selected_path}")
-            self._update_view(
-                f"Running analysis on: {self.selected_path}", LogLevel.SUCCESS
+
+            self.app.call_from_thread(
+                lambda: self.query_one("#kernel-view").update_view(
+                    f"Running analysis on: {self.selected_path}", LogLevel.SUCCESS
+                )
            )

-            # Step 1: Create analyzer
-            try:
-                self.logger.info("Step 1: Creating analyzer")
-                self.logger.info(f"Step 1: args {self.app.args}")
-                self.logger.info(f"Step 1: arch {self.app.supported_archs}")
-                self.logger.info("Step 1: Creating analyzer")
-                analyzer = tui_analysis(
-                    self.app.args, self.app.supported_archs, self.selected_path
-                )
-                self.logger.info("Step 1: Analyzer created successfully")
-            except Exception as e:
-                self.logger.error(f"Step 1 failed - Error creating analyzer: {str(e)}")
-                raise
+            # 1. Create and TUI analyzer
+            analyzer = tui_analysis(
+                self.app.args, self.app.supported_archs, self.selected_path
+            )
+            analyzer.sanitize()

-            # Step 2: Sanitize analyzer
-            try:
-                self.logger.info("Step 2: Sanitizing analyzer")
-                analyzer.sanitize()
-                self.logger.info("Step 2: Analyzer sanitized successfully")
-            except Exception as e:
-                self.logger.error(
-                    f"Step 2 failed - Error sanitizing analyzer: {str(e)}"
-                )
-                raise
+            # 2. Load and process system info and Configure SoC
+            sysinfo_path = Path(self.selected_path) / "sysinfo.csv"
+            if not sysinfo_path.exists():
+                raise FileNotFoundError(f"sysinfo.csv not found at {sysinfo_path}")
+            sys_info = file_io.load_sys_info(sysinfo_path).iloc[0].to_dict()
+            self.app.load_soc_specs(sys_info)

-            # Step 3: Load sys_info
-            try:
-                self.logger.info("Step 3: Loading sys_info")
-                sysinfo_path = Path(self.selected_path).joinpath("sysinfo.csv")
-                self.logger.info(f"Step 3: sysinfo_path = {sysinfo_path}")
+            # 3. run analysis
+            analyzer.set_soc(self.app.soc)
+            analyzer.pre_processing()
+            self.kernel_to_df_dict = analyzer.run_kernel_analysis()
+            self.top_kernel_to_df_list = analyzer.run_top_kernel()

-                if not sysinfo_path.exists():
-                    raise FileNotFoundError(f"sysinfo.csv not found at {sysinfo_path}")
-
-                sys_info_df = file_io.load_sys_info(sysinfo_path)
-                self.logger.info(f"Step 3: sys_info_df type = {type(sys_info_df)}")
-                shape_info = (
-                    sys_info_df.shape
-                    if hasattr(sys_info_df, "shape")
-                    else "No shape attribute"
-                )
-                self.logger.info(f"Step 3: sys_info_df shape = {shape_info}")
-
-            except Exception as e:
-                self.logger.error(f"Step 3 failed - Error loading sys_info: {str(e)}")
-                raise
-
-            # Step 4: Convert sys_info to dict
-            try:
-                self.logger.info("Step 4: Converting sys_info to dict")
-
-                # Check if it's actually a DataFrame
-                if hasattr(sys_info_df, "iloc"):
-                    sys_info = sys_info_df.iloc[0].to_dict()
-                elif hasattr(sys_info_df, "to_dict"):
-                    # If it's already a Series
-                    sys_info = sys_info_df.to_dict()
-                elif isinstance(sys_info_df, dict):
-                    # If it's already a dict
-                    sys_info = sys_info_df
-                else:
-                    raise TypeError(
-                        f"Unexpected type for sys_info: {type(sys_info_df)}"
+            if not self.kernel_to_df_dict or not self.top_kernel_to_df_list:
+                self.app.call_from_thread(
+                    lambda: self.query_one("#kernel-view").update_view(
+                        "Analysis completed but not all data was returned",
+                        LogLevel.WARNING,
                    )
-
-                self.logger.info(f"Step 4: sys_info converted = {sys_info}")
-
-            except Exception as e:
-                self.logger.error(
-                    f"Step 4 failed - Error converting sys_info: {str(e)}"
                )
-                raise
-
-            # Step 5: Load SoC specs
-            try:
-                self.logger.info("Step 5: Loading SoC specs")
-                self.app.load_soc_specs(sys_info)
-                self.logger.info(f"Step 5: SoC loaded = {self.app.soc}")
-            except Exception as e:
-                self.logger.error(f"Step 5 failed - Error loading SoC specs: {str(e)}")
-                raise
-
-            # Step 6: Set SoC in analyzer
-            try:
-                self.logger.info("Step 6: Setting SoC in analyzer")
-                analyzer.set_soc(self.app.soc)
-                self.logger.info("Step 6: SoC set successfully")
-            except Exception as e:
-                self.logger.error(f"Step 6 failed - Error setting SoC: {str(e)}")
-                raise
-
-            # Step 7: Pre-processing
-            try:
-                self.logger.info("Step 7: Running pre-processing")
-                analyzer.pre_processing()
-                self.logger.info("Step 7: Pre-processing completed")
-            except Exception as e:
-                self.logger.error(f"Step 7 failed - Error in pre-processing: {str(e)}")
-                raise
-            # Step 8: Run analysis
-            try:
-                self.logger.info("Step 8: Running analysis")
-                self.per_kernel_dfs = analyzer.run_kernel_analysis()
-                self.top_kernels = analyzer.run_top_kernel()
-
-                # TODO: add per kernel Roofline support when available
-
-                if not self.per_kernel_dfs or not self.top_kernels:
-                    warning_msg = (
-                        "Step 8: Per Kernel Analysis completed but not all data "
-                        "was returned"
-                    )
-                    self._update_view(warning_msg, LogLevel.WARNING)
-                    self.logger.warning(warning_msg)
-                else:
-                    self.app.call_from_thread(self.refresh_results)
-                    self.logger.info("Step 8: Kernel Analysis completed successfully")
-                    # self.logger.info(f"{self.per_kernel_dfs}")
-            except Exception as e:
-                self.logger.error(f"Step 8 failed - Error running analysis: {str(e)}")
-                raise
+            else:
+                self.app.call_from_thread(self.refresh_results)
+                self.logger.info("Kernel Analysis completed successfully")
+                # self.logger.info(f"{self.kernel_to_df_dict}")

        except Exception as e:
            import traceback

-            error_msg = f"Unexpected error during analysis: {str(e)}"
-            self.logger.error(error_msg)
-            self.logger.error(f"Full traceback:\n{traceback.format_exc()}")
-            self._update_view(error_msg, LogLevel.ERROR)
-
-    def _update_view(self, message: str, log_level: LogLevel) -> None:
-        try:
-            self.app.call_from_thread(self._safe_update_view, message, log_level)
-        except Exception as e:
-            self.logger.error(f"View update scheduling error: {str(e)}")
-
-    def _safe_update_view(self, message: str, log_level: LogLevel) -> None:
-        try:
-            kernel_view = self.query_one("#kernel-view")
-            if kernel_view:
-                kernel_view.update_view(message, log_level)
-            else:
-                self.logger.warning("Analysis view not found when updating log")
-        except Exception as e:
-            self.logger.error(f"Log update error: {str(e)}")
+            error_msg = f"Analysis failed: {str(e)}"
+            self.logger.error(f"{error_msg}\n{traceback.format_exc()}")
+            self.app.call_from_thread(
+                lambda: self.query_one("#kernel-view").update_view(
+                    error_msg, LogLevel.ERROR
+                )
+            )

    def refresh_results(self) -> None:
-        try:
-            self.logger.info("Refreshing kernel results")
-            kernel_view = self.query_one("#kernel-view")
-            if not kernel_view:
-                self.logger.error("Kernel view not found")
-                return
-
-            if (
-                not hasattr(self, "per_kernel_dfs")
-                or self.per_kernel_dfs is None
-                or not hasattr(self, "top_kernels")
-                or self.top_kernels is None
-            ):
-                self.logger.error("No kernel analysis data available to display")
-                return
-
-            kernel_view.update_results(self.per_kernel_dfs, self.top_kernels)
+        kernel_view = self.query_one("#kernel-view")
+        if kernel_view:
+            kernel_view.update_results(self.kernel_to_df_dict, self.top_kernel_to_df_list)
            self.logger.success("Results displayed successfully.")
-        except Exception as e:
-            self.logger.error(f"Error refreshing results: {str(e)}")
+        else:
+            self.logger.error("Kernel view not found or no data available")

    def refresh_view(self) -> None:
-        self.logger.info("Refreshing view...")
-        if self.top_kernels:
+        if self.kernel_to_df_dict and self.top_kernel_to_df_list:
            self.refresh_results()
        else:
            self.logger.warning("No data available for refresh")
@@ -23,7 +23,8 @@

 ##############################################################################

-from typing import Any, Dict, List, Optional
+
+from typing import Any, Dict, List

 import pandas as pd
 import yaml
@@ -31,7 +32,6 @@ from textual.widgets import Collapsible, DataTable, Label

 from rocprof_compute_tui.widgets.charts import (
    MemoryChart,
-    RooflinePlot,
    SimpleBar,
    SimpleBox,
    SimpleMultiBar,
@@ -40,17 +40,38 @@ from rocprof_compute_tui.widgets.charts import (

 def create_table(df: pd.DataFrame) -> DataTable:
    table = DataTable(zebra_stripes=True)
-
    df = df.reset_index()
    df = df[~df.apply(lambda row: row.astype(str).str.strip().eq("").any(), axis=1)]
-
    str_columns = [str(col) for col in df.columns]
    table.add_columns(*str_columns)
    table.add_rows([tuple(str(x) for x in row) for row in df.itertuples(index=False)])
-
    return table


+def create_widget_from_data(df: pd.DataFrame, tui_style: str = None, context: str = ""):
+    if df is None or df.empty:
+        return Label(
+            f"Data not available{f' for {context}' if context else ''}", classes="warning"
+        )
+
+    match tui_style:
+        # TODO: implement tui_style == "roofline"
+        # case "roofline":
+        #     return Roofline(df)
+        case None:
+            return create_table(df)
+        case "mem_chart":
+            return MemoryChart(df)
+        case "simple_bar":
+            return SimpleBar(df)
+        case "simple_box":
+            return SimpleBox(df)
+        case "simple_multiple_bar":
+            return SimpleMultiBar(df)
+        case _:
+            return Label(f"Unknown display type: {tui_style}")
+
+
 def load_config(config_path) -> Dict[str, Any]:
    try:
        with open(config_path, "r") as file:
@@ -66,257 +87,60 @@ def load_config(config_path) -> Dict[str, Any]:
        raise ValueError(f"Error parsing YAML configuration: {e}")


-def get_data_from_path(dfs: Dict[str, Any], path: List[str]) -> Optional[pd.DataFrame]:
-    try:
-        current = dfs
-        for key in path:
-            current = current[key]
-        return current["df"]
-    except (KeyError, TypeError):
-        return None
-
-
-def get_tui_style_from_path(dfs: Dict[str, Any], path: List[str]) -> Optional[str]:
-    try:
-        current = dfs
-        for key in path:
-            current = current[key]
-        return current.get("tui_style")
-    except (KeyError, TypeError):
-        return None
-
-
-def create_widget_from_data(df: pd.DataFrame, tui_style: Optional[str] = None) -> Any:
-    if df is not None and not df.empty:
-        match tui_style:  # noqa
-            case None:
-                return create_table(df)
-
-            case "mem_chart":
-                return MemoryChart(df)
-
-            case "simple_bar":
-                return SimpleBar(df)
-
-            case "simple_box":
-                return SimpleBox(df)
-
-            case "simple_multiple_bar":
-                return SimpleMultiBar(df)
-
-            case _:
-                return Label(f"Unknown display type: {tui_style}")
-    else:
-        return Label(f"Data not available for display in {tui_style}.")
-
-
-def build_subsection(
-    subsection_config: Dict[str, Any], dfs: Dict[str, Any]
-) -> Collapsible:
-    title = subsection_config["title"]
-    collapsed = subsection_config.get("collapsed", True)
-    tui_style = subsection_config.get("tui_style")
-
-    # Handle data-driven widgets
-    if "data_path" in subsection_config:
-        data_path = subsection_config["data_path"]
-
-        if tui_style is None:
-            tui_style = (
-                get_tui_style_from_path(dfs, data_path) if dfs is not None else None
-            )
-
-        df = get_data_from_path(dfs, data_path)
-
-        if df is None and tui_style is None:
-            error_msg = (
-                f"{title} data not available: Path {' -> '.join(data_path)} not found"
-            )
-            return Collapsible(
-                Label(error_msg, classes="warning"), title=title, collapsed=collapsed
-            )
-
-        # Create main widget
-        widget = create_widget_from_data(df, tui_style)
-
-        # Add header label if specified
-        widgets = []
-        if "header_label" in subsection_config:
-            header_class = subsection_config.get("header_class", "")
-            widgets.append(
-                Label(subsection_config["header_label"], classes=header_class)
-            )
-
-        widgets.append(widget)
-
-        collapsible = Collapsible(*widgets, title=title, collapsed=collapsed)
-    elif tui_style == "roofline":
-        if dfs["4. Roofline"]:
-            widget = RooflinePlot(dfs)
-            collapsible = Collapsible(widget, title=title, collapsed=collapsed)
-        else:
-            return None
-    # Fallback for subsections without data or style
-    else:
-        collapsible = Collapsible(
-            Label(f"No data or style configuration for {title}"),
-            title=title,
-            collapsed=collapsed,
-        )
-
-    # Add ID if specified
-    if "widget_id" in subsection_config:
-        collapsible.id = subsection_config["widget_id"]
-
-    return collapsible
-
-
-def build_kernel_sections(
-    dfs: Dict[str, Any], skip_sections: List[str]
-) -> List[Collapsible]:
-    children = []
-
-    def add_warning(message: str):
-        children.append(Label(message, classes="warning"))
-
-    def validate_data_structure(data, name: str, parent_name: str = None) -> bool:
-        if data is None:
-            location = f"'{parent_name}' > '{name}'" if parent_name else f"'{name}'"
-            add_warning(f"Analysis result for {location} is not available")
-            return False
-
-        if not isinstance(data, dict):
-            location = f"'{parent_name}' > '{name}'" if parent_name else f"'{name}'"
-            add_warning(
-                f"Analysis result for {location} is not a dictionary type: {type(data)}"
-            )
-            return False
-
-        return True
-
-    def create_safe_widget(subsection_name: str, data: dict, section_name: str):
-        if not (isinstance(data, dict) and "df" in data):
-            add_warning(
-                (
-                    f"Invalid data structure for '{subsection_name}' "
-                    f"in section '{section_name}'"
-                )
-            )
-            return None
-
-        try:
-            if data["df"] is None or data["df"].empty:
-                return None
-            tui_style = data.get("tui_style")
-            widget = create_widget_from_data(data["df"], tui_style)
-
-            if widget is None:
-                add_warning(f"Widget creation returned None for '{subsection_name}'")
-                return None
-
-            return widget
-        except Exception as e:
-            add_warning(f"Failed to create widget for '{subsection_name}': {str(e)}")
-            return None
-
-    def create_safe_collapsible(widget, title):
-        try:
-            return Collapsible(widget, title=title, collapsed=True)
-        except Exception as e:
-            add_warning(f"Failed to create collapsible for '{title}': {str(e)}")
-            return None
-
-    try:
-        if not validate_data_structure(dfs, "analysis result"):
-            return children
-
-        for section_name, subsections in dfs.items():
-            if section_name in skip_sections:
-                continue
-
-            if not validate_data_structure(subsections, section_name):
-                continue
-
-            kernel_children = []
-            for subsection_name, data in subsections.items():
-                try:
-                    widget = create_safe_widget(subsection_name, data, section_name)
-                    if widget:
-                        collapsible = create_safe_collapsible(widget, subsection_name)
-                        if collapsible:
-                            kernel_children.append(collapsible)
-                except Exception as e:
-                    add_warning(
-                        (
-                            f"Error processing subsection '{subsection_name}' "
-                            f"in section '{section_name}': {str(e)}"
-                        )
-                    )
-
-            if kernel_children:
-                try:
-                    section_collapsible = Collapsible(
-                        *kernel_children, title=section_name, collapsed=True
-                    )
-                    children.append(section_collapsible)
-                except Exception as e:
-                    add_warning(
-                        (
-                            "Failed to create collapsible for section "
-                            f"'{section_name}': {str(e)}"
-                        )
-                    )
-
-    except Exception as e:
-        add_warning(f"Unexpected error in Kernel Section processing: {str(e)}")
-
-    return children
-
-
 def build_section_from_config(
-    section_config: Dict[str, Any], dfs: Dict[str, Any]
+    dfs: Dict[str, Any], section_config: Dict[str, Any]
 ) -> Collapsible:
    title = section_config["title"]
    collapsed = section_config.get("collapsed", True)
-    css_class = section_config.get("class")

-    # Handle under construction sections
-    if section_config.get("under_construction", False):
-        construction_label = section_config.get(
-            "construction_label", "Under Construction"
-        )
-        construction_class = section_config.get("construction_class", "")
-        children = [Label(construction_label, classes=construction_class)]
+    children = []
+    for subsection_config in section_config["subsections"]:
+        # Handle arch_config_data
+        if subsection_config.get("arch_config_data", False):
+            if isinstance(dfs, dict):
+                exclude_keys = subsection_config.get("exclude_keys", [])
+                for section_name, subsections in dfs.items():
+                    if section_name not in exclude_keys and isinstance(subsections, dict):
+                        kernel_children = []
+                        for subsection_name, data in subsections.items():
+                            if isinstance(data, dict) and "df" in data:
+                                widget = create_widget_from_data(
+                                    data["df"], data.get("tui_style"), subsection_name
+                                )
+                                kernel_children.append(
+                                    Collapsible(
+                                        widget, title=subsection_name, collapsed=True
+                                    )
+                                )

-    # Handle dynamic sections (like kernel sections)
-    elif section_config.get("dynamic_sections", False):
-        skip_sections = section_config.get("skip_sections", [])
-        children = build_kernel_sections(dfs, skip_sections)
+                        if kernel_children:
+                            children.append(
+                                Collapsible(
+                                    *kernel_children, title=section_name, collapsed=True
+                                )
+                            )
+        else:
+            # Handle data_path
+            tui_style = subsection_config.get("tui_style")
+            data_path = subsection_config["data_path"]

-    # Handle regular sections with subsections
-    elif "subsections" in section_config:
-        children = []
-        for subsection_config in section_config["subsections"]:
-            try:
-                subsection = build_subsection(subsection_config, dfs)
-                if subsection:
-                    children.append(subsection)
-            except Exception as e:
-                error_msg = (
-                    f"{subsection_config.get('title', 'Unknown')} error: {str(e)}"
+            df = dfs.get(data_path[0], {}).get(data_path[1], {})
+            df = df.get("df") if isinstance(df, dict) else None
+            if df is not None and isinstance(df, dict) and tui_style is None:
+                tui_style = df.get("tui_style")
+
+            widgets = [
+                create_widget_from_data(df, tui_style, f"path {' -> '.join(data_path)}")
+            ]
+
+            children.append(
+                Collapsible(
+                    *widgets,
+                    title=subsection_config.get("title", "Untitled"),
+                    collapsed=subsection_config.get("collapsed", True),
                )
-                children.append(Label(error_msg, classes="warning"))
-    else:
-        children = [Label("No configuration provided for this section")]
-
-    # Create the main collapsible
-    collapsible = Collapsible(*children, title=title, collapsed=collapsed)
-
-    # Add CSS class if specified
-    if css_class:
-        collapsible.add_class(css_class)
-
-    return collapsible
+            )
+    return Collapsible(*children, title=title, collapsed=collapsed)


 def build_all_sections(dfs: Dict[str, Any], config_path) -> List[Collapsible]:
@@ -324,17 +148,7 @@ def build_all_sections(dfs: Dict[str, Any], config_path) -> List[Collapsible]:
    sections = []

    for section_config in config["sections"]:
-        try:
-            section = build_section_from_config(section_config, dfs)
-            sections.append(section)
-        except Exception as e:
-            # Create error section if something goes wrong
-            error_title = section_config.get("title", "Unknown Section")
-            error_section = Collapsible(
-                Label(f"Error building section: {str(e)}", classes="error"),
-                title=f"❌ {error_title}",
-                collapsed=True,
-            )
-            sections.append(error_section)
+        section = build_section_from_config(dfs, section_config)
+        sections.append(section)

    return sections
@@ -63,24 +63,24 @@ def load_sys_info(f):
    return pd.read_csv(f)


-def load_panel_configs(dir):
+def load_panel_configs(dirs):
    """
    Load all panel configs from yaml file.
    """
    d = {}
-    for root, dirs, files in os.walk(dir):
-        for f in files:
-            if f.endswith(".yaml"):
-                with open(str(Path(root).joinpath(f))) as file:
-                    config = yaml.safe_load(file)
-                    # metric key can be None due to some metric tables
-                    # not having any metrics
-                    # metric key should be empty dict instead of None
-                    for data_source in config["Panel Config"]["data source"]:
-                        metric_table = data_source.get("metric_table")
-                        if metric_table and metric_table["metric"] is None:
-                            metric_table["metric"] = {}
-                    d[config["Panel Config"]["id"]] = config["Panel Config"]
+    for dir in dirs:
+        for root, _, files in os.walk(dir):
+            for f in files:
+                if f.endswith(".yaml"):
+                    with open(Path(root) / f) as file:
+                        config_yml = yaml.safe_load(file)
+                        # metric key can be None due to some metric tables not having any metrics
+                        # metric key should be empty dict instead of None
+                        for data_source in config_yml["Panel Config"]["data source"]:
+                            metric_table = data_source.get("metric_table")
+                            if metric_table and metric_table["metric"] is None:
+                                metric_table["metric"] = {}
+                        d[config_yml["Panel Config"]["id"]] = config_yml["Panel Config"]

    # TODO: sort metrics as the header order in case they-
    # are not defined in the same order
@@ -160,9 +160,9 @@ def create_df_kernel_top_stats(
        axis=1,
    )

-    grouped = time_stats.groupby(by=["Kernel_Name"]).agg({
-        "ExeTime": ["count", "sum", "mean", "median"]
-    })
+    grouped = time_stats.groupby(by=["Kernel_Name"]).agg(
+        {"ExeTime": ["count", "sum", "mean", "median"]}
+    )

    time_unit_str = "(" + time_unit + ")"
    grouped.columns = [