diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0400_roofline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0400_roofline.yaml
index 66c656fb4c..9c80f75e80 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0400_roofline.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0400_roofline.yaml
@@ -3,9 +3,17 @@ Panel Config:
   id: 400
   title: Roofline
   metrics_description:
-    VALU FLOPs: 'The total floating-point operations executed per second on the VALU.
-      This is also presented as a percent of the peak theoretical FLOPs achievable
-      on the specific accelerator. Note: this does not include any floating-point
+    VALU FLOPs (F16): 'The total 16-bit floating-point operations executed per second
+      on the VALU. This is presented with the value of the peak empirical F16 FLOPs
+      achievable on the specific accelerator. Note: this does not include any F16
+      operations from MFMA instructions.'
+    VALU FLOPs (F32): 'The total 32-bit floating-point operations executed per second
+      on the VALU. This is presented with the value of the peak empirical F32 FLOPs
+      achievable on the specific accelerator. Note: this does not include any F32
+      operations from MFMA instructions.'
+    VALU FLOPs (F64): 'The total 64-bit floating-point operations executed per second
+      on the VALU. This is presented with the value of the peak empirical F64 FLOPs
+      achievable on the specific accelerator. Note: this does not include any F64
       operations from MFMA instructions.'
     MFMA FLOPs (F8): The total number of 8-bit brain floating point MFMA operations
       executed per second. This does not include any 16-bit brain floating point operations
@@ -28,6 +36,11 @@ Panel Config:
       per second. Note: this does not include any 64-bit floating point operations
       from VALU instructions. The peak empirically measured F64 MFMA operations achievable
       on the specific accelerator is displayed alongside for comparison.'
+    MFMA FLOPs (F6F4): 'The total number of 4-bit and 6-bit floating point MFMA operations
+      executed per second. Note: this does not include any floating point operations
+      from VALU instructions. The peak empirically measured F6F4 MFMA operations achievable
+      on the specific accelerator is displayed alongside for comparison. It is supported
+      on AMD Instinct MI350 series (gfx950) and later only.'
     MFMA IOPs (Int8): 'The total number of 8-bit integer MFMA operations executed
       per second. Note: this does not include any 8-bit integer operations from VALU
       instructions. The peak empirically measured INT8 MFMA operations achievable
@@ -80,15 +93,24 @@ Panel Config:
         unit: Unit
         peak: Peak (Empirical)
       metric:
-        VALU FLOPs:
-          value: AVG((($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16
-            + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32
-            + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32)
-            + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64)
-            + SQ_INSTS_VALU_TRANS_F64) )) / ((End_Timestamp - Start_Timestamp) / 1e9))
-            / 1e9)
+        VALU FLOPs (F16):
+          value: AVG((($wave_size * ( SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16
+            + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16 )) / ((End_Timestamp
+            - Start_Timestamp) / 1e9)) / 1e9)
           unit: GFLOP/s
-          peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
+          peak: $FP16Flops_empirical_peak
+        VALU FLOPs (F32):
+          value: AVG((($wave_size * ( SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32
+            + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32 )) / ((End_Timestamp
+            - Start_Timestamp) / 1e9)) / 1e9)
+          unit: GFLOP/s
+          peak: $FP32Flops_empirical_peak
+        VALU FLOPs (F64):
+          value: AVG((($wave_size * ( SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64
+            + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64 )) / ((End_Timestamp
+            - Start_Timestamp) / 1e9)) / 1e9)
+          unit: GFLOP/s
+          peak: $FP64Flops_empirical_peak
         MFMA FLOPs (F64):
           value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)) / ((End_Timestamp - Start_Timestamp)
             / 1e9)) / 1e9)
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0400_roofline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0400_roofline.yaml
index 38af3367e9..64ed38e23e 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0400_roofline.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0400_roofline.yaml
@@ -3,9 +3,17 @@ Panel Config:
   id: 400
   title: Roofline
   metrics_description:
-    VALU FLOPs: 'The total floating-point operations executed per second on the VALU.
-      This is also presented as a percent of the peak theoretical FLOPs achievable
-      on the specific accelerator. Note: this does not include any floating-point
+    VALU FLOPs (F16): 'The total 16-bit floating-point operations executed per second
+      on the VALU. This is presented with the value of the peak empirical F16 FLOPs
+      achievable on the specific accelerator. Note: this does not include any F16
+      operations from MFMA instructions.'
+    VALU FLOPs (F32): 'The total 32-bit floating-point operations executed per second
+      on the VALU. This is presented with the value of the peak empirical F32 FLOPs
+      achievable on the specific accelerator. Note: this does not include any F32
+      operations from MFMA instructions.'
+    VALU FLOPs (F64): 'The total 64-bit floating-point operations executed per second
+      on the VALU. This is presented with the value of the peak empirical F64 FLOPs
+      achievable on the specific accelerator. Note: this does not include any F64
       operations from MFMA instructions.'
     MFMA FLOPs (F8): The total number of 8-bit brain floating point MFMA operations
       executed per second. This does not include any 16-bit brain floating point operations
@@ -28,6 +36,11 @@ Panel Config:
       per second. Note: this does not include any 64-bit floating point operations
       from VALU instructions. The peak empirically measured F64 MFMA operations achievable
       on the specific accelerator is displayed alongside for comparison.'
+    MFMA FLOPs (F6F4): 'The total number of 4-bit and 6-bit floating point MFMA operations
+      executed per second. Note: this does not include any floating point operations
+      from VALU instructions. The peak empirically measured F6F4 MFMA operations achievable
+      on the specific accelerator is displayed alongside for comparison. It is supported
+      on AMD Instinct MI350 series (gfx950) and later only.'
     MFMA IOPs (Int8): 'The total number of 8-bit integer MFMA operations executed
       per second. Note: this does not include any 8-bit integer operations from VALU
       instructions. The peak empirically measured INT8 MFMA operations achievable
@@ -80,15 +93,24 @@ Panel Config:
         unit: Unit
         peak: Peak (Empirical)
       metric:
-        VALU FLOPs:
-          value: AVG((($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16
-            + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32
-            + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32)
-            + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64)
-            + SQ_INSTS_VALU_TRANS_F64) )) / ((End_Timestamp - Start_Timestamp) / 1e9))
-            / 1e9)
+        VALU FLOPs (F16):
+          value: AVG((($wave_size * ( SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16
+            + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16 )) / ((End_Timestamp
+            - Start_Timestamp) / 1e9)) / 1e9)
           unit: GFLOP/s
-          peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
+          peak: $FP16Flops_empirical_peak
+        VALU FLOPs (F32):
+          value: AVG((($wave_size * ( SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32
+            + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32 )) / ((End_Timestamp
+            - Start_Timestamp) / 1e9)) / 1e9)
+          unit: GFLOP/s
+          peak: $FP32Flops_empirical_peak
+        VALU FLOPs (F64):
+          value: AVG((($wave_size * ( SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64
+            + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64 )) / ((End_Timestamp
+            - Start_Timestamp) / 1e9)) / 1e9)
+          unit: GFLOP/s
+          peak: $FP64Flops_empirical_peak
         MFMA FLOPs (F64):
           value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)) / ((End_Timestamp - Start_Timestamp)
             / 1e9)) / 1e9)
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0400_roofline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0400_roofline.yaml
index 839c04fd2e..b06371c850 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0400_roofline.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0400_roofline.yaml
@@ -3,9 +3,17 @@ Panel Config:
   id: 400
   title: Roofline
   metrics_description:
-    VALU FLOPs: 'The total floating-point operations executed per second on the VALU.
-      This is also presented as a percent of the peak theoretical FLOPs achievable
-      on the specific accelerator. Note: this does not include any floating-point
+    VALU FLOPs (F16): 'The total 16-bit floating-point operations executed per second
+      on the VALU. This is presented with the value of the peak empirical F16 FLOPs
+      achievable on the specific accelerator. Note: this does not include any F16
+      operations from MFMA instructions.'
+    VALU FLOPs (F32): 'The total 32-bit floating-point operations executed per second
+      on the VALU. This is presented with the value of the peak empirical F32 FLOPs
+      achievable on the specific accelerator. Note: this does not include any F32
+      operations from MFMA instructions.'
+    VALU FLOPs (F64): 'The total 64-bit floating-point operations executed per second
+      on the VALU. This is presented with the value of the peak empirical F64 FLOPs
+      achievable on the specific accelerator. Note: this does not include any F64
       operations from MFMA instructions.'
     MFMA FLOPs (F8): The total number of 8-bit brain floating point MFMA operations
       executed per second. This does not include any 16-bit brain floating point operations
@@ -28,6 +36,11 @@ Panel Config:
       per second. Note: this does not include any 64-bit floating point operations
       from VALU instructions. The peak empirically measured F64 MFMA operations achievable
       on the specific accelerator is displayed alongside for comparison.'
+    MFMA FLOPs (F6F4): 'The total number of 4-bit and 6-bit floating point MFMA operations
+      executed per second. Note: this does not include any floating point operations
+      from VALU instructions. The peak empirically measured F6F4 MFMA operations achievable
+      on the specific accelerator is displayed alongside for comparison. It is supported
+      on AMD Instinct MI350 series (gfx950) and later only.'
     MFMA IOPs (Int8): 'The total number of 8-bit integer MFMA operations executed
       per second. Note: this does not include any 8-bit integer operations from VALU
       instructions. The peak empirically measured INT8 MFMA operations achievable
@@ -80,15 +93,24 @@ Panel Config:
         unit: Unit
         peak: Peak (Empirical)
       metric:
-        VALU FLOPs:
-          value: AVG(($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16
-            + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32
-            + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32)
-            + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64)
-            + SQ_INSTS_VALU_TRANS_F64) ) / ((End_Timestamp - Start_Timestamp) / 1e9))
-            / 1e9)
+        VALU FLOPs (F16):
+          value: AVG((($wave_size * ( SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16
+            + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16 )) / ((End_Timestamp
+            - Start_Timestamp) / 1e9)) / 1e9)
           unit: GFLOP/s
-          peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
+          peak: $FP16Flops_empirical_peak
+        VALU FLOPs (F32):
+          value: AVG((($wave_size * ( SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32
+            + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32 )) / ((End_Timestamp
+            - Start_Timestamp) / 1e9)) / 1e9)
+          unit: GFLOP/s
+          peak: $FP32Flops_empirical_peak
+        VALU FLOPs (F64):
+          value: AVG((($wave_size * ( SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64
+            + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64 )) / ((End_Timestamp
+            - Start_Timestamp) / 1e9)) / 1e9)
+          unit: GFLOP/s
+          peak: $FP64Flops_empirical_peak
         MFMA FLOPs (F64):
           value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)) / ((End_Timestamp - Start_Timestamp)
             / 1e9)) / 1e9)
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0400_roofline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0400_roofline.yaml
index f9f4d7cc19..c847403bb1 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0400_roofline.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0400_roofline.yaml
@@ -3,9 +3,17 @@ Panel Config:
   id: 400
   title: Roofline
   metrics_description:
-    VALU FLOPs: 'The total floating-point operations executed per second on the VALU.
-      This is also presented as a percent of the peak theoretical FLOPs achievable
-      on the specific accelerator. Note: this does not include any floating-point
+    VALU FLOPs (F16): 'The total 16-bit floating-point operations executed per second
+      on the VALU. This is presented with the value of the peak empirical F16 FLOPs
+      achievable on the specific accelerator. Note: this does not include any F16
+      operations from MFMA instructions.'
+    VALU FLOPs (F32): 'The total 32-bit floating-point operations executed per second
+      on the VALU. This is presented with the value of the peak empirical F32 FLOPs
+      achievable on the specific accelerator. Note: this does not include any F32
+      operations from MFMA instructions.'
+    VALU FLOPs (F64): 'The total 64-bit floating-point operations executed per second
+      on the VALU. This is presented with the value of the peak empirical F64 FLOPs
+      achievable on the specific accelerator. Note: this does not include any F64
       operations from MFMA instructions.'
     MFMA FLOPs (F8): The total number of 8-bit brain floating point MFMA operations
       executed per second. This does not include any 16-bit brain floating point operations
@@ -28,6 +36,11 @@ Panel Config:
       per second. Note: this does not include any 64-bit floating point operations
       from VALU instructions. The peak empirically measured F64 MFMA operations achievable
       on the specific accelerator is displayed alongside for comparison.'
+    MFMA FLOPs (F6F4): 'The total number of 4-bit and 6-bit floating point MFMA operations
+      executed per second. Note: this does not include any floating point operations
+      from VALU instructions. The peak empirically measured F6F4 MFMA operations achievable
+      on the specific accelerator is displayed alongside for comparison. It is supported
+      on AMD Instinct MI350 series (gfx950) and later only.'
     MFMA IOPs (Int8): 'The total number of 8-bit integer MFMA operations executed
       per second. Note: this does not include any 8-bit integer operations from VALU
       instructions. The peak empirically measured INT8 MFMA operations achievable
@@ -80,15 +93,24 @@ Panel Config:
         unit: Unit
         peak: Peak (Empirical)
       metric:
-        VALU FLOPs:
-          value: AVG(($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16
-            + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32
-            + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32)
-            + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64)
-            + SQ_INSTS_VALU_TRANS_F64) ) / ((End_Timestamp - Start_Timestamp) / 1e9))
-            / 1e9)
+        VALU FLOPs (F16):
+          value: AVG((($wave_size * ( SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16
+            + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16 )) / ((End_Timestamp
+            - Start_Timestamp) / 1e9)) / 1e9)
           unit: GFLOP/s
-          peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
+          peak: $FP16Flops_empirical_peak
+        VALU FLOPs (F32):
+          value: AVG((($wave_size * ( SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32
+            + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32 )) / ((End_Timestamp
+            - Start_Timestamp) / 1e9)) / 1e9)
+          unit: GFLOP/s
+          peak: $FP32Flops_empirical_peak
+        VALU FLOPs (F64):
+          value: AVG((($wave_size * ( SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64
+            + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64 )) / ((End_Timestamp
+            - Start_Timestamp) / 1e9)) / 1e9)
+          unit: GFLOP/s
+          peak: $FP64Flops_empirical_peak
         MFMA FLOPs (F64):
           value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)) / ((End_Timestamp - Start_Timestamp)
             / 1e9)) / 1e9)
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0400_roofline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0400_roofline.yaml
index 9ba1e6f1fa..c951110895 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0400_roofline.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0400_roofline.yaml
@@ -3,9 +3,17 @@ Panel Config:
   id: 400
   title: Roofline
   metrics_description:
-    VALU FLOPs: 'The total floating-point operations executed per second on the VALU.
-      This is also presented as a percent of the peak theoretical FLOPs achievable
-      on the specific accelerator. Note: this does not include any floating-point
+    VALU FLOPs (F16): 'The total 16-bit floating-point operations executed per second
+      on the VALU. This is presented with the value of the peak empirical F16 FLOPs
+      achievable on the specific accelerator. Note: this does not include any F16
+      operations from MFMA instructions.'
+    VALU FLOPs (F32): 'The total 32-bit floating-point operations executed per second
+      on the VALU. This is presented with the value of the peak empirical F32 FLOPs
+      achievable on the specific accelerator. Note: this does not include any F32
+      operations from MFMA instructions.'
+    VALU FLOPs (F64): 'The total 64-bit floating-point operations executed per second
+      on the VALU. This is presented with the value of the peak empirical F64 FLOPs
+      achievable on the specific accelerator. Note: this does not include any F64
       operations from MFMA instructions.'
     MFMA FLOPs (F8): The total number of 8-bit brain floating point MFMA operations
       executed per second. This does not include any 16-bit brain floating point operations
@@ -28,6 +36,11 @@ Panel Config:
       per second. Note: this does not include any 64-bit floating point operations
       from VALU instructions. The peak empirically measured F64 MFMA operations achievable
       on the specific accelerator is displayed alongside for comparison.'
+    MFMA FLOPs (F6F4): 'The total number of 4-bit and 6-bit floating point MFMA operations
+      executed per second. Note: this does not include any floating point operations
+      from VALU instructions. The peak empirically measured F6F4 MFMA operations achievable
+      on the specific accelerator is displayed alongside for comparison. It is supported
+      on AMD Instinct MI350 series (gfx950) and later only.'
     MFMA IOPs (Int8): 'The total number of 8-bit integer MFMA operations executed
       per second. Note: this does not include any 8-bit integer operations from VALU
       instructions. The peak empirically measured INT8 MFMA operations achievable
@@ -80,15 +93,24 @@ Panel Config:
         unit: Unit
         peak: Peak (Empirical)
       metric:
-        VALU FLOPs:
-          value: AVG((($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16
-            + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32
-            + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32)
-            + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64)
-            + SQ_INSTS_VALU_TRANS_F64) )) / ((End_Timestamp - Start_Timestamp) / 1e9))
-            / 1e9)
+        VALU FLOPs (F16):
+          value: AVG((($wave_size * ( SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16
+            + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16 )) / ((End_Timestamp
+            - Start_Timestamp) / 1e9)) / 1e9)
           unit: GFLOP/s
-          peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
+          peak: $FP16Flops_empirical_peak
+        VALU FLOPs (F32):
+          value: AVG((($wave_size * ( SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32
+            + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32 )) / ((End_Timestamp
+            - Start_Timestamp) / 1e9)) / 1e9)
+          unit: GFLOP/s
+          peak: $FP32Flops_empirical_peak
+        VALU FLOPs (F64):
+          value: AVG((($wave_size * ( SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64
+            + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64 )) / ((End_Timestamp
+            - Start_Timestamp) / 1e9)) / 1e9)
+          unit: GFLOP/s
+          peak: $FP64Flops_empirical_peak
         MFMA FLOPs (F64):
           value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)) / ((End_Timestamp - Start_Timestamp)
             / 1e9)) / 1e9)
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0400_roofline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0400_roofline.yaml
index 500c7ff805..4064a4a84b 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0400_roofline.yaml
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0400_roofline.yaml
@@ -3,9 +3,17 @@ Panel Config:
   id: 400
   title: Roofline
   metrics_description:
-    VALU FLOPs: 'The total floating-point operations executed per second on the VALU.
-      This is also presented as a percent of the peak theoretical FLOPs achievable
-      on the specific accelerator. Note: this does not include any floating-point
+    VALU FLOPs (F16): 'The total 16-bit floating-point operations executed per second
+      on the VALU. This is presented with the value of the peak empirical F16 FLOPs
+      achievable on the specific accelerator. Note: this does not include any F16
+      operations from MFMA instructions.'
+    VALU FLOPs (F32): 'The total 32-bit floating-point operations executed per second
+      on the VALU. This is presented with the value of the peak empirical F32 FLOPs
+      achievable on the specific accelerator. Note: this does not include any F32
+      operations from MFMA instructions.'
+    VALU FLOPs (F64): 'The total 64-bit floating-point operations executed per second
+      on the VALU. This is presented with the value of the peak empirical F64 FLOPs
+      achievable on the specific accelerator. Note: this does not include any F64
       operations from MFMA instructions.'
     MFMA FLOPs (F8): The total number of 8-bit brain floating point MFMA operations
       executed per second. This does not include any 16-bit brain floating point operations
@@ -28,6 +36,11 @@ Panel Config:
       per second. Note: this does not include any 64-bit floating point operations
       from VALU instructions. The peak empirically measured F64 MFMA operations achievable
       on the specific accelerator is displayed alongside for comparison.'
+    MFMA FLOPs (F6F4): 'The total number of 4-bit and 6-bit floating point MFMA operations
+      executed per second. Note: this does not include any floating point operations
+      from VALU instructions. The peak empirically measured F6F4 MFMA operations achievable
+      on the specific accelerator is displayed alongside for comparison. It is supported
+      on AMD Instinct MI350 series (gfx950) and later only.'
     MFMA IOPs (Int8): 'The total number of 8-bit integer MFMA operations executed
       per second. Note: this does not include any 8-bit integer operations from VALU
       instructions. The peak empirically measured INT8 MFMA operations achievable
@@ -80,15 +93,24 @@ Panel Config:
         unit: Unit
         peak: Peak (Empirical)
       metric:
-        VALU FLOPs:
-          value: AVG((($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16
-            + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32
-            + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32)
-            + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64)
-            + SQ_INSTS_VALU_TRANS_F64) )) / ((End_Timestamp - Start_Timestamp) / 1e9))
-            / 1e9)
+        VALU FLOPs (F16):
+          value: AVG((($wave_size * ( SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16
+            + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16 )) / ((End_Timestamp
+            - Start_Timestamp) / 1e9)) / 1e9)
           unit: GFLOP/s
-          peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
+          peak: $FP16Flops_empirical_peak
+        VALU FLOPs (F32):
+          value: AVG((($wave_size * ( SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32
+            + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32 )) / ((End_Timestamp
+            - Start_Timestamp) / 1e9)) / 1e9)
+          unit: GFLOP/s
+          peak: $FP32Flops_empirical_peak
+        VALU FLOPs (F64):
+          value: AVG((($wave_size * ( SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64
+            + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64 )) / ((End_Timestamp
+            - Start_Timestamp) / 1e9)) / 1e9)
+          unit: GFLOP/s
+          peak: $FP64Flops_empirical_peak
         MFMA FLOPs (F64):
           value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)) / ((End_Timestamp - Start_Timestamp)
             / 1e9)) / 1e9)
diff --git a/projects/rocprofiler-compute/src/utils/parser.py b/projects/rocprofiler-compute/src/utils/parser.py
index 192d5db774..a2f625f206 100755
--- a/projects/rocprofiler-compute/src/utils/parser.py
+++ b/projects/rocprofiler-compute/src/utils/parser.py
@@ -771,7 +771,7 @@ def build_metric_value_string(dfs, dfs_type, normal_unit, profiling_config):
 
 
 def init_metric_evaluator(
-    raw_pmc_df: Union[pd.DataFrame, dict], ammolite_vars: dict
+    raw_pmc_df: Union[pd.DataFrame, dict], ammolite_vars: dict, empirical_peaks: dict
 ) -> None:
     if isinstance(raw_pmc_df, dict):
         raw_pmc_df_keys = set(raw_pmc_df.keys())
@@ -790,6 +790,7 @@ def init_metric_evaluator(
     # The process-local globals are used for performance optimization.
     globals().update(raw_pmc_df_items)
     globals().update(ammolite_vars)
+    globals().update(empirical_peaks)
 
 
 def run_metric_evaluator(row_expr: str) -> str:
@@ -821,6 +822,38 @@ def run_metric_evaluator(row_expr: str) -> str:
             console_error("analysis", str(ae))
 
 
+def create_empirical_peaks_dict(empirical_peaks_df):
+    """Create empirical peaks dictionary"""
+    empirical_peaks = {}
+
+    if not empirical_peaks_df.empty:
+        peak_data_row = empirical_peaks_df.iloc[0]
+        for col in empirical_peaks_df.columns:
+            empirical_peaks[f"ammolite__{col}_empirical_peak"] = peak_data_row[col]
+    else:
+        peak_names = [
+            "FP16Flops",
+            "FP32Flops",
+            "FP64Flops",
+            "MFMAF64Flops",
+            "MFMAF32Flops",
+            "MFMAF16Flops",
+            "MFMABF16Flops",
+            "MFMAF8Flops",
+            "MFMAI8Ops",
+            "HBMBw",
+            "L2Bw",
+            "L1Bw",
+            "LDSBw",
+            "MFMA_FLOPs_F6F4",
+        ]
+        # initialize peaks to 0
+        for peak_name in peak_names:
+            empirical_peaks[f"ammolite__{peak_name}_empirical_peak"] = 0
+
+    return empirical_peaks
+
+
 @demarcate
 def eval_metric(dfs, dfs_type, sys_info, empirical_peaks_df, raw_pmc_df, debug, config):
     """
@@ -927,32 +960,10 @@ def eval_metric(dfs, dfs_type, sys_info, empirical_peaks_df, raw_pmc_df, debug,
             "wave_size is not available in sysinfo.csv, please provide the correct "
             "value using --specs-correction"
         )
-    if not empirical_peaks_df.empty:
-        peak_data_row = empirical_peaks_df.iloc[0]
-        for metric_name in empirical_peaks_df.columns:
-            var_name = f"ammolite__{metric_name}_empirical_peak"
-            locals()[var_name] = peak_data_row[metric_name]
-    else:
-        default_peaks = [
-            "MFMAF64Flops",
-            "MFMAF32Flops",
-            "MFMAF16Flops",
-            "MFMABF16Flops",
-            "MFMAF8Flops",
-            "MFMAI8Ops",
-            "HBMBw",
-            "L2Bw",
-            "L1Bw",
-            "LDSBw",
-            "MFMA_FLOPs_F6F4",
-        ]
-        # set values to 0 if no no empirical peaks from roofline.csv are provided
-        for peak_name in default_peaks:
-            var_name = f"ammolite__{peak_name}_empirical_peak"
-            exec(f"{var_name} = 0", globals(), locals())
+
+    empirical_peaks = create_empirical_peaks_dict(empirical_peaks_df)
 
     # TODO: fix all $normUnit in Unit column or title
-
     # build and eval all derived build-in global variables
     ammolite__build_in = {}
 
@@ -966,6 +977,8 @@ def eval_metric(dfs, dfs_type, sys_info, empirical_peaks_df, raw_pmc_df, debug,
             ammolite__build_in[key] = eval(compile(s, "<string>", "eval"))
         except TypeError:
             ammolite__build_in[key] = None
+        except NameError:
+            ammolite__build_in[key] = None
         except KeyError:
             ammolite__build_in[key] = None
         except AttributeError as ae:
@@ -1022,12 +1035,32 @@ def eval_metric(dfs, dfs_type, sys_info, empirical_peaks_df, raw_pmc_df, debug,
                                     )
                                     if matched_vars:
                                         for v in matched_vars:
-                                            print(
-                                                "Var ",
-                                                v,
-                                                ":",
-                                                eval(compile(v, "<string>", "eval")),
-                                            )
+                                            try:
+                                                value = eval(
+                                                    compile(v, "<string>", "eval")
+                                                )
+                                                print("Var ", v, ":", value)
+                                            except NameError:
+                                                if "_empirical_peak" in v:
+                                                    if v in empirical_peaks:
+                                                        print(
+                                                            "Var ",
+                                                            v,
+                                                            ":",
+                                                            empirical_peaks[v],
+                                                        )
+                                                    else:
+                                                        print(
+                                                            "Var ",
+                                                            v,
+                                                            ": [empirical peak not found]",  # noqa
+                                                        )
+                                                else:
+                                                    print(
+                                                        "Var ",
+                                                        v,
+                                                        ": [not available in main thread]",  # noqa
+                                                    )
                                     matched_cols = re.findall(
                                         r"raw_pmc_df\['\w+'\]\['\w+'\]", row[expr]
                                     )
@@ -1063,6 +1096,21 @@ def eval_metric(dfs, dfs_type, sys_info, empirical_peaks_df, raw_pmc_df, debug,
                                             eval(compile(row[expr], "<string>", "eval"))
                                         )
                                         print("~" * 40)
+                                    except NameError as ne:
+                                        if "empirical_peak" in str(ne):
+                                            console_warning(
+                                                "Skipping debug evaluation. Empirical peak variables "  # noqa
+                                                "not available in main thread: {}".format(  # noqa
+                                                    str(ne)
+                                                )
+                                            )
+                                        else:
+                                            console_warning(
+                                                "Skipping debug evaluation. Variable not available: {}".format(  # noqa
+                                                    str(ne)
+                                                )
+                                            )
+                                        print("~" * 40)
                                     except TypeError:
                                         console_warning(
                                             "Skipping entry. Encountered a missing "
@@ -1100,7 +1148,6 @@ def eval_metric(dfs, dfs_type, sys_info, empirical_peaks_df, raw_pmc_df, debug,
     ammolite_vars = {
         key: val for key, val in locals().items() if key.startswith("ammolite__")
     }
-
     # Empirically, 16 is about as much as we need.
     processes = min(16, multiprocessing.cpu_count() // 2)
 
@@ -1108,7 +1155,7 @@ def eval_metric(dfs, dfs_type, sys_info, empirical_peaks_df, raw_pmc_df, debug,
     with multiprocessing.Pool(
         processes=processes,
         initializer=init_metric_evaluator,
-        initargs=(raw_pmc_df, ammolite_vars),
+        initargs=(raw_pmc_df, ammolite_vars, empirical_peaks),
     ) as pool:
         outs = pool.map(run_metric_evaluator, row_exprs)
 
diff --git a/projects/rocprofiler-compute/utils/autogen_hash.yaml b/projects/rocprofiler-compute/utils/autogen_hash.yaml
index 7079981108..7d4fc1150f 100644
--- a/projects/rocprofiler-compute/utils/autogen_hash.yaml
+++ b/projects/rocprofiler-compute/utils/autogen_hash.yaml
@@ -23,12 +23,12 @@ src/rocprof_compute_soc/analysis_configs/gfx940/0300_memory_chart.yaml: 249e9ae0
 src/rocprof_compute_soc/analysis_configs/gfx941/0300_memory_chart.yaml: 249e9ae0445de0927827ec14d20f946a07d50d92fd56e1993bbe0c17eb65bd51
 src/rocprof_compute_soc/analysis_configs/gfx942/0300_memory_chart.yaml: 249e9ae0445de0927827ec14d20f946a07d50d92fd56e1993bbe0c17eb65bd51
 src/rocprof_compute_soc/analysis_configs/gfx950/0300_memory_chart.yaml: 643b31ffa43bc3613d6f90b0c23d95093d0d0aa5bc8e72d9a0fbc1b739a08b67
-src/rocprof_compute_soc/analysis_configs/gfx908/0400_roofline.yaml: 6406ce67cd55064f0d2db2a3511c6536cc1625314ddb31366900fbf3c60ed523
-src/rocprof_compute_soc/analysis_configs/gfx90a/0400_roofline.yaml: 100d555cf9e70b892e22f92ddd9c0a5d1f914d07077c4a8d35941e8ad62b5b30
-src/rocprof_compute_soc/analysis_configs/gfx940/0400_roofline.yaml: f8bf66f43c9afede4fd1f17c279050cc27cc6fbc1cdb53a71ae8ceb0eb84dc37
-src/rocprof_compute_soc/analysis_configs/gfx941/0400_roofline.yaml: 6fae04dcf4bcabe4a71f5d9eefc379a38d30cdf05fbb14e2c276e1c272fdb3f6
-src/rocprof_compute_soc/analysis_configs/gfx942/0400_roofline.yaml: c8dfe7df24f94dfa229ffa2035b802c6833ce98f7710e0889bc5710f2167d4c0
-src/rocprof_compute_soc/analysis_configs/gfx950/0400_roofline.yaml: 734fdfa818bfd8a87e01a0dd795c502a567c72158ca9b7bfe01e99451e8aa537
+src/rocprof_compute_soc/analysis_configs/gfx908/0400_roofline.yaml: f346786a63056303a0c6137f82bcea0baa3bb5bf63bfbef792df531725f64d1d
+src/rocprof_compute_soc/analysis_configs/gfx90a/0400_roofline.yaml: 92aced41d421faf00ed542da38f47ad22c2fbf1f0383c550e255ed9bab95b0d8
+src/rocprof_compute_soc/analysis_configs/gfx940/0400_roofline.yaml: 867a31d84eeb5b5aa64606fcdb67732347acc43e88121fcc70af79169f44fd83
+src/rocprof_compute_soc/analysis_configs/gfx941/0400_roofline.yaml: 3f7bef2487df1ece7302de3f413a14032ba35d44fcf4e5ddee5186e2fc223797
+src/rocprof_compute_soc/analysis_configs/gfx942/0400_roofline.yaml: 58272f5d1136489255a7bf9c6ade720a0567b17ab58dc58ae796597ae4d73ce5
+src/rocprof_compute_soc/analysis_configs/gfx950/0400_roofline.yaml: 4362a8120d70cc0e62abd4367a09207897bbc3be44805092dabaefda41803391
 src/rocprof_compute_soc/analysis_configs/gfx908/0500_command_processor_cpc_cpf.yaml: da1c2997d42d66da2aa069caa741cf9eade124357c56e4290333de2f3e0412bb
 src/rocprof_compute_soc/analysis_configs/gfx90a/0500_command_processor_cpc_cpf.yaml: da1c2997d42d66da2aa069caa741cf9eade124357c56e4290333de2f3e0412bb
 src/rocprof_compute_soc/analysis_configs/gfx940/0500_command_processor_cpc_cpf.yaml: da1c2997d42d66da2aa069caa741cf9eade124357c56e4290333de2f3e0412bb
diff --git a/projects/rocprofiler-compute/utils/unified_config.yaml b/projects/rocprofiler-compute/utils/unified_config.yaml
index 531afa847b..0b8e23f1ed 100644
--- a/projects/rocprofiler-compute/utils/unified_config.yaml
+++ b/projects/rocprofiler-compute/utils/unified_config.yaml
@@ -2802,14 +2802,24 @@ panels:
         peak: Peak (Empirical)
       metric:
         gfx90a:
-          VALU FLOPs:
+          VALU FLOPs (F16):
             value: AVG((($wave_size * (
-              (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
-              (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
-              (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
+              SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16
               )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
             unit: GFLOP/s
-            peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
+            peak: $FP16Flops_empirical_peak
+          VALU FLOPs (F32):
+            value: AVG((($wave_size * (
+              SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32
+              )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
+            unit: GFLOP/s
+            peak: $FP32Flops_empirical_peak
+          VALU FLOPs (F64):
+            value: AVG((($wave_size * (
+              SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64
+              )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
+            unit: GFLOP/s
+            peak: $FP64Flops_empirical_peak
           MFMA FLOPs (F64):
             value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
             unit: GFLOP/s
@@ -2855,14 +2865,24 @@ panels:
             unit: GB/s
             peak: $LDSBw_empirical_peak
         gfx908:
-          VALU FLOPs:
+          VALU FLOPs (F16):
             value: AVG((($wave_size * (
-              (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
-              (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
-              (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
+              SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16
               )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
             unit: GFLOP/s
-            peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
+            peak: $FP16Flops_empirical_peak
+          VALU FLOPs (F32):
+            value: AVG((($wave_size * (
+              SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32
+              )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
+            unit: GFLOP/s
+            peak: $FP32Flops_empirical_peak
+          VALU FLOPs (F64):
+            value: AVG((($wave_size * (
+              SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64
+              )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
+            unit: GFLOP/s
+            peak: $FP64Flops_empirical_peak
           MFMA FLOPs (F64):
             value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
             unit: GFLOP/s
@@ -2908,14 +2928,24 @@ panels:
             unit: GB/s
             peak: $LDSBw_empirical_peak
         gfx940:
-          VALU FLOPs:
-            value: AVG(($wave_size * (
-              (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
-              (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
-              (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
-              ) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
+          VALU FLOPs (F16):
+            value: AVG((($wave_size * (
+              SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16
+              )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
             unit: GFLOP/s
-            peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
+            peak: $FP16Flops_empirical_peak
+          VALU FLOPs (F32):
+            value: AVG((($wave_size * (
+              SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32
+              )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
+            unit: GFLOP/s
+            peak: $FP32Flops_empirical_peak
+          VALU FLOPs (F64):
+            value: AVG((($wave_size * (
+              SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64
+              )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
+            unit: GFLOP/s
+            peak: $FP64Flops_empirical_peak
           MFMA FLOPs (F64):
             value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
             unit: GFLOP/s
@@ -2965,14 +2995,24 @@ panels:
             unit: GB/s
             peak: $LDSBw_empirical_peak
         gfx941:
-          VALU FLOPs:
-            value: AVG(($wave_size * (
-              (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
-              (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
-              (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
-              ) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
+          VALU FLOPs (F16):
+            value: AVG((($wave_size * (
+              SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16
+              )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
             unit: GFLOP/s
-            peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
+            peak: $FP16Flops_empirical_peak
+          VALU FLOPs (F32):
+            value: AVG((($wave_size * (
+              SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32
+              )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
+            unit: GFLOP/s
+            peak: $FP32Flops_empirical_peak
+          VALU FLOPs (F64):
+            value: AVG((($wave_size * (
+              SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64
+              )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
+            unit: GFLOP/s
+            peak: $FP64Flops_empirical_peak
           MFMA FLOPs (F64):
             value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
             unit: GFLOP/s
@@ -3022,14 +3062,24 @@ panels:
             unit: GB/s
             peak: $LDSBw_empirical_peak
         gfx942:
-          VALU FLOPs:
+          VALU FLOPs (F16):
             value: AVG((($wave_size * (
-              (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
-              (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
-              (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
+              SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16
               )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
             unit: GFLOP/s
-            peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
+            peak: $FP16Flops_empirical_peak
+          VALU FLOPs (F32):
+            value: AVG((($wave_size * (
+              SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32
+              )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
+            unit: GFLOP/s
+            peak: $FP32Flops_empirical_peak
+          VALU FLOPs (F64):
+            value: AVG((($wave_size * (
+              SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64
+              )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
+            unit: GFLOP/s
+            peak: $FP64Flops_empirical_peak
           MFMA FLOPs (F64):
             value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
             unit: GFLOP/s
@@ -3079,14 +3129,24 @@ panels:
             unit: GB/s
             peak: $LDSBw_empirical_peak
         gfx950:
-          VALU FLOPs:
+          VALU FLOPs (F16):
             value: AVG((($wave_size * (
-              (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) +
-              (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) +
-              (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64)
+              SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16
               )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
             unit: GFLOP/s
-            peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000)
+            peak: $FP16Flops_empirical_peak
+          VALU FLOPs (F32):
+            value: AVG((($wave_size * (
+              SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32
+              )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
+            unit: GFLOP/s
+            peak: $FP32Flops_empirical_peak
+          VALU FLOPs (F64):
+            value: AVG((($wave_size * (
+              SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64
+              )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
+            unit: GFLOP/s
+            peak: $FP64Flops_empirical_peak
           MFMA FLOPs (F64):
             value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9)
             unit: GFLOP/s
@@ -3578,15 +3638,35 @@ panels:
               ) / 1e9
             unit: GFLOP/s
   metrics_description:
-      VALU FLOPs:
-        plain: 'The total floating-point operations executed per second on the VALU.
-          This is also presented as a percent of the peak theoretical FLOPs achievable
-          on the specific accelerator. Note: this does not include any floating-point
-          operations from MFMA instructions.'
-        rst: 'The total floating-point operations executed per second on the :ref:`VALU
-          <desc-valu>`. This is also presented as a percent of the peak theoretical
-          FLOPs achievable on the specific accelerator. Note: this does not include
-          any floating-point operations from :ref:`MFMA <desc-mfma>` instructions.'
+      VALU FLOPs (F16):
+        plain: 'The total 16-bit floating-point operations executed per second on the VALU.
+          This is presented with the value of the peak empirical F16 FLOPs achievable
+          on the specific accelerator. Note: this does not include any F16 operations
+          from MFMA instructions.'
+        rst: 'The total 16-bit floating-point operations executed per second on the :ref:`VALU
+          <desc-valu>`. This is presented with the value of the peak empirical F16 FLOPs achievable
+          on the specific accelerator. Note: this does not include any F16 operations
+          from :ref:`MFMA <desc-mfma>` instructions.'
+        unit: GFLOPs
+      VALU FLOPs (F32):
+        plain: 'The total 32-bit floating-point operations executed per second on the VALU.
+          This is presented with the value of the peak empirical F32 FLOPs achievable
+          on the specific accelerator. Note: this does not include any F32 operations
+          from MFMA instructions.'
+        rst: 'The total 32-bit floating-point operations executed per second on the :ref:`VALU
+          <desc-valu>`. This is presented with the value of the peak empirical F32 FLOPs achievable
+          on the specific accelerator. Note: this does not include any F32 operations
+          from :ref:`MFMA <desc-mfma>` instructions.'
+        unit: GFLOPs
+      VALU FLOPs (F64):
+        plain: 'The total 64-bit floating-point operations executed per second on the VALU.
+          This is presented with the value of the peak empirical F64 FLOPs achievable
+          on the specific accelerator. Note: this does not include any F64 operations
+          from MFMA instructions.'
+        rst: 'The total 64-bit floating-point operations executed per second on the :ref:`VALU
+          <desc-valu>`. This is presented with the value of the peak empirical F64 FLOPs achievable
+          on the specific accelerator. Note: this does not include any F64 operations
+          from :ref:`MFMA <desc-mfma>` instructions.'
         unit: GFLOPs
       MFMA FLOPs (F8):
         plain: The total number of 8-bit brain floating point MFMA operations executed
@@ -3646,6 +3726,19 @@ panels:
           measured F64 MFMA operations achievable on the specific accelerator is
           displayed alongside for comparison.'
         unit: GFLOPs
+      MFMA FLOPs (F6F4):
+        plain: 'The total number of 4-bit and 6-bit floating point MFMA operations executed
+          per second. Note: this does not include any floating point operations from
+          VALU instructions. The peak empirically measured F6F4 MFMA operations
+          achievable on the specific accelerator is displayed alongside for comparison.
+          It is supported on AMD Instinct MI350 series (gfx950) and later only.'
+        rst: 'The total number of 4-bit and 6-bit floating point :ref:`MFMA <desc-mfma>`
+          operations executed per second. Note: this does not include any floating point
+          operations from :ref:`VALU <desc-valu>` instructions. The peak empirically
+          measured F6F4 MFMA operations achievable on the specific accelerator is
+          displayed alongside for comparison. It is supported on AMD Instinct MI350
+          series (gfx950) and later only.'
+        unit: GFLOPs
       MFMA IOPs (Int8):
         plain: 'The total number of 8-bit integer MFMA operations executed per second.
           Note: this does not include any 8-bit integer operations from VALU instructions.