Add F4 F6 to roofline for MI350 series (#709)

Add roofline bins with FP4 FP6 datatypes enabled for gfx950 arch --------- Signed-off-by: Carrie Fallows <Carrie.Fallows@amd.com> [ROCm/rocprofiler-compute commit: cb2d928ecf]
2025-05-26 18:36:31 -04:00
commit 689746e2cd
@@ -20,7 +20,7 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs.
  * Default is FP32, but user can specify as many types as desired to overlay on the same plot output

 * Additional datatypes for roofline profiling
-  * Now supports FP8, FP16, BF16, FP32, FP64, I8, I32, I64 (dependent on gpu architecture)
+  * Now supports FP4, FP6, FP8, FP16, BF16, FP32, FP64, I8, I32, I64 (dependent on gpu architecture)

 * Support host-trap PC Sampling on CLI (beta version)

@@ -390,12 +390,12 @@ Examples:
        "-R",
        "--roofline-data-type",
        required=False,
-        choices=["FP8", "FP16", "BF16", "FP32", "FP64", "I8", "I32", "I64"],
+        choices=["FP4", "FP6", "FP8", "FP16", "BF16", "FP32", "FP64", "I8", "I32", "I64"],
        metavar="",
        nargs="+",
        type=str,
        default=["FP32"],
-        help="\t\t\tChoose datatypes to view roofline PDFs for: (DEFAULT: FP32)\n\t\t\t   FP8\n\t\t\t   FP16\n\t\t\t   BF16\n\t\t\t   FP32\n\t\t\t   FP64\n\t\t\t   I8\n\t\t\t  I32\n\t\t\t I64\n\t\t\t ",
+        help="Choose datatypes to view roofline PDFs for: (DEFAULT: FP32)\n\t\t\t   FP4\n\t\t\t FP6\n\t\t\t  FP8\n\t\t\t   FP16\n\t\t\t   BF16\n\t\t\t   FP32\n\t\t\t   FP64\n\t\t\t   I8\n\t\t\t  I32\n\t\t\t I64\n\t\t\t ",
    )

    # roofline_group.add_argument('-w', '--workgroups', required=False, default=-1, type=int, help="\t\t\tNumber of kernel workgroups (DEFAULT: 1024)")
@@ -605,12 +605,12 @@ Examples:
        "-R",
        "--roofline-data-type",
        required=False,
-        choices=["FP8", "FP16", "BF16", "FP32", "FP64", "I8", "I32", "I64"],
+        choices=["FP4", "FP6", "FP8", "FP16", "BF16", "FP32", "FP64", "I8", "I32", "I64"],
        metavar="",
        nargs="+",
        type=str,
        default=["FP32"],
-        help="\t\t\tChoose datatypes to view roofline PDFs for: (DEFAULT: FP32)\n\t\t\t   FP8\n\t\t\t   FP16\n\t\t\t   BF16\n\t\t\t   FP32\n\t\t\t   FP64\n\t\t\t   I8\n\t\t\t  I32\n\t\t\t I64\n\t\t\t ",
+        help="\t\t\tChoose datatypes to view roofline PDFs for: (DEFAULT: FP32)\n\t\t\t   FP4\n\t\t\t   FP6\n\t\t\t FP8\n\t\t\t   FP16\n\t\t\t   BF16\n\t\t\t   FP32\n\t\t\t   FP64\n\t\t\t   I8\n\t\t\t  I32\n\t\t\t I64\n\t\t\t ",
    )

    analyze_advanced_group.add_argument(
@@ -337,48 +337,69 @@ class Roofline:
        # Plot Application AI
        #######################
        # Plot the arithmetic intensity points for each cache level
-        if ops_flops == "FLOP":
-            fig.add_trace(
-                go.Scatter(
-                    x=self.__ai_data["ai_l1"][0],
-                    y=self.__ai_data["ai_l1"][1],
-                    name=dtype + "_ai_l1",
-                    mode="markers",
-                    marker_symbol=(
-                        SYMBOLS if self.__run_parameters["include_kernel_names"] else None
-                    ),
-                )
-            )
-            fig.add_trace(
-                go.Scatter(
-                    x=self.__ai_data["ai_l2"][0],
-                    y=self.__ai_data["ai_l2"][1],
-                    name=dtype + "_ai_l2",
-                    mode="markers",
-                    marker_symbol=(
-                        SYMBOLS if self.__run_parameters["include_kernel_names"] else None
-                    ),
-                )
-            )
-            fig.add_trace(
-                go.Scatter(
-                    x=self.__ai_data["ai_hbm"][0],
-                    y=self.__ai_data["ai_hbm"][1],
-                    name=dtype + "_ai_hbm",
-                    mode="markers",
-                    marker_symbol=(
-                        SYMBOLS if self.__run_parameters["include_kernel_names"] else None
-                    ),
-                )
-            )

-            # Set layout
-            fig.update_layout(
-                xaxis_title="Arithmetic Intensity (FLOPs/Byte)",
-                yaxis_title="Performance (GFLOP/sec)",
-                hovermode="x unified",
-                margin=dict(l=50, r=50, b=50, t=50, pad=4),
+        # Check for F6F4 PC which applies to both FP4 and FP6 MFMA; avoid duplicate plotting
+        skipAI = False
+        if dtype == "FP4" or dtype == "FP6":
+            if (dtype == "FP6") and (
+                "FP4" in self.__run_parameters["roofline_data_type"]
+            ):
+                skipAI = True
+            console_debug(
+                "roofline",
+                "Datatype {} is captured through the F6F4 perfmon event".format(dtype),
            )
+            dtype = "F6F4"
+
+        if ops_flops == "FLOP":
+            if not skipAI:
+                fig.add_trace(
+                    go.Scatter(
+                        x=self.__ai_data["ai_l1"][0],
+                        y=self.__ai_data["ai_l1"][1],
+                        name=dtype + "_ai_l1",
+                        mode="markers",
+                        marker_symbol=(
+                            SYMBOLS
+                            if self.__run_parameters["include_kernel_names"]
+                            else None
+                        ),
+                    )
+                )
+                fig.add_trace(
+                    go.Scatter(
+                        x=self.__ai_data["ai_l2"][0],
+                        y=self.__ai_data["ai_l2"][1],
+                        name=dtype + "_ai_l2",
+                        mode="markers",
+                        marker_symbol=(
+                            SYMBOLS
+                            if self.__run_parameters["include_kernel_names"]
+                            else None
+                        ),
+                    )
+                )
+                fig.add_trace(
+                    go.Scatter(
+                        x=self.__ai_data["ai_hbm"][0],
+                        y=self.__ai_data["ai_hbm"][1],
+                        name=dtype + "_ai_hbm",
+                        mode="markers",
+                        marker_symbol=(
+                            SYMBOLS
+                            if self.__run_parameters["include_kernel_names"]
+                            else None
+                        ),
+                    )
+                )
+
+                # Set layout
+                fig.update_layout(
+                    xaxis_title="Arithmetic Intensity (FLOPs/Byte)",
+                    yaxis_title="Performance (GFLOP/sec)",
+                    hovermode="x unified",
+                    margin=dict(l=50, r=50, b=50, t=50, pad=4),
+                )
        else:
            # Set layout
            fig.update_layout(
@@ -44,15 +44,61 @@ FONT_WEIGHT = "bold"
 # SUPPORTED_DATATYPES table is based on datatype support in rocm-amdgpu-bench repository
 # Indicates which datatypes per gpu arch can be generated by the roofline binary
 SUPPORTED_DATATYPES = {
-    "gfx90a": ["FP16", "BF16", "FP32", "FP64", "I8", "I32", "I64"],  # Unsupported: F8
-    "gfx940": ["FP8", "FP16", "BF16", "FP32", "FP64", "I8", "I32", "I64"],  # Unsupported:
-    "gfx941": ["FP8", "FP16", "BF16", "FP32", "FP64", "I8", "I32", "I64"],  # Unsupported:
-    "gfx942": ["FP8", "FP16", "BF16", "FP32", "FP64", "I8", "I32", "I64"],  # Unsupported:
-    "gfx950": ["FP8", "FP16", "BF16", "FP32", "FP64", "I8", "I32", "I64"],  # Unsupported:
+    "gfx90a": [
+        "FP16",
+        "BF16",
+        "FP32",
+        "FP64",
+        "I8",
+        "I32",
+        "I64",
+    ],  # Unsupported: F4, F6, F8
+    "gfx940": [
+        "FP8",
+        "FP16",
+        "BF16",
+        "FP32",
+        "FP64",
+        "I8",
+        "I32",
+        "I64",
+    ],  # Unsupported: F4, F6
+    "gfx941": [
+        "FP8",
+        "FP16",
+        "BF16",
+        "FP32",
+        "FP64",
+        "I8",
+        "I32",
+        "I64",
+    ],  # Unsupported: F4, F6
+    "gfx942": [
+        "FP8",
+        "FP16",
+        "BF16",
+        "FP32",
+        "FP64",
+        "I8",
+        "I32",
+        "I64",
+    ],  # Unsupported: F4, F6
+    "gfx950": [
+        "FP4",
+        "FP6",
+        "FP8",
+        "FP16",
+        "BF16",
+        "FP32",
+        "FP64",
+        "I8",
+        "I32",
+        "I64",
+    ],  # Unsupported:
 }

 PEAK_OPS_DATATYPES = ["FP8", "FP16", "BF16", "FP32", "FP64", "I8", "I32", "I64"]
-MFMA_DATATYPES = ["FP8", "FP16", "BF16", "FP32", "FP64", "I8"]
+MFMA_DATATYPES = ["FP4", "FP6", "FP8", "FP16", "BF16", "FP32", "FP64", "I8"]

 TOP_N = 10

@@ -67,6 +113,7 @@ class AI_Data:

    total_flops: float
    valu_flops: float
+    mfma_flops_f6f4: float
    mfma_flops_f8: float
    mfma_flops_f16: float
    mfma_flops_bf16: float
@@ -212,11 +259,11 @@ def calc_ai(mspec, sort_type, ret_df):
    df = df.sort_values(by=["Kernel_Name"])
    df = df.reset_index(drop=True)

-    total_flops = valu_flops = mfma_flops_f8 = mfma_flops_bf16 = mfma_flops_f16 = (
-        mfma_iops_i8
-    ) = mfma_flops_f32 = mfma_flops_f64 = lds_data = L1cache_data = L2cache_data = (
-        hbm_data
-    ) = calls = totalDuration = avgDuration = 0.0
+    total_flops = valu_flops = mfma_flops_f6f4 = mfma_flops_f8 = mfma_flops_bf16 = (
+        mfma_flops_f16
+    ) = mfma_iops_i8 = mfma_flops_f32 = mfma_flops_f64 = lds_data = L1cache_data = (
+        L2cache_data
+    ) = hbm_data = calls = totalDuration = avgDuration = 0.0

    kernelName = ""

@@ -273,6 +320,8 @@ def calc_ai(mspec, sort_type, ret_df):
            )
            if "FP8" in supported_dt:
                total_flops += df["SQ_INSTS_VALU_MFMA_MOPS_F8"][idx] * 512
+            if ("FP4" in supported_dt) or ("FP6" in supported_dt):
+                total_flops += df["SQ_INSTS_VALU_MFMA_MOPS_F6F4"][idx] * 512
        except KeyError:
            console_debug(
                "roofline",
@@ -313,6 +362,8 @@ def calc_ai(mspec, sort_type, ret_df):
        try:
            if "FP8" in supported_dt:
                mfma_flops_f8 += df["SQ_INSTS_VALU_MFMA_MOPS_F8"][idx] * 512
+            if ("FP4" in supported_dt) or ("FP6" in supported_dt):
+                mfma_flops_f6f4 += df["SQ_INSTS_VALU_MFMA_MOPS_F6F4"][idx] * 512
            mfma_flops_f16 += df["SQ_INSTS_VALU_MFMA_MOPS_F16"][idx] * 512
            mfma_flops_bf16 += df["SQ_INSTS_VALU_MFMA_MOPS_BF16"][idx] * 512
            mfma_flops_f32 += df["SQ_INSTS_VALU_MFMA_MOPS_F32"][idx] * 512
@@ -413,6 +464,7 @@ def calc_ai(mspec, sort_type, ret_df):
                    calls,
                    total_flops / calls,
                    valu_flops / calls,
+                    mfma_flops_f6f4 / calls,
                    mfma_flops_f8 / calls,
                    mfma_flops_f16 / calls,
                    mfma_flops_bf16 / calls,
@@ -432,11 +484,13 @@ def calc_ai(mspec, sort_type, ret_df):
                    kernelName, idx, calls
                )
            )
-            total_flops = valu_flops = mfma_flops_f8 = mfma_flops_bf16 = (
-                mfma_flops_f16
-            ) = mfma_iops_i8 = mfma_flops_f32 = mfma_flops_f64 = lds_data = (
-                L1cache_data
-            ) = L2cache_data = hbm_data = calls = totalDuration = avgDuration = 0.0
+            total_flops = valu_flops = mfma_flops_f6f4 = mfma_flops_f8 = (
+                mfma_flops_bf16
+            ) = mfma_flops_f16 = mfma_iops_i8 = mfma_flops_f32 = mfma_flops_f64 = (
+                lds_data
+            ) = L1cache_data = L2cache_data = hbm_data = calls = totalDuration = (
+                avgDuration
+            ) = 0.0

        if sort_type == "dispatches":
            myList.append(
@@ -445,6 +499,7 @@ def calc_ai(mspec, sort_type, ret_df):
                    calls,
                    total_flops,
                    valu_flops,
+                    mfma_flops_f6f4,
                    mfma_flops_f8,
                    mfma_flops_f16,
                    mfma_flops_bf16,
@@ -459,11 +514,13 @@ def calc_ai(mspec, sort_type, ret_df):
                    avgDuration,
                )
            )
-            total_flops = valu_flops = mfma_flops_f8 = mfma_flops_bf16 = (
-                mfma_flops_f16
-            ) = mfma_iops_i8 = mfma_flops_f32 = mfma_flops_f64 = lds_data = (
-                L1cache_data
-            ) = L2cache_data = hbm_data = calls = totalDuration = avgDuration = 0.0
+            total_flops = valu_flops = mfma_flops_f6f4 = mfma_flops_f8 = (
+                mfma_flops_bf16
+            ) = mfma_flops_f16 = mfma_iops_i8 = mfma_flops_f32 = mfma_flops_f64 = (
+                lds_data
+            ) = L1cache_data = L2cache_data = hbm_data = calls = totalDuration = (
+                avgDuration
+            ) = 0.0

    myList.sort(key=lambda x: x.totalDuration, reverse=True)