Merge pull request #99 from AMDResearch/dev

Updates for v1.0.8-PR1
2023-03-13 15:35:45 -05:00
@@ -1,12 +1,12 @@
 ## How to fork from us

-To keep our development fast and conflict free, we recommend you to [fork](https://github.com/AMDResearch/omniperf/fork) our repository and start your work from our `develop` branch in your private repository.
+To keep our development fast and conflict free, we recommend you to [fork](https://github.com/AMDResearch/omniperf/fork) our repository and start your work from our `dev` branch in your private repository.

 Afterwards, git clone your repository to your local machine. But that is not it! To keep track of the original develop repository, add it as another remote.

 ```
 git remote add mainline https://github.com/AMDResearch/omniperf.git
-git checkout develop
+git checkout dev
 ```

 As always in git, start a new branch with
@@ -31,7 +31,7 @@ and apply your changes there.

 - Ensure the PR description clearly describes the problem and solution. If there is an existing GitHub issue open describing this bug, please include it in the description so we can close it.

- Ensure the PR is based on the `develop` branch of the Omniperf GitHub repository.
+- Ensure the PR is based on the `dev` branch of the Omniperf GitHub repository.

 - Omniperf requires new commits to include a "Signed-off-by" token in the commit message (typically enabled via the `git commit -s` option), indicating your agreement to the projects's [Developer's Certificate of Origin](https://developercertificate.org/) and compatability with the project [LICENSE](https://github.com/AMDResearch/omniperf/blob/main/LICENSE):

@@ -715,7 +715,7 @@ def main():
            # Setup prerequisits for roofline
            roof_setup(args, my_parser, VER)
            # Generate roofline
-            roofline_only(args.path, args.device, args.sort, args.mem_level, args.verbose)
+            roofline_only(args.path, args.device, args.sort, args.mem_level, args.kernel_names, args.verbose)

        # Profile only
        else:
@@ -212,7 +212,7 @@ def run_cli(args, runs):
        )


-def roofline_only(path_to_dir, dev_id, sort_type, mem_level, verbose):
+def roofline_only(path_to_dir, dev_id, sort_type, mem_level, kernel_names, verbose):
    import pandas as pd
    from collections import OrderedDict

@@ -235,6 +235,7 @@ def roofline_only(path_to_dir, dev_id, sort_type, mem_level, verbose):
        dev_id,  # [Optional] Specify device id to collect roofline info from
        sort_type,  # [Optional] Sort AI by top kernels or dispatches
        mem_level,  # [Optional] Toggle particular level(s) of memory hierarchy
+        kernel_names,  # [Optional] Toggle overlay of kernel names in plot
        True,  # [Optional] Generate a standalone roofline analysis
    )

@@ -104,7 +104,7 @@ def get_header(raw_pmc, input_filters, kernel_names):
                                            dbc.DropdownMenuItem("Cache", header=True),
                                            dbc.DropdownMenuItem(
                                                "Local Data Share (LDS)",
-                                                href="#local_data_sharelds",
+                                                href="#local_data_share_lds",
                                                external_link=True,
                                            ),
                                            dbc.DropdownMenuItem(
@@ -25,6 +25,7 @@
 from omniperf_analyze.utils import roofline_calc

 import time
+import sys
 import numpy as np
 from dash import html, dash_table

@@ -32,6 +33,9 @@ from dash import dcc
 import plotly.graph_objects as go


+SYMBOLS = [0, 1, 2, 3, 4, 5, 13, 17, 18, 20]
+
+
 def to_int(a):
    if str(type(a)) == "<class 'NoneType'>":
        return np.nan
@@ -39,7 +43,9 @@ def to_int(a):
        return int(a)


-def generate_plots(roof_info, ai_data, mem_level, is_standalone, verbose, fig=None):
+def generate_plots(
+    roof_info, ai_data, mem_level, is_standalone, kernel_names, verbose, fig=None
+):
    if fig is None:
        fig = go.Figure()
    plotMode = "lines+text" if is_standalone else "lines"
@@ -120,6 +126,8 @@ def generate_plots(roof_info, ai_data, mem_level, is_standalone, verbose, fig=No
                y=ai_data["ai_l1"][1],
                name="ai_l1",
                mode="markers",
+                marker={"color": "#00CC96"},
+                marker_symbol=SYMBOLS if kernel_names else None,
            )
        )
        fig.add_trace(
@@ -128,6 +136,8 @@ def generate_plots(roof_info, ai_data, mem_level, is_standalone, verbose, fig=No
                y=ai_data["ai_l2"][1],
                name="ai_l2",
                mode="markers",
+                marker={"color": "#EF553B"},
+                marker_symbol=SYMBOLS if kernel_names else None,
            )
        )
        fig.add_trace(
@@ -136,6 +146,8 @@ def generate_plots(roof_info, ai_data, mem_level, is_standalone, verbose, fig=No
                y=ai_data["ai_hbm"][1],
                name="ai_hbm",
                mode="markers",
+                marker={"color": "#636EFA"},
+                marker_symbol=SYMBOLS if kernel_names else None,
            )
        )

@@ -158,8 +170,13 @@ def get_roofline(
    dev_id=None,
    sort_type="kernels",
    mem_level="ALL",
+    kernel_names=False,
    is_standalone=False,
 ):
+    if kernel_names and (not is_standalone):
+        print("ERROR: --roof-only is required for --kernel-names")
+        sys.exit(1)
+
    # Roofline settings
    fp32_details = {
        "path": path_to_dir,
@@ -185,11 +202,33 @@ def get_roofline(
            print(i, "->", ai_data[i])
        print("\n")

-    fp32_fig = generate_plots(fp32_details, ai_data, mem_level, is_standalone, verbose)
-    fp16_fig = generate_plots(fp16_details, ai_data, mem_level, is_standalone, verbose)
-    ml_combo_fig = generate_plots(
-        int8_details, ai_data, mem_level, is_standalone, verbose, fp16_fig
+    fp32_fig = generate_plots(
+        fp32_details, ai_data, mem_level, is_standalone, kernel_names, verbose
    )
+    fp16_fig = generate_plots(
+        fp16_details, ai_data, mem_level, is_standalone, kernel_names, verbose
+    )
+    ml_combo_fig = generate_plots(
+        int8_details, ai_data, mem_level, is_standalone, kernel_names, verbose, fp16_fig
+    )
+    legend = go.Figure(
+        go.Scatter(
+            mode="markers",
+            x=[0] * 10,
+            y=ai_data["kernelNames"],
+            marker_symbol=SYMBOLS,
+            marker_size=15,
+        )
+    )
+    legend.update_layout(
+        title="Kernel Names and Markers",
+        margin=dict(b=0, r=0),
+        xaxis_range=[-1, 1],
+        xaxis_side="top",
+        height=400,
+        width=1000,
+    )
+    legend.update_xaxes(dtick=1)

    if is_standalone:
        dev_id = "ALL" if dev_id == -1 else str(dev_id)
@@ -198,12 +237,17 @@ def get_roofline(
        ml_combo_fig.write_image(
            path_to_dir + "/empirRoof_gpu-{}_fp8_fp16.pdf".format(dev_id)
        )
+        if kernel_names:
+            # only save a legend if kernel_names option is toggled
+            legend.write_image(path_to_dir + "/kernelName_legend.pdf")
        time.sleep(1)
        # Re-save to remove loading MathJax pop up
        fp32_fig.write_image(path_to_dir + "/empirRoof_gpu-{}_fp32.pdf".format(dev_id))
        ml_combo_fig.write_image(
            path_to_dir + "/empirRoof_gpu-{}_fp8_fp16.pdf".format(dev_id)
        )
+        if kernel_names:
+            legend.write_image(path_to_dir + "/kernelName_legend.pdf")
        print("Empirical Roofline PDFs saved!")
    else:
        return html.Section(
@@ -27,7 +27,6 @@ import sys
 import astunparse
 import re
 import os
-from matplotlib.pyplot import axis
 import pandas as pd
 import numpy as np
 from tabulate import tabulate
@@ -44,6 +44,8 @@ FONT_WEIGHT = "bold"

 SUPPORTED_SOC = ["mi200"]

+TOP_N = 10
+

 ################################################
 # Helper funcs
@@ -208,17 +210,146 @@ def plot_application(sortType, ret_df, verbose):
    kernelName = ""

    myList = []
-    for index, row in df.iterrows():
+    at_end = False
+    next_kernelName = ""
+
+    for idx in df.index:
        # CASE: Top kernels
        # Calculate + append AI data if
        # a) current KernelName is different than previous OR
        # b) We've reached the end of list
-        if sortType == "kernels" and (
-            (row["KernelName"] != kernelName and kernelName != "")
-            or index == df.shape[0] - 1
-        ):
-            if df.shape[0] - 1 == index:
-                calls += 1
+        if idx + 1 == df.shape[0]:
+            at_end = True
+        else:
+            next_kernelName = df["KernelName"][idx + 1]
+
+        kernelName = df["KernelName"][idx]
+        try:
+            total_flops += (
+                (
+                    64
+                    * (
+                        df["SQ_INSTS_VALU_ADD_F16"][idx]
+                        + df["SQ_INSTS_VALU_MUL_F16"][idx]
+                        + (2 * df["SQ_INSTS_VALU_FMA_F16"][idx])
+                        + df["SQ_INSTS_VALU_TRANS_F16"][idx]
+                    )
+                )
+                + (
+                    64
+                    * (
+                        df["SQ_INSTS_VALU_ADD_F32"][idx]
+                        + df["SQ_INSTS_VALU_MUL_F32"][idx]
+                        + (2 * df["SQ_INSTS_VALU_FMA_F32"][idx])
+                        + df["SQ_INSTS_VALU_TRANS_F32"][idx]
+                    )
+                )
+                + (
+                    64
+                    * (
+                        df["SQ_INSTS_VALU_ADD_F64"][idx]
+                        + df["SQ_INSTS_VALU_MUL_F64"][idx]
+                        + (2 * df["SQ_INSTS_VALU_FMA_F64"][idx])
+                        + df["SQ_INSTS_VALU_TRANS_F64"][idx]
+                    )
+                )
+                + (df["SQ_INSTS_VALU_MFMA_MOPS_F16"][idx] * 512)
+                + (df["SQ_INSTS_VALU_MFMA_MOPS_BF16"][idx] * 512)
+                + (df["SQ_INSTS_VALU_MFMA_MOPS_F32"][idx] * 512)
+                + (df["SQ_INSTS_VALU_MFMA_MOPS_F64"][idx] * 512)
+            )
+        except KeyError:
+            if verbose >= 3:
+                print("{}: Skipped total_flops at index {}".format(kernelName[:35], idx))
+            pass
+        try:
+            valu_flops += (
+                64
+                * (
+                    df["SQ_INSTS_VALU_ADD_F16"][idx]
+                    + df["SQ_INSTS_VALU_MUL_F16"][idx]
+                    + (2 * df["SQ_INSTS_VALU_FMA_F16"][idx])
+                    + df["SQ_INSTS_VALU_TRANS_F16"][idx]
+                )
+                + 64
+                * (
+                    df["SQ_INSTS_VALU_ADD_F32"][idx]
+                    + df["SQ_INSTS_VALU_MUL_F32"][idx]
+                    + (2 * df["SQ_INSTS_VALU_FMA_F32"][idx])
+                    + df["SQ_INSTS_VALU_TRANS_F32"][idx]
+                )
+                + 64
+                * (
+                    df["SQ_INSTS_VALU_ADD_F64"][idx]
+                    + df["SQ_INSTS_VALU_MUL_F64"][idx]
+                    + (2 * df["SQ_INSTS_VALU_FMA_F64"][idx])
+                    + df["SQ_INSTS_VALU_TRANS_F64"][idx]
+                )
+            )
+        except KeyError:
+            if verbose >= 3:
+                print("{}: Skipped valu_flops at index {}".format(kernelName[:35], idx))
+            pass
+
+        try:
+            mfma_flops_f16 += df["SQ_INSTS_VALU_MFMA_MOPS_F16"][idx] * 512
+            mfma_flops_bf16 += df["SQ_INSTS_VALU_MFMA_MOPS_BF16"][idx] * 512
+            mfma_flops_f32 += df["SQ_INSTS_VALU_MFMA_MOPS_F32"][idx] * 512
+            mfma_flops_f64 += df["SQ_INSTS_VALU_MFMA_MOPS_F64"][idx] * 512
+            mfma_iops_i8 += df["SQ_INSTS_VALU_MFMA_MOPS_I8"][idx] * 512
+        except KeyError:
+            if verbose >= 3:
+                print("{}: Skipped mfma ops at index {}".format(kernelName[:35], idx))
+            pass
+
+        try:
+            lds_data += (
+                (df["SQ_LDS_IDX_ACTIVE"][idx] - df["SQ_LDS_BANK_CONFLICT"][idx])
+                * 4
+                * L2_BANKS
+            )  # L2_BANKS = 32 (since assuming mi200)
+        except KeyError:
+            if verbose >= 3:
+                print("{}: Skipped lds_data at index {}".format(kernelName[:35], idx))
+            pass
+
+        try:
+            L1cache_data += df["TCP_TOTAL_CACHE_ACCESSES_sum"][idx] * 64
+        except KeyError:
+            if verbose >= 3:
+                print("{}: Skipped L1cache_data at index {}".format(kernelName[:35], idx))
+            pass
+
+        try:
+            L2cache_data += (
+                df["TCP_TCC_WRITE_REQ_sum"][idx] * 64
+                + df["TCP_TCC_ATOMIC_WITH_RET_REQ_sum"][idx] * 64
+                + df["TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum"][idx] * 64
+                + df["TCP_TCC_READ_REQ_sum"][idx] * 64
+            )
+        except KeyError:
+            if verbose >= 3:
+                print("{}: Skipped L2cache_data at index {}".format(kernelName[:35], idx))
+            pass
+        try:
+            hbm_data += (
+                (df["TCC_EA_RDREQ_32B_sum"][idx] * 32)
+                + ((df["TCC_EA_RDREQ_sum"][idx] - df["TCC_EA_RDREQ_32B_sum"][idx]) * 64)
+                + (df["TCC_EA_WRREQ_64B_sum"][idx] * 64)
+                + ((df["TCC_EA_WRREQ_sum"][idx] - df["TCC_EA_WRREQ_64B_sum"][idx]) * 32)
+            )
+        except KeyError:
+            if verbose >= 3:
+                print("{}: Skipped hbm_data at index {}".format(kernelName[:35], idx))
+            pass
+
+        totalDuration += df["EndNs"][idx] - df["BeginNs"][idx]
+
+        avgDuration += df["EndNs"][idx] - df["BeginNs"][idx]
+
+        calls += 1
+
+        if sortType == "kernels" and (at_end == True or (kernelName != next_kernelName)):
            myList.append(
                AI_Data(
                    kernelName,
@@ -241,7 +372,7 @@ def plot_application(sortType, ret_df, verbose):
            if verbose >= 2:
                print(
                    "Just added {} to AI_Data at index {}. # of calls: {}".format(
-                        kernelName, index, calls
+                        kernelName, idx, calls
                    )
                )
            total_flops = (
@@ -262,129 +393,6 @@ def plot_application(sortType, ret_df, verbose):
                L1cache_data
            ) = L2cache_data = hbm_data = calls = totalDuration = avgDuration = 0.0

-        kernelName = row["KernelName"]
-        try:
-            total_flops += (
-                (
-                    64
-                    * (
-                        row["SQ_INSTS_VALU_ADD_F16"]
-                        + row["SQ_INSTS_VALU_MUL_F16"]
-                        + (2 * row["SQ_INSTS_VALU_FMA_F16"])
-                        + row["SQ_INSTS_VALU_TRANS_F16"]
-                    )
-                )
-                + (
-                    64
-                    * (
-                        row["SQ_INSTS_VALU_ADD_F32"]
-                        + row["SQ_INSTS_VALU_MUL_F32"]
-                        + (2 * row["SQ_INSTS_VALU_FMA_F32"])
-                        + row["SQ_INSTS_VALU_TRANS_F32"]
-                    )
-                )
-                + (
-                    64
-                    * (
-                        row["SQ_INSTS_VALU_ADD_F64"]
-                        + row["SQ_INSTS_VALU_MUL_F64"]
-                        + (2 * row["SQ_INSTS_VALU_FMA_F64"])
-                        + row["SQ_INSTS_VALU_TRANS_F64"]
-                    )
-                )
-                + (row["SQ_INSTS_VALU_MFMA_MOPS_F16"] * 512)
-                + (row["SQ_INSTS_VALU_MFMA_MOPS_BF16"] * 512)
-                + (row["SQ_INSTS_VALU_MFMA_MOPS_F32"] * 512)
-                + (row["SQ_INSTS_VALU_MFMA_MOPS_F64"] * 512)
-            )
-        except KeyError:
-            if verbose >= 2:
-                print("Skipped total_flops at index {}".format(index))
-            pass
-        try:
-            valu_flops += (
-                64
-                * (
-                    row["SQ_INSTS_VALU_ADD_F16"]
-                    + row["SQ_INSTS_VALU_MUL_F16"]
-                    + (2 * row["SQ_INSTS_VALU_FMA_F16"])
-                    + row["SQ_INSTS_VALU_TRANS_F16"]
-                )
-                + 64
-                * (
-                    row["SQ_INSTS_VALU_ADD_F32"]
-                    + row["SQ_INSTS_VALU_MUL_F32"]
-                    + (2 * row["SQ_INSTS_VALU_FMA_F32"])
-                    + row["SQ_INSTS_VALU_TRANS_F32"]
-                )
-                + 64
-                * (
-                    row["SQ_INSTS_VALU_ADD_F64"]
-                    + row["SQ_INSTS_VALU_MUL_F64"]
-                    + (2 * row["SQ_INSTS_VALU_FMA_F64"])
-                    + row["SQ_INSTS_VALU_TRANS_F64"]
-                )
-            )
-        except KeyError:
-            if verbose >= 2:
-                print("Skipped valu_flops at index {}".format(index))
-            pass
-
-        try:
-            mfma_flops_f16 += row["SQ_INSTS_VALU_MFMA_MOPS_F16"] * 512
-            mfma_flops_bf16 += row["SQ_INSTS_VALU_MFMA_MOPS_BF16"] * 512
-            mfma_flops_f32 += row["SQ_INSTS_VALU_MFMA_MOPS_F32"] * 512
-            mfma_flops_f64 += row["SQ_INSTS_VALU_MFMA_MOPS_F64"] * 512
-            mfma_iops_i8 += row["SQ_INSTS_VALU_MFMA_MOPS_I8"] * 512
-        except KeyError:
-            if verbose >= 2:
-                print("Skipped mfma ops at index {}".format(index))
-            pass
-
-        try:
-            lds_data += (
-                (row["SQ_LDS_IDX_ACTIVE"] - row["SQ_LDS_BANK_CONFLICT"]) * 4 * L2_BANKS
-            )  # L2_BANKS = 32 (since assuming mi200)
-        except KeyError:
-            if verbose >= 2:
-                print("Skipped lds_data at index {}".format(index))
-            pass
-
-        try:
-            L1cache_data += row["TCP_TOTAL_CACHE_ACCESSES_sum"] * 64
-        except KeyError:
-            if verbose >= 2:
-                print("Skipped L1cache_data at index {}".format(index))
-            pass
-
-        try:
-            L2cache_data += (
-                row["TCP_TCC_WRITE_REQ_sum"] * 64
-                + row["TCP_TCC_ATOMIC_WITH_RET_REQ_sum"] * 64
-                + row["TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum"] * 64
-                + row["TCP_TCC_READ_REQ_sum"] * 64
-            )
-        except KeyError:
-            if verbose >= 2:
-                print("Skipped L2cache_data at index {}".format(index))
-            pass
-        try:
-            hbm_data += (
-                (row["TCC_EA_RDREQ_32B_sum"] * 32)
-                + ((row["TCC_EA_RDREQ_sum"] - row["TCC_EA_RDREQ_32B_sum"]) * 64)
-                + (row["TCC_EA_WRREQ_64B_sum"] * 64)
-                + ((row["TCC_EA_WRREQ_sum"] - row["TCC_EA_WRREQ_64B_sum"]) * 32)
-            )
-        except KeyError:
-            if verbose >= 2:
-                print("Skipped hbm_data at index {}".format(index))
-            pass
-
-        totalDuration += row["EndNs"] - row["BeginNs"]
-
-        avgDuration += row["EndNs"] - row["BeginNs"]
-
-        calls += 1
        if sortType == "dispatches":
            myList.append(
                AI_Data(
@@ -428,9 +436,11 @@ def plot_application(sortType, ret_df, verbose):
    # print("Top 5 intensities ('{}')...".format(roof_details["sort"]))
    intensities = {"ai_l1": [], "ai_l2": [], "ai_hbm": []}
    curr_perf = []
+    kernelNames = []
    i = 0
    # Create list of top 5 intensities
-    while i <= 9 and i != len(myList):
+    while i < TOP_N and i != len(myList):
+        kernelNames.append(myList[i].KernelName)
        intensities["ai_l1"].append(
            myList[i].total_flops / myList[i].L1cache_data
        ) if myList[i].L1cache_data else intensities["ai_l1"].append(0)
@@ -470,6 +480,9 @@ def plot_application(sortType, ret_df, verbose):
        intensityPoints[i].append(x)
        intensityPoints[i].append(y)

+    # Add an entry for kernel names
+    intensityPoints["kernelNames"] = kernelNames
+
    return intensityPoints


@@ -234,6 +234,13 @@ def parse(my_parser):
        type=int,
        help="\t\t\tGPU device ID. (DEFAULT: ALL)",
    )
+    roofline_group.add_argument(
+        "--kernel-names",
+        required=False,
+        default=False,
+        action="store_true",
+        help="\t\t\tInclude kernel names in roofline plot.",
+    )
    # roofline_group.add_argument('-w', '--workgroups', required=False, default=-1, type=int, help="\t\t\tNumber of kernel workgroups (DEFAULT: 1024)")
    # roofline_group.add_argument('--wsize', required=False, default=-1, type=int, help="\t\t\tWorkgroup size (DEFAULT: 256)")
    # roofline_group.add_argument('--dataset', required=False, default = -1, type=int, help="\t\t\tDataset size (DEFAULT: 536M)")