Re-enable standalone roof options. Everthing working.

Signed-off-by: coleramos425 <colramos@amd.com>
2023-01-26 14:07:10 -06:00
@@ -649,7 +649,7 @@ def main():
            # Setup prerequisits for roofline
            roof_setup(args, my_parser)
            # Generate roofline
-            roofline_only(args.path, args.verbose, args.device)
+            roofline_only(args.path, args.device, args.sort, args.mem_level, args.verbose)

        # Profile only
        else:
@@ -210,10 +210,15 @@ def run_cli(args, runs):
        )


-def roofline_only(path_to_dir, verbose, dev_id):
+def roofline_only(path_to_dir, dev_id, sort_type, mem_level, verbose):
    import pandas as pd
    from collections import OrderedDict

+    # Change vL1D to a interpretable str, if required
+    if "vL1D" in mem_level:
+        mem_level.remove("vL1D")
+        mem_level.append("L1")
+
    app_path = path_to_dir + "/pmc_perf.csv"
    roofline_exists = os.path.isfile(app_path)
    if not roofline_exists:
@@ -221,7 +226,15 @@ def roofline_only(path_to_dir, verbose, dev_id):
        sys.exit(0)
    t_df = OrderedDict()
    t_df["pmc_perf"] = pd.read_csv(app_path)
-    get_roofline(path_to_dir, t_df, dev_id, verbose, True)
+    get_roofline(
+        path_to_dir,
+        t_df,
+        verbose,
+        dev_id,  # [Optional] Specify device id to collect roofline info from
+        sort_type,  # [Optional] Sort AI by top kernels or dispatches
+        mem_level,  # [Optional] Toggle particular level(s) of memory hierarchy
+        True,  # [Optional] Generate a standalone roofline analysis
+    )


 def analyze(args):
@@ -22,6 +22,7 @@

 from omniperf_analyze.utils import roofline_calc

+import time
 import numpy as np
 from dash import html, dash_table

@@ -35,72 +36,40 @@ def to_int(a):
    else:
        return int(a)

-def generate_plots(roof_info, ai_data, isStandalone, verbose, fig=None):
+
+def generate_plots(roof_info, ai_data, mem_level, is_standalone, verbose, fig=None):
    if fig is None:
        fig = go.Figure()
-    plotMode = "lines+text" if isStandalone else "lines"
-    line_data = roofline_calc.empirical_roof(roof_info, verbose)
+    plotMode = "lines+text" if is_standalone else "lines"
+    line_data = roofline_calc.empirical_roof(roof_info, mem_level, verbose)
    print("Line data:\n", line_data)

    #######################
    # Plot BW Lines
    #######################
-    fig.add_trace(
-        go.Scatter(
-            x=line_data["hbm"][0],
-            y=line_data["hbm"][1],
-            name="HBM-{}".format(roof_info["dtype"]),
-            mode=plotMode,
-            hovertemplate="<b>%{text}</b>",
-            text=[
-                "{} GB/s".format(to_int(line_data["hbm"][2])),
-                None if isStandalone else "{} GB/s".format(to_int(line_data["hbm"][2]))
-            ],
-            textposition="top right",
+    if mem_level == "ALL":
+        cacheHierarchy = ["HBM", "L2", "L1", "LDS"]
+    else:
+        cacheHierarchy = mem_level
+
+    for cacheLevel in cacheHierarchy:
+        fig.add_trace(
+            go.Scatter(
+                x=line_data[cacheLevel.lower()][0],
+                y=line_data[cacheLevel.lower()][1],
+                name="{}-{}".format(cacheLevel, roof_info["dtype"]),
+                mode=plotMode,
+                hovertemplate="<b>%{text}</b>",
+                text=[
+                    "{} GB/s".format(to_int(line_data[cacheLevel.lower()][2])),
+                    None
+                    if is_standalone
+                    else "{} GB/s".format(to_int(line_data[cacheLevel.lower()][2])),
+                ],
+                textposition="top right",
+            )
        )
-    )
-    fig.add_trace(
-        go.Scatter(
-            x=line_data["l2"][0],
-            y=line_data["l2"][1],
-            name="L2-{}".format(roof_info["dtype"]),
-            mode=plotMode,
-            hovertemplate="<b>%{text}</b>",
-            text=[
-                "{} GB/s".format(to_int(line_data["l2"][2])),
-                None if isStandalone else "{} GB/s".format(to_int(line_data["l2"][2]))
-            ],
-            textposition="top right",
-        )
-    )
-    fig.add_trace(
-        go.Scatter(
-            x=line_data["l1"][0],
-            y=line_data["l1"][1],
-            name="L1-{}".format(roof_info["dtype"]),
-            mode=plotMode,
-            hovertemplate="<b>%{text}</b>",
-            text=[
-                "{} GB/s".format(to_int(line_data["l1"][2])),
-                None if isStandalone else "{} GB/s".format(to_int(line_data["l1"][2]))
-            ],
-            textposition="top right",
-        )
-    )
-    fig.add_trace(
-        go.Scatter(
-            x=line_data["lds"][0],
-            y=line_data["lds"][1],
-            name="LDS-{}".format(roof_info["dtype"]),
-            mode=plotMode,
-            hovertemplate="<b>%{text}</b>",
-            text=[
-                "{} GB/s".format(to_int(line_data["lds"][2])),
-                None if isStandalone else "{} GB/s".format(to_int(line_data["lds"][2]))
-            ],
-            textposition="top right",
-        )
-    )
+
    if roof_info["dtype"] != "FP16" and roof_info["dtype"] != "I8":
        fig.add_trace(
            go.Scatter(
@@ -110,7 +79,9 @@ def generate_plots(roof_info, ai_data, isStandalone, verbose, fig=None):
                mode=plotMode,
                hovertemplate="<b>%{text}</b>",
                text=[
-                    None if isStandalone else "{} GFLOP/s".format(to_int(line_data["valu"][2])),
+                    None
+                    if is_standalone
+                    else "{} GFLOP/s".format(to_int(line_data["valu"][2])),
                    "{} GFLOP/s".format(to_int(line_data["valu"][2])),
                ],
                textposition="top left",
@@ -129,7 +100,9 @@ def generate_plots(roof_info, ai_data, isStandalone, verbose, fig=None):
            mode=plotMode,
            hovertemplate="<b>%{text}</b>",
            text=[
-                None if isStandalone else "{} GFLOP/s".format(to_int(line_data["mfma"][2])),
+                None
+                if is_standalone
+                else "{} GFLOP/s".format(to_int(line_data["mfma"][2])),
                "{} GFLOP/s".format(to_int(line_data["mfma"][2])),
            ],
            textposition=pos,
@@ -176,26 +149,33 @@ def generate_plots(roof_info, ai_data, isStandalone, verbose, fig=None):
    return fig


-def get_roofline(path_to_dir, ret_df, verbose, dev_id=None, isStandalone=False):
+def get_roofline(
+    path_to_dir,
+    ret_df,
+    verbose,
+    dev_id=None,
+    sort_type="kernels",
+    mem_level="ALL",
+    is_standalone=False,
+):
    # Roofline settings
-    # TODO: Make "sort" attribute dynamic so user can select desired sort
    fp32_details = {
        "path": path_to_dir,
-        "sort": "kernels",
+        "sort": sort_type,
        "device": 0,
        "dtype": "FP32",
    }
    fp16_details = {
        "path": path_to_dir,
-        "sort": "kernels",
+        "sort": sort_type,
        "device": 0,
        "dtype": "FP16",
    }
-    int8_details = {"path": path_to_dir, "sort": "kernels", "device": 0, "dtype": "I8"}
+    int8_details = {"path": path_to_dir, "sort": sort_type, "device": 0, "dtype": "I8"}

    # Generate roofline plots
    print("Path: ", path_to_dir)
-    ai_data = roofline_calc.plot_application("kernels", ret_df, verbose)
+    ai_data = roofline_calc.plot_application(sort_type, ret_df, verbose)
    if verbose >= 1:
        # print AI data for each mem level
        print("AI at each mem level")
@@ -203,17 +183,26 @@ def get_roofline(path_to_dir, ret_df, verbose, dev_id=None, isStandalone=False):
            print(i, "->", ai_data[i])
        print("\n")

-    fp32_fig = generate_plots(fp32_details, ai_data, isStandalone, verbose)
-    fp16_fig = generate_plots(fp16_details, ai_data, isStandalone, verbose)
-    ml_combo_fig = generate_plots(int8_details, ai_data, isStandalone, verbose, fp16_fig)
+    fp32_fig = generate_plots(fp32_details, ai_data, mem_level, is_standalone, verbose)
+    fp16_fig = generate_plots(fp16_details, ai_data, mem_level, is_standalone, verbose)
+    ml_combo_fig = generate_plots(
+        int8_details, ai_data, mem_level, is_standalone, verbose, fp16_fig
+    )

-    if isStandalone:
+    if is_standalone:
        dev_id = "ALL" if dev_id == -1 else str(dev_id)

        fp32_fig.write_image(path_to_dir + "/empirRoof_gpu-{}_fp32.pdf".format(dev_id))
        ml_combo_fig.write_image(
            path_to_dir + "/empirRoof_gpu-{}_fp8_fp16.pdf".format(dev_id)
        )
+        time.sleep(1)
+        # Re-save to remove loading MathJax pop up
+        fp32_fig.write_image(path_to_dir + "/empirRoof_gpu-{}_fp32.pdf".format(dev_id))
+        ml_combo_fig.write_image(
+            path_to_dir + "/empirRoof_gpu-{}_fp8_fp16.pdf".format(dev_id)
+        )
+        print("Empirical Roofline PDFs saved!")
    else:
        return html.Section(
            id="roofline",
@@ -56,6 +56,7 @@ class AI_Data:
    mfma_flops_bf16: float
    mfma_flops_f32: float
    mfma_flops_f64: float
+    mfma_iops_i8: float
    lds_data: float
    L1cache_data: float
    L2cache_data: float
@@ -88,11 +89,14 @@ def get_color(catagory):
 # -------------------------------------------------------------------------------------
 #                           Plot BW at each cache level
 # -------------------------------------------------------------------------------------
-def plot_roof(roof_details, roof_data, verbose):
+def plot_roof(roof_details, roof_data, mem_level, verbose):
    # TODO: This is where filtering by memory level will need to occur for standalone
    graphPoints = {"hbm": [], "l2": [], "l1": [], "lds": [], "valu": [], "mfma": []}

-    cacheHierarchy = ["HBM", "L2", "L1", "LDS"]
+    if mem_level == "ALL":
+        cacheHierarchy = ["HBM", "L2", "L1", "LDS"]
+    else:
+        cacheHierarchy = mem_level

    x1 = y1 = x2 = y2 = -1
    x1_mfma = y1_mfma = x2_mfma = y2_mfma = -1
@@ -223,6 +227,7 @@ def plot_application(sortType, ret_df, verbose):
                    mfma_flops_bf16 / calls,
                    mfma_flops_f32 / calls,
                    mfma_flops_f64 / calls,
+                    mfma_iops_i8 / calls,
                    lds_data / calls,
                    L1cache_data / calls,
                    L2cache_data / calls,
@@ -466,10 +471,7 @@ def plot_application(sortType, ret_df, verbose):
    return intensityPoints


-def empirical_roof(roof_info, verbose):
-
-    if roof_info["sort"] != "kernels" and roof_info["sort"] != "dispatches":
-        sys.exit("Invalid sort. Must be either 'kernels' or 'dispatches'")
+def empirical_roof(roof_info, mem_level, verbose):

    roofPath = roof_info["path"] + "/roofline.csv"
    # -----------------------------------------------------
@@ -509,7 +511,7 @@ def empirical_roof(roof_info, verbose):
    # ------------------
    #  Generate Roofline
    # ------------------
-    results = plot_roof(roof_info, roof_data, verbose)
+    results = plot_roof(roof_info, roof_data, mem_level, verbose)
    # for key in results:
    #     print(key, "->", results[key])

@@ -211,6 +211,7 @@ def parse(my_parser):
        metavar="",
        type=str,
        default="kernels",
+        choices=["kernels", "dispatches"],
        help="\t\t\tOverlay top kernels or top dispatches: (DEFAULT: kernels)\n\t\t\t   kernels\n\t\t\t   dispatches",
    )
    roofline_group.add_argument(
@@ -219,19 +220,11 @@ def parse(my_parser):
        required=False,
        choices=["HBM", "L2", "vL1D", "LDS"],
        metavar="",
+        nargs="+",
        type=str,
        default="ALL",
        help="\t\t\tFilter by memory level: (DEFAULT: ALL)\n\t\t\t   HBM\n\t\t\t   L2\n\t\t\t   vL1D\n\t\t\t   LDS",
    )
-    roofline_group.add_argument(
-        "--axes",
-        default=None,
-        type=float,
-        required=False,
-        nargs="+",
-        metavar="",
-        help="\t\t\tDesired axis values for graph. As follows:\n\t\t\t   xmin xmax ymin ymax",
-    )
    roofline_group.add_argument(
        "--device",
        metavar="",
@@ -1,672 +0,0 @@
-################################################################################
-# Copyright (c) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-# THE SOFTWARE.
-################################################################################
-
-from linecache import cache
-import os
-import sys
-from pathlib import Path
-
-import numpy
-import matplotlib
-
-try:
-
-    import matplotlib.pyplot as plt
-except ImportError:
-    # other non-interactive options:
-    #   cairo, pdf, pgf, ps, svg, template
-    matplotlib.use("agg", force=True)
-    import matplotlib.pyplot as plt
-
-from matplotlib.pyplot import get, text
-from math import log, pi, sqrt
-import pandas as pd
-import pylab
-
-from dataclasses import dataclass
-import csv
-
-
-################################################
-# Global vars
-################################################
-
-IMGNAME = "empirRoof"
-
-L2_BANKS = 32  # default assuming mi200
-
-XMIN = 0.01
-XMAX = 1000
-
-FONT_SIZE = 16
-FONT_COLOR = "black"
-FONT_WEIGHT = "bold"
-
-SUPPORTED_SOC = ["mi200"]
-
-################################################
-# Helper funcs
-################################################
-@dataclass
-class AI_Data:
-    KernelName: str
-    numCalls: float
-
-    total_flops: float
-    valu_flops: float
-    mfma_flops_f16: float
-    mfma_flops_bf16: float
-    mfma_flops_f32: float
-    mfma_flops_f64: float
-    lds_data: float
-    L1cache_data: float
-    L2cache_data: float
-    hbm_data: float
-
-    totalDuration: float
-    avgDuration: float
-
-
-def get_font():
-    return {
-        "size": FONT_SIZE,
-        "color": FONT_COLOR,
-        "weight": FONT_WEIGHT,
-        "family": "serif",
-    }
-
-
-def get_color(catagory):
-    if catagory == "curr_ai_l1":
-        return "green"
-    elif catagory == "curr_ai_l2":
-        return "blue"
-    elif catagory == "curr_ai_hbm":
-        return "red"
-    else:
-        raise RuntimeError("Invalid catagory passed to get_color()")
-
-
-# -------------------------------------------------------------------------------------
-#                           Plot BW at each cache level
-# -------------------------------------------------------------------------------------
-def plot_roof(inputs, roof_data):
-    cacheHierarchy = []
-    if inputs["mem"] == "ALL":
-        cacheHierarchy += ["HBM", "L2", "L1", "LDS"]
-    else:
-        cacheHierarchy.append(inputs["mem"])
-    targ_dtype = (
-        "FP32"
-        if float(roof_data["FP32Flops"][0]) > float(roof_data["FP64Flops"][0])
-        else "FP64"
-    )
-    print("Dtype: ", targ_dtype)
-    print(inputs["mem"])
-    x1 = y1 = x2 = y2 = -1
-    x1_mfma = y1_mfma = x2_mfma = y2_mfma = -1
-    target_precision = targ_dtype[2:]
-
-    peakOps = float(roof_data[targ_dtype + "Flops"][0])
-    for i in range(0, len(cacheHierarchy)):
-        # Plot BW line
-        # print("Current cache level: {}".format(cacheHierarchy[i]))
-        curr_bw = cacheHierarchy[i] + "Bw"
-        peakBw = float(roof_data[curr_bw][0])
-
-        peakMFMA = float(roof_data["MFMAF{}Flops".format(target_precision)][0])
-
-        x1 = float(XMIN)
-        y1 = float(XMIN) * peakBw
-
-        x2 = peakOps / peakBw
-        y2 = peakOps
-
-        plt.plot([x1, x2], [y1, y2], color="magenta")
-        # print("Mem Points: [{}, {}], [{}, {}]".format(x1, x2, y1, y2))
-
-        # Plot MFMA lines (NOTE: Assuming MI200 soc)
-        x1_mfma = peakOps / peakBw
-        y1_mfma = peakOps
-
-        x2_mfma = peakMFMA / peakBw
-        y2_mfma = peakMFMA
-
-        plt.plot([x1_mfma, x2_mfma], [y1_mfma, y2_mfma], color="blue")
-        # print("Extend BW Points: [{}, {}], [{}, {}]".format(x1_mfma, x2_mfma, y1_mfma, y2_mfma))
-
-        # These are the points to use:
-        # print("x = [{}, {}]".format(x1,x2_mfma))
-        # print("y = [{}, {}]".format(y1, y2_mfma))
-
-        # Plot BW label
-        x1log = log(x1) / log(10)
-        x2log = log(x2) / log(10)
-        y1log = log(y1) / log(10)
-        y2log = log(y2) / log(10)
-        x_text = 10 ** ((x1log + x2log) / 2)
-        y_text = 10 ** ((y1log + y2log) / 2)
-
-        fig = plt.gcf()
-        size = fig.get_size_inches() * fig.dpi
-        fig_x, fig_y = size
-
-        # dx = log(x2) - log(x1)
-        # dy = log(y2) - log(y1)
-        # x_min, x_max = plt.xlim()
-        # y_min, y_max = plt.ylim()
-        # Dx = dx * fig_x / (log(x_max) - log(x_min))
-        # Dy = dy * fig_y / (log(y_max) - log(y_min))
-        # #fdiv = 0.7 #TODO: improve accuracy of text angle (tilt)
-        # angle = (180.0 / pi) * numpy.arctan(Dy / Dx )#/fdiv)
-
-        dx = abs(log(x2) - log(x1))
-        dy = abs(log(y2) - log(y1))
-        angle = (180.0 / pi) * numpy.arctan(dy / dx)
-        # If user isn't zooming in, print bw labels normally
-        if not inputs["axes"]:
-            text(
-                x_text,
-                y_text,
-                "{} vL1D GB/s".format(int(peakBw))
-                if cacheHierarchy[i].upper() == "L1"
-                else "{} {} GB/s".format(int(peakBw), cacheHierarchy[i].upper()),
-                rotation=angle,
-                rotation_mode="anchor",
-                **get_font(),
-            )
-        else:
-            # if bw line isn't being cut out then plot bw
-            print("if {} < {}".format(inputs["axes"][0], 10**x2log))
-            if inputs["axes"][0] < 10**x2log:
-                text(
-                    10**x2log,
-                    10**y2log,
-                    "{} {} GB/s".format(int(peakBw), cacheHierarchy[i].upper()),
-                    rotation=angle,
-                    rotation_mode="anchor",
-                    **get_font(),
-                )
-
-    # -------------------------------------------------------------------------------------
-    #                                     Plot computing roof
-    # -------------------------------------------------------------------------------------
-    # Plot FMA roof
-    x0 = XMAX
-    if x2 < x0:
-        x0 = x2
-
-    temp_label = "{} VALU GFLOP/sec".format(int(peakOps))
-    plt.plot([x0, XMAX], [peakOps, peakOps], color="magenta")
-    # print("FMA Points: [{}, {}], [{},{}]".format(x0, XMAX, peakOps, peakOps))
-    text(
-        XMAX if not inputs["axes"] else inputs["axes"][1],
-        peakOps - 4000,  # should i keep this fixed at 4000?
-        temp_label,
-        horizontalalignment="right",
-        **get_font(),
-    )
-
-    # Plot MFMA roof
-    if x1_mfma != -1:  # assert that mfma has been assigned
-        x0_mfma = XMAX
-        if x2_mfma < x0_mfma:
-            x0_mfma = x2_mfma
-
-        peakMFMA = float(roof_data["MFMAF{}Flops".format(target_precision)][0])
-        temp_label = "{} MFMA GFLOP/sec".format(int(peakMFMA))
-        plt.plot([x0_mfma, XMAX], [peakMFMA, peakMFMA], color="blue")
-        # print("MFMA Points: [{}, {}], [{},{}]".format(x0_mfma, XMAX, peakMFMA, peakMFMA))
-        text(
-            XMAX if not inputs["axes"] else inputs["axes"][1],
-            peakMFMA + 1000,
-            temp_label,
-            horizontalalignment="right",
-            **get_font(),
-        )
-
-    return targ_dtype
-
-
-# -------------------------------------------------------------------------------------
-#                              Overlay application performance
-# -------------------------------------------------------------------------------------
-# Calculate relevent metrics for ai calculation
-def plot_application(inputs, verbose):
-
-    df = pd.read_csv(inputs["path"] + "/pmc_perf.csv")
-    # Sort by top kernels or top dispatches?
-    df = df.sort_values(by=["KernelName"])
-    df = df.reset_index(drop=True)
-
-    total_flops = (
-        valu_flops
-    ) = (
-        mfma_flops_bf16
-    ) = (
-        mfma_flops_f16
-    ) = (
-        mfma_iops_i8
-    ) = (
-        mfma_flops_f32
-    ) = (
-        mfma_flops_f64
-    ) = (
-        lds_data
-    ) = L1cache_data = L2cache_data = hbm_data = calls = totalDuration = avgDuration = 0.0
-    kernelName = ""
-
-    myList = []
-    for index, row in df.iterrows():
-        # CASE: Top kernels
-        if inputs["sort"] == "kernels" and (
-            (row["KernelName"] != kernelName and kernelName != "")
-            or index == df.shape[0] - 1
-        ):
-            if df.shape[0] - 1 == index:
-                calls += 1
-            myList.append(
-                AI_Data(
-                    kernelName,
-                    calls,
-                    total_flops / calls,
-                    valu_flops / calls,
-                    mfma_flops_f16 / calls,
-                    mfma_flops_bf16 / calls,
-                    mfma_flops_f32 / calls,
-                    mfma_flops_f64 / calls,
-                    lds_data / calls,
-                    L1cache_data / calls,
-                    L2cache_data / calls,
-                    hbm_data / calls,
-                    totalDuration,
-                    avgDuration / calls,
-                )
-            )
-            if verbose >= 2:
-                print(
-                    "Just added {} to AI_Data at index {}. # of calls: {}".format(
-                        kernelName, index, calls
-                    )
-                )
-            total_flops = (
-                valu_flops
-            ) = (
-                mfma_flops_bf16
-            ) = (
-                mfma_flops_f16
-            ) = (
-                mfma_iops_i8
-            ) = (
-                mfma_flops_f32
-            ) = (
-                mfma_flops_f64
-            ) = (
-                lds_data
-            ) = (
-                L1cache_data
-            ) = L2cache_data = hbm_data = calls = totalDuration = avgDuration = 0.0
-
-        kernelName = row["KernelName"]
-        try:
-            total_flops += (
-                (
-                    64
-                    * (
-                        row["SQ_INSTS_VALU_ADD_F16"]
-                        + row["SQ_INSTS_VALU_MUL_F16"]
-                        + (2 * row["SQ_INSTS_VALU_FMA_F16"])
-                        + row["SQ_INSTS_VALU_TRANS_F16"]
-                    )
-                )
-                + (
-                    64
-                    * (
-                        row["SQ_INSTS_VALU_ADD_F32"]
-                        + row["SQ_INSTS_VALU_MUL_F32"]
-                        + (2 * row["SQ_INSTS_VALU_FMA_F32"])
-                        + row["SQ_INSTS_VALU_TRANS_F32"]
-                    )
-                )
-                + (
-                    64
-                    * (
-                        row["SQ_INSTS_VALU_ADD_F64"]
-                        + row["SQ_INSTS_VALU_MUL_F64"]
-                        + (2 * row["SQ_INSTS_VALU_FMA_F64"])
-                        + row["SQ_INSTS_VALU_TRANS_F64"]
-                    )
-                )
-                + (row["SQ_INSTS_VALU_MFMA_MOPS_F16"] * 512)
-                + (row["SQ_INSTS_VALU_MFMA_MOPS_BF16"] * 512)
-                + (row["SQ_INSTS_VALU_MFMA_MOPS_F32"] * 512)
-                + (row["SQ_INSTS_VALU_MFMA_MOPS_F64"] * 512)
-            )
-        except KeyError:
-            if verbose >= 2:
-                print("Skipped total_flops at index {}".format(index))
-            pass
-        try:
-            valu_flops += (
-                64
-                * (
-                    row["SQ_INSTS_VALU_ADD_F16"]
-                    + row["SQ_INSTS_VALU_MUL_F16"]
-                    + (2 * row["SQ_INSTS_VALU_FMA_F16"])
-                    + row["SQ_INSTS_VALU_TRANS_F16"]
-                )
-                + 64
-                * (
-                    row["SQ_INSTS_VALU_ADD_F32"]
-                    + row["SQ_INSTS_VALU_MUL_F32"]
-                    + (2 * row["SQ_INSTS_VALU_FMA_F32"])
-                    + row["SQ_INSTS_VALU_TRANS_F32"]
-                )
-                + 64
-                * (
-                    row["SQ_INSTS_VALU_ADD_F64"]
-                    + row["SQ_INSTS_VALU_MUL_F64"]
-                    + (2 * row["SQ_INSTS_VALU_FMA_F64"])
-                    + row["SQ_INSTS_VALU_TRANS_F64"]
-                )
-            )
-        except KeyError:
-            if verbose >= 2:
-                print("Skipped valu_flops at index {}".format(index))
-            pass
-
-        try:
-            mfma_flops_f16 += row["SQ_INSTS_VALU_MFMA_MOPS_F16"] * 512
-            mfma_flops_bf16 += row["SQ_INSTS_VALU_MFMA_MOPS_BF16"] * 512
-            mfma_flops_f32 += row["SQ_INSTS_VALU_MFMA_MOPS_F32"] * 512
-            mfma_flops_f64 += row["SQ_INSTS_VALU_MFMA_MOPS_F64"] * 512
-            mfma_iops_i8 += row["SQ_INSTS_VALU_MFMA_MOPS_I8"] * 512
-        except KeyError:
-            if verbose >= 2:
-                print("Skipped mfma ops at index {}".format(index))
-            pass
-
-        try:
-            lds_data += (
-                (row["SQ_LDS_IDX_ACTIVE"] - row["SQ_LDS_BANK_CONFLICT"]) * 4 * L2_BANKS
-            )  # L2_BANKS = 32 (since assuming mi200)
-        except KeyError:
-            if verbose >= 2:
-                print("Skipped lds_data at index {}".format(index))
-            pass
-
-        try:
-            L1cache_data += row["TCP_TOTAL_CACHE_ACCESSES_sum"] * 64
-        except KeyError:
-            if verbose >= 2:
-                print("Skipped L1cache_data at index {}".format(index))
-            pass
-
-        try:
-            L2cache_data += (
-                row["TCP_TCC_WRITE_REQ_sum"] * 64
-                + row["TCP_TCC_ATOMIC_WITH_RET_REQ_sum"] * 64
-                + row["TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum"] * 64
-                + row["TCP_TCC_READ_REQ_sum"] * 64
-            )
-        except KeyError:
-            if verbose >= 2:
-                print("Skipped L2cache_data at index {}".format(index))
-            pass
-        try:
-            hbm_data += (
-                (row["TCC_EA_RDREQ_32B_sum"] * 32)
-                + ((row["TCC_EA_RDREQ_sum"] - row["TCC_EA_RDREQ_32B_sum"]) * 64)
-                + (row["TCC_EA_WRREQ_64B_sum"] * 64)
-                + ((row["TCC_EA_WRREQ_sum"] - row["TCC_EA_WRREQ_64B_sum"]) * 32)
-            )
-        except KeyError:
-            if verbose >= 2:
-                print("Skipped hbm_data at index {}".format(index))
-            pass
-
-        totalDuration += row["EndNs"] - row["BeginNs"]
-
-        avgDuration += row["EndNs"] - row["BeginNs"]
-
-        calls += 1
-        if inputs["sort"] == "dispatches":
-            myList.append(
-                AI_Data(
-                    kernelName,
-                    calls,
-                    total_flops,
-                    valu_flops,
-                    mfma_flops_f16,
-                    mfma_flops_bf16,
-                    mfma_flops_f32,
-                    mfma_flops_f64,
-                    mfma_iops_i8,
-                    lds_data,
-                    L1cache_data,
-                    L2cache_data,
-                    hbm_data,
-                    totalDuration,
-                    avgDuration,
-                )
-            )
-            total_flops = (
-                valu_flops
-            ) = (
-                mfma_flops_bf16
-            ) = (
-                mfma_flops_f16
-            ) = (
-                mfma_iops_i8
-            ) = (
-                mfma_flops_f32
-            ) = (
-                mfma_flops_f64
-            ) = (
-                lds_data
-            ) = (
-                L1cache_data
-            ) = L2cache_data = hbm_data = calls = totalDuration = avgDuration = 0.0
-
-    myList.sort(key=lambda x: x.totalDuration, reverse=True)
-
-    print("Top 10 intensities ('{}')...".format(inputs["sort"]))
-    intensities = {"curr_ai_l1": [], "curr_ai_l2": [], "curr_ai_hbm": []}
-    curr_perf = []
-    i = 0
-    # Create list of top 5 intensities
-    while i <= 9 and i != len(myList):
-        intensities["curr_ai_l1"].append(
-            myList[i].total_flops / myList[i].L1cache_data
-        ) if myList[i].L1cache_data else intensities["curr_ai_l1"].append(0)
-        # print("cur_ai_L1", myList[i].total_flops/myList[i].L1cache_data) if myList[i].L1cache_data else print("null")
-        # print()
-        intensities["curr_ai_l2"].append(
-            myList[i].total_flops / myList[i].L2cache_data
-        ) if myList[i].L2cache_data else intensities["curr_ai_l2"].append(0)
-        # print("cur_ai_L2", myList[i].total_flops/myList[i].L2cache_data) if myList[i].L2cache_data else print("null")
-        # print()
-        intensities["curr_ai_hbm"].append(
-            myList[i].total_flops / myList[i].hbm_data
-        ) if myList[i].hbm_data else intensities["curr_ai_hbm"].append(0)
-        # print("cur_ai_hbm", myList[i].total_flops/myList[i].hbm_data) if myList[i].hbm_data else print("null")
-        # print()
-        curr_perf.append(myList[i].total_flops / myList[i].avgDuration) if myList[
-            i
-        ].avgDuration else curr_perf.append(0)
-        # print("cur_perf", myList[i].total_flops/myList[i].avgDuration) if myList[i].avgDuration else print("null")
-
-        i += 1
-
-    print(intensities)
-
-    plotted_spots = []
-    labels = []
-    for i in intensities:
-        values = intensities[i]
-        color = get_color(i)
-        x = []
-        y = []
-        for entryIndx in range(0, len(values)):
-            x.append(values[entryIndx])
-            y.append(curr_perf[entryIndx])
-        myScatter = plt.scatter(x, y, c=color, marker="o")
-        plotted_spots.append(myScatter)
-        label = i
-        labels.append(label)
-
-    try:
-        pylab.legend(
-            plotted_spots,
-            labels,
-            prop={"size": (FONT_SIZE - 2)},
-            bbox_to_anchor=(1.04, 1),
-            loc="upper left",
-            title="Top {}".format(inputs["sort"]),
-            title_fontsize=FONT_SIZE,
-        )
-    except Exception as e:
-        sys.stderr.write(f"{e}\n")
-        pylab.legend(
-            plotted_spots,
-            labels,
-            prop={"size": (FONT_SIZE - 2)},
-        )
-
-
-def empirical_roof(args):
-    soc = args.target
-    inputs = {
-        "path": str,
-        "cmd": str,
-        "sort": str,
-        "mem": str,
-        "axes": list,
-        "device": int,
-        # "workgroups": int,
-        # "wsize": int,
-        # "dataset": int,
-        # "experiments": int,
-        # "iter": int
-    }
-
-    inputs["sort"] = args.sort.lower()
-    inputs["mem"] = args.mem_level.upper()
-
-    if inputs["sort"] != "kernels" and inputs["sort"] != "dispatches":
-        sys.exit("Invalid sort. Must be either 'kernels' or 'dispatches'")
-    if (
-        inputs["mem"] != "HBM"
-        and inputs["mem"] != "VL1D"
-        and inputs["mem"] != "L2"
-        and inputs["mem"] != "LDS"
-        and inputs["mem"] != "ALL"
-    ):
-        sys.exit(
-            "Invalid mem-level. Must be one of these option 'LDS', 'L2', 'vL1D', or 'HBM'"
-        )
-    if inputs["mem"] == "VL1D":
-        inputs["mem"] = "L1"
-
-    inputs["device"] = int(args.device)
-    # inputs["workgroups"] = int(args.workgroups)
-    # inputs["wsize"] = int(args.wsize)
-    # inputs["dataset"] = int(args.dataset)
-    # inputs["experiments"] = int(args.experiments)
-    # inputs["iter"] = int(args.iter)
-    inputs["path"] = args.path
-    inputs["cmd"] = args.remaining
-    inputs["axes"] = args.axes
-
-    # device_list = [int(item) for item in args.device.split(',')]
-
-    if soc not in SUPPORTED_SOC:
-        sys.exit("SoC not yet supported for Roofline Analysis")
-
-    # Basic Info
-    print("Path: ", inputs["path"])
-    print("Target: ", soc)
-    print("Memory Level: ", inputs["mem"])
-
-    roofPath = inputs["path"] + "/roofline.csv"
-    # -----------------------------------------------------
-    # Initialize roofline data dictionary from roofline.csv
-    # -----------------------------------------------------
-    roof_data = (
-        {}
-    )  # TODO: consider changing this to an ordered dict for consistency over py versions
-    headers = []
-    with open(roofPath, "r") as csvfile:
-        csvReader = csv.reader(csvfile, delimiter=",")
-        rowCount = 0
-        for row in csvReader:
-            row.pop(0)  # remove devID
-            if rowCount == 0:
-                headers = row
-                for i in headers:
-                    roof_data[i] = []
-            else:
-                for i, key in enumerate(headers):
-                    roof_data[key].append(row[i])
-
-            rowCount += 1
-    csvfile.close()
-
-    # Initalize plot
-    f = plt.figure(figsize=(1600 / 100, 1200 / 100), dpi=100)
-    f.add_subplot(111)
-
-    _title_font = get_font()
-    _title_font["size"] += 8
-
-    plt.title("Empirical Roofline", **_title_font)
-    plt.xlabel("Arithmetic Intensity (FLOP/Byte)", **get_font())
-    plt.ylabel("Performance (GFLOP/sec)", **get_font())
-    plt.grid(True, which="major", ls="--", lw=1)
-    plt.grid(True, which="minor", ls="--", lw=0.5)
-    plt.yscale("log")
-    plt.xscale("log")
-    # Adjust axes if instructed
-    if inputs["axes"]:
-        plt.xlim(inputs["axes"][0], inputs["axes"][1])
-        plt.ylim(inputs["axes"][2], inputs["axes"][3])
-
-    # ------------------
-    #  Generate Roofline
-    # ------------------
-    dtype = plot_roof(inputs, roof_data)  # Also returns chosen dtype
-    plot_application(inputs, args.verbose)
-
-    if inputs["device"] == -1:
-        dev_id = "ALL"
-    else:
-        dev_id = str(inputs["device"])
-
-    filename = IMGNAME + "_gpu-" + dev_id + "_{}".format(dtype) + ".pdf"
-
-    full_path = os.path.abspath(inputs["path"])
-    path_to_output = full_path + "/" + filename
-
-    print('Saving plot: "{}"...'.format(filename))
-    plt.savefig(path_to_output, bbox_inches="tight", format="pdf")
-    print('File saved to: "{}"'.format(path_to_output))
-    plt.close()