From b6d6c05e7ba4cd00ea62b0537591f2d260564a8a Mon Sep 17 00:00:00 2001 From: coleramos425 Date: Thu, 26 Jan 2023 14:07:10 -0600 Subject: [PATCH] Re-enable standalone roof options. Everthing working. Signed-off-by: coleramos425 --- src/omniperf | 2 +- src/omniperf_analyze/omniperf_analyze.py | 17 +- .../utils/gui_components/roofline.py | 129 ++-- src/omniperf_analyze/utils/roofline_calc.py | 16 +- src/parser.py | 11 +- src/utils/plot_roofline.py | 672 ------------------ 6 files changed, 86 insertions(+), 761 deletions(-) delete mode 100644 src/utils/plot_roofline.py diff --git a/src/omniperf b/src/omniperf index 3697409386..24e4cc94d5 100755 --- a/src/omniperf +++ b/src/omniperf @@ -649,7 +649,7 @@ def main(): # Setup prerequisits for roofline roof_setup(args, my_parser) # Generate roofline - roofline_only(args.path, args.verbose, args.device) + roofline_only(args.path, args.device, args.sort, args.mem_level, args.verbose) # Profile only else: diff --git a/src/omniperf_analyze/omniperf_analyze.py b/src/omniperf_analyze/omniperf_analyze.py index d15dec4c92..5c090bc121 100644 --- a/src/omniperf_analyze/omniperf_analyze.py +++ b/src/omniperf_analyze/omniperf_analyze.py @@ -210,10 +210,15 @@ def run_cli(args, runs): ) -def roofline_only(path_to_dir, verbose, dev_id): +def roofline_only(path_to_dir, dev_id, sort_type, mem_level, verbose): import pandas as pd from collections import OrderedDict + # Change vL1D to a interpretable str, if required + if "vL1D" in mem_level: + mem_level.remove("vL1D") + mem_level.append("L1") + app_path = path_to_dir + "/pmc_perf.csv" roofline_exists = os.path.isfile(app_path) if not roofline_exists: @@ -221,7 +226,15 @@ def roofline_only(path_to_dir, verbose, dev_id): sys.exit(0) t_df = OrderedDict() t_df["pmc_perf"] = pd.read_csv(app_path) - get_roofline(path_to_dir, t_df, dev_id, verbose, True) + get_roofline( + path_to_dir, + t_df, + verbose, + dev_id, # [Optional] Specify device id to collect roofline info from + sort_type, # [Optional] Sort AI by top kernels or dispatches + mem_level, # [Optional] Toggle particular level(s) of memory hierarchy + True, # [Optional] Generate a standalone roofline analysis + ) def analyze(args): diff --git a/src/omniperf_analyze/utils/gui_components/roofline.py b/src/omniperf_analyze/utils/gui_components/roofline.py index c3917064d9..89e19af132 100644 --- a/src/omniperf_analyze/utils/gui_components/roofline.py +++ b/src/omniperf_analyze/utils/gui_components/roofline.py @@ -22,6 +22,7 @@ from omniperf_analyze.utils import roofline_calc +import time import numpy as np from dash import html, dash_table @@ -35,72 +36,40 @@ def to_int(a): else: return int(a) -def generate_plots(roof_info, ai_data, isStandalone, verbose, fig=None): + +def generate_plots(roof_info, ai_data, mem_level, is_standalone, verbose, fig=None): if fig is None: fig = go.Figure() - plotMode = "lines+text" if isStandalone else "lines" - line_data = roofline_calc.empirical_roof(roof_info, verbose) + plotMode = "lines+text" if is_standalone else "lines" + line_data = roofline_calc.empirical_roof(roof_info, mem_level, verbose) print("Line data:\n", line_data) ####################### # Plot BW Lines ####################### - fig.add_trace( - go.Scatter( - x=line_data["hbm"][0], - y=line_data["hbm"][1], - name="HBM-{}".format(roof_info["dtype"]), - mode=plotMode, - hovertemplate="%{text}", - text=[ - "{} GB/s".format(to_int(line_data["hbm"][2])), - None if isStandalone else "{} GB/s".format(to_int(line_data["hbm"][2])) - ], - textposition="top right", + if mem_level == "ALL": + cacheHierarchy = ["HBM", "L2", "L1", "LDS"] + else: + cacheHierarchy = mem_level + + for cacheLevel in cacheHierarchy: + fig.add_trace( + go.Scatter( + x=line_data[cacheLevel.lower()][0], + y=line_data[cacheLevel.lower()][1], + name="{}-{}".format(cacheLevel, roof_info["dtype"]), + mode=plotMode, + hovertemplate="%{text}", + text=[ + "{} GB/s".format(to_int(line_data[cacheLevel.lower()][2])), + None + if is_standalone + else "{} GB/s".format(to_int(line_data[cacheLevel.lower()][2])), + ], + textposition="top right", + ) ) - ) - fig.add_trace( - go.Scatter( - x=line_data["l2"][0], - y=line_data["l2"][1], - name="L2-{}".format(roof_info["dtype"]), - mode=plotMode, - hovertemplate="%{text}", - text=[ - "{} GB/s".format(to_int(line_data["l2"][2])), - None if isStandalone else "{} GB/s".format(to_int(line_data["l2"][2])) - ], - textposition="top right", - ) - ) - fig.add_trace( - go.Scatter( - x=line_data["l1"][0], - y=line_data["l1"][1], - name="L1-{}".format(roof_info["dtype"]), - mode=plotMode, - hovertemplate="%{text}", - text=[ - "{} GB/s".format(to_int(line_data["l1"][2])), - None if isStandalone else "{} GB/s".format(to_int(line_data["l1"][2])) - ], - textposition="top right", - ) - ) - fig.add_trace( - go.Scatter( - x=line_data["lds"][0], - y=line_data["lds"][1], - name="LDS-{}".format(roof_info["dtype"]), - mode=plotMode, - hovertemplate="%{text}", - text=[ - "{} GB/s".format(to_int(line_data["lds"][2])), - None if isStandalone else "{} GB/s".format(to_int(line_data["lds"][2])) - ], - textposition="top right", - ) - ) + if roof_info["dtype"] != "FP16" and roof_info["dtype"] != "I8": fig.add_trace( go.Scatter( @@ -110,7 +79,9 @@ def generate_plots(roof_info, ai_data, isStandalone, verbose, fig=None): mode=plotMode, hovertemplate="%{text}", text=[ - None if isStandalone else "{} GFLOP/s".format(to_int(line_data["valu"][2])), + None + if is_standalone + else "{} GFLOP/s".format(to_int(line_data["valu"][2])), "{} GFLOP/s".format(to_int(line_data["valu"][2])), ], textposition="top left", @@ -129,7 +100,9 @@ def generate_plots(roof_info, ai_data, isStandalone, verbose, fig=None): mode=plotMode, hovertemplate="%{text}", text=[ - None if isStandalone else "{} GFLOP/s".format(to_int(line_data["mfma"][2])), + None + if is_standalone + else "{} GFLOP/s".format(to_int(line_data["mfma"][2])), "{} GFLOP/s".format(to_int(line_data["mfma"][2])), ], textposition=pos, @@ -176,26 +149,33 @@ def generate_plots(roof_info, ai_data, isStandalone, verbose, fig=None): return fig -def get_roofline(path_to_dir, ret_df, verbose, dev_id=None, isStandalone=False): +def get_roofline( + path_to_dir, + ret_df, + verbose, + dev_id=None, + sort_type="kernels", + mem_level="ALL", + is_standalone=False, +): # Roofline settings - # TODO: Make "sort" attribute dynamic so user can select desired sort fp32_details = { "path": path_to_dir, - "sort": "kernels", + "sort": sort_type, "device": 0, "dtype": "FP32", } fp16_details = { "path": path_to_dir, - "sort": "kernels", + "sort": sort_type, "device": 0, "dtype": "FP16", } - int8_details = {"path": path_to_dir, "sort": "kernels", "device": 0, "dtype": "I8"} + int8_details = {"path": path_to_dir, "sort": sort_type, "device": 0, "dtype": "I8"} # Generate roofline plots print("Path: ", path_to_dir) - ai_data = roofline_calc.plot_application("kernels", ret_df, verbose) + ai_data = roofline_calc.plot_application(sort_type, ret_df, verbose) if verbose >= 1: # print AI data for each mem level print("AI at each mem level") @@ -203,17 +183,26 @@ def get_roofline(path_to_dir, ret_df, verbose, dev_id=None, isStandalone=False): print(i, "->", ai_data[i]) print("\n") - fp32_fig = generate_plots(fp32_details, ai_data, isStandalone, verbose) - fp16_fig = generate_plots(fp16_details, ai_data, isStandalone, verbose) - ml_combo_fig = generate_plots(int8_details, ai_data, isStandalone, verbose, fp16_fig) + fp32_fig = generate_plots(fp32_details, ai_data, mem_level, is_standalone, verbose) + fp16_fig = generate_plots(fp16_details, ai_data, mem_level, is_standalone, verbose) + ml_combo_fig = generate_plots( + int8_details, ai_data, mem_level, is_standalone, verbose, fp16_fig + ) - if isStandalone: + if is_standalone: dev_id = "ALL" if dev_id == -1 else str(dev_id) fp32_fig.write_image(path_to_dir + "/empirRoof_gpu-{}_fp32.pdf".format(dev_id)) ml_combo_fig.write_image( path_to_dir + "/empirRoof_gpu-{}_fp8_fp16.pdf".format(dev_id) ) + time.sleep(1) + # Re-save to remove loading MathJax pop up + fp32_fig.write_image(path_to_dir + "/empirRoof_gpu-{}_fp32.pdf".format(dev_id)) + ml_combo_fig.write_image( + path_to_dir + "/empirRoof_gpu-{}_fp8_fp16.pdf".format(dev_id) + ) + print("Empirical Roofline PDFs saved!") else: return html.Section( id="roofline", diff --git a/src/omniperf_analyze/utils/roofline_calc.py b/src/omniperf_analyze/utils/roofline_calc.py index ca8b5022a5..158a2cb28b 100644 --- a/src/omniperf_analyze/utils/roofline_calc.py +++ b/src/omniperf_analyze/utils/roofline_calc.py @@ -56,6 +56,7 @@ class AI_Data: mfma_flops_bf16: float mfma_flops_f32: float mfma_flops_f64: float + mfma_iops_i8: float lds_data: float L1cache_data: float L2cache_data: float @@ -88,11 +89,14 @@ def get_color(catagory): # ------------------------------------------------------------------------------------- # Plot BW at each cache level # ------------------------------------------------------------------------------------- -def plot_roof(roof_details, roof_data, verbose): +def plot_roof(roof_details, roof_data, mem_level, verbose): # TODO: This is where filtering by memory level will need to occur for standalone graphPoints = {"hbm": [], "l2": [], "l1": [], "lds": [], "valu": [], "mfma": []} - cacheHierarchy = ["HBM", "L2", "L1", "LDS"] + if mem_level == "ALL": + cacheHierarchy = ["HBM", "L2", "L1", "LDS"] + else: + cacheHierarchy = mem_level x1 = y1 = x2 = y2 = -1 x1_mfma = y1_mfma = x2_mfma = y2_mfma = -1 @@ -223,6 +227,7 @@ def plot_application(sortType, ret_df, verbose): mfma_flops_bf16 / calls, mfma_flops_f32 / calls, mfma_flops_f64 / calls, + mfma_iops_i8 / calls, lds_data / calls, L1cache_data / calls, L2cache_data / calls, @@ -466,10 +471,7 @@ def plot_application(sortType, ret_df, verbose): return intensityPoints -def empirical_roof(roof_info, verbose): - - if roof_info["sort"] != "kernels" and roof_info["sort"] != "dispatches": - sys.exit("Invalid sort. Must be either 'kernels' or 'dispatches'") +def empirical_roof(roof_info, mem_level, verbose): roofPath = roof_info["path"] + "/roofline.csv" # ----------------------------------------------------- @@ -509,7 +511,7 @@ def empirical_roof(roof_info, verbose): # ------------------ # Generate Roofline # ------------------ - results = plot_roof(roof_info, roof_data, verbose) + results = plot_roof(roof_info, roof_data, mem_level, verbose) # for key in results: # print(key, "->", results[key]) diff --git a/src/parser.py b/src/parser.py index 149c3f8f86..5f847d092d 100644 --- a/src/parser.py +++ b/src/parser.py @@ -211,6 +211,7 @@ def parse(my_parser): metavar="", type=str, default="kernels", + choices=["kernels", "dispatches"], help="\t\t\tOverlay top kernels or top dispatches: (DEFAULT: kernels)\n\t\t\t kernels\n\t\t\t dispatches", ) roofline_group.add_argument( @@ -219,19 +220,11 @@ def parse(my_parser): required=False, choices=["HBM", "L2", "vL1D", "LDS"], metavar="", + nargs="+", type=str, default="ALL", help="\t\t\tFilter by memory level: (DEFAULT: ALL)\n\t\t\t HBM\n\t\t\t L2\n\t\t\t vL1D\n\t\t\t LDS", ) - roofline_group.add_argument( - "--axes", - default=None, - type=float, - required=False, - nargs="+", - metavar="", - help="\t\t\tDesired axis values for graph. As follows:\n\t\t\t xmin xmax ymin ymax", - ) roofline_group.add_argument( "--device", metavar="", diff --git a/src/utils/plot_roofline.py b/src/utils/plot_roofline.py deleted file mode 100644 index f421dc972e..0000000000 --- a/src/utils/plot_roofline.py +++ /dev/null @@ -1,672 +0,0 @@ -################################################################################ -# Copyright (c) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -# THE SOFTWARE. -################################################################################ - -from linecache import cache -import os -import sys -from pathlib import Path - -import numpy -import matplotlib - -try: - - import matplotlib.pyplot as plt -except ImportError: - # other non-interactive options: - # cairo, pdf, pgf, ps, svg, template - matplotlib.use("agg", force=True) - import matplotlib.pyplot as plt - -from matplotlib.pyplot import get, text -from math import log, pi, sqrt -import pandas as pd -import pylab - -from dataclasses import dataclass -import csv - - -################################################ -# Global vars -################################################ - -IMGNAME = "empirRoof" - -L2_BANKS = 32 # default assuming mi200 - -XMIN = 0.01 -XMAX = 1000 - -FONT_SIZE = 16 -FONT_COLOR = "black" -FONT_WEIGHT = "bold" - -SUPPORTED_SOC = ["mi200"] - -################################################ -# Helper funcs -################################################ -@dataclass -class AI_Data: - KernelName: str - numCalls: float - - total_flops: float - valu_flops: float - mfma_flops_f16: float - mfma_flops_bf16: float - mfma_flops_f32: float - mfma_flops_f64: float - lds_data: float - L1cache_data: float - L2cache_data: float - hbm_data: float - - totalDuration: float - avgDuration: float - - -def get_font(): - return { - "size": FONT_SIZE, - "color": FONT_COLOR, - "weight": FONT_WEIGHT, - "family": "serif", - } - - -def get_color(catagory): - if catagory == "curr_ai_l1": - return "green" - elif catagory == "curr_ai_l2": - return "blue" - elif catagory == "curr_ai_hbm": - return "red" - else: - raise RuntimeError("Invalid catagory passed to get_color()") - - -# ------------------------------------------------------------------------------------- -# Plot BW at each cache level -# ------------------------------------------------------------------------------------- -def plot_roof(inputs, roof_data): - cacheHierarchy = [] - if inputs["mem"] == "ALL": - cacheHierarchy += ["HBM", "L2", "L1", "LDS"] - else: - cacheHierarchy.append(inputs["mem"]) - targ_dtype = ( - "FP32" - if float(roof_data["FP32Flops"][0]) > float(roof_data["FP64Flops"][0]) - else "FP64" - ) - print("Dtype: ", targ_dtype) - print(inputs["mem"]) - x1 = y1 = x2 = y2 = -1 - x1_mfma = y1_mfma = x2_mfma = y2_mfma = -1 - target_precision = targ_dtype[2:] - - peakOps = float(roof_data[targ_dtype + "Flops"][0]) - for i in range(0, len(cacheHierarchy)): - # Plot BW line - # print("Current cache level: {}".format(cacheHierarchy[i])) - curr_bw = cacheHierarchy[i] + "Bw" - peakBw = float(roof_data[curr_bw][0]) - - peakMFMA = float(roof_data["MFMAF{}Flops".format(target_precision)][0]) - - x1 = float(XMIN) - y1 = float(XMIN) * peakBw - - x2 = peakOps / peakBw - y2 = peakOps - - plt.plot([x1, x2], [y1, y2], color="magenta") - # print("Mem Points: [{}, {}], [{}, {}]".format(x1, x2, y1, y2)) - - # Plot MFMA lines (NOTE: Assuming MI200 soc) - x1_mfma = peakOps / peakBw - y1_mfma = peakOps - - x2_mfma = peakMFMA / peakBw - y2_mfma = peakMFMA - - plt.plot([x1_mfma, x2_mfma], [y1_mfma, y2_mfma], color="blue") - # print("Extend BW Points: [{}, {}], [{}, {}]".format(x1_mfma, x2_mfma, y1_mfma, y2_mfma)) - - # These are the points to use: - # print("x = [{}, {}]".format(x1,x2_mfma)) - # print("y = [{}, {}]".format(y1, y2_mfma)) - - # Plot BW label - x1log = log(x1) / log(10) - x2log = log(x2) / log(10) - y1log = log(y1) / log(10) - y2log = log(y2) / log(10) - x_text = 10 ** ((x1log + x2log) / 2) - y_text = 10 ** ((y1log + y2log) / 2) - - fig = plt.gcf() - size = fig.get_size_inches() * fig.dpi - fig_x, fig_y = size - - # dx = log(x2) - log(x1) - # dy = log(y2) - log(y1) - # x_min, x_max = plt.xlim() - # y_min, y_max = plt.ylim() - # Dx = dx * fig_x / (log(x_max) - log(x_min)) - # Dy = dy * fig_y / (log(y_max) - log(y_min)) - # #fdiv = 0.7 #TODO: improve accuracy of text angle (tilt) - # angle = (180.0 / pi) * numpy.arctan(Dy / Dx )#/fdiv) - - dx = abs(log(x2) - log(x1)) - dy = abs(log(y2) - log(y1)) - angle = (180.0 / pi) * numpy.arctan(dy / dx) - # If user isn't zooming in, print bw labels normally - if not inputs["axes"]: - text( - x_text, - y_text, - "{} vL1D GB/s".format(int(peakBw)) - if cacheHierarchy[i].upper() == "L1" - else "{} {} GB/s".format(int(peakBw), cacheHierarchy[i].upper()), - rotation=angle, - rotation_mode="anchor", - **get_font(), - ) - else: - # if bw line isn't being cut out then plot bw - print("if {} < {}".format(inputs["axes"][0], 10**x2log)) - if inputs["axes"][0] < 10**x2log: - text( - 10**x2log, - 10**y2log, - "{} {} GB/s".format(int(peakBw), cacheHierarchy[i].upper()), - rotation=angle, - rotation_mode="anchor", - **get_font(), - ) - - # ------------------------------------------------------------------------------------- - # Plot computing roof - # ------------------------------------------------------------------------------------- - # Plot FMA roof - x0 = XMAX - if x2 < x0: - x0 = x2 - - temp_label = "{} VALU GFLOP/sec".format(int(peakOps)) - plt.plot([x0, XMAX], [peakOps, peakOps], color="magenta") - # print("FMA Points: [{}, {}], [{},{}]".format(x0, XMAX, peakOps, peakOps)) - text( - XMAX if not inputs["axes"] else inputs["axes"][1], - peakOps - 4000, # should i keep this fixed at 4000? - temp_label, - horizontalalignment="right", - **get_font(), - ) - - # Plot MFMA roof - if x1_mfma != -1: # assert that mfma has been assigned - x0_mfma = XMAX - if x2_mfma < x0_mfma: - x0_mfma = x2_mfma - - peakMFMA = float(roof_data["MFMAF{}Flops".format(target_precision)][0]) - temp_label = "{} MFMA GFLOP/sec".format(int(peakMFMA)) - plt.plot([x0_mfma, XMAX], [peakMFMA, peakMFMA], color="blue") - # print("MFMA Points: [{}, {}], [{},{}]".format(x0_mfma, XMAX, peakMFMA, peakMFMA)) - text( - XMAX if not inputs["axes"] else inputs["axes"][1], - peakMFMA + 1000, - temp_label, - horizontalalignment="right", - **get_font(), - ) - - return targ_dtype - - -# ------------------------------------------------------------------------------------- -# Overlay application performance -# ------------------------------------------------------------------------------------- -# Calculate relevent metrics for ai calculation -def plot_application(inputs, verbose): - - df = pd.read_csv(inputs["path"] + "/pmc_perf.csv") - # Sort by top kernels or top dispatches? - df = df.sort_values(by=["KernelName"]) - df = df.reset_index(drop=True) - - total_flops = ( - valu_flops - ) = ( - mfma_flops_bf16 - ) = ( - mfma_flops_f16 - ) = ( - mfma_iops_i8 - ) = ( - mfma_flops_f32 - ) = ( - mfma_flops_f64 - ) = ( - lds_data - ) = L1cache_data = L2cache_data = hbm_data = calls = totalDuration = avgDuration = 0.0 - kernelName = "" - - myList = [] - for index, row in df.iterrows(): - # CASE: Top kernels - if inputs["sort"] == "kernels" and ( - (row["KernelName"] != kernelName and kernelName != "") - or index == df.shape[0] - 1 - ): - if df.shape[0] - 1 == index: - calls += 1 - myList.append( - AI_Data( - kernelName, - calls, - total_flops / calls, - valu_flops / calls, - mfma_flops_f16 / calls, - mfma_flops_bf16 / calls, - mfma_flops_f32 / calls, - mfma_flops_f64 / calls, - lds_data / calls, - L1cache_data / calls, - L2cache_data / calls, - hbm_data / calls, - totalDuration, - avgDuration / calls, - ) - ) - if verbose >= 2: - print( - "Just added {} to AI_Data at index {}. # of calls: {}".format( - kernelName, index, calls - ) - ) - total_flops = ( - valu_flops - ) = ( - mfma_flops_bf16 - ) = ( - mfma_flops_f16 - ) = ( - mfma_iops_i8 - ) = ( - mfma_flops_f32 - ) = ( - mfma_flops_f64 - ) = ( - lds_data - ) = ( - L1cache_data - ) = L2cache_data = hbm_data = calls = totalDuration = avgDuration = 0.0 - - kernelName = row["KernelName"] - try: - total_flops += ( - ( - 64 - * ( - row["SQ_INSTS_VALU_ADD_F16"] - + row["SQ_INSTS_VALU_MUL_F16"] - + (2 * row["SQ_INSTS_VALU_FMA_F16"]) - + row["SQ_INSTS_VALU_TRANS_F16"] - ) - ) - + ( - 64 - * ( - row["SQ_INSTS_VALU_ADD_F32"] - + row["SQ_INSTS_VALU_MUL_F32"] - + (2 * row["SQ_INSTS_VALU_FMA_F32"]) - + row["SQ_INSTS_VALU_TRANS_F32"] - ) - ) - + ( - 64 - * ( - row["SQ_INSTS_VALU_ADD_F64"] - + row["SQ_INSTS_VALU_MUL_F64"] - + (2 * row["SQ_INSTS_VALU_FMA_F64"]) - + row["SQ_INSTS_VALU_TRANS_F64"] - ) - ) - + (row["SQ_INSTS_VALU_MFMA_MOPS_F16"] * 512) - + (row["SQ_INSTS_VALU_MFMA_MOPS_BF16"] * 512) - + (row["SQ_INSTS_VALU_MFMA_MOPS_F32"] * 512) - + (row["SQ_INSTS_VALU_MFMA_MOPS_F64"] * 512) - ) - except KeyError: - if verbose >= 2: - print("Skipped total_flops at index {}".format(index)) - pass - try: - valu_flops += ( - 64 - * ( - row["SQ_INSTS_VALU_ADD_F16"] - + row["SQ_INSTS_VALU_MUL_F16"] - + (2 * row["SQ_INSTS_VALU_FMA_F16"]) - + row["SQ_INSTS_VALU_TRANS_F16"] - ) - + 64 - * ( - row["SQ_INSTS_VALU_ADD_F32"] - + row["SQ_INSTS_VALU_MUL_F32"] - + (2 * row["SQ_INSTS_VALU_FMA_F32"]) - + row["SQ_INSTS_VALU_TRANS_F32"] - ) - + 64 - * ( - row["SQ_INSTS_VALU_ADD_F64"] - + row["SQ_INSTS_VALU_MUL_F64"] - + (2 * row["SQ_INSTS_VALU_FMA_F64"]) - + row["SQ_INSTS_VALU_TRANS_F64"] - ) - ) - except KeyError: - if verbose >= 2: - print("Skipped valu_flops at index {}".format(index)) - pass - - try: - mfma_flops_f16 += row["SQ_INSTS_VALU_MFMA_MOPS_F16"] * 512 - mfma_flops_bf16 += row["SQ_INSTS_VALU_MFMA_MOPS_BF16"] * 512 - mfma_flops_f32 += row["SQ_INSTS_VALU_MFMA_MOPS_F32"] * 512 - mfma_flops_f64 += row["SQ_INSTS_VALU_MFMA_MOPS_F64"] * 512 - mfma_iops_i8 += row["SQ_INSTS_VALU_MFMA_MOPS_I8"] * 512 - except KeyError: - if verbose >= 2: - print("Skipped mfma ops at index {}".format(index)) - pass - - try: - lds_data += ( - (row["SQ_LDS_IDX_ACTIVE"] - row["SQ_LDS_BANK_CONFLICT"]) * 4 * L2_BANKS - ) # L2_BANKS = 32 (since assuming mi200) - except KeyError: - if verbose >= 2: - print("Skipped lds_data at index {}".format(index)) - pass - - try: - L1cache_data += row["TCP_TOTAL_CACHE_ACCESSES_sum"] * 64 - except KeyError: - if verbose >= 2: - print("Skipped L1cache_data at index {}".format(index)) - pass - - try: - L2cache_data += ( - row["TCP_TCC_WRITE_REQ_sum"] * 64 - + row["TCP_TCC_ATOMIC_WITH_RET_REQ_sum"] * 64 - + row["TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum"] * 64 - + row["TCP_TCC_READ_REQ_sum"] * 64 - ) - except KeyError: - if verbose >= 2: - print("Skipped L2cache_data at index {}".format(index)) - pass - try: - hbm_data += ( - (row["TCC_EA_RDREQ_32B_sum"] * 32) - + ((row["TCC_EA_RDREQ_sum"] - row["TCC_EA_RDREQ_32B_sum"]) * 64) - + (row["TCC_EA_WRREQ_64B_sum"] * 64) - + ((row["TCC_EA_WRREQ_sum"] - row["TCC_EA_WRREQ_64B_sum"]) * 32) - ) - except KeyError: - if verbose >= 2: - print("Skipped hbm_data at index {}".format(index)) - pass - - totalDuration += row["EndNs"] - row["BeginNs"] - - avgDuration += row["EndNs"] - row["BeginNs"] - - calls += 1 - if inputs["sort"] == "dispatches": - myList.append( - AI_Data( - kernelName, - calls, - total_flops, - valu_flops, - mfma_flops_f16, - mfma_flops_bf16, - mfma_flops_f32, - mfma_flops_f64, - mfma_iops_i8, - lds_data, - L1cache_data, - L2cache_data, - hbm_data, - totalDuration, - avgDuration, - ) - ) - total_flops = ( - valu_flops - ) = ( - mfma_flops_bf16 - ) = ( - mfma_flops_f16 - ) = ( - mfma_iops_i8 - ) = ( - mfma_flops_f32 - ) = ( - mfma_flops_f64 - ) = ( - lds_data - ) = ( - L1cache_data - ) = L2cache_data = hbm_data = calls = totalDuration = avgDuration = 0.0 - - myList.sort(key=lambda x: x.totalDuration, reverse=True) - - print("Top 10 intensities ('{}')...".format(inputs["sort"])) - intensities = {"curr_ai_l1": [], "curr_ai_l2": [], "curr_ai_hbm": []} - curr_perf = [] - i = 0 - # Create list of top 5 intensities - while i <= 9 and i != len(myList): - intensities["curr_ai_l1"].append( - myList[i].total_flops / myList[i].L1cache_data - ) if myList[i].L1cache_data else intensities["curr_ai_l1"].append(0) - # print("cur_ai_L1", myList[i].total_flops/myList[i].L1cache_data) if myList[i].L1cache_data else print("null") - # print() - intensities["curr_ai_l2"].append( - myList[i].total_flops / myList[i].L2cache_data - ) if myList[i].L2cache_data else intensities["curr_ai_l2"].append(0) - # print("cur_ai_L2", myList[i].total_flops/myList[i].L2cache_data) if myList[i].L2cache_data else print("null") - # print() - intensities["curr_ai_hbm"].append( - myList[i].total_flops / myList[i].hbm_data - ) if myList[i].hbm_data else intensities["curr_ai_hbm"].append(0) - # print("cur_ai_hbm", myList[i].total_flops/myList[i].hbm_data) if myList[i].hbm_data else print("null") - # print() - curr_perf.append(myList[i].total_flops / myList[i].avgDuration) if myList[ - i - ].avgDuration else curr_perf.append(0) - # print("cur_perf", myList[i].total_flops/myList[i].avgDuration) if myList[i].avgDuration else print("null") - - i += 1 - - print(intensities) - - plotted_spots = [] - labels = [] - for i in intensities: - values = intensities[i] - color = get_color(i) - x = [] - y = [] - for entryIndx in range(0, len(values)): - x.append(values[entryIndx]) - y.append(curr_perf[entryIndx]) - myScatter = plt.scatter(x, y, c=color, marker="o") - plotted_spots.append(myScatter) - label = i - labels.append(label) - - try: - pylab.legend( - plotted_spots, - labels, - prop={"size": (FONT_SIZE - 2)}, - bbox_to_anchor=(1.04, 1), - loc="upper left", - title="Top {}".format(inputs["sort"]), - title_fontsize=FONT_SIZE, - ) - except Exception as e: - sys.stderr.write(f"{e}\n") - pylab.legend( - plotted_spots, - labels, - prop={"size": (FONT_SIZE - 2)}, - ) - - -def empirical_roof(args): - soc = args.target - inputs = { - "path": str, - "cmd": str, - "sort": str, - "mem": str, - "axes": list, - "device": int, - # "workgroups": int, - # "wsize": int, - # "dataset": int, - # "experiments": int, - # "iter": int - } - - inputs["sort"] = args.sort.lower() - inputs["mem"] = args.mem_level.upper() - - if inputs["sort"] != "kernels" and inputs["sort"] != "dispatches": - sys.exit("Invalid sort. Must be either 'kernels' or 'dispatches'") - if ( - inputs["mem"] != "HBM" - and inputs["mem"] != "VL1D" - and inputs["mem"] != "L2" - and inputs["mem"] != "LDS" - and inputs["mem"] != "ALL" - ): - sys.exit( - "Invalid mem-level. Must be one of these option 'LDS', 'L2', 'vL1D', or 'HBM'" - ) - if inputs["mem"] == "VL1D": - inputs["mem"] = "L1" - - inputs["device"] = int(args.device) - # inputs["workgroups"] = int(args.workgroups) - # inputs["wsize"] = int(args.wsize) - # inputs["dataset"] = int(args.dataset) - # inputs["experiments"] = int(args.experiments) - # inputs["iter"] = int(args.iter) - inputs["path"] = args.path - inputs["cmd"] = args.remaining - inputs["axes"] = args.axes - - # device_list = [int(item) for item in args.device.split(',')] - - if soc not in SUPPORTED_SOC: - sys.exit("SoC not yet supported for Roofline Analysis") - - # Basic Info - print("Path: ", inputs["path"]) - print("Target: ", soc) - print("Memory Level: ", inputs["mem"]) - - roofPath = inputs["path"] + "/roofline.csv" - # ----------------------------------------------------- - # Initialize roofline data dictionary from roofline.csv - # ----------------------------------------------------- - roof_data = ( - {} - ) # TODO: consider changing this to an ordered dict for consistency over py versions - headers = [] - with open(roofPath, "r") as csvfile: - csvReader = csv.reader(csvfile, delimiter=",") - rowCount = 0 - for row in csvReader: - row.pop(0) # remove devID - if rowCount == 0: - headers = row - for i in headers: - roof_data[i] = [] - else: - for i, key in enumerate(headers): - roof_data[key].append(row[i]) - - rowCount += 1 - csvfile.close() - - # Initalize plot - f = plt.figure(figsize=(1600 / 100, 1200 / 100), dpi=100) - f.add_subplot(111) - - _title_font = get_font() - _title_font["size"] += 8 - - plt.title("Empirical Roofline", **_title_font) - plt.xlabel("Arithmetic Intensity (FLOP/Byte)", **get_font()) - plt.ylabel("Performance (GFLOP/sec)", **get_font()) - plt.grid(True, which="major", ls="--", lw=1) - plt.grid(True, which="minor", ls="--", lw=0.5) - plt.yscale("log") - plt.xscale("log") - # Adjust axes if instructed - if inputs["axes"]: - plt.xlim(inputs["axes"][0], inputs["axes"][1]) - plt.ylim(inputs["axes"][2], inputs["axes"][3]) - - # ------------------ - # Generate Roofline - # ------------------ - dtype = plot_roof(inputs, roof_data) # Also returns chosen dtype - plot_application(inputs, args.verbose) - - if inputs["device"] == -1: - dev_id = "ALL" - else: - dev_id = str(inputs["device"]) - - filename = IMGNAME + "_gpu-" + dev_id + "_{}".format(dtype) + ".pdf" - - full_path = os.path.abspath(inputs["path"]) - path_to_output = full_path + "/" + filename - - print('Saving plot: "{}"...'.format(filename)) - plt.savefig(path_to_output, bbox_inches="tight", format="pdf") - print('File saved to: "{}"'.format(path_to_output)) - plt.close()