diff --git a/src/omniperf b/src/omniperf
index 3697409386..24e4cc94d5 100755
--- a/src/omniperf
+++ b/src/omniperf
@@ -649,7 +649,7 @@ def main():
# Setup prerequisits for roofline
roof_setup(args, my_parser)
# Generate roofline
- roofline_only(args.path, args.verbose, args.device)
+ roofline_only(args.path, args.device, args.sort, args.mem_level, args.verbose)
# Profile only
else:
diff --git a/src/omniperf_analyze/omniperf_analyze.py b/src/omniperf_analyze/omniperf_analyze.py
index d15dec4c92..5c090bc121 100644
--- a/src/omniperf_analyze/omniperf_analyze.py
+++ b/src/omniperf_analyze/omniperf_analyze.py
@@ -210,10 +210,15 @@ def run_cli(args, runs):
)
-def roofline_only(path_to_dir, verbose, dev_id):
+def roofline_only(path_to_dir, dev_id, sort_type, mem_level, verbose):
import pandas as pd
from collections import OrderedDict
+ # Change vL1D to a interpretable str, if required
+ if "vL1D" in mem_level:
+ mem_level.remove("vL1D")
+ mem_level.append("L1")
+
app_path = path_to_dir + "/pmc_perf.csv"
roofline_exists = os.path.isfile(app_path)
if not roofline_exists:
@@ -221,7 +226,15 @@ def roofline_only(path_to_dir, verbose, dev_id):
sys.exit(0)
t_df = OrderedDict()
t_df["pmc_perf"] = pd.read_csv(app_path)
- get_roofline(path_to_dir, t_df, dev_id, verbose, True)
+ get_roofline(
+ path_to_dir,
+ t_df,
+ verbose,
+ dev_id, # [Optional] Specify device id to collect roofline info from
+ sort_type, # [Optional] Sort AI by top kernels or dispatches
+ mem_level, # [Optional] Toggle particular level(s) of memory hierarchy
+ True, # [Optional] Generate a standalone roofline analysis
+ )
def analyze(args):
diff --git a/src/omniperf_analyze/utils/gui_components/roofline.py b/src/omniperf_analyze/utils/gui_components/roofline.py
index c3917064d9..89e19af132 100644
--- a/src/omniperf_analyze/utils/gui_components/roofline.py
+++ b/src/omniperf_analyze/utils/gui_components/roofline.py
@@ -22,6 +22,7 @@
from omniperf_analyze.utils import roofline_calc
+import time
import numpy as np
from dash import html, dash_table
@@ -35,72 +36,40 @@ def to_int(a):
else:
return int(a)
-def generate_plots(roof_info, ai_data, isStandalone, verbose, fig=None):
+
+def generate_plots(roof_info, ai_data, mem_level, is_standalone, verbose, fig=None):
if fig is None:
fig = go.Figure()
- plotMode = "lines+text" if isStandalone else "lines"
- line_data = roofline_calc.empirical_roof(roof_info, verbose)
+ plotMode = "lines+text" if is_standalone else "lines"
+ line_data = roofline_calc.empirical_roof(roof_info, mem_level, verbose)
print("Line data:\n", line_data)
#######################
# Plot BW Lines
#######################
- fig.add_trace(
- go.Scatter(
- x=line_data["hbm"][0],
- y=line_data["hbm"][1],
- name="HBM-{}".format(roof_info["dtype"]),
- mode=plotMode,
- hovertemplate="%{text}",
- text=[
- "{} GB/s".format(to_int(line_data["hbm"][2])),
- None if isStandalone else "{} GB/s".format(to_int(line_data["hbm"][2]))
- ],
- textposition="top right",
+ if mem_level == "ALL":
+ cacheHierarchy = ["HBM", "L2", "L1", "LDS"]
+ else:
+ cacheHierarchy = mem_level
+
+ for cacheLevel in cacheHierarchy:
+ fig.add_trace(
+ go.Scatter(
+ x=line_data[cacheLevel.lower()][0],
+ y=line_data[cacheLevel.lower()][1],
+ name="{}-{}".format(cacheLevel, roof_info["dtype"]),
+ mode=plotMode,
+ hovertemplate="%{text}",
+ text=[
+ "{} GB/s".format(to_int(line_data[cacheLevel.lower()][2])),
+ None
+ if is_standalone
+ else "{} GB/s".format(to_int(line_data[cacheLevel.lower()][2])),
+ ],
+ textposition="top right",
+ )
)
- )
- fig.add_trace(
- go.Scatter(
- x=line_data["l2"][0],
- y=line_data["l2"][1],
- name="L2-{}".format(roof_info["dtype"]),
- mode=plotMode,
- hovertemplate="%{text}",
- text=[
- "{} GB/s".format(to_int(line_data["l2"][2])),
- None if isStandalone else "{} GB/s".format(to_int(line_data["l2"][2]))
- ],
- textposition="top right",
- )
- )
- fig.add_trace(
- go.Scatter(
- x=line_data["l1"][0],
- y=line_data["l1"][1],
- name="L1-{}".format(roof_info["dtype"]),
- mode=plotMode,
- hovertemplate="%{text}",
- text=[
- "{} GB/s".format(to_int(line_data["l1"][2])),
- None if isStandalone else "{} GB/s".format(to_int(line_data["l1"][2]))
- ],
- textposition="top right",
- )
- )
- fig.add_trace(
- go.Scatter(
- x=line_data["lds"][0],
- y=line_data["lds"][1],
- name="LDS-{}".format(roof_info["dtype"]),
- mode=plotMode,
- hovertemplate="%{text}",
- text=[
- "{} GB/s".format(to_int(line_data["lds"][2])),
- None if isStandalone else "{} GB/s".format(to_int(line_data["lds"][2]))
- ],
- textposition="top right",
- )
- )
+
if roof_info["dtype"] != "FP16" and roof_info["dtype"] != "I8":
fig.add_trace(
go.Scatter(
@@ -110,7 +79,9 @@ def generate_plots(roof_info, ai_data, isStandalone, verbose, fig=None):
mode=plotMode,
hovertemplate="%{text}",
text=[
- None if isStandalone else "{} GFLOP/s".format(to_int(line_data["valu"][2])),
+ None
+ if is_standalone
+ else "{} GFLOP/s".format(to_int(line_data["valu"][2])),
"{} GFLOP/s".format(to_int(line_data["valu"][2])),
],
textposition="top left",
@@ -129,7 +100,9 @@ def generate_plots(roof_info, ai_data, isStandalone, verbose, fig=None):
mode=plotMode,
hovertemplate="%{text}",
text=[
- None if isStandalone else "{} GFLOP/s".format(to_int(line_data["mfma"][2])),
+ None
+ if is_standalone
+ else "{} GFLOP/s".format(to_int(line_data["mfma"][2])),
"{} GFLOP/s".format(to_int(line_data["mfma"][2])),
],
textposition=pos,
@@ -176,26 +149,33 @@ def generate_plots(roof_info, ai_data, isStandalone, verbose, fig=None):
return fig
-def get_roofline(path_to_dir, ret_df, verbose, dev_id=None, isStandalone=False):
+def get_roofline(
+ path_to_dir,
+ ret_df,
+ verbose,
+ dev_id=None,
+ sort_type="kernels",
+ mem_level="ALL",
+ is_standalone=False,
+):
# Roofline settings
- # TODO: Make "sort" attribute dynamic so user can select desired sort
fp32_details = {
"path": path_to_dir,
- "sort": "kernels",
+ "sort": sort_type,
"device": 0,
"dtype": "FP32",
}
fp16_details = {
"path": path_to_dir,
- "sort": "kernels",
+ "sort": sort_type,
"device": 0,
"dtype": "FP16",
}
- int8_details = {"path": path_to_dir, "sort": "kernels", "device": 0, "dtype": "I8"}
+ int8_details = {"path": path_to_dir, "sort": sort_type, "device": 0, "dtype": "I8"}
# Generate roofline plots
print("Path: ", path_to_dir)
- ai_data = roofline_calc.plot_application("kernels", ret_df, verbose)
+ ai_data = roofline_calc.plot_application(sort_type, ret_df, verbose)
if verbose >= 1:
# print AI data for each mem level
print("AI at each mem level")
@@ -203,17 +183,26 @@ def get_roofline(path_to_dir, ret_df, verbose, dev_id=None, isStandalone=False):
print(i, "->", ai_data[i])
print("\n")
- fp32_fig = generate_plots(fp32_details, ai_data, isStandalone, verbose)
- fp16_fig = generate_plots(fp16_details, ai_data, isStandalone, verbose)
- ml_combo_fig = generate_plots(int8_details, ai_data, isStandalone, verbose, fp16_fig)
+ fp32_fig = generate_plots(fp32_details, ai_data, mem_level, is_standalone, verbose)
+ fp16_fig = generate_plots(fp16_details, ai_data, mem_level, is_standalone, verbose)
+ ml_combo_fig = generate_plots(
+ int8_details, ai_data, mem_level, is_standalone, verbose, fp16_fig
+ )
- if isStandalone:
+ if is_standalone:
dev_id = "ALL" if dev_id == -1 else str(dev_id)
fp32_fig.write_image(path_to_dir + "/empirRoof_gpu-{}_fp32.pdf".format(dev_id))
ml_combo_fig.write_image(
path_to_dir + "/empirRoof_gpu-{}_fp8_fp16.pdf".format(dev_id)
)
+ time.sleep(1)
+ # Re-save to remove loading MathJax pop up
+ fp32_fig.write_image(path_to_dir + "/empirRoof_gpu-{}_fp32.pdf".format(dev_id))
+ ml_combo_fig.write_image(
+ path_to_dir + "/empirRoof_gpu-{}_fp8_fp16.pdf".format(dev_id)
+ )
+ print("Empirical Roofline PDFs saved!")
else:
return html.Section(
id="roofline",
diff --git a/src/omniperf_analyze/utils/roofline_calc.py b/src/omniperf_analyze/utils/roofline_calc.py
index ca8b5022a5..158a2cb28b 100644
--- a/src/omniperf_analyze/utils/roofline_calc.py
+++ b/src/omniperf_analyze/utils/roofline_calc.py
@@ -56,6 +56,7 @@ class AI_Data:
mfma_flops_bf16: float
mfma_flops_f32: float
mfma_flops_f64: float
+ mfma_iops_i8: float
lds_data: float
L1cache_data: float
L2cache_data: float
@@ -88,11 +89,14 @@ def get_color(catagory):
# -------------------------------------------------------------------------------------
# Plot BW at each cache level
# -------------------------------------------------------------------------------------
-def plot_roof(roof_details, roof_data, verbose):
+def plot_roof(roof_details, roof_data, mem_level, verbose):
# TODO: This is where filtering by memory level will need to occur for standalone
graphPoints = {"hbm": [], "l2": [], "l1": [], "lds": [], "valu": [], "mfma": []}
- cacheHierarchy = ["HBM", "L2", "L1", "LDS"]
+ if mem_level == "ALL":
+ cacheHierarchy = ["HBM", "L2", "L1", "LDS"]
+ else:
+ cacheHierarchy = mem_level
x1 = y1 = x2 = y2 = -1
x1_mfma = y1_mfma = x2_mfma = y2_mfma = -1
@@ -223,6 +227,7 @@ def plot_application(sortType, ret_df, verbose):
mfma_flops_bf16 / calls,
mfma_flops_f32 / calls,
mfma_flops_f64 / calls,
+ mfma_iops_i8 / calls,
lds_data / calls,
L1cache_data / calls,
L2cache_data / calls,
@@ -466,10 +471,7 @@ def plot_application(sortType, ret_df, verbose):
return intensityPoints
-def empirical_roof(roof_info, verbose):
-
- if roof_info["sort"] != "kernels" and roof_info["sort"] != "dispatches":
- sys.exit("Invalid sort. Must be either 'kernels' or 'dispatches'")
+def empirical_roof(roof_info, mem_level, verbose):
roofPath = roof_info["path"] + "/roofline.csv"
# -----------------------------------------------------
@@ -509,7 +511,7 @@ def empirical_roof(roof_info, verbose):
# ------------------
# Generate Roofline
# ------------------
- results = plot_roof(roof_info, roof_data, verbose)
+ results = plot_roof(roof_info, roof_data, mem_level, verbose)
# for key in results:
# print(key, "->", results[key])
diff --git a/src/parser.py b/src/parser.py
index 149c3f8f86..5f847d092d 100644
--- a/src/parser.py
+++ b/src/parser.py
@@ -211,6 +211,7 @@ def parse(my_parser):
metavar="",
type=str,
default="kernels",
+ choices=["kernels", "dispatches"],
help="\t\t\tOverlay top kernels or top dispatches: (DEFAULT: kernels)\n\t\t\t kernels\n\t\t\t dispatches",
)
roofline_group.add_argument(
@@ -219,19 +220,11 @@ def parse(my_parser):
required=False,
choices=["HBM", "L2", "vL1D", "LDS"],
metavar="",
+ nargs="+",
type=str,
default="ALL",
help="\t\t\tFilter by memory level: (DEFAULT: ALL)\n\t\t\t HBM\n\t\t\t L2\n\t\t\t vL1D\n\t\t\t LDS",
)
- roofline_group.add_argument(
- "--axes",
- default=None,
- type=float,
- required=False,
- nargs="+",
- metavar="",
- help="\t\t\tDesired axis values for graph. As follows:\n\t\t\t xmin xmax ymin ymax",
- )
roofline_group.add_argument(
"--device",
metavar="",
diff --git a/src/utils/plot_roofline.py b/src/utils/plot_roofline.py
deleted file mode 100644
index f421dc972e..0000000000
--- a/src/utils/plot_roofline.py
+++ /dev/null
@@ -1,672 +0,0 @@
-################################################################################
-# Copyright (c) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-# THE SOFTWARE.
-################################################################################
-
-from linecache import cache
-import os
-import sys
-from pathlib import Path
-
-import numpy
-import matplotlib
-
-try:
-
- import matplotlib.pyplot as plt
-except ImportError:
- # other non-interactive options:
- # cairo, pdf, pgf, ps, svg, template
- matplotlib.use("agg", force=True)
- import matplotlib.pyplot as plt
-
-from matplotlib.pyplot import get, text
-from math import log, pi, sqrt
-import pandas as pd
-import pylab
-
-from dataclasses import dataclass
-import csv
-
-
-################################################
-# Global vars
-################################################
-
-IMGNAME = "empirRoof"
-
-L2_BANKS = 32 # default assuming mi200
-
-XMIN = 0.01
-XMAX = 1000
-
-FONT_SIZE = 16
-FONT_COLOR = "black"
-FONT_WEIGHT = "bold"
-
-SUPPORTED_SOC = ["mi200"]
-
-################################################
-# Helper funcs
-################################################
-@dataclass
-class AI_Data:
- KernelName: str
- numCalls: float
-
- total_flops: float
- valu_flops: float
- mfma_flops_f16: float
- mfma_flops_bf16: float
- mfma_flops_f32: float
- mfma_flops_f64: float
- lds_data: float
- L1cache_data: float
- L2cache_data: float
- hbm_data: float
-
- totalDuration: float
- avgDuration: float
-
-
-def get_font():
- return {
- "size": FONT_SIZE,
- "color": FONT_COLOR,
- "weight": FONT_WEIGHT,
- "family": "serif",
- }
-
-
-def get_color(catagory):
- if catagory == "curr_ai_l1":
- return "green"
- elif catagory == "curr_ai_l2":
- return "blue"
- elif catagory == "curr_ai_hbm":
- return "red"
- else:
- raise RuntimeError("Invalid catagory passed to get_color()")
-
-
-# -------------------------------------------------------------------------------------
-# Plot BW at each cache level
-# -------------------------------------------------------------------------------------
-def plot_roof(inputs, roof_data):
- cacheHierarchy = []
- if inputs["mem"] == "ALL":
- cacheHierarchy += ["HBM", "L2", "L1", "LDS"]
- else:
- cacheHierarchy.append(inputs["mem"])
- targ_dtype = (
- "FP32"
- if float(roof_data["FP32Flops"][0]) > float(roof_data["FP64Flops"][0])
- else "FP64"
- )
- print("Dtype: ", targ_dtype)
- print(inputs["mem"])
- x1 = y1 = x2 = y2 = -1
- x1_mfma = y1_mfma = x2_mfma = y2_mfma = -1
- target_precision = targ_dtype[2:]
-
- peakOps = float(roof_data[targ_dtype + "Flops"][0])
- for i in range(0, len(cacheHierarchy)):
- # Plot BW line
- # print("Current cache level: {}".format(cacheHierarchy[i]))
- curr_bw = cacheHierarchy[i] + "Bw"
- peakBw = float(roof_data[curr_bw][0])
-
- peakMFMA = float(roof_data["MFMAF{}Flops".format(target_precision)][0])
-
- x1 = float(XMIN)
- y1 = float(XMIN) * peakBw
-
- x2 = peakOps / peakBw
- y2 = peakOps
-
- plt.plot([x1, x2], [y1, y2], color="magenta")
- # print("Mem Points: [{}, {}], [{}, {}]".format(x1, x2, y1, y2))
-
- # Plot MFMA lines (NOTE: Assuming MI200 soc)
- x1_mfma = peakOps / peakBw
- y1_mfma = peakOps
-
- x2_mfma = peakMFMA / peakBw
- y2_mfma = peakMFMA
-
- plt.plot([x1_mfma, x2_mfma], [y1_mfma, y2_mfma], color="blue")
- # print("Extend BW Points: [{}, {}], [{}, {}]".format(x1_mfma, x2_mfma, y1_mfma, y2_mfma))
-
- # These are the points to use:
- # print("x = [{}, {}]".format(x1,x2_mfma))
- # print("y = [{}, {}]".format(y1, y2_mfma))
-
- # Plot BW label
- x1log = log(x1) / log(10)
- x2log = log(x2) / log(10)
- y1log = log(y1) / log(10)
- y2log = log(y2) / log(10)
- x_text = 10 ** ((x1log + x2log) / 2)
- y_text = 10 ** ((y1log + y2log) / 2)
-
- fig = plt.gcf()
- size = fig.get_size_inches() * fig.dpi
- fig_x, fig_y = size
-
- # dx = log(x2) - log(x1)
- # dy = log(y2) - log(y1)
- # x_min, x_max = plt.xlim()
- # y_min, y_max = plt.ylim()
- # Dx = dx * fig_x / (log(x_max) - log(x_min))
- # Dy = dy * fig_y / (log(y_max) - log(y_min))
- # #fdiv = 0.7 #TODO: improve accuracy of text angle (tilt)
- # angle = (180.0 / pi) * numpy.arctan(Dy / Dx )#/fdiv)
-
- dx = abs(log(x2) - log(x1))
- dy = abs(log(y2) - log(y1))
- angle = (180.0 / pi) * numpy.arctan(dy / dx)
- # If user isn't zooming in, print bw labels normally
- if not inputs["axes"]:
- text(
- x_text,
- y_text,
- "{} vL1D GB/s".format(int(peakBw))
- if cacheHierarchy[i].upper() == "L1"
- else "{} {} GB/s".format(int(peakBw), cacheHierarchy[i].upper()),
- rotation=angle,
- rotation_mode="anchor",
- **get_font(),
- )
- else:
- # if bw line isn't being cut out then plot bw
- print("if {} < {}".format(inputs["axes"][0], 10**x2log))
- if inputs["axes"][0] < 10**x2log:
- text(
- 10**x2log,
- 10**y2log,
- "{} {} GB/s".format(int(peakBw), cacheHierarchy[i].upper()),
- rotation=angle,
- rotation_mode="anchor",
- **get_font(),
- )
-
- # -------------------------------------------------------------------------------------
- # Plot computing roof
- # -------------------------------------------------------------------------------------
- # Plot FMA roof
- x0 = XMAX
- if x2 < x0:
- x0 = x2
-
- temp_label = "{} VALU GFLOP/sec".format(int(peakOps))
- plt.plot([x0, XMAX], [peakOps, peakOps], color="magenta")
- # print("FMA Points: [{}, {}], [{},{}]".format(x0, XMAX, peakOps, peakOps))
- text(
- XMAX if not inputs["axes"] else inputs["axes"][1],
- peakOps - 4000, # should i keep this fixed at 4000?
- temp_label,
- horizontalalignment="right",
- **get_font(),
- )
-
- # Plot MFMA roof
- if x1_mfma != -1: # assert that mfma has been assigned
- x0_mfma = XMAX
- if x2_mfma < x0_mfma:
- x0_mfma = x2_mfma
-
- peakMFMA = float(roof_data["MFMAF{}Flops".format(target_precision)][0])
- temp_label = "{} MFMA GFLOP/sec".format(int(peakMFMA))
- plt.plot([x0_mfma, XMAX], [peakMFMA, peakMFMA], color="blue")
- # print("MFMA Points: [{}, {}], [{},{}]".format(x0_mfma, XMAX, peakMFMA, peakMFMA))
- text(
- XMAX if not inputs["axes"] else inputs["axes"][1],
- peakMFMA + 1000,
- temp_label,
- horizontalalignment="right",
- **get_font(),
- )
-
- return targ_dtype
-
-
-# -------------------------------------------------------------------------------------
-# Overlay application performance
-# -------------------------------------------------------------------------------------
-# Calculate relevent metrics for ai calculation
-def plot_application(inputs, verbose):
-
- df = pd.read_csv(inputs["path"] + "/pmc_perf.csv")
- # Sort by top kernels or top dispatches?
- df = df.sort_values(by=["KernelName"])
- df = df.reset_index(drop=True)
-
- total_flops = (
- valu_flops
- ) = (
- mfma_flops_bf16
- ) = (
- mfma_flops_f16
- ) = (
- mfma_iops_i8
- ) = (
- mfma_flops_f32
- ) = (
- mfma_flops_f64
- ) = (
- lds_data
- ) = L1cache_data = L2cache_data = hbm_data = calls = totalDuration = avgDuration = 0.0
- kernelName = ""
-
- myList = []
- for index, row in df.iterrows():
- # CASE: Top kernels
- if inputs["sort"] == "kernels" and (
- (row["KernelName"] != kernelName and kernelName != "")
- or index == df.shape[0] - 1
- ):
- if df.shape[0] - 1 == index:
- calls += 1
- myList.append(
- AI_Data(
- kernelName,
- calls,
- total_flops / calls,
- valu_flops / calls,
- mfma_flops_f16 / calls,
- mfma_flops_bf16 / calls,
- mfma_flops_f32 / calls,
- mfma_flops_f64 / calls,
- lds_data / calls,
- L1cache_data / calls,
- L2cache_data / calls,
- hbm_data / calls,
- totalDuration,
- avgDuration / calls,
- )
- )
- if verbose >= 2:
- print(
- "Just added {} to AI_Data at index {}. # of calls: {}".format(
- kernelName, index, calls
- )
- )
- total_flops = (
- valu_flops
- ) = (
- mfma_flops_bf16
- ) = (
- mfma_flops_f16
- ) = (
- mfma_iops_i8
- ) = (
- mfma_flops_f32
- ) = (
- mfma_flops_f64
- ) = (
- lds_data
- ) = (
- L1cache_data
- ) = L2cache_data = hbm_data = calls = totalDuration = avgDuration = 0.0
-
- kernelName = row["KernelName"]
- try:
- total_flops += (
- (
- 64
- * (
- row["SQ_INSTS_VALU_ADD_F16"]
- + row["SQ_INSTS_VALU_MUL_F16"]
- + (2 * row["SQ_INSTS_VALU_FMA_F16"])
- + row["SQ_INSTS_VALU_TRANS_F16"]
- )
- )
- + (
- 64
- * (
- row["SQ_INSTS_VALU_ADD_F32"]
- + row["SQ_INSTS_VALU_MUL_F32"]
- + (2 * row["SQ_INSTS_VALU_FMA_F32"])
- + row["SQ_INSTS_VALU_TRANS_F32"]
- )
- )
- + (
- 64
- * (
- row["SQ_INSTS_VALU_ADD_F64"]
- + row["SQ_INSTS_VALU_MUL_F64"]
- + (2 * row["SQ_INSTS_VALU_FMA_F64"])
- + row["SQ_INSTS_VALU_TRANS_F64"]
- )
- )
- + (row["SQ_INSTS_VALU_MFMA_MOPS_F16"] * 512)
- + (row["SQ_INSTS_VALU_MFMA_MOPS_BF16"] * 512)
- + (row["SQ_INSTS_VALU_MFMA_MOPS_F32"] * 512)
- + (row["SQ_INSTS_VALU_MFMA_MOPS_F64"] * 512)
- )
- except KeyError:
- if verbose >= 2:
- print("Skipped total_flops at index {}".format(index))
- pass
- try:
- valu_flops += (
- 64
- * (
- row["SQ_INSTS_VALU_ADD_F16"]
- + row["SQ_INSTS_VALU_MUL_F16"]
- + (2 * row["SQ_INSTS_VALU_FMA_F16"])
- + row["SQ_INSTS_VALU_TRANS_F16"]
- )
- + 64
- * (
- row["SQ_INSTS_VALU_ADD_F32"]
- + row["SQ_INSTS_VALU_MUL_F32"]
- + (2 * row["SQ_INSTS_VALU_FMA_F32"])
- + row["SQ_INSTS_VALU_TRANS_F32"]
- )
- + 64
- * (
- row["SQ_INSTS_VALU_ADD_F64"]
- + row["SQ_INSTS_VALU_MUL_F64"]
- + (2 * row["SQ_INSTS_VALU_FMA_F64"])
- + row["SQ_INSTS_VALU_TRANS_F64"]
- )
- )
- except KeyError:
- if verbose >= 2:
- print("Skipped valu_flops at index {}".format(index))
- pass
-
- try:
- mfma_flops_f16 += row["SQ_INSTS_VALU_MFMA_MOPS_F16"] * 512
- mfma_flops_bf16 += row["SQ_INSTS_VALU_MFMA_MOPS_BF16"] * 512
- mfma_flops_f32 += row["SQ_INSTS_VALU_MFMA_MOPS_F32"] * 512
- mfma_flops_f64 += row["SQ_INSTS_VALU_MFMA_MOPS_F64"] * 512
- mfma_iops_i8 += row["SQ_INSTS_VALU_MFMA_MOPS_I8"] * 512
- except KeyError:
- if verbose >= 2:
- print("Skipped mfma ops at index {}".format(index))
- pass
-
- try:
- lds_data += (
- (row["SQ_LDS_IDX_ACTIVE"] - row["SQ_LDS_BANK_CONFLICT"]) * 4 * L2_BANKS
- ) # L2_BANKS = 32 (since assuming mi200)
- except KeyError:
- if verbose >= 2:
- print("Skipped lds_data at index {}".format(index))
- pass
-
- try:
- L1cache_data += row["TCP_TOTAL_CACHE_ACCESSES_sum"] * 64
- except KeyError:
- if verbose >= 2:
- print("Skipped L1cache_data at index {}".format(index))
- pass
-
- try:
- L2cache_data += (
- row["TCP_TCC_WRITE_REQ_sum"] * 64
- + row["TCP_TCC_ATOMIC_WITH_RET_REQ_sum"] * 64
- + row["TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum"] * 64
- + row["TCP_TCC_READ_REQ_sum"] * 64
- )
- except KeyError:
- if verbose >= 2:
- print("Skipped L2cache_data at index {}".format(index))
- pass
- try:
- hbm_data += (
- (row["TCC_EA_RDREQ_32B_sum"] * 32)
- + ((row["TCC_EA_RDREQ_sum"] - row["TCC_EA_RDREQ_32B_sum"]) * 64)
- + (row["TCC_EA_WRREQ_64B_sum"] * 64)
- + ((row["TCC_EA_WRREQ_sum"] - row["TCC_EA_WRREQ_64B_sum"]) * 32)
- )
- except KeyError:
- if verbose >= 2:
- print("Skipped hbm_data at index {}".format(index))
- pass
-
- totalDuration += row["EndNs"] - row["BeginNs"]
-
- avgDuration += row["EndNs"] - row["BeginNs"]
-
- calls += 1
- if inputs["sort"] == "dispatches":
- myList.append(
- AI_Data(
- kernelName,
- calls,
- total_flops,
- valu_flops,
- mfma_flops_f16,
- mfma_flops_bf16,
- mfma_flops_f32,
- mfma_flops_f64,
- mfma_iops_i8,
- lds_data,
- L1cache_data,
- L2cache_data,
- hbm_data,
- totalDuration,
- avgDuration,
- )
- )
- total_flops = (
- valu_flops
- ) = (
- mfma_flops_bf16
- ) = (
- mfma_flops_f16
- ) = (
- mfma_iops_i8
- ) = (
- mfma_flops_f32
- ) = (
- mfma_flops_f64
- ) = (
- lds_data
- ) = (
- L1cache_data
- ) = L2cache_data = hbm_data = calls = totalDuration = avgDuration = 0.0
-
- myList.sort(key=lambda x: x.totalDuration, reverse=True)
-
- print("Top 10 intensities ('{}')...".format(inputs["sort"]))
- intensities = {"curr_ai_l1": [], "curr_ai_l2": [], "curr_ai_hbm": []}
- curr_perf = []
- i = 0
- # Create list of top 5 intensities
- while i <= 9 and i != len(myList):
- intensities["curr_ai_l1"].append(
- myList[i].total_flops / myList[i].L1cache_data
- ) if myList[i].L1cache_data else intensities["curr_ai_l1"].append(0)
- # print("cur_ai_L1", myList[i].total_flops/myList[i].L1cache_data) if myList[i].L1cache_data else print("null")
- # print()
- intensities["curr_ai_l2"].append(
- myList[i].total_flops / myList[i].L2cache_data
- ) if myList[i].L2cache_data else intensities["curr_ai_l2"].append(0)
- # print("cur_ai_L2", myList[i].total_flops/myList[i].L2cache_data) if myList[i].L2cache_data else print("null")
- # print()
- intensities["curr_ai_hbm"].append(
- myList[i].total_flops / myList[i].hbm_data
- ) if myList[i].hbm_data else intensities["curr_ai_hbm"].append(0)
- # print("cur_ai_hbm", myList[i].total_flops/myList[i].hbm_data) if myList[i].hbm_data else print("null")
- # print()
- curr_perf.append(myList[i].total_flops / myList[i].avgDuration) if myList[
- i
- ].avgDuration else curr_perf.append(0)
- # print("cur_perf", myList[i].total_flops/myList[i].avgDuration) if myList[i].avgDuration else print("null")
-
- i += 1
-
- print(intensities)
-
- plotted_spots = []
- labels = []
- for i in intensities:
- values = intensities[i]
- color = get_color(i)
- x = []
- y = []
- for entryIndx in range(0, len(values)):
- x.append(values[entryIndx])
- y.append(curr_perf[entryIndx])
- myScatter = plt.scatter(x, y, c=color, marker="o")
- plotted_spots.append(myScatter)
- label = i
- labels.append(label)
-
- try:
- pylab.legend(
- plotted_spots,
- labels,
- prop={"size": (FONT_SIZE - 2)},
- bbox_to_anchor=(1.04, 1),
- loc="upper left",
- title="Top {}".format(inputs["sort"]),
- title_fontsize=FONT_SIZE,
- )
- except Exception as e:
- sys.stderr.write(f"{e}\n")
- pylab.legend(
- plotted_spots,
- labels,
- prop={"size": (FONT_SIZE - 2)},
- )
-
-
-def empirical_roof(args):
- soc = args.target
- inputs = {
- "path": str,
- "cmd": str,
- "sort": str,
- "mem": str,
- "axes": list,
- "device": int,
- # "workgroups": int,
- # "wsize": int,
- # "dataset": int,
- # "experiments": int,
- # "iter": int
- }
-
- inputs["sort"] = args.sort.lower()
- inputs["mem"] = args.mem_level.upper()
-
- if inputs["sort"] != "kernels" and inputs["sort"] != "dispatches":
- sys.exit("Invalid sort. Must be either 'kernels' or 'dispatches'")
- if (
- inputs["mem"] != "HBM"
- and inputs["mem"] != "VL1D"
- and inputs["mem"] != "L2"
- and inputs["mem"] != "LDS"
- and inputs["mem"] != "ALL"
- ):
- sys.exit(
- "Invalid mem-level. Must be one of these option 'LDS', 'L2', 'vL1D', or 'HBM'"
- )
- if inputs["mem"] == "VL1D":
- inputs["mem"] = "L1"
-
- inputs["device"] = int(args.device)
- # inputs["workgroups"] = int(args.workgroups)
- # inputs["wsize"] = int(args.wsize)
- # inputs["dataset"] = int(args.dataset)
- # inputs["experiments"] = int(args.experiments)
- # inputs["iter"] = int(args.iter)
- inputs["path"] = args.path
- inputs["cmd"] = args.remaining
- inputs["axes"] = args.axes
-
- # device_list = [int(item) for item in args.device.split(',')]
-
- if soc not in SUPPORTED_SOC:
- sys.exit("SoC not yet supported for Roofline Analysis")
-
- # Basic Info
- print("Path: ", inputs["path"])
- print("Target: ", soc)
- print("Memory Level: ", inputs["mem"])
-
- roofPath = inputs["path"] + "/roofline.csv"
- # -----------------------------------------------------
- # Initialize roofline data dictionary from roofline.csv
- # -----------------------------------------------------
- roof_data = (
- {}
- ) # TODO: consider changing this to an ordered dict for consistency over py versions
- headers = []
- with open(roofPath, "r") as csvfile:
- csvReader = csv.reader(csvfile, delimiter=",")
- rowCount = 0
- for row in csvReader:
- row.pop(0) # remove devID
- if rowCount == 0:
- headers = row
- for i in headers:
- roof_data[i] = []
- else:
- for i, key in enumerate(headers):
- roof_data[key].append(row[i])
-
- rowCount += 1
- csvfile.close()
-
- # Initalize plot
- f = plt.figure(figsize=(1600 / 100, 1200 / 100), dpi=100)
- f.add_subplot(111)
-
- _title_font = get_font()
- _title_font["size"] += 8
-
- plt.title("Empirical Roofline", **_title_font)
- plt.xlabel("Arithmetic Intensity (FLOP/Byte)", **get_font())
- plt.ylabel("Performance (GFLOP/sec)", **get_font())
- plt.grid(True, which="major", ls="--", lw=1)
- plt.grid(True, which="minor", ls="--", lw=0.5)
- plt.yscale("log")
- plt.xscale("log")
- # Adjust axes if instructed
- if inputs["axes"]:
- plt.xlim(inputs["axes"][0], inputs["axes"][1])
- plt.ylim(inputs["axes"][2], inputs["axes"][3])
-
- # ------------------
- # Generate Roofline
- # ------------------
- dtype = plot_roof(inputs, roof_data) # Also returns chosen dtype
- plot_application(inputs, args.verbose)
-
- if inputs["device"] == -1:
- dev_id = "ALL"
- else:
- dev_id = str(inputs["device"])
-
- filename = IMGNAME + "_gpu-" + dev_id + "_{}".format(dtype) + ".pdf"
-
- full_path = os.path.abspath(inputs["path"])
- path_to_output = full_path + "/" + filename
-
- print('Saving plot: "{}"...'.format(filename))
- plt.savefig(path_to_output, bbox_inches="tight", format="pdf")
- print('File saved to: "{}"'.format(path_to_output))
- plt.close()