Re-enable standalone roof options. Everthing working.

Signed-off-by: coleramos425 <colramos@amd.com>
Bu işleme şunda yer alıyor:
coleramos425
2023-01-26 14:07:10 -06:00
ebeveyn 5d27fe09ca
işleme b6d6c05e7b
6 değiştirilmiş dosya ile 86 ekleme ve 761 silme
+1 -1
Dosyayı Görüntüle
@@ -649,7 +649,7 @@ def main():
# Setup prerequisits for roofline
roof_setup(args, my_parser)
# Generate roofline
roofline_only(args.path, args.verbose, args.device)
roofline_only(args.path, args.device, args.sort, args.mem_level, args.verbose)
# Profile only
else:
+15 -2
Dosyayı Görüntüle
@@ -210,10 +210,15 @@ def run_cli(args, runs):
)
def roofline_only(path_to_dir, verbose, dev_id):
def roofline_only(path_to_dir, dev_id, sort_type, mem_level, verbose):
import pandas as pd
from collections import OrderedDict
# Change vL1D to a interpretable str, if required
if "vL1D" in mem_level:
mem_level.remove("vL1D")
mem_level.append("L1")
app_path = path_to_dir + "/pmc_perf.csv"
roofline_exists = os.path.isfile(app_path)
if not roofline_exists:
@@ -221,7 +226,15 @@ def roofline_only(path_to_dir, verbose, dev_id):
sys.exit(0)
t_df = OrderedDict()
t_df["pmc_perf"] = pd.read_csv(app_path)
get_roofline(path_to_dir, t_df, dev_id, verbose, True)
get_roofline(
path_to_dir,
t_df,
verbose,
dev_id, # [Optional] Specify device id to collect roofline info from
sort_type, # [Optional] Sort AI by top kernels or dispatches
mem_level, # [Optional] Toggle particular level(s) of memory hierarchy
True, # [Optional] Generate a standalone roofline analysis
)
def analyze(args):
+59 -70
Dosyayı Görüntüle
@@ -22,6 +22,7 @@
from omniperf_analyze.utils import roofline_calc
import time
import numpy as np
from dash import html, dash_table
@@ -35,72 +36,40 @@ def to_int(a):
else:
return int(a)
def generate_plots(roof_info, ai_data, isStandalone, verbose, fig=None):
def generate_plots(roof_info, ai_data, mem_level, is_standalone, verbose, fig=None):
if fig is None:
fig = go.Figure()
plotMode = "lines+text" if isStandalone else "lines"
line_data = roofline_calc.empirical_roof(roof_info, verbose)
plotMode = "lines+text" if is_standalone else "lines"
line_data = roofline_calc.empirical_roof(roof_info, mem_level, verbose)
print("Line data:\n", line_data)
#######################
# Plot BW Lines
#######################
fig.add_trace(
go.Scatter(
x=line_data["hbm"][0],
y=line_data["hbm"][1],
name="HBM-{}".format(roof_info["dtype"]),
mode=plotMode,
hovertemplate="<b>%{text}</b>",
text=[
"{} GB/s".format(to_int(line_data["hbm"][2])),
None if isStandalone else "{} GB/s".format(to_int(line_data["hbm"][2]))
],
textposition="top right",
if mem_level == "ALL":
cacheHierarchy = ["HBM", "L2", "L1", "LDS"]
else:
cacheHierarchy = mem_level
for cacheLevel in cacheHierarchy:
fig.add_trace(
go.Scatter(
x=line_data[cacheLevel.lower()][0],
y=line_data[cacheLevel.lower()][1],
name="{}-{}".format(cacheLevel, roof_info["dtype"]),
mode=plotMode,
hovertemplate="<b>%{text}</b>",
text=[
"{} GB/s".format(to_int(line_data[cacheLevel.lower()][2])),
None
if is_standalone
else "{} GB/s".format(to_int(line_data[cacheLevel.lower()][2])),
],
textposition="top right",
)
)
)
fig.add_trace(
go.Scatter(
x=line_data["l2"][0],
y=line_data["l2"][1],
name="L2-{}".format(roof_info["dtype"]),
mode=plotMode,
hovertemplate="<b>%{text}</b>",
text=[
"{} GB/s".format(to_int(line_data["l2"][2])),
None if isStandalone else "{} GB/s".format(to_int(line_data["l2"][2]))
],
textposition="top right",
)
)
fig.add_trace(
go.Scatter(
x=line_data["l1"][0],
y=line_data["l1"][1],
name="L1-{}".format(roof_info["dtype"]),
mode=plotMode,
hovertemplate="<b>%{text}</b>",
text=[
"{} GB/s".format(to_int(line_data["l1"][2])),
None if isStandalone else "{} GB/s".format(to_int(line_data["l1"][2]))
],
textposition="top right",
)
)
fig.add_trace(
go.Scatter(
x=line_data["lds"][0],
y=line_data["lds"][1],
name="LDS-{}".format(roof_info["dtype"]),
mode=plotMode,
hovertemplate="<b>%{text}</b>",
text=[
"{} GB/s".format(to_int(line_data["lds"][2])),
None if isStandalone else "{} GB/s".format(to_int(line_data["lds"][2]))
],
textposition="top right",
)
)
if roof_info["dtype"] != "FP16" and roof_info["dtype"] != "I8":
fig.add_trace(
go.Scatter(
@@ -110,7 +79,9 @@ def generate_plots(roof_info, ai_data, isStandalone, verbose, fig=None):
mode=plotMode,
hovertemplate="<b>%{text}</b>",
text=[
None if isStandalone else "{} GFLOP/s".format(to_int(line_data["valu"][2])),
None
if is_standalone
else "{} GFLOP/s".format(to_int(line_data["valu"][2])),
"{} GFLOP/s".format(to_int(line_data["valu"][2])),
],
textposition="top left",
@@ -129,7 +100,9 @@ def generate_plots(roof_info, ai_data, isStandalone, verbose, fig=None):
mode=plotMode,
hovertemplate="<b>%{text}</b>",
text=[
None if isStandalone else "{} GFLOP/s".format(to_int(line_data["mfma"][2])),
None
if is_standalone
else "{} GFLOP/s".format(to_int(line_data["mfma"][2])),
"{} GFLOP/s".format(to_int(line_data["mfma"][2])),
],
textposition=pos,
@@ -176,26 +149,33 @@ def generate_plots(roof_info, ai_data, isStandalone, verbose, fig=None):
return fig
def get_roofline(path_to_dir, ret_df, verbose, dev_id=None, isStandalone=False):
def get_roofline(
path_to_dir,
ret_df,
verbose,
dev_id=None,
sort_type="kernels",
mem_level="ALL",
is_standalone=False,
):
# Roofline settings
# TODO: Make "sort" attribute dynamic so user can select desired sort
fp32_details = {
"path": path_to_dir,
"sort": "kernels",
"sort": sort_type,
"device": 0,
"dtype": "FP32",
}
fp16_details = {
"path": path_to_dir,
"sort": "kernels",
"sort": sort_type,
"device": 0,
"dtype": "FP16",
}
int8_details = {"path": path_to_dir, "sort": "kernels", "device": 0, "dtype": "I8"}
int8_details = {"path": path_to_dir, "sort": sort_type, "device": 0, "dtype": "I8"}
# Generate roofline plots
print("Path: ", path_to_dir)
ai_data = roofline_calc.plot_application("kernels", ret_df, verbose)
ai_data = roofline_calc.plot_application(sort_type, ret_df, verbose)
if verbose >= 1:
# print AI data for each mem level
print("AI at each mem level")
@@ -203,17 +183,26 @@ def get_roofline(path_to_dir, ret_df, verbose, dev_id=None, isStandalone=False):
print(i, "->", ai_data[i])
print("\n")
fp32_fig = generate_plots(fp32_details, ai_data, isStandalone, verbose)
fp16_fig = generate_plots(fp16_details, ai_data, isStandalone, verbose)
ml_combo_fig = generate_plots(int8_details, ai_data, isStandalone, verbose, fp16_fig)
fp32_fig = generate_plots(fp32_details, ai_data, mem_level, is_standalone, verbose)
fp16_fig = generate_plots(fp16_details, ai_data, mem_level, is_standalone, verbose)
ml_combo_fig = generate_plots(
int8_details, ai_data, mem_level, is_standalone, verbose, fp16_fig
)
if isStandalone:
if is_standalone:
dev_id = "ALL" if dev_id == -1 else str(dev_id)
fp32_fig.write_image(path_to_dir + "/empirRoof_gpu-{}_fp32.pdf".format(dev_id))
ml_combo_fig.write_image(
path_to_dir + "/empirRoof_gpu-{}_fp8_fp16.pdf".format(dev_id)
)
time.sleep(1)
# Re-save to remove loading MathJax pop up
fp32_fig.write_image(path_to_dir + "/empirRoof_gpu-{}_fp32.pdf".format(dev_id))
ml_combo_fig.write_image(
path_to_dir + "/empirRoof_gpu-{}_fp8_fp16.pdf".format(dev_id)
)
print("Empirical Roofline PDFs saved!")
else:
return html.Section(
id="roofline",
+9 -7
Dosyayı Görüntüle
@@ -56,6 +56,7 @@ class AI_Data:
mfma_flops_bf16: float
mfma_flops_f32: float
mfma_flops_f64: float
mfma_iops_i8: float
lds_data: float
L1cache_data: float
L2cache_data: float
@@ -88,11 +89,14 @@ def get_color(catagory):
# -------------------------------------------------------------------------------------
# Plot BW at each cache level
# -------------------------------------------------------------------------------------
def plot_roof(roof_details, roof_data, verbose):
def plot_roof(roof_details, roof_data, mem_level, verbose):
# TODO: This is where filtering by memory level will need to occur for standalone
graphPoints = {"hbm": [], "l2": [], "l1": [], "lds": [], "valu": [], "mfma": []}
cacheHierarchy = ["HBM", "L2", "L1", "LDS"]
if mem_level == "ALL":
cacheHierarchy = ["HBM", "L2", "L1", "LDS"]
else:
cacheHierarchy = mem_level
x1 = y1 = x2 = y2 = -1
x1_mfma = y1_mfma = x2_mfma = y2_mfma = -1
@@ -223,6 +227,7 @@ def plot_application(sortType, ret_df, verbose):
mfma_flops_bf16 / calls,
mfma_flops_f32 / calls,
mfma_flops_f64 / calls,
mfma_iops_i8 / calls,
lds_data / calls,
L1cache_data / calls,
L2cache_data / calls,
@@ -466,10 +471,7 @@ def plot_application(sortType, ret_df, verbose):
return intensityPoints
def empirical_roof(roof_info, verbose):
if roof_info["sort"] != "kernels" and roof_info["sort"] != "dispatches":
sys.exit("Invalid sort. Must be either 'kernels' or 'dispatches'")
def empirical_roof(roof_info, mem_level, verbose):
roofPath = roof_info["path"] + "/roofline.csv"
# -----------------------------------------------------
@@ -509,7 +511,7 @@ def empirical_roof(roof_info, verbose):
# ------------------
# Generate Roofline
# ------------------
results = plot_roof(roof_info, roof_data, verbose)
results = plot_roof(roof_info, roof_data, mem_level, verbose)
# for key in results:
# print(key, "->", results[key])
+2 -9
Dosyayı Görüntüle
@@ -211,6 +211,7 @@ def parse(my_parser):
metavar="",
type=str,
default="kernels",
choices=["kernels", "dispatches"],
help="\t\t\tOverlay top kernels or top dispatches: (DEFAULT: kernels)\n\t\t\t kernels\n\t\t\t dispatches",
)
roofline_group.add_argument(
@@ -219,19 +220,11 @@ def parse(my_parser):
required=False,
choices=["HBM", "L2", "vL1D", "LDS"],
metavar="",
nargs="+",
type=str,
default="ALL",
help="\t\t\tFilter by memory level: (DEFAULT: ALL)\n\t\t\t HBM\n\t\t\t L2\n\t\t\t vL1D\n\t\t\t LDS",
)
roofline_group.add_argument(
"--axes",
default=None,
type=float,
required=False,
nargs="+",
metavar="",
help="\t\t\tDesired axis values for graph. As follows:\n\t\t\t xmin xmax ymin ymax",
)
roofline_group.add_argument(
"--device",
metavar="",
-672
Dosyayı Görüntüle
@@ -1,672 +0,0 @@
################################################################################
# Copyright (c) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
################################################################################
from linecache import cache
import os
import sys
from pathlib import Path
import numpy
import matplotlib
try:
import matplotlib.pyplot as plt
except ImportError:
# other non-interactive options:
# cairo, pdf, pgf, ps, svg, template
matplotlib.use("agg", force=True)
import matplotlib.pyplot as plt
from matplotlib.pyplot import get, text
from math import log, pi, sqrt
import pandas as pd
import pylab
from dataclasses import dataclass
import csv
################################################
# Global vars
################################################
IMGNAME = "empirRoof"
L2_BANKS = 32 # default assuming mi200
XMIN = 0.01
XMAX = 1000
FONT_SIZE = 16
FONT_COLOR = "black"
FONT_WEIGHT = "bold"
SUPPORTED_SOC = ["mi200"]
################################################
# Helper funcs
################################################
@dataclass
class AI_Data:
KernelName: str
numCalls: float
total_flops: float
valu_flops: float
mfma_flops_f16: float
mfma_flops_bf16: float
mfma_flops_f32: float
mfma_flops_f64: float
lds_data: float
L1cache_data: float
L2cache_data: float
hbm_data: float
totalDuration: float
avgDuration: float
def get_font():
return {
"size": FONT_SIZE,
"color": FONT_COLOR,
"weight": FONT_WEIGHT,
"family": "serif",
}
def get_color(catagory):
if catagory == "curr_ai_l1":
return "green"
elif catagory == "curr_ai_l2":
return "blue"
elif catagory == "curr_ai_hbm":
return "red"
else:
raise RuntimeError("Invalid catagory passed to get_color()")
# -------------------------------------------------------------------------------------
# Plot BW at each cache level
# -------------------------------------------------------------------------------------
def plot_roof(inputs, roof_data):
cacheHierarchy = []
if inputs["mem"] == "ALL":
cacheHierarchy += ["HBM", "L2", "L1", "LDS"]
else:
cacheHierarchy.append(inputs["mem"])
targ_dtype = (
"FP32"
if float(roof_data["FP32Flops"][0]) > float(roof_data["FP64Flops"][0])
else "FP64"
)
print("Dtype: ", targ_dtype)
print(inputs["mem"])
x1 = y1 = x2 = y2 = -1
x1_mfma = y1_mfma = x2_mfma = y2_mfma = -1
target_precision = targ_dtype[2:]
peakOps = float(roof_data[targ_dtype + "Flops"][0])
for i in range(0, len(cacheHierarchy)):
# Plot BW line
# print("Current cache level: {}".format(cacheHierarchy[i]))
curr_bw = cacheHierarchy[i] + "Bw"
peakBw = float(roof_data[curr_bw][0])
peakMFMA = float(roof_data["MFMAF{}Flops".format(target_precision)][0])
x1 = float(XMIN)
y1 = float(XMIN) * peakBw
x2 = peakOps / peakBw
y2 = peakOps
plt.plot([x1, x2], [y1, y2], color="magenta")
# print("Mem Points: [{}, {}], [{}, {}]".format(x1, x2, y1, y2))
# Plot MFMA lines (NOTE: Assuming MI200 soc)
x1_mfma = peakOps / peakBw
y1_mfma = peakOps
x2_mfma = peakMFMA / peakBw
y2_mfma = peakMFMA
plt.plot([x1_mfma, x2_mfma], [y1_mfma, y2_mfma], color="blue")
# print("Extend BW Points: [{}, {}], [{}, {}]".format(x1_mfma, x2_mfma, y1_mfma, y2_mfma))
# These are the points to use:
# print("x = [{}, {}]".format(x1,x2_mfma))
# print("y = [{}, {}]".format(y1, y2_mfma))
# Plot BW label
x1log = log(x1) / log(10)
x2log = log(x2) / log(10)
y1log = log(y1) / log(10)
y2log = log(y2) / log(10)
x_text = 10 ** ((x1log + x2log) / 2)
y_text = 10 ** ((y1log + y2log) / 2)
fig = plt.gcf()
size = fig.get_size_inches() * fig.dpi
fig_x, fig_y = size
# dx = log(x2) - log(x1)
# dy = log(y2) - log(y1)
# x_min, x_max = plt.xlim()
# y_min, y_max = plt.ylim()
# Dx = dx * fig_x / (log(x_max) - log(x_min))
# Dy = dy * fig_y / (log(y_max) - log(y_min))
# #fdiv = 0.7 #TODO: improve accuracy of text angle (tilt)
# angle = (180.0 / pi) * numpy.arctan(Dy / Dx )#/fdiv)
dx = abs(log(x2) - log(x1))
dy = abs(log(y2) - log(y1))
angle = (180.0 / pi) * numpy.arctan(dy / dx)
# If user isn't zooming in, print bw labels normally
if not inputs["axes"]:
text(
x_text,
y_text,
"{} vL1D GB/s".format(int(peakBw))
if cacheHierarchy[i].upper() == "L1"
else "{} {} GB/s".format(int(peakBw), cacheHierarchy[i].upper()),
rotation=angle,
rotation_mode="anchor",
**get_font(),
)
else:
# if bw line isn't being cut out then plot bw
print("if {} < {}".format(inputs["axes"][0], 10**x2log))
if inputs["axes"][0] < 10**x2log:
text(
10**x2log,
10**y2log,
"{} {} GB/s".format(int(peakBw), cacheHierarchy[i].upper()),
rotation=angle,
rotation_mode="anchor",
**get_font(),
)
# -------------------------------------------------------------------------------------
# Plot computing roof
# -------------------------------------------------------------------------------------
# Plot FMA roof
x0 = XMAX
if x2 < x0:
x0 = x2
temp_label = "{} VALU GFLOP/sec".format(int(peakOps))
plt.plot([x0, XMAX], [peakOps, peakOps], color="magenta")
# print("FMA Points: [{}, {}], [{},{}]".format(x0, XMAX, peakOps, peakOps))
text(
XMAX if not inputs["axes"] else inputs["axes"][1],
peakOps - 4000, # should i keep this fixed at 4000?
temp_label,
horizontalalignment="right",
**get_font(),
)
# Plot MFMA roof
if x1_mfma != -1: # assert that mfma has been assigned
x0_mfma = XMAX
if x2_mfma < x0_mfma:
x0_mfma = x2_mfma
peakMFMA = float(roof_data["MFMAF{}Flops".format(target_precision)][0])
temp_label = "{} MFMA GFLOP/sec".format(int(peakMFMA))
plt.plot([x0_mfma, XMAX], [peakMFMA, peakMFMA], color="blue")
# print("MFMA Points: [{}, {}], [{},{}]".format(x0_mfma, XMAX, peakMFMA, peakMFMA))
text(
XMAX if not inputs["axes"] else inputs["axes"][1],
peakMFMA + 1000,
temp_label,
horizontalalignment="right",
**get_font(),
)
return targ_dtype
# -------------------------------------------------------------------------------------
# Overlay application performance
# -------------------------------------------------------------------------------------
# Calculate relevent metrics for ai calculation
def plot_application(inputs, verbose):
df = pd.read_csv(inputs["path"] + "/pmc_perf.csv")
# Sort by top kernels or top dispatches?
df = df.sort_values(by=["KernelName"])
df = df.reset_index(drop=True)
total_flops = (
valu_flops
) = (
mfma_flops_bf16
) = (
mfma_flops_f16
) = (
mfma_iops_i8
) = (
mfma_flops_f32
) = (
mfma_flops_f64
) = (
lds_data
) = L1cache_data = L2cache_data = hbm_data = calls = totalDuration = avgDuration = 0.0
kernelName = ""
myList = []
for index, row in df.iterrows():
# CASE: Top kernels
if inputs["sort"] == "kernels" and (
(row["KernelName"] != kernelName and kernelName != "")
or index == df.shape[0] - 1
):
if df.shape[0] - 1 == index:
calls += 1
myList.append(
AI_Data(
kernelName,
calls,
total_flops / calls,
valu_flops / calls,
mfma_flops_f16 / calls,
mfma_flops_bf16 / calls,
mfma_flops_f32 / calls,
mfma_flops_f64 / calls,
lds_data / calls,
L1cache_data / calls,
L2cache_data / calls,
hbm_data / calls,
totalDuration,
avgDuration / calls,
)
)
if verbose >= 2:
print(
"Just added {} to AI_Data at index {}. # of calls: {}".format(
kernelName, index, calls
)
)
total_flops = (
valu_flops
) = (
mfma_flops_bf16
) = (
mfma_flops_f16
) = (
mfma_iops_i8
) = (
mfma_flops_f32
) = (
mfma_flops_f64
) = (
lds_data
) = (
L1cache_data
) = L2cache_data = hbm_data = calls = totalDuration = avgDuration = 0.0
kernelName = row["KernelName"]
try:
total_flops += (
(
64
* (
row["SQ_INSTS_VALU_ADD_F16"]
+ row["SQ_INSTS_VALU_MUL_F16"]
+ (2 * row["SQ_INSTS_VALU_FMA_F16"])
+ row["SQ_INSTS_VALU_TRANS_F16"]
)
)
+ (
64
* (
row["SQ_INSTS_VALU_ADD_F32"]
+ row["SQ_INSTS_VALU_MUL_F32"]
+ (2 * row["SQ_INSTS_VALU_FMA_F32"])
+ row["SQ_INSTS_VALU_TRANS_F32"]
)
)
+ (
64
* (
row["SQ_INSTS_VALU_ADD_F64"]
+ row["SQ_INSTS_VALU_MUL_F64"]
+ (2 * row["SQ_INSTS_VALU_FMA_F64"])
+ row["SQ_INSTS_VALU_TRANS_F64"]
)
)
+ (row["SQ_INSTS_VALU_MFMA_MOPS_F16"] * 512)
+ (row["SQ_INSTS_VALU_MFMA_MOPS_BF16"] * 512)
+ (row["SQ_INSTS_VALU_MFMA_MOPS_F32"] * 512)
+ (row["SQ_INSTS_VALU_MFMA_MOPS_F64"] * 512)
)
except KeyError:
if verbose >= 2:
print("Skipped total_flops at index {}".format(index))
pass
try:
valu_flops += (
64
* (
row["SQ_INSTS_VALU_ADD_F16"]
+ row["SQ_INSTS_VALU_MUL_F16"]
+ (2 * row["SQ_INSTS_VALU_FMA_F16"])
+ row["SQ_INSTS_VALU_TRANS_F16"]
)
+ 64
* (
row["SQ_INSTS_VALU_ADD_F32"]
+ row["SQ_INSTS_VALU_MUL_F32"]
+ (2 * row["SQ_INSTS_VALU_FMA_F32"])
+ row["SQ_INSTS_VALU_TRANS_F32"]
)
+ 64
* (
row["SQ_INSTS_VALU_ADD_F64"]
+ row["SQ_INSTS_VALU_MUL_F64"]
+ (2 * row["SQ_INSTS_VALU_FMA_F64"])
+ row["SQ_INSTS_VALU_TRANS_F64"]
)
)
except KeyError:
if verbose >= 2:
print("Skipped valu_flops at index {}".format(index))
pass
try:
mfma_flops_f16 += row["SQ_INSTS_VALU_MFMA_MOPS_F16"] * 512
mfma_flops_bf16 += row["SQ_INSTS_VALU_MFMA_MOPS_BF16"] * 512
mfma_flops_f32 += row["SQ_INSTS_VALU_MFMA_MOPS_F32"] * 512
mfma_flops_f64 += row["SQ_INSTS_VALU_MFMA_MOPS_F64"] * 512
mfma_iops_i8 += row["SQ_INSTS_VALU_MFMA_MOPS_I8"] * 512
except KeyError:
if verbose >= 2:
print("Skipped mfma ops at index {}".format(index))
pass
try:
lds_data += (
(row["SQ_LDS_IDX_ACTIVE"] - row["SQ_LDS_BANK_CONFLICT"]) * 4 * L2_BANKS
) # L2_BANKS = 32 (since assuming mi200)
except KeyError:
if verbose >= 2:
print("Skipped lds_data at index {}".format(index))
pass
try:
L1cache_data += row["TCP_TOTAL_CACHE_ACCESSES_sum"] * 64
except KeyError:
if verbose >= 2:
print("Skipped L1cache_data at index {}".format(index))
pass
try:
L2cache_data += (
row["TCP_TCC_WRITE_REQ_sum"] * 64
+ row["TCP_TCC_ATOMIC_WITH_RET_REQ_sum"] * 64
+ row["TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum"] * 64
+ row["TCP_TCC_READ_REQ_sum"] * 64
)
except KeyError:
if verbose >= 2:
print("Skipped L2cache_data at index {}".format(index))
pass
try:
hbm_data += (
(row["TCC_EA_RDREQ_32B_sum"] * 32)
+ ((row["TCC_EA_RDREQ_sum"] - row["TCC_EA_RDREQ_32B_sum"]) * 64)
+ (row["TCC_EA_WRREQ_64B_sum"] * 64)
+ ((row["TCC_EA_WRREQ_sum"] - row["TCC_EA_WRREQ_64B_sum"]) * 32)
)
except KeyError:
if verbose >= 2:
print("Skipped hbm_data at index {}".format(index))
pass
totalDuration += row["EndNs"] - row["BeginNs"]
avgDuration += row["EndNs"] - row["BeginNs"]
calls += 1
if inputs["sort"] == "dispatches":
myList.append(
AI_Data(
kernelName,
calls,
total_flops,
valu_flops,
mfma_flops_f16,
mfma_flops_bf16,
mfma_flops_f32,
mfma_flops_f64,
mfma_iops_i8,
lds_data,
L1cache_data,
L2cache_data,
hbm_data,
totalDuration,
avgDuration,
)
)
total_flops = (
valu_flops
) = (
mfma_flops_bf16
) = (
mfma_flops_f16
) = (
mfma_iops_i8
) = (
mfma_flops_f32
) = (
mfma_flops_f64
) = (
lds_data
) = (
L1cache_data
) = L2cache_data = hbm_data = calls = totalDuration = avgDuration = 0.0
myList.sort(key=lambda x: x.totalDuration, reverse=True)
print("Top 10 intensities ('{}')...".format(inputs["sort"]))
intensities = {"curr_ai_l1": [], "curr_ai_l2": [], "curr_ai_hbm": []}
curr_perf = []
i = 0
# Create list of top 5 intensities
while i <= 9 and i != len(myList):
intensities["curr_ai_l1"].append(
myList[i].total_flops / myList[i].L1cache_data
) if myList[i].L1cache_data else intensities["curr_ai_l1"].append(0)
# print("cur_ai_L1", myList[i].total_flops/myList[i].L1cache_data) if myList[i].L1cache_data else print("null")
# print()
intensities["curr_ai_l2"].append(
myList[i].total_flops / myList[i].L2cache_data
) if myList[i].L2cache_data else intensities["curr_ai_l2"].append(0)
# print("cur_ai_L2", myList[i].total_flops/myList[i].L2cache_data) if myList[i].L2cache_data else print("null")
# print()
intensities["curr_ai_hbm"].append(
myList[i].total_flops / myList[i].hbm_data
) if myList[i].hbm_data else intensities["curr_ai_hbm"].append(0)
# print("cur_ai_hbm", myList[i].total_flops/myList[i].hbm_data) if myList[i].hbm_data else print("null")
# print()
curr_perf.append(myList[i].total_flops / myList[i].avgDuration) if myList[
i
].avgDuration else curr_perf.append(0)
# print("cur_perf", myList[i].total_flops/myList[i].avgDuration) if myList[i].avgDuration else print("null")
i += 1
print(intensities)
plotted_spots = []
labels = []
for i in intensities:
values = intensities[i]
color = get_color(i)
x = []
y = []
for entryIndx in range(0, len(values)):
x.append(values[entryIndx])
y.append(curr_perf[entryIndx])
myScatter = plt.scatter(x, y, c=color, marker="o")
plotted_spots.append(myScatter)
label = i
labels.append(label)
try:
pylab.legend(
plotted_spots,
labels,
prop={"size": (FONT_SIZE - 2)},
bbox_to_anchor=(1.04, 1),
loc="upper left",
title="Top {}".format(inputs["sort"]),
title_fontsize=FONT_SIZE,
)
except Exception as e:
sys.stderr.write(f"{e}\n")
pylab.legend(
plotted_spots,
labels,
prop={"size": (FONT_SIZE - 2)},
)
def empirical_roof(args):
soc = args.target
inputs = {
"path": str,
"cmd": str,
"sort": str,
"mem": str,
"axes": list,
"device": int,
# "workgroups": int,
# "wsize": int,
# "dataset": int,
# "experiments": int,
# "iter": int
}
inputs["sort"] = args.sort.lower()
inputs["mem"] = args.mem_level.upper()
if inputs["sort"] != "kernels" and inputs["sort"] != "dispatches":
sys.exit("Invalid sort. Must be either 'kernels' or 'dispatches'")
if (
inputs["mem"] != "HBM"
and inputs["mem"] != "VL1D"
and inputs["mem"] != "L2"
and inputs["mem"] != "LDS"
and inputs["mem"] != "ALL"
):
sys.exit(
"Invalid mem-level. Must be one of these option 'LDS', 'L2', 'vL1D', or 'HBM'"
)
if inputs["mem"] == "VL1D":
inputs["mem"] = "L1"
inputs["device"] = int(args.device)
# inputs["workgroups"] = int(args.workgroups)
# inputs["wsize"] = int(args.wsize)
# inputs["dataset"] = int(args.dataset)
# inputs["experiments"] = int(args.experiments)
# inputs["iter"] = int(args.iter)
inputs["path"] = args.path
inputs["cmd"] = args.remaining
inputs["axes"] = args.axes
# device_list = [int(item) for item in args.device.split(',')]
if soc not in SUPPORTED_SOC:
sys.exit("SoC not yet supported for Roofline Analysis")
# Basic Info
print("Path: ", inputs["path"])
print("Target: ", soc)
print("Memory Level: ", inputs["mem"])
roofPath = inputs["path"] + "/roofline.csv"
# -----------------------------------------------------
# Initialize roofline data dictionary from roofline.csv
# -----------------------------------------------------
roof_data = (
{}
) # TODO: consider changing this to an ordered dict for consistency over py versions
headers = []
with open(roofPath, "r") as csvfile:
csvReader = csv.reader(csvfile, delimiter=",")
rowCount = 0
for row in csvReader:
row.pop(0) # remove devID
if rowCount == 0:
headers = row
for i in headers:
roof_data[i] = []
else:
for i, key in enumerate(headers):
roof_data[key].append(row[i])
rowCount += 1
csvfile.close()
# Initalize plot
f = plt.figure(figsize=(1600 / 100, 1200 / 100), dpi=100)
f.add_subplot(111)
_title_font = get_font()
_title_font["size"] += 8
plt.title("Empirical Roofline", **_title_font)
plt.xlabel("Arithmetic Intensity (FLOP/Byte)", **get_font())
plt.ylabel("Performance (GFLOP/sec)", **get_font())
plt.grid(True, which="major", ls="--", lw=1)
plt.grid(True, which="minor", ls="--", lw=0.5)
plt.yscale("log")
plt.xscale("log")
# Adjust axes if instructed
if inputs["axes"]:
plt.xlim(inputs["axes"][0], inputs["axes"][1])
plt.ylim(inputs["axes"][2], inputs["axes"][3])
# ------------------
# Generate Roofline
# ------------------
dtype = plot_roof(inputs, roof_data) # Also returns chosen dtype
plot_application(inputs, args.verbose)
if inputs["device"] == -1:
dev_id = "ALL"
else:
dev_id = str(inputs["device"])
filename = IMGNAME + "_gpu-" + dev_id + "_{}".format(dtype) + ".pdf"
full_path = os.path.abspath(inputs["path"])
path_to_output = full_path + "/" + filename
print('Saving plot: "{}"...'.format(filename))
plt.savefig(path_to_output, bbox_inches="tight", format="pdf")
print('File saved to: "{}"'.format(path_to_output))
plt.close()