Merge pull request #99 from AMDResearch/dev

Updates for v1.0.8-PR1
Этот коммит содержится в:
Cole Ramos
2023-03-13 15:35:45 -05:00
коммит произвёл GitHub
родитель 0705db024c 56ecd59203
Коммит 281b577fc7
8 изменённых файлов: 208 добавлений и 144 удалений
+3 -3
Просмотреть файл
@@ -1,12 +1,12 @@
## How to fork from us
To keep our development fast and conflict free, we recommend you to [fork](https://github.com/AMDResearch/omniperf/fork) our repository and start your work from our `develop` branch in your private repository.
To keep our development fast and conflict free, we recommend you to [fork](https://github.com/AMDResearch/omniperf/fork) our repository and start your work from our `dev` branch in your private repository.
Afterwards, git clone your repository to your local machine. But that is not it! To keep track of the original develop repository, add it as another remote.
```
git remote add mainline https://github.com/AMDResearch/omniperf.git
git checkout develop
git checkout dev
```
As always in git, start a new branch with
@@ -31,7 +31,7 @@ and apply your changes there.
- Ensure the PR description clearly describes the problem and solution. If there is an existing GitHub issue open describing this bug, please include it in the description so we can close it.
- Ensure the PR is based on the `develop` branch of the Omniperf GitHub repository.
- Ensure the PR is based on the `dev` branch of the Omniperf GitHub repository.
- Omniperf requires new commits to include a "Signed-off-by" token in the commit message (typically enabled via the `git commit -s` option), indicating your agreement to the projects's [Developer's Certificate of Origin](https://developercertificate.org/) and compatability with the project [LICENSE](https://github.com/AMDResearch/omniperf/blob/main/LICENSE):
+1 -1
Просмотреть файл
@@ -715,7 +715,7 @@ def main():
# Setup prerequisits for roofline
roof_setup(args, my_parser, VER)
# Generate roofline
roofline_only(args.path, args.device, args.sort, args.mem_level, args.verbose)
roofline_only(args.path, args.device, args.sort, args.mem_level, args.kernel_names, args.verbose)
# Profile only
else:
+2 -1
Просмотреть файл
@@ -212,7 +212,7 @@ def run_cli(args, runs):
)
def roofline_only(path_to_dir, dev_id, sort_type, mem_level, verbose):
def roofline_only(path_to_dir, dev_id, sort_type, mem_level, kernel_names, verbose):
import pandas as pd
from collections import OrderedDict
@@ -235,6 +235,7 @@ def roofline_only(path_to_dir, dev_id, sort_type, mem_level, verbose):
dev_id, # [Optional] Specify device id to collect roofline info from
sort_type, # [Optional] Sort AI by top kernels or dispatches
mem_level, # [Optional] Toggle particular level(s) of memory hierarchy
kernel_names, # [Optional] Toggle overlay of kernel names in plot
True, # [Optional] Generate a standalone roofline analysis
)
+1 -1
Просмотреть файл
@@ -104,7 +104,7 @@ def get_header(raw_pmc, input_filters, kernel_names):
dbc.DropdownMenuItem("Cache", header=True),
dbc.DropdownMenuItem(
"Local Data Share (LDS)",
href="#local_data_sharelds",
href="#local_data_share_lds",
external_link=True,
),
dbc.DropdownMenuItem(
+49 -5
Просмотреть файл
@@ -25,6 +25,7 @@
from omniperf_analyze.utils import roofline_calc
import time
import sys
import numpy as np
from dash import html, dash_table
@@ -32,6 +33,9 @@ from dash import dcc
import plotly.graph_objects as go
SYMBOLS = [0, 1, 2, 3, 4, 5, 13, 17, 18, 20]
def to_int(a):
if str(type(a)) == "<class 'NoneType'>":
return np.nan
@@ -39,7 +43,9 @@ def to_int(a):
return int(a)
def generate_plots(roof_info, ai_data, mem_level, is_standalone, verbose, fig=None):
def generate_plots(
roof_info, ai_data, mem_level, is_standalone, kernel_names, verbose, fig=None
):
if fig is None:
fig = go.Figure()
plotMode = "lines+text" if is_standalone else "lines"
@@ -120,6 +126,8 @@ def generate_plots(roof_info, ai_data, mem_level, is_standalone, verbose, fig=No
y=ai_data["ai_l1"][1],
name="ai_l1",
mode="markers",
marker={"color": "#00CC96"},
marker_symbol=SYMBOLS if kernel_names else None,
)
)
fig.add_trace(
@@ -128,6 +136,8 @@ def generate_plots(roof_info, ai_data, mem_level, is_standalone, verbose, fig=No
y=ai_data["ai_l2"][1],
name="ai_l2",
mode="markers",
marker={"color": "#EF553B"},
marker_symbol=SYMBOLS if kernel_names else None,
)
)
fig.add_trace(
@@ -136,6 +146,8 @@ def generate_plots(roof_info, ai_data, mem_level, is_standalone, verbose, fig=No
y=ai_data["ai_hbm"][1],
name="ai_hbm",
mode="markers",
marker={"color": "#636EFA"},
marker_symbol=SYMBOLS if kernel_names else None,
)
)
@@ -158,8 +170,13 @@ def get_roofline(
dev_id=None,
sort_type="kernels",
mem_level="ALL",
kernel_names=False,
is_standalone=False,
):
if kernel_names and (not is_standalone):
print("ERROR: --roof-only is required for --kernel-names")
sys.exit(1)
# Roofline settings
fp32_details = {
"path": path_to_dir,
@@ -185,11 +202,33 @@ def get_roofline(
print(i, "->", ai_data[i])
print("\n")
fp32_fig = generate_plots(fp32_details, ai_data, mem_level, is_standalone, verbose)
fp16_fig = generate_plots(fp16_details, ai_data, mem_level, is_standalone, verbose)
ml_combo_fig = generate_plots(
int8_details, ai_data, mem_level, is_standalone, verbose, fp16_fig
fp32_fig = generate_plots(
fp32_details, ai_data, mem_level, is_standalone, kernel_names, verbose
)
fp16_fig = generate_plots(
fp16_details, ai_data, mem_level, is_standalone, kernel_names, verbose
)
ml_combo_fig = generate_plots(
int8_details, ai_data, mem_level, is_standalone, kernel_names, verbose, fp16_fig
)
legend = go.Figure(
go.Scatter(
mode="markers",
x=[0] * 10,
y=ai_data["kernelNames"],
marker_symbol=SYMBOLS,
marker_size=15,
)
)
legend.update_layout(
title="Kernel Names and Markers",
margin=dict(b=0, r=0),
xaxis_range=[-1, 1],
xaxis_side="top",
height=400,
width=1000,
)
legend.update_xaxes(dtick=1)
if is_standalone:
dev_id = "ALL" if dev_id == -1 else str(dev_id)
@@ -198,12 +237,17 @@ def get_roofline(
ml_combo_fig.write_image(
path_to_dir + "/empirRoof_gpu-{}_fp8_fp16.pdf".format(dev_id)
)
if kernel_names:
# only save a legend if kernel_names option is toggled
legend.write_image(path_to_dir + "/kernelName_legend.pdf")
time.sleep(1)
# Re-save to remove loading MathJax pop up
fp32_fig.write_image(path_to_dir + "/empirRoof_gpu-{}_fp32.pdf".format(dev_id))
ml_combo_fig.write_image(
path_to_dir + "/empirRoof_gpu-{}_fp8_fp16.pdf".format(dev_id)
)
if kernel_names:
legend.write_image(path_to_dir + "/kernelName_legend.pdf")
print("Empirical Roofline PDFs saved!")
else:
return html.Section(
-1
Просмотреть файл
@@ -27,7 +27,6 @@ import sys
import astunparse
import re
import os
from matplotlib.pyplot import axis
import pandas as pd
import numpy as np
from tabulate import tabulate
+145 -132
Просмотреть файл
@@ -44,6 +44,8 @@ FONT_WEIGHT = "bold"
SUPPORTED_SOC = ["mi200"]
TOP_N = 10
################################################
# Helper funcs
@@ -208,17 +210,146 @@ def plot_application(sortType, ret_df, verbose):
kernelName = ""
myList = []
for index, row in df.iterrows():
at_end = False
next_kernelName = ""
for idx in df.index:
# CASE: Top kernels
# Calculate + append AI data if
# a) current KernelName is different than previous OR
# b) We've reached the end of list
if sortType == "kernels" and (
(row["KernelName"] != kernelName and kernelName != "")
or index == df.shape[0] - 1
):
if df.shape[0] - 1 == index:
calls += 1
if idx + 1 == df.shape[0]:
at_end = True
else:
next_kernelName = df["KernelName"][idx + 1]
kernelName = df["KernelName"][idx]
try:
total_flops += (
(
64
* (
df["SQ_INSTS_VALU_ADD_F16"][idx]
+ df["SQ_INSTS_VALU_MUL_F16"][idx]
+ (2 * df["SQ_INSTS_VALU_FMA_F16"][idx])
+ df["SQ_INSTS_VALU_TRANS_F16"][idx]
)
)
+ (
64
* (
df["SQ_INSTS_VALU_ADD_F32"][idx]
+ df["SQ_INSTS_VALU_MUL_F32"][idx]
+ (2 * df["SQ_INSTS_VALU_FMA_F32"][idx])
+ df["SQ_INSTS_VALU_TRANS_F32"][idx]
)
)
+ (
64
* (
df["SQ_INSTS_VALU_ADD_F64"][idx]
+ df["SQ_INSTS_VALU_MUL_F64"][idx]
+ (2 * df["SQ_INSTS_VALU_FMA_F64"][idx])
+ df["SQ_INSTS_VALU_TRANS_F64"][idx]
)
)
+ (df["SQ_INSTS_VALU_MFMA_MOPS_F16"][idx] * 512)
+ (df["SQ_INSTS_VALU_MFMA_MOPS_BF16"][idx] * 512)
+ (df["SQ_INSTS_VALU_MFMA_MOPS_F32"][idx] * 512)
+ (df["SQ_INSTS_VALU_MFMA_MOPS_F64"][idx] * 512)
)
except KeyError:
if verbose >= 3:
print("{}: Skipped total_flops at index {}".format(kernelName[:35], idx))
pass
try:
valu_flops += (
64
* (
df["SQ_INSTS_VALU_ADD_F16"][idx]
+ df["SQ_INSTS_VALU_MUL_F16"][idx]
+ (2 * df["SQ_INSTS_VALU_FMA_F16"][idx])
+ df["SQ_INSTS_VALU_TRANS_F16"][idx]
)
+ 64
* (
df["SQ_INSTS_VALU_ADD_F32"][idx]
+ df["SQ_INSTS_VALU_MUL_F32"][idx]
+ (2 * df["SQ_INSTS_VALU_FMA_F32"][idx])
+ df["SQ_INSTS_VALU_TRANS_F32"][idx]
)
+ 64
* (
df["SQ_INSTS_VALU_ADD_F64"][idx]
+ df["SQ_INSTS_VALU_MUL_F64"][idx]
+ (2 * df["SQ_INSTS_VALU_FMA_F64"][idx])
+ df["SQ_INSTS_VALU_TRANS_F64"][idx]
)
)
except KeyError:
if verbose >= 3:
print("{}: Skipped valu_flops at index {}".format(kernelName[:35], idx))
pass
try:
mfma_flops_f16 += df["SQ_INSTS_VALU_MFMA_MOPS_F16"][idx] * 512
mfma_flops_bf16 += df["SQ_INSTS_VALU_MFMA_MOPS_BF16"][idx] * 512
mfma_flops_f32 += df["SQ_INSTS_VALU_MFMA_MOPS_F32"][idx] * 512
mfma_flops_f64 += df["SQ_INSTS_VALU_MFMA_MOPS_F64"][idx] * 512
mfma_iops_i8 += df["SQ_INSTS_VALU_MFMA_MOPS_I8"][idx] * 512
except KeyError:
if verbose >= 3:
print("{}: Skipped mfma ops at index {}".format(kernelName[:35], idx))
pass
try:
lds_data += (
(df["SQ_LDS_IDX_ACTIVE"][idx] - df["SQ_LDS_BANK_CONFLICT"][idx])
* 4
* L2_BANKS
) # L2_BANKS = 32 (since assuming mi200)
except KeyError:
if verbose >= 3:
print("{}: Skipped lds_data at index {}".format(kernelName[:35], idx))
pass
try:
L1cache_data += df["TCP_TOTAL_CACHE_ACCESSES_sum"][idx] * 64
except KeyError:
if verbose >= 3:
print("{}: Skipped L1cache_data at index {}".format(kernelName[:35], idx))
pass
try:
L2cache_data += (
df["TCP_TCC_WRITE_REQ_sum"][idx] * 64
+ df["TCP_TCC_ATOMIC_WITH_RET_REQ_sum"][idx] * 64
+ df["TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum"][idx] * 64
+ df["TCP_TCC_READ_REQ_sum"][idx] * 64
)
except KeyError:
if verbose >= 3:
print("{}: Skipped L2cache_data at index {}".format(kernelName[:35], idx))
pass
try:
hbm_data += (
(df["TCC_EA_RDREQ_32B_sum"][idx] * 32)
+ ((df["TCC_EA_RDREQ_sum"][idx] - df["TCC_EA_RDREQ_32B_sum"][idx]) * 64)
+ (df["TCC_EA_WRREQ_64B_sum"][idx] * 64)
+ ((df["TCC_EA_WRREQ_sum"][idx] - df["TCC_EA_WRREQ_64B_sum"][idx]) * 32)
)
except KeyError:
if verbose >= 3:
print("{}: Skipped hbm_data at index {}".format(kernelName[:35], idx))
pass
totalDuration += df["EndNs"][idx] - df["BeginNs"][idx]
avgDuration += df["EndNs"][idx] - df["BeginNs"][idx]
calls += 1
if sortType == "kernels" and (at_end == True or (kernelName != next_kernelName)):
myList.append(
AI_Data(
kernelName,
@@ -241,7 +372,7 @@ def plot_application(sortType, ret_df, verbose):
if verbose >= 2:
print(
"Just added {} to AI_Data at index {}. # of calls: {}".format(
kernelName, index, calls
kernelName, idx, calls
)
)
total_flops = (
@@ -262,129 +393,6 @@ def plot_application(sortType, ret_df, verbose):
L1cache_data
) = L2cache_data = hbm_data = calls = totalDuration = avgDuration = 0.0
kernelName = row["KernelName"]
try:
total_flops += (
(
64
* (
row["SQ_INSTS_VALU_ADD_F16"]
+ row["SQ_INSTS_VALU_MUL_F16"]
+ (2 * row["SQ_INSTS_VALU_FMA_F16"])
+ row["SQ_INSTS_VALU_TRANS_F16"]
)
)
+ (
64
* (
row["SQ_INSTS_VALU_ADD_F32"]
+ row["SQ_INSTS_VALU_MUL_F32"]
+ (2 * row["SQ_INSTS_VALU_FMA_F32"])
+ row["SQ_INSTS_VALU_TRANS_F32"]
)
)
+ (
64
* (
row["SQ_INSTS_VALU_ADD_F64"]
+ row["SQ_INSTS_VALU_MUL_F64"]
+ (2 * row["SQ_INSTS_VALU_FMA_F64"])
+ row["SQ_INSTS_VALU_TRANS_F64"]
)
)
+ (row["SQ_INSTS_VALU_MFMA_MOPS_F16"] * 512)
+ (row["SQ_INSTS_VALU_MFMA_MOPS_BF16"] * 512)
+ (row["SQ_INSTS_VALU_MFMA_MOPS_F32"] * 512)
+ (row["SQ_INSTS_VALU_MFMA_MOPS_F64"] * 512)
)
except KeyError:
if verbose >= 2:
print("Skipped total_flops at index {}".format(index))
pass
try:
valu_flops += (
64
* (
row["SQ_INSTS_VALU_ADD_F16"]
+ row["SQ_INSTS_VALU_MUL_F16"]
+ (2 * row["SQ_INSTS_VALU_FMA_F16"])
+ row["SQ_INSTS_VALU_TRANS_F16"]
)
+ 64
* (
row["SQ_INSTS_VALU_ADD_F32"]
+ row["SQ_INSTS_VALU_MUL_F32"]
+ (2 * row["SQ_INSTS_VALU_FMA_F32"])
+ row["SQ_INSTS_VALU_TRANS_F32"]
)
+ 64
* (
row["SQ_INSTS_VALU_ADD_F64"]
+ row["SQ_INSTS_VALU_MUL_F64"]
+ (2 * row["SQ_INSTS_VALU_FMA_F64"])
+ row["SQ_INSTS_VALU_TRANS_F64"]
)
)
except KeyError:
if verbose >= 2:
print("Skipped valu_flops at index {}".format(index))
pass
try:
mfma_flops_f16 += row["SQ_INSTS_VALU_MFMA_MOPS_F16"] * 512
mfma_flops_bf16 += row["SQ_INSTS_VALU_MFMA_MOPS_BF16"] * 512
mfma_flops_f32 += row["SQ_INSTS_VALU_MFMA_MOPS_F32"] * 512
mfma_flops_f64 += row["SQ_INSTS_VALU_MFMA_MOPS_F64"] * 512
mfma_iops_i8 += row["SQ_INSTS_VALU_MFMA_MOPS_I8"] * 512
except KeyError:
if verbose >= 2:
print("Skipped mfma ops at index {}".format(index))
pass
try:
lds_data += (
(row["SQ_LDS_IDX_ACTIVE"] - row["SQ_LDS_BANK_CONFLICT"]) * 4 * L2_BANKS
) # L2_BANKS = 32 (since assuming mi200)
except KeyError:
if verbose >= 2:
print("Skipped lds_data at index {}".format(index))
pass
try:
L1cache_data += row["TCP_TOTAL_CACHE_ACCESSES_sum"] * 64
except KeyError:
if verbose >= 2:
print("Skipped L1cache_data at index {}".format(index))
pass
try:
L2cache_data += (
row["TCP_TCC_WRITE_REQ_sum"] * 64
+ row["TCP_TCC_ATOMIC_WITH_RET_REQ_sum"] * 64
+ row["TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum"] * 64
+ row["TCP_TCC_READ_REQ_sum"] * 64
)
except KeyError:
if verbose >= 2:
print("Skipped L2cache_data at index {}".format(index))
pass
try:
hbm_data += (
(row["TCC_EA_RDREQ_32B_sum"] * 32)
+ ((row["TCC_EA_RDREQ_sum"] - row["TCC_EA_RDREQ_32B_sum"]) * 64)
+ (row["TCC_EA_WRREQ_64B_sum"] * 64)
+ ((row["TCC_EA_WRREQ_sum"] - row["TCC_EA_WRREQ_64B_sum"]) * 32)
)
except KeyError:
if verbose >= 2:
print("Skipped hbm_data at index {}".format(index))
pass
totalDuration += row["EndNs"] - row["BeginNs"]
avgDuration += row["EndNs"] - row["BeginNs"]
calls += 1
if sortType == "dispatches":
myList.append(
AI_Data(
@@ -428,9 +436,11 @@ def plot_application(sortType, ret_df, verbose):
# print("Top 5 intensities ('{}')...".format(roof_details["sort"]))
intensities = {"ai_l1": [], "ai_l2": [], "ai_hbm": []}
curr_perf = []
kernelNames = []
i = 0
# Create list of top 5 intensities
while i <= 9 and i != len(myList):
while i < TOP_N and i != len(myList):
kernelNames.append(myList[i].KernelName)
intensities["ai_l1"].append(
myList[i].total_flops / myList[i].L1cache_data
) if myList[i].L1cache_data else intensities["ai_l1"].append(0)
@@ -470,6 +480,9 @@ def plot_application(sortType, ret_df, verbose):
intensityPoints[i].append(x)
intensityPoints[i].append(y)
# Add an entry for kernel names
intensityPoints["kernelNames"] = kernelNames
return intensityPoints
+7
Просмотреть файл
@@ -234,6 +234,13 @@ def parse(my_parser):
type=int,
help="\t\t\tGPU device ID. (DEFAULT: ALL)",
)
roofline_group.add_argument(
"--kernel-names",
required=False,
default=False,
action="store_true",
help="\t\t\tInclude kernel names in roofline plot.",
)
# roofline_group.add_argument('-w', '--workgroups', required=False, default=-1, type=int, help="\t\t\tNumber of kernel workgroups (DEFAULT: 1024)")
# roofline_group.add_argument('--wsize', required=False, default=-1, type=int, help="\t\t\tWorkgroup size (DEFAULT: 256)")
# roofline_group.add_argument('--dataset', required=False, default = -1, type=int, help="\t\t\tDataset size (DEFAULT: 536M)")