Add F4 F6 to roofline for MI350 series (#709)
Add roofline bins with FP4 FP6 datatypes enabled for gfx950 arch
---------
Signed-off-by: Carrie Fallows <Carrie.Fallows@amd.com>
[ROCm/rocprofiler-compute commit: cb2d928ecf]
This commit is contained in:
zatwierdzone przez
GitHub
rodzic
ba61cc36f5
commit
689746e2cd
@@ -20,7 +20,7 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs.
|
||||
* Default is FP32, but user can specify as many types as desired to overlay on the same plot output
|
||||
|
||||
* Additional datatypes for roofline profiling
|
||||
* Now supports FP8, FP16, BF16, FP32, FP64, I8, I32, I64 (dependent on gpu architecture)
|
||||
* Now supports FP4, FP6, FP8, FP16, BF16, FP32, FP64, I8, I32, I64 (dependent on gpu architecture)
|
||||
|
||||
* Support host-trap PC Sampling on CLI (beta version)
|
||||
|
||||
|
||||
@@ -390,12 +390,12 @@ Examples:
|
||||
"-R",
|
||||
"--roofline-data-type",
|
||||
required=False,
|
||||
choices=["FP8", "FP16", "BF16", "FP32", "FP64", "I8", "I32", "I64"],
|
||||
choices=["FP4", "FP6", "FP8", "FP16", "BF16", "FP32", "FP64", "I8", "I32", "I64"],
|
||||
metavar="",
|
||||
nargs="+",
|
||||
type=str,
|
||||
default=["FP32"],
|
||||
help="\t\t\tChoose datatypes to view roofline PDFs for: (DEFAULT: FP32)\n\t\t\t FP8\n\t\t\t FP16\n\t\t\t BF16\n\t\t\t FP32\n\t\t\t FP64\n\t\t\t I8\n\t\t\t I32\n\t\t\t I64\n\t\t\t ",
|
||||
help="Choose datatypes to view roofline PDFs for: (DEFAULT: FP32)\n\t\t\t FP4\n\t\t\t FP6\n\t\t\t FP8\n\t\t\t FP16\n\t\t\t BF16\n\t\t\t FP32\n\t\t\t FP64\n\t\t\t I8\n\t\t\t I32\n\t\t\t I64\n\t\t\t ",
|
||||
)
|
||||
|
||||
# roofline_group.add_argument('-w', '--workgroups', required=False, default=-1, type=int, help="\t\t\tNumber of kernel workgroups (DEFAULT: 1024)")
|
||||
@@ -605,12 +605,12 @@ Examples:
|
||||
"-R",
|
||||
"--roofline-data-type",
|
||||
required=False,
|
||||
choices=["FP8", "FP16", "BF16", "FP32", "FP64", "I8", "I32", "I64"],
|
||||
choices=["FP4", "FP6", "FP8", "FP16", "BF16", "FP32", "FP64", "I8", "I32", "I64"],
|
||||
metavar="",
|
||||
nargs="+",
|
||||
type=str,
|
||||
default=["FP32"],
|
||||
help="\t\t\tChoose datatypes to view roofline PDFs for: (DEFAULT: FP32)\n\t\t\t FP8\n\t\t\t FP16\n\t\t\t BF16\n\t\t\t FP32\n\t\t\t FP64\n\t\t\t I8\n\t\t\t I32\n\t\t\t I64\n\t\t\t ",
|
||||
help="\t\t\tChoose datatypes to view roofline PDFs for: (DEFAULT: FP32)\n\t\t\t FP4\n\t\t\t FP6\n\t\t\t FP8\n\t\t\t FP16\n\t\t\t BF16\n\t\t\t FP32\n\t\t\t FP64\n\t\t\t I8\n\t\t\t I32\n\t\t\t I64\n\t\t\t ",
|
||||
)
|
||||
|
||||
analyze_advanced_group.add_argument(
|
||||
|
||||
@@ -337,48 +337,69 @@ class Roofline:
|
||||
# Plot Application AI
|
||||
#######################
|
||||
# Plot the arithmetic intensity points for each cache level
|
||||
if ops_flops == "FLOP":
|
||||
fig.add_trace(
|
||||
go.Scatter(
|
||||
x=self.__ai_data["ai_l1"][0],
|
||||
y=self.__ai_data["ai_l1"][1],
|
||||
name=dtype + "_ai_l1",
|
||||
mode="markers",
|
||||
marker_symbol=(
|
||||
SYMBOLS if self.__run_parameters["include_kernel_names"] else None
|
||||
),
|
||||
)
|
||||
)
|
||||
fig.add_trace(
|
||||
go.Scatter(
|
||||
x=self.__ai_data["ai_l2"][0],
|
||||
y=self.__ai_data["ai_l2"][1],
|
||||
name=dtype + "_ai_l2",
|
||||
mode="markers",
|
||||
marker_symbol=(
|
||||
SYMBOLS if self.__run_parameters["include_kernel_names"] else None
|
||||
),
|
||||
)
|
||||
)
|
||||
fig.add_trace(
|
||||
go.Scatter(
|
||||
x=self.__ai_data["ai_hbm"][0],
|
||||
y=self.__ai_data["ai_hbm"][1],
|
||||
name=dtype + "_ai_hbm",
|
||||
mode="markers",
|
||||
marker_symbol=(
|
||||
SYMBOLS if self.__run_parameters["include_kernel_names"] else None
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
# Set layout
|
||||
fig.update_layout(
|
||||
xaxis_title="Arithmetic Intensity (FLOPs/Byte)",
|
||||
yaxis_title="Performance (GFLOP/sec)",
|
||||
hovermode="x unified",
|
||||
margin=dict(l=50, r=50, b=50, t=50, pad=4),
|
||||
# Check for F6F4 PC which applies to both FP4 and FP6 MFMA; avoid duplicate plotting
|
||||
skipAI = False
|
||||
if dtype == "FP4" or dtype == "FP6":
|
||||
if (dtype == "FP6") and (
|
||||
"FP4" in self.__run_parameters["roofline_data_type"]
|
||||
):
|
||||
skipAI = True
|
||||
console_debug(
|
||||
"roofline",
|
||||
"Datatype {} is captured through the F6F4 perfmon event".format(dtype),
|
||||
)
|
||||
dtype = "F6F4"
|
||||
|
||||
if ops_flops == "FLOP":
|
||||
if not skipAI:
|
||||
fig.add_trace(
|
||||
go.Scatter(
|
||||
x=self.__ai_data["ai_l1"][0],
|
||||
y=self.__ai_data["ai_l1"][1],
|
||||
name=dtype + "_ai_l1",
|
||||
mode="markers",
|
||||
marker_symbol=(
|
||||
SYMBOLS
|
||||
if self.__run_parameters["include_kernel_names"]
|
||||
else None
|
||||
),
|
||||
)
|
||||
)
|
||||
fig.add_trace(
|
||||
go.Scatter(
|
||||
x=self.__ai_data["ai_l2"][0],
|
||||
y=self.__ai_data["ai_l2"][1],
|
||||
name=dtype + "_ai_l2",
|
||||
mode="markers",
|
||||
marker_symbol=(
|
||||
SYMBOLS
|
||||
if self.__run_parameters["include_kernel_names"]
|
||||
else None
|
||||
),
|
||||
)
|
||||
)
|
||||
fig.add_trace(
|
||||
go.Scatter(
|
||||
x=self.__ai_data["ai_hbm"][0],
|
||||
y=self.__ai_data["ai_hbm"][1],
|
||||
name=dtype + "_ai_hbm",
|
||||
mode="markers",
|
||||
marker_symbol=(
|
||||
SYMBOLS
|
||||
if self.__run_parameters["include_kernel_names"]
|
||||
else None
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
# Set layout
|
||||
fig.update_layout(
|
||||
xaxis_title="Arithmetic Intensity (FLOPs/Byte)",
|
||||
yaxis_title="Performance (GFLOP/sec)",
|
||||
hovermode="x unified",
|
||||
margin=dict(l=50, r=50, b=50, t=50, pad=4),
|
||||
)
|
||||
else:
|
||||
# Set layout
|
||||
fig.update_layout(
|
||||
|
||||
@@ -44,15 +44,61 @@ FONT_WEIGHT = "bold"
|
||||
# SUPPORTED_DATATYPES table is based on datatype support in rocm-amdgpu-bench repository
|
||||
# Indicates which datatypes per gpu arch can be generated by the roofline binary
|
||||
SUPPORTED_DATATYPES = {
|
||||
"gfx90a": ["FP16", "BF16", "FP32", "FP64", "I8", "I32", "I64"], # Unsupported: F8
|
||||
"gfx940": ["FP8", "FP16", "BF16", "FP32", "FP64", "I8", "I32", "I64"], # Unsupported:
|
||||
"gfx941": ["FP8", "FP16", "BF16", "FP32", "FP64", "I8", "I32", "I64"], # Unsupported:
|
||||
"gfx942": ["FP8", "FP16", "BF16", "FP32", "FP64", "I8", "I32", "I64"], # Unsupported:
|
||||
"gfx950": ["FP8", "FP16", "BF16", "FP32", "FP64", "I8", "I32", "I64"], # Unsupported:
|
||||
"gfx90a": [
|
||||
"FP16",
|
||||
"BF16",
|
||||
"FP32",
|
||||
"FP64",
|
||||
"I8",
|
||||
"I32",
|
||||
"I64",
|
||||
], # Unsupported: F4, F6, F8
|
||||
"gfx940": [
|
||||
"FP8",
|
||||
"FP16",
|
||||
"BF16",
|
||||
"FP32",
|
||||
"FP64",
|
||||
"I8",
|
||||
"I32",
|
||||
"I64",
|
||||
], # Unsupported: F4, F6
|
||||
"gfx941": [
|
||||
"FP8",
|
||||
"FP16",
|
||||
"BF16",
|
||||
"FP32",
|
||||
"FP64",
|
||||
"I8",
|
||||
"I32",
|
||||
"I64",
|
||||
], # Unsupported: F4, F6
|
||||
"gfx942": [
|
||||
"FP8",
|
||||
"FP16",
|
||||
"BF16",
|
||||
"FP32",
|
||||
"FP64",
|
||||
"I8",
|
||||
"I32",
|
||||
"I64",
|
||||
], # Unsupported: F4, F6
|
||||
"gfx950": [
|
||||
"FP4",
|
||||
"FP6",
|
||||
"FP8",
|
||||
"FP16",
|
||||
"BF16",
|
||||
"FP32",
|
||||
"FP64",
|
||||
"I8",
|
||||
"I32",
|
||||
"I64",
|
||||
], # Unsupported:
|
||||
}
|
||||
|
||||
PEAK_OPS_DATATYPES = ["FP8", "FP16", "BF16", "FP32", "FP64", "I8", "I32", "I64"]
|
||||
MFMA_DATATYPES = ["FP8", "FP16", "BF16", "FP32", "FP64", "I8"]
|
||||
MFMA_DATATYPES = ["FP4", "FP6", "FP8", "FP16", "BF16", "FP32", "FP64", "I8"]
|
||||
|
||||
TOP_N = 10
|
||||
|
||||
@@ -67,6 +113,7 @@ class AI_Data:
|
||||
|
||||
total_flops: float
|
||||
valu_flops: float
|
||||
mfma_flops_f6f4: float
|
||||
mfma_flops_f8: float
|
||||
mfma_flops_f16: float
|
||||
mfma_flops_bf16: float
|
||||
@@ -212,11 +259,11 @@ def calc_ai(mspec, sort_type, ret_df):
|
||||
df = df.sort_values(by=["Kernel_Name"])
|
||||
df = df.reset_index(drop=True)
|
||||
|
||||
total_flops = valu_flops = mfma_flops_f8 = mfma_flops_bf16 = mfma_flops_f16 = (
|
||||
mfma_iops_i8
|
||||
) = mfma_flops_f32 = mfma_flops_f64 = lds_data = L1cache_data = L2cache_data = (
|
||||
hbm_data
|
||||
) = calls = totalDuration = avgDuration = 0.0
|
||||
total_flops = valu_flops = mfma_flops_f6f4 = mfma_flops_f8 = mfma_flops_bf16 = (
|
||||
mfma_flops_f16
|
||||
) = mfma_iops_i8 = mfma_flops_f32 = mfma_flops_f64 = lds_data = L1cache_data = (
|
||||
L2cache_data
|
||||
) = hbm_data = calls = totalDuration = avgDuration = 0.0
|
||||
|
||||
kernelName = ""
|
||||
|
||||
@@ -273,6 +320,8 @@ def calc_ai(mspec, sort_type, ret_df):
|
||||
)
|
||||
if "FP8" in supported_dt:
|
||||
total_flops += df["SQ_INSTS_VALU_MFMA_MOPS_F8"][idx] * 512
|
||||
if ("FP4" in supported_dt) or ("FP6" in supported_dt):
|
||||
total_flops += df["SQ_INSTS_VALU_MFMA_MOPS_F6F4"][idx] * 512
|
||||
except KeyError:
|
||||
console_debug(
|
||||
"roofline",
|
||||
@@ -313,6 +362,8 @@ def calc_ai(mspec, sort_type, ret_df):
|
||||
try:
|
||||
if "FP8" in supported_dt:
|
||||
mfma_flops_f8 += df["SQ_INSTS_VALU_MFMA_MOPS_F8"][idx] * 512
|
||||
if ("FP4" in supported_dt) or ("FP6" in supported_dt):
|
||||
mfma_flops_f6f4 += df["SQ_INSTS_VALU_MFMA_MOPS_F6F4"][idx] * 512
|
||||
mfma_flops_f16 += df["SQ_INSTS_VALU_MFMA_MOPS_F16"][idx] * 512
|
||||
mfma_flops_bf16 += df["SQ_INSTS_VALU_MFMA_MOPS_BF16"][idx] * 512
|
||||
mfma_flops_f32 += df["SQ_INSTS_VALU_MFMA_MOPS_F32"][idx] * 512
|
||||
@@ -413,6 +464,7 @@ def calc_ai(mspec, sort_type, ret_df):
|
||||
calls,
|
||||
total_flops / calls,
|
||||
valu_flops / calls,
|
||||
mfma_flops_f6f4 / calls,
|
||||
mfma_flops_f8 / calls,
|
||||
mfma_flops_f16 / calls,
|
||||
mfma_flops_bf16 / calls,
|
||||
@@ -432,11 +484,13 @@ def calc_ai(mspec, sort_type, ret_df):
|
||||
kernelName, idx, calls
|
||||
)
|
||||
)
|
||||
total_flops = valu_flops = mfma_flops_f8 = mfma_flops_bf16 = (
|
||||
mfma_flops_f16
|
||||
) = mfma_iops_i8 = mfma_flops_f32 = mfma_flops_f64 = lds_data = (
|
||||
L1cache_data
|
||||
) = L2cache_data = hbm_data = calls = totalDuration = avgDuration = 0.0
|
||||
total_flops = valu_flops = mfma_flops_f6f4 = mfma_flops_f8 = (
|
||||
mfma_flops_bf16
|
||||
) = mfma_flops_f16 = mfma_iops_i8 = mfma_flops_f32 = mfma_flops_f64 = (
|
||||
lds_data
|
||||
) = L1cache_data = L2cache_data = hbm_data = calls = totalDuration = (
|
||||
avgDuration
|
||||
) = 0.0
|
||||
|
||||
if sort_type == "dispatches":
|
||||
myList.append(
|
||||
@@ -445,6 +499,7 @@ def calc_ai(mspec, sort_type, ret_df):
|
||||
calls,
|
||||
total_flops,
|
||||
valu_flops,
|
||||
mfma_flops_f6f4,
|
||||
mfma_flops_f8,
|
||||
mfma_flops_f16,
|
||||
mfma_flops_bf16,
|
||||
@@ -459,11 +514,13 @@ def calc_ai(mspec, sort_type, ret_df):
|
||||
avgDuration,
|
||||
)
|
||||
)
|
||||
total_flops = valu_flops = mfma_flops_f8 = mfma_flops_bf16 = (
|
||||
mfma_flops_f16
|
||||
) = mfma_iops_i8 = mfma_flops_f32 = mfma_flops_f64 = lds_data = (
|
||||
L1cache_data
|
||||
) = L2cache_data = hbm_data = calls = totalDuration = avgDuration = 0.0
|
||||
total_flops = valu_flops = mfma_flops_f6f4 = mfma_flops_f8 = (
|
||||
mfma_flops_bf16
|
||||
) = mfma_flops_f16 = mfma_iops_i8 = mfma_flops_f32 = mfma_flops_f64 = (
|
||||
lds_data
|
||||
) = L1cache_data = L2cache_data = hbm_data = calls = totalDuration = (
|
||||
avgDuration
|
||||
) = 0.0
|
||||
|
||||
myList.sort(key=lambda x: x.totalDuration, reverse=True)
|
||||
|
||||
|
||||
Plik binarny nie jest wyświetlany.
Plik binarny nie jest wyświetlany.
Plik binarny nie jest wyświetlany.
Reference in New Issue
Block a user