Add F4 F6 to roofline for MI350 series (#709)

Add roofline bins with FP4 FP6 datatypes enabled for gfx950 arch

---------

Signed-off-by: Carrie Fallows <Carrie.Fallows@amd.com>

[ROCm/rocprofiler-compute commit: cb2d928ecf]
This commit is contained in:
cfallows-amd
2025-05-26 18:36:31 -04:00
zatwierdzone przez GitHub
rodzic ba61cc36f5
commit 689746e2cd
7 zmienionych plików z 144 dodań i 66 usunięć
@@ -20,7 +20,7 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs.
* Default is FP32, but user can specify as many types as desired to overlay on the same plot output
* Additional datatypes for roofline profiling
* Now supports FP8, FP16, BF16, FP32, FP64, I8, I32, I64 (dependent on gpu architecture)
* Now supports FP4, FP6, FP8, FP16, BF16, FP32, FP64, I8, I32, I64 (dependent on gpu architecture)
* Support host-trap PC Sampling on CLI (beta version)
@@ -390,12 +390,12 @@ Examples:
"-R",
"--roofline-data-type",
required=False,
choices=["FP8", "FP16", "BF16", "FP32", "FP64", "I8", "I32", "I64"],
choices=["FP4", "FP6", "FP8", "FP16", "BF16", "FP32", "FP64", "I8", "I32", "I64"],
metavar="",
nargs="+",
type=str,
default=["FP32"],
help="\t\t\tChoose datatypes to view roofline PDFs for: (DEFAULT: FP32)\n\t\t\t FP8\n\t\t\t FP16\n\t\t\t BF16\n\t\t\t FP32\n\t\t\t FP64\n\t\t\t I8\n\t\t\t I32\n\t\t\t I64\n\t\t\t ",
help="Choose datatypes to view roofline PDFs for: (DEFAULT: FP32)\n\t\t\t FP4\n\t\t\t FP6\n\t\t\t FP8\n\t\t\t FP16\n\t\t\t BF16\n\t\t\t FP32\n\t\t\t FP64\n\t\t\t I8\n\t\t\t I32\n\t\t\t I64\n\t\t\t ",
)
# roofline_group.add_argument('-w', '--workgroups', required=False, default=-1, type=int, help="\t\t\tNumber of kernel workgroups (DEFAULT: 1024)")
@@ -605,12 +605,12 @@ Examples:
"-R",
"--roofline-data-type",
required=False,
choices=["FP8", "FP16", "BF16", "FP32", "FP64", "I8", "I32", "I64"],
choices=["FP4", "FP6", "FP8", "FP16", "BF16", "FP32", "FP64", "I8", "I32", "I64"],
metavar="",
nargs="+",
type=str,
default=["FP32"],
help="\t\t\tChoose datatypes to view roofline PDFs for: (DEFAULT: FP32)\n\t\t\t FP8\n\t\t\t FP16\n\t\t\t BF16\n\t\t\t FP32\n\t\t\t FP64\n\t\t\t I8\n\t\t\t I32\n\t\t\t I64\n\t\t\t ",
help="\t\t\tChoose datatypes to view roofline PDFs for: (DEFAULT: FP32)\n\t\t\t FP4\n\t\t\t FP6\n\t\t\t FP8\n\t\t\t FP16\n\t\t\t BF16\n\t\t\t FP32\n\t\t\t FP64\n\t\t\t I8\n\t\t\t I32\n\t\t\t I64\n\t\t\t ",
)
analyze_advanced_group.add_argument(
@@ -337,48 +337,69 @@ class Roofline:
# Plot Application AI
#######################
# Plot the arithmetic intensity points for each cache level
if ops_flops == "FLOP":
fig.add_trace(
go.Scatter(
x=self.__ai_data["ai_l1"][0],
y=self.__ai_data["ai_l1"][1],
name=dtype + "_ai_l1",
mode="markers",
marker_symbol=(
SYMBOLS if self.__run_parameters["include_kernel_names"] else None
),
)
)
fig.add_trace(
go.Scatter(
x=self.__ai_data["ai_l2"][0],
y=self.__ai_data["ai_l2"][1],
name=dtype + "_ai_l2",
mode="markers",
marker_symbol=(
SYMBOLS if self.__run_parameters["include_kernel_names"] else None
),
)
)
fig.add_trace(
go.Scatter(
x=self.__ai_data["ai_hbm"][0],
y=self.__ai_data["ai_hbm"][1],
name=dtype + "_ai_hbm",
mode="markers",
marker_symbol=(
SYMBOLS if self.__run_parameters["include_kernel_names"] else None
),
)
)
# Set layout
fig.update_layout(
xaxis_title="Arithmetic Intensity (FLOPs/Byte)",
yaxis_title="Performance (GFLOP/sec)",
hovermode="x unified",
margin=dict(l=50, r=50, b=50, t=50, pad=4),
# Check for F6F4 PC which applies to both FP4 and FP6 MFMA; avoid duplicate plotting
skipAI = False
if dtype == "FP4" or dtype == "FP6":
if (dtype == "FP6") and (
"FP4" in self.__run_parameters["roofline_data_type"]
):
skipAI = True
console_debug(
"roofline",
"Datatype {} is captured through the F6F4 perfmon event".format(dtype),
)
dtype = "F6F4"
if ops_flops == "FLOP":
if not skipAI:
fig.add_trace(
go.Scatter(
x=self.__ai_data["ai_l1"][0],
y=self.__ai_data["ai_l1"][1],
name=dtype + "_ai_l1",
mode="markers",
marker_symbol=(
SYMBOLS
if self.__run_parameters["include_kernel_names"]
else None
),
)
)
fig.add_trace(
go.Scatter(
x=self.__ai_data["ai_l2"][0],
y=self.__ai_data["ai_l2"][1],
name=dtype + "_ai_l2",
mode="markers",
marker_symbol=(
SYMBOLS
if self.__run_parameters["include_kernel_names"]
else None
),
)
)
fig.add_trace(
go.Scatter(
x=self.__ai_data["ai_hbm"][0],
y=self.__ai_data["ai_hbm"][1],
name=dtype + "_ai_hbm",
mode="markers",
marker_symbol=(
SYMBOLS
if self.__run_parameters["include_kernel_names"]
else None
),
)
)
# Set layout
fig.update_layout(
xaxis_title="Arithmetic Intensity (FLOPs/Byte)",
yaxis_title="Performance (GFLOP/sec)",
hovermode="x unified",
margin=dict(l=50, r=50, b=50, t=50, pad=4),
)
else:
# Set layout
fig.update_layout(
@@ -44,15 +44,61 @@ FONT_WEIGHT = "bold"
# SUPPORTED_DATATYPES table is based on datatype support in rocm-amdgpu-bench repository
# Indicates which datatypes per gpu arch can be generated by the roofline binary
SUPPORTED_DATATYPES = {
"gfx90a": ["FP16", "BF16", "FP32", "FP64", "I8", "I32", "I64"], # Unsupported: F8
"gfx940": ["FP8", "FP16", "BF16", "FP32", "FP64", "I8", "I32", "I64"], # Unsupported:
"gfx941": ["FP8", "FP16", "BF16", "FP32", "FP64", "I8", "I32", "I64"], # Unsupported:
"gfx942": ["FP8", "FP16", "BF16", "FP32", "FP64", "I8", "I32", "I64"], # Unsupported:
"gfx950": ["FP8", "FP16", "BF16", "FP32", "FP64", "I8", "I32", "I64"], # Unsupported:
"gfx90a": [
"FP16",
"BF16",
"FP32",
"FP64",
"I8",
"I32",
"I64",
], # Unsupported: F4, F6, F8
"gfx940": [
"FP8",
"FP16",
"BF16",
"FP32",
"FP64",
"I8",
"I32",
"I64",
], # Unsupported: F4, F6
"gfx941": [
"FP8",
"FP16",
"BF16",
"FP32",
"FP64",
"I8",
"I32",
"I64",
], # Unsupported: F4, F6
"gfx942": [
"FP8",
"FP16",
"BF16",
"FP32",
"FP64",
"I8",
"I32",
"I64",
], # Unsupported: F4, F6
"gfx950": [
"FP4",
"FP6",
"FP8",
"FP16",
"BF16",
"FP32",
"FP64",
"I8",
"I32",
"I64",
], # Unsupported:
}
PEAK_OPS_DATATYPES = ["FP8", "FP16", "BF16", "FP32", "FP64", "I8", "I32", "I64"]
MFMA_DATATYPES = ["FP8", "FP16", "BF16", "FP32", "FP64", "I8"]
MFMA_DATATYPES = ["FP4", "FP6", "FP8", "FP16", "BF16", "FP32", "FP64", "I8"]
TOP_N = 10
@@ -67,6 +113,7 @@ class AI_Data:
total_flops: float
valu_flops: float
mfma_flops_f6f4: float
mfma_flops_f8: float
mfma_flops_f16: float
mfma_flops_bf16: float
@@ -212,11 +259,11 @@ def calc_ai(mspec, sort_type, ret_df):
df = df.sort_values(by=["Kernel_Name"])
df = df.reset_index(drop=True)
total_flops = valu_flops = mfma_flops_f8 = mfma_flops_bf16 = mfma_flops_f16 = (
mfma_iops_i8
) = mfma_flops_f32 = mfma_flops_f64 = lds_data = L1cache_data = L2cache_data = (
hbm_data
) = calls = totalDuration = avgDuration = 0.0
total_flops = valu_flops = mfma_flops_f6f4 = mfma_flops_f8 = mfma_flops_bf16 = (
mfma_flops_f16
) = mfma_iops_i8 = mfma_flops_f32 = mfma_flops_f64 = lds_data = L1cache_data = (
L2cache_data
) = hbm_data = calls = totalDuration = avgDuration = 0.0
kernelName = ""
@@ -273,6 +320,8 @@ def calc_ai(mspec, sort_type, ret_df):
)
if "FP8" in supported_dt:
total_flops += df["SQ_INSTS_VALU_MFMA_MOPS_F8"][idx] * 512
if ("FP4" in supported_dt) or ("FP6" in supported_dt):
total_flops += df["SQ_INSTS_VALU_MFMA_MOPS_F6F4"][idx] * 512
except KeyError:
console_debug(
"roofline",
@@ -313,6 +362,8 @@ def calc_ai(mspec, sort_type, ret_df):
try:
if "FP8" in supported_dt:
mfma_flops_f8 += df["SQ_INSTS_VALU_MFMA_MOPS_F8"][idx] * 512
if ("FP4" in supported_dt) or ("FP6" in supported_dt):
mfma_flops_f6f4 += df["SQ_INSTS_VALU_MFMA_MOPS_F6F4"][idx] * 512
mfma_flops_f16 += df["SQ_INSTS_VALU_MFMA_MOPS_F16"][idx] * 512
mfma_flops_bf16 += df["SQ_INSTS_VALU_MFMA_MOPS_BF16"][idx] * 512
mfma_flops_f32 += df["SQ_INSTS_VALU_MFMA_MOPS_F32"][idx] * 512
@@ -413,6 +464,7 @@ def calc_ai(mspec, sort_type, ret_df):
calls,
total_flops / calls,
valu_flops / calls,
mfma_flops_f6f4 / calls,
mfma_flops_f8 / calls,
mfma_flops_f16 / calls,
mfma_flops_bf16 / calls,
@@ -432,11 +484,13 @@ def calc_ai(mspec, sort_type, ret_df):
kernelName, idx, calls
)
)
total_flops = valu_flops = mfma_flops_f8 = mfma_flops_bf16 = (
mfma_flops_f16
) = mfma_iops_i8 = mfma_flops_f32 = mfma_flops_f64 = lds_data = (
L1cache_data
) = L2cache_data = hbm_data = calls = totalDuration = avgDuration = 0.0
total_flops = valu_flops = mfma_flops_f6f4 = mfma_flops_f8 = (
mfma_flops_bf16
) = mfma_flops_f16 = mfma_iops_i8 = mfma_flops_f32 = mfma_flops_f64 = (
lds_data
) = L1cache_data = L2cache_data = hbm_data = calls = totalDuration = (
avgDuration
) = 0.0
if sort_type == "dispatches":
myList.append(
@@ -445,6 +499,7 @@ def calc_ai(mspec, sort_type, ret_df):
calls,
total_flops,
valu_flops,
mfma_flops_f6f4,
mfma_flops_f8,
mfma_flops_f16,
mfma_flops_bf16,
@@ -459,11 +514,13 @@ def calc_ai(mspec, sort_type, ret_df):
avgDuration,
)
)
total_flops = valu_flops = mfma_flops_f8 = mfma_flops_bf16 = (
mfma_flops_f16
) = mfma_iops_i8 = mfma_flops_f32 = mfma_flops_f64 = lds_data = (
L1cache_data
) = L2cache_data = hbm_data = calls = totalDuration = avgDuration = 0.0
total_flops = valu_flops = mfma_flops_f6f4 = mfma_flops_f8 = (
mfma_flops_bf16
) = mfma_flops_f16 = mfma_iops_i8 = mfma_flops_f32 = mfma_flops_f64 = (
lds_data
) = L1cache_data = L2cache_data = hbm_data = calls = totalDuration = (
avgDuration
) = 0.0
myList.sort(key=lambda x: x.totalDuration, reverse=True)