diff --git a/projects/rocprofiler-compute/CHANGELOG.md b/projects/rocprofiler-compute/CHANGELOG.md index 98c1824514..cd38b5a8da 100644 --- a/projects/rocprofiler-compute/CHANGELOG.md +++ b/projects/rocprofiler-compute/CHANGELOG.md @@ -13,6 +13,10 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs. * -b option in profile mode also accept hardware IP block for filtering, however, this support will be deprecated soon * --list-metrics option added in profile mode to list possible metric id(s), similar to analyze mode +* Datatype selection option for roofline profiling + * --roofline-data-type / -R option added to specify which datatypes the user wants to capture in the roofline PDF plot outputs + * Default is FP32, but user can specify as many types as desired to overlay on the same plot output + ### Changed * Change normal_unit default to per_kernel diff --git a/projects/rocprofiler-compute/docs/data/profile/sample-roof-plot.jpg b/projects/rocprofiler-compute/docs/data/profile/sample-roof-plot.jpg index aa11a5b5cd..2deaba7ad2 100644 Binary files a/projects/rocprofiler-compute/docs/data/profile/sample-roof-plot.jpg and b/projects/rocprofiler-compute/docs/data/profile/sample-roof-plot.jpg differ diff --git a/projects/rocprofiler-compute/docs/how-to/profile/mode.rst b/projects/rocprofiler-compute/docs/how-to/profile/mode.rst index 0d6c72434e..8852704c83 100644 --- a/projects/rocprofiler-compute/docs/how-to/profile/mode.rst +++ b/projects/rocprofiler-compute/docs/how-to/profile/mode.rst @@ -474,6 +474,9 @@ Roofline options Allows you to specify a device ID to collect performance data from when running a roofline benchmark on your system. +``--roofline-data-type `` + Allows you to specify datatypes that you want plotted in the roofline PDF output(s). Selecting more than one datatype will overlay the results onto the same plot. Default: FP32 + To distinguish different kernels in your ``.pdf`` roofline plot use ``--kernel-names``. This will give each kernel a unique marker identifiable from the plot's key. @@ -507,8 +510,7 @@ successfully. $ ls workloads/vcopy/MI200/ total 48 - -rw-r--r-- 1 auser agroup 13331 Mar 1 16:05 empirRoof_gpu-0_fp32_fp64.pdf - -rw-r--r-- 1 auser agroup 13136 Mar 1 16:05 empirRoof_gpu-0_int8_fp16.pdf + -rw-r--r-- 1 auser agroup 13331 Mar 1 16:05 empirRoof_gpu-0_FP32.pdf drwxr-xr-x 1 auser agroup 0 Mar 1 16:03 perfmon -rw-r--r-- 1 auser agroup 1101 Mar 1 16:03 pmc_perf.csv -rw-r--r-- 1 auser agroup 1715 Mar 1 16:05 roofline.csv @@ -517,11 +519,9 @@ successfully. .. note:: - ROCm Compute Profiler generates three roofline outputs to organize results and reduce - clutter. One chart plots FP32/FP64 performance, one plots I8/FP16 - performance, and the other plots FP8 performance. + ROCm Compute Profiler currently captures roofline profiling for all data types, but has the ability to reduce clutter in the PDF outputs by selecting datatype(s). Selecting multiple datatypes will overlay the results into the same PDF. If the user would like separate PDFs for each datatype off of the same workload run, the user can run the profiling command again with the single datatype as long as the roofline.csv still exists in the workload folder. -The following image is a sample ``empirRoof_gpu-0_int8_fp16.pdf`` roofline +The following image is a sample ``empirRoof_gpu-0_FP32.pdf`` roofline plot. .. image:: ../../data/profile/sample-roof-plot.jpg diff --git a/projects/rocprofiler-compute/docs/how-to/use.rst b/projects/rocprofiler-compute/docs/how-to/use.rst index 4ac2b6bd1d..3101357c07 100644 --- a/projects/rocprofiler-compute/docs/how-to/use.rst +++ b/projects/rocprofiler-compute/docs/how-to/use.rst @@ -231,7 +231,7 @@ The following table lists ROCm Compute Profiler's basic operations, their * - :ref:`Standalone roofline analysis ` - ``profile`` - - ``--name``, ``--roof-only``, ``-- `` + - ``--name``, ``--roof-only``, ``--roofline-data-type ``, ``-- `` * - :ref:`Import a workload to database ` - ``database`` diff --git a/projects/rocprofiler-compute/src/argparser.py b/projects/rocprofiler-compute/src/argparser.py index 508981ed29..06110f4615 100644 --- a/projects/rocprofiler-compute/src/argparser.py +++ b/projects/rocprofiler-compute/src/argparser.py @@ -367,6 +367,19 @@ Examples: action="store_true", help="\t\t\tInclude kernel names in roofline plot.", ) + + roofline_group.add_argument( + "-R", + "--roofline-data-type", + required=False, + choices=["FP8", "FP16", "BF16", "FP32", "FP64", "I8"], + metavar="", + nargs="+", + type=str, + default=["FP32"], + help="\t\t\tChoose datatypes to generate plotted roofline PDFs for: (DEFAULT: FP32)\n\t\t\t FP8\n\t\t\t FP16\n\t\t\t BF16\n\t\t\t FP32\n\t\t\t FP64\n\t\t\t I8", + ) + # roofline_group.add_argument('-w', '--workgroups', required=False, default=-1, type=int, help="\t\t\tNumber of kernel workgroups (DEFAULT: 1024)") # roofline_group.add_argument('--wsize', required=False, default=-1, type=int, help="\t\t\tWorkgroup size (DEFAULT: 256)") # roofline_group.add_argument('--dataset', required=False, default = -1, type=int, help="\t\t\tDataset size (DEFAULT: 536M)") diff --git a/projects/rocprofiler-compute/src/roofline.py b/projects/rocprofiler-compute/src/roofline.py index cce906e4b7..57487013b4 100644 --- a/projects/rocprofiler-compute/src/roofline.py +++ b/projects/rocprofiler-compute/src/roofline.py @@ -33,7 +33,13 @@ import pandas as pd import plotly.graph_objects as go from dash import dcc, html -from utils.roofline_calc import calc_ai, constuct_roof +from utils.roofline_calc import ( + MFMA_DATATYPES, + PEAK_OPS_DATATYPES, + SUPPORTED_DATATYPES, + calc_ai, + constuct_roof, +) from utils.utils import ( console_debug, console_error, @@ -60,6 +66,7 @@ class Roofline: "mem_level": "ALL", "include_kernel_names": False, "is_standalone": False, + "roofline_data_type": ["FP32"], } ) self.__ai_data = None @@ -76,7 +83,10 @@ class Roofline: self.__run_parameters["mem_level"] = self.__args.mem_level if hasattr(self.__args, "sort") and self.__args.sort != "ALL": self.__run_parameters["sort_type"] = self.__args.sort - + if hasattr( + self.__args, "roofline_data_type" + ) and self.__args.roofline_data_type != ["FP32"]: + self.__run_parameters["roofline_data_type"] = self.__args.roofline_data_type self.validate_parameters() def validate_parameters(self): @@ -121,19 +131,42 @@ class Roofline: msg += "\n\t%s -> %s" % (i, self.__ai_data[i]) console_debug(msg) - # Generate a roofline figure for each data type - fp32_fig = self.generate_plot(dtype="FP32") - ml_combo_fig_fp32_fp64 = self.generate_plot( - dtype="FP64", - fig=fp32_fig, - ) - fp16_fig = self.generate_plot(dtype="FP16") - ml_combo_fig_int8_fp16 = self.generate_plot( - dtype="I8", - fig=fp16_fig, - ) - if self.__mspec.gpu_series != "MI200": - fig_fp8 = self.generate_plot(dtype="FP8") + # Generate a roofline figure for the datatypes + ops_figure = flops_figure = None + ops_dt_list = flops_dt_list = "" + for dt in self.__run_parameters["roofline_data_type"]: + # Do not generate a roofline figure if the datatype is not supported on this gpu_arch + if not str(dt) in SUPPORTED_DATATYPES[self.__mspec.gpu_arch]: + console_error( + "{} is not a supported datatype for roofline profiling on {}".format( + str(dt), self.__mspec.gpu_model + ), + exit=False, + ) + continue + + ops_flops = "Ops" if (str(dt[:1]) == "I") else "Flops" + + if ops_flops == "Ops": + if ops_figure: + ops_combo_figure = self.generate_plot( + dtype=str(dt), + fig=ops_figure, + ) + ops_figure = ops_combo_figure + else: + ops_figure = self.generate_plot(dtype=str(dt)) + ops_dt_list += "_" + str(dt) + if ops_flops == "Flops": + if flops_figure: + flops_combo_figure = self.generate_plot( + dtype=str(dt), + fig=flops_figure, + ) + flops_figure = flops_combo_figure + else: + flops_figure = self.generate_plot(dtype=str(dt)) + flops_dt_list += "_" + str(dt) # Create a legend and distinct kernel markers. This can be saved, optionally self.__figure = go.Figure( @@ -160,80 +193,55 @@ class Roofline: if self.__run_parameters["is_standalone"]: dev_id = str(self.__run_parameters["device_id"]) - ml_combo_fig_fp32_fp64.write_image( - self.__run_parameters["workload_dir"] - + "/empirRoof_gpu-{}_fp32_fp64.pdf".format(dev_id) - ) - ml_combo_fig_int8_fp16.write_image( - self.__run_parameters["workload_dir"] - + "/empirRoof_gpu-{}_int8_fp16.pdf".format(dev_id) - ) - if self.__mspec.gpu_series != "MI200": - fig_fp8.write_image( - self.__run_parameters["workload_dir"] - + "/empirRoof_gpu-{}_fp8.pdf".format(dev_id) - ) - # only save a legend if kernel_names option is toggled - if self.__run_parameters["include_kernel_names"]: - self.__figure.write_image( - self.__run_parameters["workload_dir"] + "/kernelName_legend.pdf" - ) - time.sleep(1) # Re-save to remove loading MathJax pop up - ml_combo_fig_fp32_fp64.write_image( - self.__run_parameters["workload_dir"] - + "/empirRoof_gpu-{}_fp32_fp64.pdf".format(dev_id) - ) - ml_combo_fig_int8_fp16.write_image( - self.__run_parameters["workload_dir"] - + "/empirRoof_gpu-{}_int8_fp16.pdf".format(dev_id) - ) - if self.__mspec.gpu_series != "MI200": - fig_fp8.write_image( - self.__run_parameters["workload_dir"] - + "/empirRoof_gpu-{}_fp8.pdf".format(dev_id) - ) - if self.__run_parameters["include_kernel_names"]: - self.__figure.write_image( - self.__run_parameters["workload_dir"] + "/kernelName_legend.pdf" - ) + for i in range(2): + if ops_figure: + ops_figure.write_image( + self.__run_parameters["workload_dir"] + + "/empirRoof_gpu-{}{}.pdf".format(dev_id, ops_dt_list) + ) + if flops_figure: + flops_figure.write_image( + self.__run_parameters["workload_dir"] + + "/empirRoof_gpu-{}{}.pdf".format(dev_id, flops_dt_list) + ) + + # only save a legend if kernel_names option is toggled + if self.__run_parameters["include_kernel_names"]: + self.__figure.write_image( + self.__run_parameters["workload_dir"] + "/kernelName_legend.pdf" + ) + time.sleep(1) console_log("roofline", "Empirical Roofline PDFs saved!") else: - if self.__mspec.gpu_series != "MI200": - fp8_graph = html.Div( + if ops_figure: + ops_graph = html.Div( className="float-child", children=[ - html.H3(children="Empirical Roofline Analysis (FP8)"), - dcc.Graph(figure=fig_fp8), + html.H3(children="Empirical Roofline Analysis (Ops)"), + dcc.Graph(figure=ops_figure), ], ) else: - fp8_graph = None + ops_graph = None + if flops_figure: + flops_graph = html.Div( + className="float-child", + children=[ + html.H3(children="Empirical Roofline Analysis (Flops)"), + dcc.Graph(figure=flops_figure), + ], + ) + else: + flops_graph = None return html.Section( id="roofline", children=[ html.Div( className="float-container", children=[ - html.Div( - className="float-child", - children=[ - html.H3( - children="Empirical Roofline Analysis (FP32/FP64)" - ), - dcc.Graph(figure=ml_combo_fig_fp32_fp64), - ], - ), - html.Div( - className="float-child", - children=[ - html.H3( - children="Empirical Roofline Analysis (FP16/INT8)" - ), - dcc.Graph(figure=ml_combo_fig_int8_fp16), - ], - ), - fp8_graph, + ops_graph, + flops_graph, ], ) ], @@ -284,9 +292,10 @@ class Roofline: ) ) + ops_flops = "OP" if (dtype[:1] == "I") else "FLOP" + # Plot peak VALU ceiling - # VALU info I8/FP16 not collected via microbench - if dtype != "FP16" and dtype != "I8": + if dtype in PEAK_OPS_DATATYPES: fig.add_trace( go.Scatter( x=self.__ceiling_data["valu"][0], @@ -298,20 +307,18 @@ class Roofline: ( None if self.__run_parameters["is_standalone"] - else "{} GFLOP/s".format( - to_int(self.__ceiling_data["valu"][2]) + else "{} G{}/s".format( + to_int(self.__ceiling_data["valu"][2], ops_flops) ) ), - "{} GFLOP/s".format(to_int(self.__ceiling_data["valu"][2])), + "{} G{}/s".format( + to_int(self.__ceiling_data["valu"][2]), ops_flops + ), ], textposition="top left", ) ) - if dtype == "FP16": - pos = "bottom left" - else: - pos = "top left" # Plot peak MFMA ceiling fig.add_trace( go.Scatter( @@ -324,26 +331,26 @@ class Roofline: ( None if self.__run_parameters["is_standalone"] - else "{} GFLOP/s".format(to_int(self.__ceiling_data["mfma"][2])) + else "{} G{}/s".format( + to_int(self.__ceiling_data["mfma"][2]), ops_flops + ) ), - "{} GFLOP/s".format(to_int(self.__ceiling_data["mfma"][2])), + "{} G{}/s".format(to_int(self.__ceiling_data["mfma"][2]), ops_flops), ], - textposition=pos, + textposition="top left", ) ) ####################### # Plot Application AI ####################### - if dtype != "I8" and dtype != "FP64": - # Plot the arithmetic intensity points for each cache level - # Omitting I8 AIs to clean up graph. FP16 tends to be higher. + # Plot the arithmetic intensity points for each cache level + if ops_flops == "FLOP": fig.add_trace( go.Scatter( x=self.__ai_data["ai_l1"][0], y=self.__ai_data["ai_l1"][1], - name="ai_l1", + name=dtype + "_ai_l1", mode="markers", - marker={"color": "#00CC96"}, marker_symbol=( SYMBOLS if self.__run_parameters["include_kernel_names"] else None ), @@ -353,9 +360,8 @@ class Roofline: go.Scatter( x=self.__ai_data["ai_l2"][0], y=self.__ai_data["ai_l2"][1], - name="ai_l2", + name=dtype + "_ai_l2", mode="markers", - marker={"color": "#EF553B"}, marker_symbol=( SYMBOLS if self.__run_parameters["include_kernel_names"] else None ), @@ -365,22 +371,30 @@ class Roofline: go.Scatter( x=self.__ai_data["ai_hbm"][0], y=self.__ai_data["ai_hbm"][1], - name="ai_hbm", + name=dtype + "_ai_hbm", mode="markers", - marker={"color": "#636EFA"}, marker_symbol=( SYMBOLS if self.__run_parameters["include_kernel_names"] else None ), ) ) - # Set layout - fig.update_layout( - xaxis_title="Arithmetic Intensity (FLOPs/Byte)", - yaxis_title="Performance (GFLOP/sec)", - hovermode="x unified", - margin=dict(l=50, r=50, b=50, t=50, pad=4), - ) + # Set layout + fig.update_layout( + xaxis_title="Arithmetic Intensity (FLOPs/Byte)", + yaxis_title="Performance (GFLOP/sec)", + hovermode="x unified", + margin=dict(l=50, r=50, b=50, t=50, pad=4), + ) + else: + # Set layout + fig.update_layout( + xaxis_title="Bandwidth (GB/sec)", + yaxis_title="Performance (GOP/sec)", + hovermode="x unified", + margin=dict(l=50, r=50, b=50, t=50, pad=4), + ) + fig.update_xaxes(type="log", autorange=True) fig.update_yaxes(type="log", autorange=True) diff --git a/projects/rocprofiler-compute/src/utils/roofline_calc.py b/projects/rocprofiler-compute/src/utils/roofline_calc.py index 1c99ac6247..03ffb54054 100644 --- a/projects/rocprofiler-compute/src/utils/roofline_calc.py +++ b/projects/rocprofiler-compute/src/utils/roofline_calc.py @@ -41,7 +41,17 @@ FONT_SIZE = 16 FONT_COLOR = "black" FONT_WEIGHT = "bold" -SUPPORTED_SOC = ["mi200", "mi300"] +# SUPPORTED_DATATYPES table is based on datatype support in rocm-amdgpu-bench repository +# Indicates which datatypes per gpu arch can be generated by the roofline binary +SUPPORTED_DATATYPES = { + "gfx90a": ["FP16", "BF16", "FP32", "FP64", "I8"], # Unsupported: F8 + "gfx940": ["FP8", "FP16", "FP32", "FP64"], # Unsupported: BF16, I8 + "gfx941": ["FP8", "FP16", "FP32", "FP64"], # Unsupported: BF16, I8 + "gfx942": ["FP8", "FP16", "FP32", "FP64"], # Unsupported: BF16, I8 +} + +PEAK_OPS_DATATYPES = ["FP8", "FP32", "FP64"] +MFMA_DATATYPES = ["FP8", "FP16", "BF16", "FP32", "FP64", "I8"] TOP_N = 10 @@ -106,31 +116,25 @@ def calc_ceilings(roofline_parameters, dtype, benchmark_data): x1 = y1 = x2 = y2 = -1 x1_mfma = y1_mfma = x2_mfma = y2_mfma = -1 - target_precision = dtype[2:] - if dtype != "FP16" and dtype != "I8": - peakOps = float(benchmark_data[dtype + "Flops"][roofline_parameters["device_id"]]) + ops_flops = "Ops" if (dtype[:1] == "I") else "Flops" + + if dtype in PEAK_OPS_DATATYPES: + peakOps = float( + benchmark_data[dtype + "{}".format(ops_flops)][ + roofline_parameters["device_id"] + ] + ) for i in range(0, len(cacheHierarchy)): # Plot BW line console_debug("roofline", "Current cache level is %s" % cacheHierarchy[i]) curr_bw = cacheHierarchy[i] + "Bw" peakBw = float(benchmark_data[curr_bw][roofline_parameters["device_id"]]) - if dtype == "I8": - peakMFMA = float( - benchmark_data["MFMAI8Ops"][roofline_parameters["device_id"]] - ) - else: - peakMFMA = float( - benchmark_data["MFMAF{}Flops".format(target_precision)][ - roofline_parameters["device_id"] - ] - ) - x1 = float(XMIN) y1 = float(XMIN) * peakBw - # Note: No reg peakOps for FP16 or INT8 - if dtype != "FP16" and dtype != "I8": + + if dtype in PEAK_OPS_DATATYPES: x2 = peakOps / peakBw y2 = peakOps @@ -138,8 +142,16 @@ def calc_ceilings(roofline_parameters, dtype, benchmark_data): x1_mfma = peakOps / peakBw y1_mfma = peakOps - x2_mfma = peakMFMA / peakBw - y2_mfma = peakMFMA + if dtype in MFMA_DATATYPES: + target_precision = (dtype) if (dtype[:1] == "I") else ("F" + dtype[2:]) + + peakMFMA = float( + benchmark_data["MFMA{}{}".format(target_precision, ops_flops)][ + roofline_parameters["device_id"] + ] + ) + x2_mfma = peakMFMA / peakBw + y2_mfma = peakMFMA # These are the points to use: console_debug("roofline", "coordinate points:") @@ -153,8 +165,7 @@ def calc_ceilings(roofline_parameters, dtype, benchmark_data): # ------------------------------------------------------------------------------------- # Plot computing roof # ------------------------------------------------------------------------------------- - # Note: No FMA roof for FP16 or INT8 - if dtype != "FP16" and dtype != "I8": + if dtype in PEAK_OPS_DATATYPES: # Plot FMA roof x0 = XMAX if x2 < x0: @@ -166,9 +177,7 @@ def calc_ceilings(roofline_parameters, dtype, benchmark_data): graphPoints["valu"].append(peakOps) # Plot MFMA roof - if ( - x1_mfma != -1 or dtype == "FP16" or dtype == "I8" - ): # assert that mfma has been assigned + if x1_mfma != -1 or (dtype in MFMA_DATATYPES): # assert that mfma has been assigned x0_mfma = XMAX if x2_mfma < x0_mfma: x0_mfma = x2_mfma @@ -206,6 +215,8 @@ def calc_ai(mspec, sort_type, ret_df): at_end = False next_kernelName = "" + supported_dt = SUPPORTED_DATATYPES[mspec.gpu_arch] + for idx in df.index: # CASE: Top kernels # Calculate + append AI data if @@ -251,7 +262,7 @@ def calc_ai(mspec, sort_type, ret_df): + (df["SQ_INSTS_VALU_MFMA_MOPS_F32"][idx] * 512) + (df["SQ_INSTS_VALU_MFMA_MOPS_F64"][idx] * 512) ) - if mspec.gpu_series != "MI200": + if "FP8" in supported_dt: total_flops += df["SQ_INSTS_VALU_MFMA_MOPS_F8"][idx] * 512 except KeyError: console_debug( @@ -291,7 +302,7 @@ def calc_ai(mspec, sort_type, ret_df): pass try: - if mspec.gpu_series != "MI200": + if "FP8" in supported_dt: mfma_flops_f8 += df["SQ_INSTS_VALU_MFMA_MOPS_F8"][idx] * 512 mfma_flops_f16 += df["SQ_INSTS_VALU_MFMA_MOPS_F16"][idx] * 512 mfma_flops_bf16 += df["SQ_INSTS_VALU_MFMA_MOPS_BF16"][idx] * 512 diff --git a/projects/rocprofiler-compute/tests/test_profile_general.py b/projects/rocprofiler-compute/tests/test_profile_general.py index 221ad193b8..1f0af76791 100644 --- a/projects/rocprofiler-compute/tests/test_profile_general.py +++ b/projects/rocprofiler-compute/tests/test_profile_general.py @@ -107,9 +107,7 @@ ALL_CSVS_MI300 = sorted( ROOF_ONLY_FILES = sorted( [ - "empirRoof_gpu-0_fp32_fp64.pdf", - "empirRoof_gpu-0_int8_fp16.pdf", - "empirRoof_gpu-0_fp8.pdf", + "empirRoof_gpu-0_FP32.pdf", "pmc_perf.csv", "pmc_perf_0.csv", "pmc_perf_1.csv",