Add single kernel filtering to roofline plots (#757)

* Add single kernel filtering for roofline * Add --kernel to documentation * Add kernel labels to roofline pdfs Signed-off-by: Carrie Fallows <Carrie.Fallows@amd.com> * Add test cases Signed-off-by: Carrie Fallows <Carrie.Fallows@amd.com> * Add autodetect for mode (profile or analyze) during roof validate and filter Prevent --kernel from affecting roofline in gui mode- although this may be broken in develop branch anyways Signed-off-by: Carrie Fallows <Carrie.Fallows@amd.com> * Add note about roof-only usage checking for existing profiling files in the dir. If roof-only is not provided, rocprof-compute currently assumes it has to profile in full regardless. Will look into this another day. Signed-off-by: Carrie Fallows <Carrie.Fallows@amd.com> * Update CHANGELOG.md Add line in resolved issues section to highlight that kernel filtering is now working for roofline plots * Apply changes suggested by docs team Signed-off-by: Carrie Fallows <Carrie.Fallows@amd.com> * Update projects/rocprofiler-compute/CHANGELOG.md Co-authored-by: Pratik Basyal <pratik.basyal@amd.com> --------- Signed-off-by: Carrie Fallows <Carrie.Fallows@amd.com> Co-authored-by: Pratik Basyal <pratik.basyal@amd.com>
2025-08-27 13:41:07 -04:00
commit c68ba44e72
@@ -115,6 +115,8 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs.
 * Fixed L2 read/write/atomic bandwidths on MI350
 * Update metric names for better alignment between analysis configuration and documentation
 * Fixed an issue where accumulation counters could not be collected on AMD Instinct MI100
+* Updated Roofline plots to handle and apply kernel filtering.
+

 ### Known issues

@@ -477,7 +477,9 @@ Standalone roofline
 Roofline analysis occurs on any profile mode run, provided ``--no-roof`` option is not included.
 You don't need to include any additional roofline-specific options for roofline analysis.
 If you want to focus only on roofline-specific performance data and reduce the time it takes to profile, you can use the ``--roof-only`` option.
-This option limits the profiling to just the roofline performance counters.
+This option checks if there is existing profiling data in the workload directory (``pmc_perf.csv`` and ``roofline.csv``):
+	a) If found, uses the data files with the provided arguments to create another roofline PDF output; otherwise,
+	b) Profile mode runs but is limited to collecting only roofline performance counters.

 Roofline options
 ----------------
@@ -494,6 +496,10 @@ Roofline options
   Allows you to specify a device ID to collect performance data from when
   running a roofline benchmark on your system.

+``-k``, ``--kernel <kernel-substr>``
+   Allows for kernel filtering. Usage is equivalent with the current ``rocprof``
+   utility. See :ref:`profiling-kernel-filtering`.
+
 ``--roofline-data-type <datatype>``
   Allows you to specify data types that you want plotted in the roofline PDF output(s). Selecting more than one data type will overlay the results onto the same plot. Default: FP32

@@ -193,6 +193,7 @@ class webui_analysis(OmniAnalyze_Base):
                        "include_kernel_names": False,
                        "is_standalone": False,
                        "roofline_data_type": self.__roofline_data_type,
+                        "kernel_filter": False,
                    }
                )
                roof_obj = self.get_socs()[self.arch].roofline_obj
@@ -85,6 +85,7 @@ class Roofline:
                "include_kernel_names": False,
                "is_standalone": False,
                "roofline_data_type": ["FP32"],  # default to FP32
+                "kernel_filter": False,
            }
        )
        self.__ai_data = None
@@ -102,13 +103,19 @@ class Roofline:
        if hasattr(self.__args, "sort") and self.__args.sort != "ALL":
            self.__run_parameters["sort_type"] = self.__args.sort
        self.__run_parameters["roofline_data_type"] = self.__args.roofline_data_type
+        if (hasattr(self.__args, "kernel") and self.__args.kernel) or (
+            hasattr(self.__args, "gpu_kernel") and self.__args.gpu_kernel
+        ):
+            self.__run_parameters["kernel_filter"] = True
        self.validate_parameters()

    def validate_parameters(self):
        if self.__run_parameters["include_kernel_names"] and (
            not self.__run_parameters["is_standalone"]
        ):
-            console_error("--kernel-names cannot be used with --no-roof option")
+            console_warning(
+                "--kernel-names is nonactionable when used with --no-roof option"
+            )

    def roof_setup(self):
        # Setup the workload directory for roofline profiling.
@@ -165,6 +172,48 @@ class Roofline:
        # Create the directory
        Path(final_dir).mkdir(parents=True, exist_ok=True)

+    def validate_apply_kernel_filter(self, df, path=None):
+        if self.__run_parameters["kernel_filter"] is True:
+            if self.__args.mode == "profile":
+                df_pmc = df["pmc_perf"]
+                df_filtered = df_pmc.copy()
+                df_list = (df_pmc.loc[:, "Kernel_Name"]).to_list()
+                for idx in range(0, len(df_list)):
+                    if df_list[idx].split("(")[0] not in self.__args.kernel:
+                        # Drop row from dataframe if kernel has not been requested
+                        df_filtered.drop(index=idx, inplace=True)
+                # Verify that final filtered kernel df matches the kernel list requested
+                if len(df_filtered.drop_duplicates(subset=["Kernel_Name"])) != len(
+                    self.__args.kernel
+                ):
+                    console_debug(
+                        "Profiled kernels: {}\n`--kernel`: {}".format(
+                            df_list, self.__args.kernel
+                        )
+                    )
+                    console_error(
+                        "Roofline cannot profile - kernels requested with `--kernel` missing from profiling data!"  # noqa: E501
+                        "\n\tRe-profile workload in full or specify subset of available kernels using `--kernel` option."  # noqa: E501
+                        "\n\tComplete profiled kernels list can be found in pmc_perf file.",  # noqa: E501
+                        exit=True,
+                    )
+                # Fix df structure to resemble same df arg passed in
+                df["pmc_perf"] = df_filtered
+            elif self.__args.mode == "analyze":
+                top_kernels_csv = Path(path).joinpath("pmc_kernel_top.csv")
+                if not top_kernels_csv.is_file():
+                    console_error(
+                        "roofline", "{} does not exist".format(top_kernels_csv)
+                    )
+                k_df = pd.read_csv(top_kernels_csv)
+                k_df = k_df.loc[self.__args.gpu_kernel[0], "Kernel_Name"]
+
+                df["pmc_perf"] = df["pmc_perf"][
+                    df["pmc_perf"]["Kernel_Name"].isin(k_df)
+                ]
+
+        return df
+
    @demarcate
    def empirical_roofline(
        self,
@@ -183,6 +232,10 @@ class Roofline:
        console_debug(
            "roofline", "Path: %s" % self.__run_parameters.get("workload_dir")
        )
+        # Verify kernels have been profiled and filter the df
+        ret_df = self.validate_apply_kernel_filter(
+            df=ret_df, path=self.__run_parameters.get("workload_dir")
+        )
        self.__ai_data = calc_ai_profile(
            self.__mspec, self.__run_parameters.get("sort_type"), ret_df
        )
@@ -192,7 +245,7 @@ class Roofline:
        console_debug(msg)

        ops_figure = flops_figure = None
-        ops_dt_list = flops_dt_list = ""
+        ops_dt_list = flops_dt_list = kernel_list = ""

        for dt in self.__run_parameters.get("roofline_data_type", []):
            gpu_arch = getattr(self.__mspec, "gpu_arch", "unknown_arch")
@@ -245,6 +298,9 @@ class Roofline:
                original_kernel_names = []
            else:
                original_kernel_names = self.__ai_data.get("kernelNames", [])
+                if self.__run_parameters.get("kernel_filter", False):
+                    for name in sorted(self.__args.kernel):
+                        kernel_list += "_" + name

            num_kernels = len(original_kernel_names)

@@ -376,18 +432,23 @@ class Roofline:
                if ops_figure:
                    ops_figure.write_image(
                        self.__run_parameters["workload_dir"]
-                        + "/empirRoof_gpu-{}{}.pdf".format(dev_id, ops_dt_list)
+                        + "/empirRoof_gpu-{}{}{}.pdf".format(
+                            dev_id, ops_dt_list, kernel_list
+                        )
                    )
                if flops_figure:
                    flops_figure.write_image(
                        self.__run_parameters["workload_dir"]
-                        + "/empirRoof_gpu-{}{}.pdf".format(dev_id, flops_dt_list)
+                        + "/empirRoof_gpu-{}{}{}.pdf".format(
+                            dev_id, flops_dt_list, kernel_list
+                        )
                    )

                # only save a legend if kernel_names option is toggled
                if self.__run_parameters["include_kernel_names"]:
                    self.__figure.write_image(
-                        self.__run_parameters["workload_dir"] + "/kernelName_legend.pdf"
+                        self.__run_parameters["workload_dir"]
+                        + "/kernelName_legend{}.pdf".format(kernel_list)
                    )
                time.sleep(1)
            console_log("roofline", "Empirical Roofline PDFs saved!")
@@ -697,6 +758,7 @@ class Roofline:
            if profiling_config.get("format_rocprof_output") == "rocpd":
                t_df["pmc_perf"] = rocpd_data.process_rocpd_csv(t_df["pmc_perf"])

+            t_df = self.validate_apply_kernel_filter(df=t_df, path=base_path)
            self.__ai_data = calc_ai_profile(
                self.__mspec, self.__run_parameters["sort_type"], t_df
            )
@@ -895,9 +895,48 @@ def test_roofline_empty_kernel_names_handling(binary_handler_profile_rocprof_com
    ]
    workload_dir = test_utils.get_output_dir()

+    returncode = binary_handler_profile_rocprof_compute(  # noqa: F841
+        config, workload_dir, options, check_success=True, roof=True
+    )
+
+    test_utils.clean_output_dir(config["cleanup"], workload_dir)
+
+
+@pytest.mark.misc
+def test_roofline_kernel_filter(binary_handler_profile_rocprof_compute):
+    """
+    Test roofline multi-attempt profiling with `--kernel`
+    Expect to be able to re-profile from same workload if kernels are valid.
+    (Validity of --kernels tested in test_roofline_kernel_filter_error_handling already)
+    """
+    if soc in ("MI100"):
+        pytest.skip("Skipping roofline test for MI100")
+        return
+
+    options = [
+        "--device",
+        "0",
+        "--roof-only",
+        "--kernel-names",
+    ]
+    workload_dir = test_utils.get_output_dir()
+
+    returncode = binary_handler_profile_rocprof_compute(  # noqa: F841
+        config, workload_dir, options, check_success=True, roof=True
+    )
+    # Don't clean output dir, use same workload
+    options.extend(["--kernel", config["kernel_name_1"]])
+    returncode = binary_handler_profile_rocprof_compute(  # noqa: F841
+        config, workload_dir, options, check_success=True, roof=True
+    )
+
+    # Test nonexistent kernel on roof profile using existing profiling data
+    # Since already profiled, throw error if non-existent kernel requested for roofline
+    options.append("nonexistent_kernel_name_that_should_not_match_anything")
    returncode = binary_handler_profile_rocprof_compute(  # noqa: F841
        config, workload_dir, options, check_success=False, roof=True
    )
+    assert returncode == 1

    test_utils.clean_output_dir(config["cleanup"], workload_dir)

@@ -934,6 +973,10 @@ def test_roof_plot_modes(binary_handler_profile_rocprof_compute):
        assert True
        return

+    # Test `--kernel` filtering outputs are present and labelled correctly
+    filter_kernelName = "kernelName_legend_" + config["kernel_name_1"]
+    filter_empirRoof = "empirRoof_gpu-0_" + config["kernel_name_1"]
+
    plot_configurations = [
        {
            "options": ["--device", "0", "--roof-only", "--roofline-data-type", "FP32"],
@@ -944,8 +987,15 @@ def test_roof_plot_modes(binary_handler_profile_rocprof_compute):
            "expected_files": ["empirRoof_gpu-0_FP16.pdf"],
        },
        {
-            "options": ["--device", "0", "--roof-only", "--kernel-names"],
-            "expected_files": ["kernelName_legend.pdf"],
+            "options": [
+                "--device",
+                "0",
+                "--roof-only",
+                "--kernel-names",
+                "--kernel",
+                config["kernel_name_1"],
+            ],
+            "expected_files": [filter_kernelName, filter_empirRoof],
        },
    ]