[rocprofiler-compute] Fixes for roofline when used with iteration multiplexing (#2635)

*Added iteration_multiplex_impute_counters on pmc data- GUI dataframe did not implement this in the build_layout method previously *Created a Workload() in profile mode post-processing for roofline html standalone plot to be generated- this will be removed once roofline plot is moved to analyze phase in future release *Added iteration_multiplexing run parameter to roofline object init so that we can accurately parse dataframe if the option was used during profiling- this helps us to avoid reading nan values in certain dispatches that did not get imputed in calc_ai_profile *Cleanup for unused legacy code, adjusted method parameters to assist in moving roofline plotting to analyze mode in future release *Update iteration multiplexing data imputation algorithm to impute counters for ungrouped dispatches at the end based on the previous group. This however won't work if there are no dispatches that can be grouped (i.e. number of dispatches < number of counter buckets) --------- Signed-off-by: Carrie Fallows <Carrie.Fallows@amd.com> Co-authored-by: Vignesh Edithal <Vignesh.Edithal@amd.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
2026-01-23 11:10:46 -05:00
@@ -146,6 +146,12 @@ class webui_analysis(OmniAnalyze_Base):
                    base_data[base_run].raw_pmc
                )

+            if self._profiling_config["iteration_multiplexing"] is not None:
+                base_data[base_run].raw_pmc = self.iteration_multiplex_impute_counters(
+                    base_data[base_run].raw_pmc,
+                    policy=self._profiling_config["iteration_multiplexing"],
+                )
+
            # Apply filters to workload data
            console_debug("analysis", f"gui dispatch filter is {disp_filt}")
            console_debug("analysis", f"gui kernel filter is {kernel_filter}")
@@ -224,6 +230,9 @@ class webui_analysis(OmniAnalyze_Base):
                            "is_standalone": False,
                            "roofline_data_type": self.__roofline_data_type,
                            "kernel_filter": False,
+                            "iteration_multiplexing": self._profiling_config[
+                                "iteration_multiplexing"
+                            ],
                        }
                    )
                    roof_obj = soc[self.arch].roofline_obj
@@ -37,6 +37,7 @@ import yaml
 import config
 from roofline import Roofline
 from utils.amdsmi_interface import amdsmi_ctx, get_gpu_model, get_mem_max_clock
+from utils.file_io import create_df_pmc, load_profiling_config
 from utils.logger import (
    console_debug,
    console_error,
@@ -45,15 +46,18 @@ from utils.logger import (
    demarcate,
 )
 from utils.mi_gpu_spec import mi_gpu_specs
-from utils.parser import BUILD_IN_VARS, SUPPORTED_DENOM
+from utils.parser import BUILD_IN_VARS, SUPPORTED_DENOM, apply_filters
 from utils.roofline_calc import validate_roofline_csv
+from utils.schema import Workload
 from utils.specs import MachineSpecs
 from utils.utils import (
    METRIC_ID_RE,
    add_counter_extra_config_input_yaml,
    convert_metric_id_to_panel_info,
    get_panel_alias,
+    impute_counters_iteration_multiplex,
    is_tcc_channel_counter,
+    merge_counters_spatial_multiplex,
    parse_sets_yaml,
 )

@@ -701,7 +705,32 @@ class OmniSoC_Base:
                )
                return

-            self.roofline_obj.post_processing()
+            args = self.get_args()
+            workload = Workload()
+            workload.path = self.__args.path
+            profiling_config = load_profiling_config(workload.path)
+            workload.raw_pmc = create_df_pmc(
+                raw_data_root_dir=workload.path,
+                nodes=None,
+                spatial_multiplexing=args.spatial_multiplexing,
+                kernel_verbose=-1,
+                verbose=args.verbose,
+                config_dict=profiling_config,
+            )
+
+            if args.spatial_multiplexing:
+                workload.raw_pmc = merge_counters_spatial_multiplex(workload.raw_pmc)
+
+            if profiling_config["iteration_multiplexing"] is not None:
+                workload.raw_pmc = impute_counters_iteration_multiplex(
+                    workload.raw_pmc,
+                    policy=profiling_config["iteration_multiplexing"],
+                )
+            filtered_pmc = apply_filters(
+                workload, workload.path, is_gui=False, debug=False
+            )
+
+            self.roofline_obj.post_processing(filtered_pmc)

    @abstractmethod
    def analysis_setup(self, roofline_parameters: Optional[dict[str, Any]]) -> None:
@@ -25,7 +25,6 @@
 import argparse
 import textwrap
 from abc import abstractmethod
-from collections import OrderedDict
 from pathlib import Path
 from typing import Any, Optional, Union

@@ -36,7 +35,7 @@ import plotly.graph_objects as go
 from dash import dcc, html
 from plotly.subplots import make_subplots

-from utils import file_io, rocpd_data, schema
+from utils import schema
 from utils.logger import (
    console_debug,
    console_error,
@@ -96,6 +95,7 @@ class Roofline:
                "is_standalone": False,
                "roofline_data_type": ["FP32"],  # default to FP32
                "kernel_filter": False,
+                "iteration_multiplexing": None,
            }
        )
        self.__ai_data: Optional[dict[str, Any]] = None
@@ -116,6 +116,13 @@ class Roofline:
            hasattr(self.__args, "gpu_kernel") and self.__args.gpu_kernel
        ):
            self.__run_parameters["kernel_filter"] = True
+        if (
+            hasattr(self.__args, "iteration_multiplexing")
+            and self.__args.iteration_multiplexing is not None
+        ):
+            self.__run_parameters["iteration_multiplexing"] = (
+                self.__args.iteration_multiplexing
+            )

    def get_args(self) -> argparse.Namespace:
        return self.__args
@@ -286,11 +293,7 @@ class Roofline:
        Generate a set of empirical roofline plots given a directory containing
        required profiling and benchmarking data.
        """
-        if (
-            not isinstance(self.__run_parameters["workload_dir"], list)
-            and self.__run_parameters["workload_dir"] != None
-        ):
-            self.roof_setup()
+        self.roof_setup()

        console_debug("roofline", f"Path: {self.__run_parameters.get('workload_dir')}")

@@ -300,7 +303,10 @@ class Roofline:
        )

        self.__ai_data = calc_ai_profile(
-            self.__mspec, self.__run_parameters.get("sort_type"), ret_df
+            self.__mspec,
+            self.__run_parameters.get("sort_type"),
+            ret_df,
+            self.__run_parameters["iteration_multiplexing"],
        )

        msg = "AI at each mem level:"
@@ -1133,14 +1139,17 @@ class Roofline:
    def cli_generate_plot(
        self,
        dtype: str,
-        workload: Optional[schema.Workload] = None,
-        config: Optional[dict[str, Any]] = None,
-        arch_config: Optional[schema.ArchConfig] = None,
+        workload: schema.Workload,
+        config: dict[str, Any],
+        arch_config: schema.ArchConfig,
    ) -> Optional[str]:
        """
        Plot CLI mode roofline analysis in terminal using plotext

        :param dtype: The datatype to be profiled
+        :param workload: Complete dataframe
+        :param config: Profiling configuration from profiling_config.yaml
+        :param arch_config: Archetype-specific configurations
        :type method: str
        :return: Build the current figure using plot.build(),
        or None if datatype is not valid for the architecture
@@ -1200,33 +1209,13 @@ class Roofline:
            console_warning("roofline", "Skipping plot generation")
            return None

-        # if workload is detected, utilize Roofline yamls.
-        # If not, fallback to legacy calc_ai
-        if workload and config and arch_config:
-            self.__ai_data = calc_ai_analyze(
-                workload=workload,
-                mspec=self.__mspec,
-                sort_type=str(self.__run_parameters.get("sort_type")),
-                config=config,
-                arch_config=arch_config,
-            )
-
-        else:
-            pmc_perf_csv = base_path / "pmc_perf.csv"
-            if not pmc_perf_csv.is_file():
-                console_error("roofline", f"{pmc_perf_csv} does not exist")
-
-            t_df = OrderedDict()
-            t_df["pmc_perf"] = pd.read_csv(pmc_perf_csv)
-
-            profiling_config = file_io.load_profiling_config(self.__args.path[0][0])
-            if profiling_config.get("format_rocprof_output") == "rocpd":
-                t_df["pmc_perf"] = rocpd_data.process_rocpd_csv(t_df["pmc_perf"])
-
-            t_df = self.validate_apply_kernel_filter(df=t_df, path_str=str(base_path))
-            self.__ai_data = calc_ai_profile(
-                self.__mspec, self.__run_parameters["sort_type"], t_df
-            )
+        self.__ai_data = calc_ai_analyze(
+            workload=workload,
+            mspec=self.__mspec,
+            sort_type=str(self.__run_parameters.get("sort_type")),
+            config=config,
+            arch_config=arch_config,
+        )

        self.__ceiling_data = construct_roof(
            roofline_parameters=self.__run_parameters, dtype=dtype
@@ -1402,38 +1391,29 @@ class Roofline:
        return plt.build()

    @demarcate
-    def standalone_roofline(self) -> None:
-        if (
-            not isinstance(self.__run_parameters["workload_dir"], list)
-            and self.__run_parameters["workload_dir"] != None
-        ):
-            self.roof_setup()
+    def standalone_roofline(
+        self,
+        df: dict[str, pd.DataFrame],
+    ) -> None:
+        self.roof_setup()

        # Change vL1D to a interpretable str, if required
        if "vL1D" in self.__run_parameters["mem_level"]:
            self.__run_parameters["mem_level"].remove("vL1D")
            self.__run_parameters["mem_level"].append("L1")

-        app_path = Path(str(self.__run_parameters["workload_dir"])) / "pmc_perf.csv"
-        if not app_path.is_file():
-            console_error("roofline", f"{app_path} does not exist")
-
-        t_df = OrderedDict()
-        t_df["pmc_perf"] = pd.read_csv(app_path)
-
-        profiling_config = file_io.load_profiling_config(self.__args.path)
-        if profiling_config.get("format_rocprof_output") == "rocpd":
-            t_df["pmc_perf"] = rocpd_data.process_rocpd_csv(t_df["pmc_perf"])
-
-        self.empirical_roofline(ret_df=t_df)
+        self.empirical_roofline(ret_df=df)

    # NB: Currently the post_prossesing() method is the only one being used by
    # rocprofiler-compute, we include pre_processing() and profile() methods for
    # those who wish to borrow the roofline module
    @abstractmethod
-    def post_processing(self) -> None:
+    def post_processing(
+        self,
+        filtered_pmc: pd.DataFrame,
+    ) -> None:
        if self.__run_parameters["is_standalone"]:
-            self.standalone_roofline()
+            self.standalone_roofline(filtered_pmc)

    def get_dtype(self) -> list[str]:
        return self.__run_parameters["roofline_data_type"]
@@ -251,7 +251,9 @@ def create_df_pmc(
                    tmp_df = rocpd_data.process_rocpd_csv(tmp_df)

                # Demangle original KernelNames
-                kernel_name_shortener(tmp_df, kernel_verbose)
+                # Skip for Standalone Roofline with -1 to keep full kernel names
+                if kernel_verbose >= 0:
+                    kernel_name_shortener(tmp_df, kernel_verbose)

                # NB:
                #   Idealy, the Node column should be added out of
@@ -466,7 +466,10 @@ def calc_ai_analyze(


 def calc_ai_profile(
-    mspec: MachineSpecs, sort_type: str, ret_df: dict[str, pd.DataFrame]
+    mspec: MachineSpecs,
+    sort_type: str,
+    ret_df: dict[str, pd.DataFrame],
+    iteration_multiplexing: str,
 ) -> dict[str, Union[list[list[float]], list[str]]]:
    """Given counter data, calculate arithmetic intensity for each kernel
    in the application. Leverage hard-coded equations to calculate AI values.
@@ -505,6 +508,10 @@ def calc_ai_profile(
        next_kernel_name = df["Kernel_Name"][idx + 1] if not at_end else ""
        kernel_name = df["Kernel_Name"][idx]

+        # Skip this kernel dispatch row if any counter value is n/a
+        if df.iloc[idx].isna().any():
+            continue
+
        try:
            total_flops += (
                (
@@ -546,7 +553,8 @@ def calc_ai_profile(
        except KeyError as e:
            console_debug(
                "roofline",
-                f"{kernel_name[:35]}: Skipped total_flops at index {idx} due to {e}",
+                f"{kernel_name[:35]}: Skipped total_flops at index \
+                    {idx} due to {e}",
            )
            pass
        try:
@@ -615,7 +623,8 @@ def calc_ai_profile(
        except KeyError as e:
            console_debug(
                "roofline",
-                f"{kernel_name[:35]}: Skipped L1cache_data at index {idx} due to {e}",
+                f"{kernel_name[:35]}: Skipped L1cache_data at index \
+                    {idx} due to {e}",
            )
            pass

@@ -629,7 +638,8 @@ def calc_ai_profile(
        except KeyError as e:
            console_debug(
                "roofline",
-                f"{kernel_name[:35]}: Skipped L2cache_data at index {idx} due to {e}",
+                f"{kernel_name[:35]}: Skipped L2cache_data at index \
+                    {idx} due to {e}",
            )
            pass
        try:
@@ -1502,6 +1502,7 @@ def impute_counters_iteration_multiplex(
            }
            # Collect imputed sub-groups as dataframes
            subgroup_dfs = []
+            previous_fill_values = {}
            for i in range(0, len(group), subgroup_size):
                subgroup = group.iloc[i : i + subgroup_size]

@@ -1517,7 +1518,22 @@ def impute_counters_iteration_multiplex(
                if fill_values:
                    subgroup = subgroup.fillna(fill_values)

+                # If this is the last subgroup and it still has missing values,
+                # use previous subgroup's fill values
+                # NOTE: This wont work if the first subgroup is itself incomplete
+                is_last_subgroup = (i + subgroup_size) >= len(group)
+                # First any() returns bool pd.Series for every column,
+                # second any() returns single bool
+                if (
+                    is_last_subgroup
+                    and previous_fill_values
+                    and subgroup.isna().any().any()
+                ):
+                    # Use previous subgroup's fill values for remaining missing values
+                    subgroup = subgroup.fillna(previous_fill_values)
+
                subgroup_dfs.append(subgroup)
+                previous_fill_values = fill_values

            # Concatenate all subgroups for this group
            if subgroup_dfs:
@@ -75,6 +75,8 @@ config["COUNTER_LOGGING"] = False
 config["METRIC_COMPARE"] = False
 config["METRIC_LOGGING"] = False

+arch_config = {}
+
 num_kernels = 3
 num_devices = 1

@@ -1326,6 +1328,7 @@ def test_roofline_missing_file_handling(binary_handler_profile_rocprof_compute):

    try:
        from roofline import Roofline
+        from utils.schema import Workload
        from utils.specs import generate_machine_specs

        class MockArgs:
@@ -1337,6 +1340,7 @@ def test_roofline_missing_file_handling(binary_handler_profile_rocprof_compute):

        args = MockArgs()
        mspec = generate_machine_specs(None, None)
+        workload = Workload()

        workload_dir = test_utils.get_output_dir()

@@ -1351,7 +1355,9 @@ def test_roofline_missing_file_handling(binary_handler_profile_rocprof_compute):

        roofline_instance = Roofline(args, mspec, run_parameters)

-        result = roofline_instance.cli_generate_plot("FP32")
+        result = roofline_instance.cli_generate_plot(
+            "FP32", workload, config, arch_config
+        )

        assert result is None

@@ -1378,6 +1384,7 @@ def test_roofline_invalid_datatype_cli(binary_handler_profile_rocprof_compute):

    try:
        from roofline import Roofline
+        from utils.schema import Workload
        from utils.specs import generate_machine_specs

        class MockArgs:
@@ -1389,6 +1396,7 @@ def test_roofline_invalid_datatype_cli(binary_handler_profile_rocprof_compute):

        args = MockArgs()
        mspec = generate_machine_specs(None, None)
+        workload = Workload()

        run_parameters = {
            "workload_dir": test_utils.get_output_dir(),
@@ -1401,7 +1409,9 @@ def test_roofline_invalid_datatype_cli(binary_handler_profile_rocprof_compute):

        roofline_instance = Roofline(args, mspec, run_parameters)

-        result = roofline_instance.cli_generate_plot("INVALID_DATATYPE")
+        result = roofline_instance.cli_generate_plot(
+            "INVALID_DATATYPE", workload, config, arch_config
+        )

        assert result is None