[rocprofiler-compute] Fixes for roofline when used with iteration multiplexing (#2635)

*Added iteration_multiplex_impute_counters on pmc data- GUI dataframe did not implement this in the build_layout method previously
*Created a Workload() in profile mode post-processing for roofline html standalone plot to be generated- this will be removed once roofline plot is moved to analyze phase in future release
*Added iteration_multiplexing run parameter to roofline object init so that we can accurately parse dataframe if the option was used during profiling- this helps us to avoid reading nan values in certain dispatches that did not get imputed in calc_ai_profile
*Cleanup for unused legacy code, adjusted method parameters to assist in moving roofline plotting to analyze mode in future release
*Update iteration multiplexing data imputation algorithm to impute counters for ungrouped dispatches at the end based on the previous group. This however won't work if there are no dispatches that can be grouped (i.e. number of dispatches < number of counter buckets)

---------

Signed-off-by: Carrie Fallows <Carrie.Fallows@amd.com>
Co-authored-by: Vignesh Edithal <Vignesh.Edithal@amd.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
This commit is contained in:
cfallows-amd
2026-01-23 11:10:46 -05:00
committed by GitHub
parent fc4422d73b
commit 62dd4d114d
7 changed files with 123 additions and 67 deletions
@@ -146,6 +146,12 @@ class webui_analysis(OmniAnalyze_Base):
base_data[base_run].raw_pmc base_data[base_run].raw_pmc
) )
if self._profiling_config["iteration_multiplexing"] is not None:
base_data[base_run].raw_pmc = self.iteration_multiplex_impute_counters(
base_data[base_run].raw_pmc,
policy=self._profiling_config["iteration_multiplexing"],
)
# Apply filters to workload data # Apply filters to workload data
console_debug("analysis", f"gui dispatch filter is {disp_filt}") console_debug("analysis", f"gui dispatch filter is {disp_filt}")
console_debug("analysis", f"gui kernel filter is {kernel_filter}") console_debug("analysis", f"gui kernel filter is {kernel_filter}")
@@ -224,6 +230,9 @@ class webui_analysis(OmniAnalyze_Base):
"is_standalone": False, "is_standalone": False,
"roofline_data_type": self.__roofline_data_type, "roofline_data_type": self.__roofline_data_type,
"kernel_filter": False, "kernel_filter": False,
"iteration_multiplexing": self._profiling_config[
"iteration_multiplexing"
],
} }
) )
roof_obj = soc[self.arch].roofline_obj roof_obj = soc[self.arch].roofline_obj
@@ -37,6 +37,7 @@ import yaml
import config import config
from roofline import Roofline from roofline import Roofline
from utils.amdsmi_interface import amdsmi_ctx, get_gpu_model, get_mem_max_clock from utils.amdsmi_interface import amdsmi_ctx, get_gpu_model, get_mem_max_clock
from utils.file_io import create_df_pmc, load_profiling_config
from utils.logger import ( from utils.logger import (
console_debug, console_debug,
console_error, console_error,
@@ -45,15 +46,18 @@ from utils.logger import (
demarcate, demarcate,
) )
from utils.mi_gpu_spec import mi_gpu_specs from utils.mi_gpu_spec import mi_gpu_specs
from utils.parser import BUILD_IN_VARS, SUPPORTED_DENOM from utils.parser import BUILD_IN_VARS, SUPPORTED_DENOM, apply_filters
from utils.roofline_calc import validate_roofline_csv from utils.roofline_calc import validate_roofline_csv
from utils.schema import Workload
from utils.specs import MachineSpecs from utils.specs import MachineSpecs
from utils.utils import ( from utils.utils import (
METRIC_ID_RE, METRIC_ID_RE,
add_counter_extra_config_input_yaml, add_counter_extra_config_input_yaml,
convert_metric_id_to_panel_info, convert_metric_id_to_panel_info,
get_panel_alias, get_panel_alias,
impute_counters_iteration_multiplex,
is_tcc_channel_counter, is_tcc_channel_counter,
merge_counters_spatial_multiplex,
parse_sets_yaml, parse_sets_yaml,
) )
@@ -701,7 +705,32 @@ class OmniSoC_Base:
) )
return return
self.roofline_obj.post_processing() args = self.get_args()
workload = Workload()
workload.path = self.__args.path
profiling_config = load_profiling_config(workload.path)
workload.raw_pmc = create_df_pmc(
raw_data_root_dir=workload.path,
nodes=None,
spatial_multiplexing=args.spatial_multiplexing,
kernel_verbose=-1,
verbose=args.verbose,
config_dict=profiling_config,
)
if args.spatial_multiplexing:
workload.raw_pmc = merge_counters_spatial_multiplex(workload.raw_pmc)
if profiling_config["iteration_multiplexing"] is not None:
workload.raw_pmc = impute_counters_iteration_multiplex(
workload.raw_pmc,
policy=profiling_config["iteration_multiplexing"],
)
filtered_pmc = apply_filters(
workload, workload.path, is_gui=False, debug=False
)
self.roofline_obj.post_processing(filtered_pmc)
@abstractmethod @abstractmethod
def analysis_setup(self, roofline_parameters: Optional[dict[str, Any]]) -> None: def analysis_setup(self, roofline_parameters: Optional[dict[str, Any]]) -> None:
+38 -58
View File
@@ -25,7 +25,6 @@
import argparse import argparse
import textwrap import textwrap
from abc import abstractmethod from abc import abstractmethod
from collections import OrderedDict
from pathlib import Path from pathlib import Path
from typing import Any, Optional, Union from typing import Any, Optional, Union
@@ -36,7 +35,7 @@ import plotly.graph_objects as go
from dash import dcc, html from dash import dcc, html
from plotly.subplots import make_subplots from plotly.subplots import make_subplots
from utils import file_io, rocpd_data, schema from utils import schema
from utils.logger import ( from utils.logger import (
console_debug, console_debug,
console_error, console_error,
@@ -96,6 +95,7 @@ class Roofline:
"is_standalone": False, "is_standalone": False,
"roofline_data_type": ["FP32"], # default to FP32 "roofline_data_type": ["FP32"], # default to FP32
"kernel_filter": False, "kernel_filter": False,
"iteration_multiplexing": None,
} }
) )
self.__ai_data: Optional[dict[str, Any]] = None self.__ai_data: Optional[dict[str, Any]] = None
@@ -116,6 +116,13 @@ class Roofline:
hasattr(self.__args, "gpu_kernel") and self.__args.gpu_kernel hasattr(self.__args, "gpu_kernel") and self.__args.gpu_kernel
): ):
self.__run_parameters["kernel_filter"] = True self.__run_parameters["kernel_filter"] = True
if (
hasattr(self.__args, "iteration_multiplexing")
and self.__args.iteration_multiplexing is not None
):
self.__run_parameters["iteration_multiplexing"] = (
self.__args.iteration_multiplexing
)
def get_args(self) -> argparse.Namespace: def get_args(self) -> argparse.Namespace:
return self.__args return self.__args
@@ -286,11 +293,7 @@ class Roofline:
Generate a set of empirical roofline plots given a directory containing Generate a set of empirical roofline plots given a directory containing
required profiling and benchmarking data. required profiling and benchmarking data.
""" """
if ( self.roof_setup()
not isinstance(self.__run_parameters["workload_dir"], list)
and self.__run_parameters["workload_dir"] != None
):
self.roof_setup()
console_debug("roofline", f"Path: {self.__run_parameters.get('workload_dir')}") console_debug("roofline", f"Path: {self.__run_parameters.get('workload_dir')}")
@@ -300,7 +303,10 @@ class Roofline:
) )
self.__ai_data = calc_ai_profile( self.__ai_data = calc_ai_profile(
self.__mspec, self.__run_parameters.get("sort_type"), ret_df self.__mspec,
self.__run_parameters.get("sort_type"),
ret_df,
self.__run_parameters["iteration_multiplexing"],
) )
msg = "AI at each mem level:" msg = "AI at each mem level:"
@@ -1133,14 +1139,17 @@ class Roofline:
def cli_generate_plot( def cli_generate_plot(
self, self,
dtype: str, dtype: str,
workload: Optional[schema.Workload] = None, workload: schema.Workload,
config: Optional[dict[str, Any]] = None, config: dict[str, Any],
arch_config: Optional[schema.ArchConfig] = None, arch_config: schema.ArchConfig,
) -> Optional[str]: ) -> Optional[str]:
""" """
Plot CLI mode roofline analysis in terminal using plotext Plot CLI mode roofline analysis in terminal using plotext
:param dtype: The datatype to be profiled :param dtype: The datatype to be profiled
:param workload: Complete dataframe
:param config: Profiling configuration from profiling_config.yaml
:param arch_config: Archetype-specific configurations
:type method: str :type method: str
:return: Build the current figure using plot.build(), :return: Build the current figure using plot.build(),
or None if datatype is not valid for the architecture or None if datatype is not valid for the architecture
@@ -1200,33 +1209,13 @@ class Roofline:
console_warning("roofline", "Skipping plot generation") console_warning("roofline", "Skipping plot generation")
return None return None
# if workload is detected, utilize Roofline yamls. self.__ai_data = calc_ai_analyze(
# If not, fallback to legacy calc_ai workload=workload,
if workload and config and arch_config: mspec=self.__mspec,
self.__ai_data = calc_ai_analyze( sort_type=str(self.__run_parameters.get("sort_type")),
workload=workload, config=config,
mspec=self.__mspec, arch_config=arch_config,
sort_type=str(self.__run_parameters.get("sort_type")), )
config=config,
arch_config=arch_config,
)
else:
pmc_perf_csv = base_path / "pmc_perf.csv"
if not pmc_perf_csv.is_file():
console_error("roofline", f"{pmc_perf_csv} does not exist")
t_df = OrderedDict()
t_df["pmc_perf"] = pd.read_csv(pmc_perf_csv)
profiling_config = file_io.load_profiling_config(self.__args.path[0][0])
if profiling_config.get("format_rocprof_output") == "rocpd":
t_df["pmc_perf"] = rocpd_data.process_rocpd_csv(t_df["pmc_perf"])
t_df = self.validate_apply_kernel_filter(df=t_df, path_str=str(base_path))
self.__ai_data = calc_ai_profile(
self.__mspec, self.__run_parameters["sort_type"], t_df
)
self.__ceiling_data = construct_roof( self.__ceiling_data = construct_roof(
roofline_parameters=self.__run_parameters, dtype=dtype roofline_parameters=self.__run_parameters, dtype=dtype
@@ -1402,38 +1391,29 @@ class Roofline:
return plt.build() return plt.build()
@demarcate @demarcate
def standalone_roofline(self) -> None: def standalone_roofline(
if ( self,
not isinstance(self.__run_parameters["workload_dir"], list) df: dict[str, pd.DataFrame],
and self.__run_parameters["workload_dir"] != None ) -> None:
): self.roof_setup()
self.roof_setup()
# Change vL1D to a interpretable str, if required # Change vL1D to a interpretable str, if required
if "vL1D" in self.__run_parameters["mem_level"]: if "vL1D" in self.__run_parameters["mem_level"]:
self.__run_parameters["mem_level"].remove("vL1D") self.__run_parameters["mem_level"].remove("vL1D")
self.__run_parameters["mem_level"].append("L1") self.__run_parameters["mem_level"].append("L1")
app_path = Path(str(self.__run_parameters["workload_dir"])) / "pmc_perf.csv" self.empirical_roofline(ret_df=df)
if not app_path.is_file():
console_error("roofline", f"{app_path} does not exist")
t_df = OrderedDict()
t_df["pmc_perf"] = pd.read_csv(app_path)
profiling_config = file_io.load_profiling_config(self.__args.path)
if profiling_config.get("format_rocprof_output") == "rocpd":
t_df["pmc_perf"] = rocpd_data.process_rocpd_csv(t_df["pmc_perf"])
self.empirical_roofline(ret_df=t_df)
# NB: Currently the post_prossesing() method is the only one being used by # NB: Currently the post_prossesing() method is the only one being used by
# rocprofiler-compute, we include pre_processing() and profile() methods for # rocprofiler-compute, we include pre_processing() and profile() methods for
# those who wish to borrow the roofline module # those who wish to borrow the roofline module
@abstractmethod @abstractmethod
def post_processing(self) -> None: def post_processing(
self,
filtered_pmc: pd.DataFrame,
) -> None:
if self.__run_parameters["is_standalone"]: if self.__run_parameters["is_standalone"]:
self.standalone_roofline() self.standalone_roofline(filtered_pmc)
def get_dtype(self) -> list[str]: def get_dtype(self) -> list[str]:
return self.__run_parameters["roofline_data_type"] return self.__run_parameters["roofline_data_type"]
@@ -251,7 +251,9 @@ def create_df_pmc(
tmp_df = rocpd_data.process_rocpd_csv(tmp_df) tmp_df = rocpd_data.process_rocpd_csv(tmp_df)
# Demangle original KernelNames # Demangle original KernelNames
kernel_name_shortener(tmp_df, kernel_verbose) # Skip for Standalone Roofline with -1 to keep full kernel names
if kernel_verbose >= 0:
kernel_name_shortener(tmp_df, kernel_verbose)
# NB: # NB:
# Idealy, the Node column should be added out of # Idealy, the Node column should be added out of
@@ -466,7 +466,10 @@ def calc_ai_analyze(
def calc_ai_profile( def calc_ai_profile(
mspec: MachineSpecs, sort_type: str, ret_df: dict[str, pd.DataFrame] mspec: MachineSpecs,
sort_type: str,
ret_df: dict[str, pd.DataFrame],
iteration_multiplexing: str,
) -> dict[str, Union[list[list[float]], list[str]]]: ) -> dict[str, Union[list[list[float]], list[str]]]:
"""Given counter data, calculate arithmetic intensity for each kernel """Given counter data, calculate arithmetic intensity for each kernel
in the application. Leverage hard-coded equations to calculate AI values. in the application. Leverage hard-coded equations to calculate AI values.
@@ -505,6 +508,10 @@ def calc_ai_profile(
next_kernel_name = df["Kernel_Name"][idx + 1] if not at_end else "" next_kernel_name = df["Kernel_Name"][idx + 1] if not at_end else ""
kernel_name = df["Kernel_Name"][idx] kernel_name = df["Kernel_Name"][idx]
# Skip this kernel dispatch row if any counter value is n/a
if df.iloc[idx].isna().any():
continue
try: try:
total_flops += ( total_flops += (
( (
@@ -546,7 +553,8 @@ def calc_ai_profile(
except KeyError as e: except KeyError as e:
console_debug( console_debug(
"roofline", "roofline",
f"{kernel_name[:35]}: Skipped total_flops at index {idx} due to {e}", f"{kernel_name[:35]}: Skipped total_flops at index \
{idx} due to {e}",
) )
pass pass
try: try:
@@ -615,7 +623,8 @@ def calc_ai_profile(
except KeyError as e: except KeyError as e:
console_debug( console_debug(
"roofline", "roofline",
f"{kernel_name[:35]}: Skipped L1cache_data at index {idx} due to {e}", f"{kernel_name[:35]}: Skipped L1cache_data at index \
{idx} due to {e}",
) )
pass pass
@@ -629,7 +638,8 @@ def calc_ai_profile(
except KeyError as e: except KeyError as e:
console_debug( console_debug(
"roofline", "roofline",
f"{kernel_name[:35]}: Skipped L2cache_data at index {idx} due to {e}", f"{kernel_name[:35]}: Skipped L2cache_data at index \
{idx} due to {e}",
) )
pass pass
try: try:
@@ -1502,6 +1502,7 @@ def impute_counters_iteration_multiplex(
} }
# Collect imputed sub-groups as dataframes # Collect imputed sub-groups as dataframes
subgroup_dfs = [] subgroup_dfs = []
previous_fill_values = {}
for i in range(0, len(group), subgroup_size): for i in range(0, len(group), subgroup_size):
subgroup = group.iloc[i : i + subgroup_size] subgroup = group.iloc[i : i + subgroup_size]
@@ -1517,7 +1518,22 @@ def impute_counters_iteration_multiplex(
if fill_values: if fill_values:
subgroup = subgroup.fillna(fill_values) subgroup = subgroup.fillna(fill_values)
# If this is the last subgroup and it still has missing values,
# use previous subgroup's fill values
# NOTE: This wont work if the first subgroup is itself incomplete
is_last_subgroup = (i + subgroup_size) >= len(group)
# First any() returns bool pd.Series for every column,
# second any() returns single bool
if (
is_last_subgroup
and previous_fill_values
and subgroup.isna().any().any()
):
# Use previous subgroup's fill values for remaining missing values
subgroup = subgroup.fillna(previous_fill_values)
subgroup_dfs.append(subgroup) subgroup_dfs.append(subgroup)
previous_fill_values = fill_values
# Concatenate all subgroups for this group # Concatenate all subgroups for this group
if subgroup_dfs: if subgroup_dfs:
@@ -75,6 +75,8 @@ config["COUNTER_LOGGING"] = False
config["METRIC_COMPARE"] = False config["METRIC_COMPARE"] = False
config["METRIC_LOGGING"] = False config["METRIC_LOGGING"] = False
arch_config = {}
num_kernels = 3 num_kernels = 3
num_devices = 1 num_devices = 1
@@ -1326,6 +1328,7 @@ def test_roofline_missing_file_handling(binary_handler_profile_rocprof_compute):
try: try:
from roofline import Roofline from roofline import Roofline
from utils.schema import Workload
from utils.specs import generate_machine_specs from utils.specs import generate_machine_specs
class MockArgs: class MockArgs:
@@ -1337,6 +1340,7 @@ def test_roofline_missing_file_handling(binary_handler_profile_rocprof_compute):
args = MockArgs() args = MockArgs()
mspec = generate_machine_specs(None, None) mspec = generate_machine_specs(None, None)
workload = Workload()
workload_dir = test_utils.get_output_dir() workload_dir = test_utils.get_output_dir()
@@ -1351,7 +1355,9 @@ def test_roofline_missing_file_handling(binary_handler_profile_rocprof_compute):
roofline_instance = Roofline(args, mspec, run_parameters) roofline_instance = Roofline(args, mspec, run_parameters)
result = roofline_instance.cli_generate_plot("FP32") result = roofline_instance.cli_generate_plot(
"FP32", workload, config, arch_config
)
assert result is None assert result is None
@@ -1378,6 +1384,7 @@ def test_roofline_invalid_datatype_cli(binary_handler_profile_rocprof_compute):
try: try:
from roofline import Roofline from roofline import Roofline
from utils.schema import Workload
from utils.specs import generate_machine_specs from utils.specs import generate_machine_specs
class MockArgs: class MockArgs:
@@ -1389,6 +1396,7 @@ def test_roofline_invalid_datatype_cli(binary_handler_profile_rocprof_compute):
args = MockArgs() args = MockArgs()
mspec = generate_machine_specs(None, None) mspec = generate_machine_specs(None, None)
workload = Workload()
run_parameters = { run_parameters = {
"workload_dir": test_utils.get_output_dir(), "workload_dir": test_utils.get_output_dir(),
@@ -1401,7 +1409,9 @@ def test_roofline_invalid_datatype_cli(binary_handler_profile_rocprof_compute):
roofline_instance = Roofline(args, mspec, run_parameters) roofline_instance = Roofline(args, mspec, run_parameters)
result = roofline_instance.cli_generate_plot("INVALID_DATATYPE") result = roofline_instance.cli_generate_plot(
"INVALID_DATATYPE", workload, config, arch_config
)
assert result is None assert result is None