[rocprofiler-compute] Fixes for roofline when used with iteration multiplexing (#2635)

*Added iteration_multiplex_impute_counters on pmc data- GUI dataframe did not implement this in the build_layout method previously
*Created a Workload() in profile mode post-processing for roofline html standalone plot to be generated- this will be removed once roofline plot is moved to analyze phase in future release
*Added iteration_multiplexing run parameter to roofline object init so that we can accurately parse dataframe if the option was used during profiling- this helps us to avoid reading nan values in certain dispatches that did not get imputed in calc_ai_profile
*Cleanup for unused legacy code, adjusted method parameters to assist in moving roofline plotting to analyze mode in future release
*Update iteration multiplexing data imputation algorithm to impute counters for ungrouped dispatches at the end based on the previous group. This however won't work if there are no dispatches that can be grouped (i.e. number of dispatches < number of counter buckets)

---------

Signed-off-by: Carrie Fallows <Carrie.Fallows@amd.com>
Co-authored-by: Vignesh Edithal <Vignesh.Edithal@amd.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Tento commit je obsažen v:
cfallows-amd
2026-01-23 11:10:46 -05:00
odevzdal GitHub
rodič fc4422d73b
revize 62dd4d114d
7 změnil soubory, kde provedl 123 přidání a 67 odebrání
@@ -146,6 +146,12 @@ class webui_analysis(OmniAnalyze_Base):
base_data[base_run].raw_pmc
)
if self._profiling_config["iteration_multiplexing"] is not None:
base_data[base_run].raw_pmc = self.iteration_multiplex_impute_counters(
base_data[base_run].raw_pmc,
policy=self._profiling_config["iteration_multiplexing"],
)
# Apply filters to workload data
console_debug("analysis", f"gui dispatch filter is {disp_filt}")
console_debug("analysis", f"gui kernel filter is {kernel_filter}")
@@ -224,6 +230,9 @@ class webui_analysis(OmniAnalyze_Base):
"is_standalone": False,
"roofline_data_type": self.__roofline_data_type,
"kernel_filter": False,
"iteration_multiplexing": self._profiling_config[
"iteration_multiplexing"
],
}
)
roof_obj = soc[self.arch].roofline_obj
+31 -2
Zobrazit soubor
@@ -37,6 +37,7 @@ import yaml
import config
from roofline import Roofline
from utils.amdsmi_interface import amdsmi_ctx, get_gpu_model, get_mem_max_clock
from utils.file_io import create_df_pmc, load_profiling_config
from utils.logger import (
console_debug,
console_error,
@@ -45,15 +46,18 @@ from utils.logger import (
demarcate,
)
from utils.mi_gpu_spec import mi_gpu_specs
from utils.parser import BUILD_IN_VARS, SUPPORTED_DENOM
from utils.parser import BUILD_IN_VARS, SUPPORTED_DENOM, apply_filters
from utils.roofline_calc import validate_roofline_csv
from utils.schema import Workload
from utils.specs import MachineSpecs
from utils.utils import (
METRIC_ID_RE,
add_counter_extra_config_input_yaml,
convert_metric_id_to_panel_info,
get_panel_alias,
impute_counters_iteration_multiplex,
is_tcc_channel_counter,
merge_counters_spatial_multiplex,
parse_sets_yaml,
)
@@ -701,7 +705,32 @@ class OmniSoC_Base:
)
return
self.roofline_obj.post_processing()
args = self.get_args()
workload = Workload()
workload.path = self.__args.path
profiling_config = load_profiling_config(workload.path)
workload.raw_pmc = create_df_pmc(
raw_data_root_dir=workload.path,
nodes=None,
spatial_multiplexing=args.spatial_multiplexing,
kernel_verbose=-1,
verbose=args.verbose,
config_dict=profiling_config,
)
if args.spatial_multiplexing:
workload.raw_pmc = merge_counters_spatial_multiplex(workload.raw_pmc)
if profiling_config["iteration_multiplexing"] is not None:
workload.raw_pmc = impute_counters_iteration_multiplex(
workload.raw_pmc,
policy=profiling_config["iteration_multiplexing"],
)
filtered_pmc = apply_filters(
workload, workload.path, is_gui=False, debug=False
)
self.roofline_obj.post_processing(filtered_pmc)
@abstractmethod
def analysis_setup(self, roofline_parameters: Optional[dict[str, Any]]) -> None:
+38 -58
Zobrazit soubor
@@ -25,7 +25,6 @@
import argparse
import textwrap
from abc import abstractmethod
from collections import OrderedDict
from pathlib import Path
from typing import Any, Optional, Union
@@ -36,7 +35,7 @@ import plotly.graph_objects as go
from dash import dcc, html
from plotly.subplots import make_subplots
from utils import file_io, rocpd_data, schema
from utils import schema
from utils.logger import (
console_debug,
console_error,
@@ -96,6 +95,7 @@ class Roofline:
"is_standalone": False,
"roofline_data_type": ["FP32"], # default to FP32
"kernel_filter": False,
"iteration_multiplexing": None,
}
)
self.__ai_data: Optional[dict[str, Any]] = None
@@ -116,6 +116,13 @@ class Roofline:
hasattr(self.__args, "gpu_kernel") and self.__args.gpu_kernel
):
self.__run_parameters["kernel_filter"] = True
if (
hasattr(self.__args, "iteration_multiplexing")
and self.__args.iteration_multiplexing is not None
):
self.__run_parameters["iteration_multiplexing"] = (
self.__args.iteration_multiplexing
)
def get_args(self) -> argparse.Namespace:
return self.__args
@@ -286,11 +293,7 @@ class Roofline:
Generate a set of empirical roofline plots given a directory containing
required profiling and benchmarking data.
"""
if (
not isinstance(self.__run_parameters["workload_dir"], list)
and self.__run_parameters["workload_dir"] != None
):
self.roof_setup()
self.roof_setup()
console_debug("roofline", f"Path: {self.__run_parameters.get('workload_dir')}")
@@ -300,7 +303,10 @@ class Roofline:
)
self.__ai_data = calc_ai_profile(
self.__mspec, self.__run_parameters.get("sort_type"), ret_df
self.__mspec,
self.__run_parameters.get("sort_type"),
ret_df,
self.__run_parameters["iteration_multiplexing"],
)
msg = "AI at each mem level:"
@@ -1133,14 +1139,17 @@ class Roofline:
def cli_generate_plot(
self,
dtype: str,
workload: Optional[schema.Workload] = None,
config: Optional[dict[str, Any]] = None,
arch_config: Optional[schema.ArchConfig] = None,
workload: schema.Workload,
config: dict[str, Any],
arch_config: schema.ArchConfig,
) -> Optional[str]:
"""
Plot CLI mode roofline analysis in terminal using plotext
:param dtype: The datatype to be profiled
:param workload: Complete dataframe
:param config: Profiling configuration from profiling_config.yaml
:param arch_config: Archetype-specific configurations
:type method: str
:return: Build the current figure using plot.build(),
or None if datatype is not valid for the architecture
@@ -1200,33 +1209,13 @@ class Roofline:
console_warning("roofline", "Skipping plot generation")
return None
# if workload is detected, utilize Roofline yamls.
# If not, fallback to legacy calc_ai
if workload and config and arch_config:
self.__ai_data = calc_ai_analyze(
workload=workload,
mspec=self.__mspec,
sort_type=str(self.__run_parameters.get("sort_type")),
config=config,
arch_config=arch_config,
)
else:
pmc_perf_csv = base_path / "pmc_perf.csv"
if not pmc_perf_csv.is_file():
console_error("roofline", f"{pmc_perf_csv} does not exist")
t_df = OrderedDict()
t_df["pmc_perf"] = pd.read_csv(pmc_perf_csv)
profiling_config = file_io.load_profiling_config(self.__args.path[0][0])
if profiling_config.get("format_rocprof_output") == "rocpd":
t_df["pmc_perf"] = rocpd_data.process_rocpd_csv(t_df["pmc_perf"])
t_df = self.validate_apply_kernel_filter(df=t_df, path_str=str(base_path))
self.__ai_data = calc_ai_profile(
self.__mspec, self.__run_parameters["sort_type"], t_df
)
self.__ai_data = calc_ai_analyze(
workload=workload,
mspec=self.__mspec,
sort_type=str(self.__run_parameters.get("sort_type")),
config=config,
arch_config=arch_config,
)
self.__ceiling_data = construct_roof(
roofline_parameters=self.__run_parameters, dtype=dtype
@@ -1402,38 +1391,29 @@ class Roofline:
return plt.build()
@demarcate
def standalone_roofline(self) -> None:
if (
not isinstance(self.__run_parameters["workload_dir"], list)
and self.__run_parameters["workload_dir"] != None
):
self.roof_setup()
def standalone_roofline(
self,
df: dict[str, pd.DataFrame],
) -> None:
self.roof_setup()
# Change vL1D to a interpretable str, if required
if "vL1D" in self.__run_parameters["mem_level"]:
self.__run_parameters["mem_level"].remove("vL1D")
self.__run_parameters["mem_level"].append("L1")
app_path = Path(str(self.__run_parameters["workload_dir"])) / "pmc_perf.csv"
if not app_path.is_file():
console_error("roofline", f"{app_path} does not exist")
t_df = OrderedDict()
t_df["pmc_perf"] = pd.read_csv(app_path)
profiling_config = file_io.load_profiling_config(self.__args.path)
if profiling_config.get("format_rocprof_output") == "rocpd":
t_df["pmc_perf"] = rocpd_data.process_rocpd_csv(t_df["pmc_perf"])
self.empirical_roofline(ret_df=t_df)
self.empirical_roofline(ret_df=df)
# NB: Currently the post_prossesing() method is the only one being used by
# rocprofiler-compute, we include pre_processing() and profile() methods for
# those who wish to borrow the roofline module
@abstractmethod
def post_processing(self) -> None:
def post_processing(
self,
filtered_pmc: pd.DataFrame,
) -> None:
if self.__run_parameters["is_standalone"]:
self.standalone_roofline()
self.standalone_roofline(filtered_pmc)
def get_dtype(self) -> list[str]:
return self.__run_parameters["roofline_data_type"]
+3 -1
Zobrazit soubor
@@ -251,7 +251,9 @@ def create_df_pmc(
tmp_df = rocpd_data.process_rocpd_csv(tmp_df)
# Demangle original KernelNames
kernel_name_shortener(tmp_df, kernel_verbose)
# Skip for Standalone Roofline with -1 to keep full kernel names
if kernel_verbose >= 0:
kernel_name_shortener(tmp_df, kernel_verbose)
# NB:
# Idealy, the Node column should be added out of
+14 -4
Zobrazit soubor
@@ -466,7 +466,10 @@ def calc_ai_analyze(
def calc_ai_profile(
mspec: MachineSpecs, sort_type: str, ret_df: dict[str, pd.DataFrame]
mspec: MachineSpecs,
sort_type: str,
ret_df: dict[str, pd.DataFrame],
iteration_multiplexing: str,
) -> dict[str, Union[list[list[float]], list[str]]]:
"""Given counter data, calculate arithmetic intensity for each kernel
in the application. Leverage hard-coded equations to calculate AI values.
@@ -505,6 +508,10 @@ def calc_ai_profile(
next_kernel_name = df["Kernel_Name"][idx + 1] if not at_end else ""
kernel_name = df["Kernel_Name"][idx]
# Skip this kernel dispatch row if any counter value is n/a
if df.iloc[idx].isna().any():
continue
try:
total_flops += (
(
@@ -546,7 +553,8 @@ def calc_ai_profile(
except KeyError as e:
console_debug(
"roofline",
f"{kernel_name[:35]}: Skipped total_flops at index {idx} due to {e}",
f"{kernel_name[:35]}: Skipped total_flops at index \
{idx} due to {e}",
)
pass
try:
@@ -615,7 +623,8 @@ def calc_ai_profile(
except KeyError as e:
console_debug(
"roofline",
f"{kernel_name[:35]}: Skipped L1cache_data at index {idx} due to {e}",
f"{kernel_name[:35]}: Skipped L1cache_data at index \
{idx} due to {e}",
)
pass
@@ -629,7 +638,8 @@ def calc_ai_profile(
except KeyError as e:
console_debug(
"roofline",
f"{kernel_name[:35]}: Skipped L2cache_data at index {idx} due to {e}",
f"{kernel_name[:35]}: Skipped L2cache_data at index \
{idx} due to {e}",
)
pass
try:
+16
Zobrazit soubor
@@ -1502,6 +1502,7 @@ def impute_counters_iteration_multiplex(
}
# Collect imputed sub-groups as dataframes
subgroup_dfs = []
previous_fill_values = {}
for i in range(0, len(group), subgroup_size):
subgroup = group.iloc[i : i + subgroup_size]
@@ -1517,7 +1518,22 @@ def impute_counters_iteration_multiplex(
if fill_values:
subgroup = subgroup.fillna(fill_values)
# If this is the last subgroup and it still has missing values,
# use previous subgroup's fill values
# NOTE: This wont work if the first subgroup is itself incomplete
is_last_subgroup = (i + subgroup_size) >= len(group)
# First any() returns bool pd.Series for every column,
# second any() returns single bool
if (
is_last_subgroup
and previous_fill_values
and subgroup.isna().any().any()
):
# Use previous subgroup's fill values for remaining missing values
subgroup = subgroup.fillna(previous_fill_values)
subgroup_dfs.append(subgroup)
previous_fill_values = fill_values
# Concatenate all subgroups for this group
if subgroup_dfs:
+12 -2
Zobrazit soubor
@@ -75,6 +75,8 @@ config["COUNTER_LOGGING"] = False
config["METRIC_COMPARE"] = False
config["METRIC_LOGGING"] = False
arch_config = {}
num_kernels = 3
num_devices = 1
@@ -1326,6 +1328,7 @@ def test_roofline_missing_file_handling(binary_handler_profile_rocprof_compute):
try:
from roofline import Roofline
from utils.schema import Workload
from utils.specs import generate_machine_specs
class MockArgs:
@@ -1337,6 +1340,7 @@ def test_roofline_missing_file_handling(binary_handler_profile_rocprof_compute):
args = MockArgs()
mspec = generate_machine_specs(None, None)
workload = Workload()
workload_dir = test_utils.get_output_dir()
@@ -1351,7 +1355,9 @@ def test_roofline_missing_file_handling(binary_handler_profile_rocprof_compute):
roofline_instance = Roofline(args, mspec, run_parameters)
result = roofline_instance.cli_generate_plot("FP32")
result = roofline_instance.cli_generate_plot(
"FP32", workload, config, arch_config
)
assert result is None
@@ -1378,6 +1384,7 @@ def test_roofline_invalid_datatype_cli(binary_handler_profile_rocprof_compute):
try:
from roofline import Roofline
from utils.schema import Workload
from utils.specs import generate_machine_specs
class MockArgs:
@@ -1389,6 +1396,7 @@ def test_roofline_invalid_datatype_cli(binary_handler_profile_rocprof_compute):
args = MockArgs()
mspec = generate_machine_specs(None, None)
workload = Workload()
run_parameters = {
"workload_dir": test_utils.get_output_dir(),
@@ -1401,7 +1409,9 @@ def test_roofline_invalid_datatype_cli(binary_handler_profile_rocprof_compute):
roofline_instance = Roofline(args, mspec, run_parameters)
result = roofline_instance.cli_generate_plot("INVALID_DATATYPE")
result = roofline_instance.cli_generate_plot(
"INVALID_DATATYPE", workload, config, arch_config
)
assert result is None