[rocprofiler-compute] Fixes for roofline when used with iteration multiplexing (#2635)
*Added iteration_multiplex_impute_counters on pmc data- GUI dataframe did not implement this in the build_layout method previously *Created a Workload() in profile mode post-processing for roofline html standalone plot to be generated- this will be removed once roofline plot is moved to analyze phase in future release *Added iteration_multiplexing run parameter to roofline object init so that we can accurately parse dataframe if the option was used during profiling- this helps us to avoid reading nan values in certain dispatches that did not get imputed in calc_ai_profile *Cleanup for unused legacy code, adjusted method parameters to assist in moving roofline plotting to analyze mode in future release *Update iteration multiplexing data imputation algorithm to impute counters for ungrouped dispatches at the end based on the previous group. This however won't work if there are no dispatches that can be grouped (i.e. number of dispatches < number of counter buckets) --------- Signed-off-by: Carrie Fallows <Carrie.Fallows@amd.com> Co-authored-by: Vignesh Edithal <Vignesh.Edithal@amd.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
orang tua
fc4422d73b
melakukan
62dd4d114d
@@ -146,6 +146,12 @@ class webui_analysis(OmniAnalyze_Base):
|
||||
base_data[base_run].raw_pmc
|
||||
)
|
||||
|
||||
if self._profiling_config["iteration_multiplexing"] is not None:
|
||||
base_data[base_run].raw_pmc = self.iteration_multiplex_impute_counters(
|
||||
base_data[base_run].raw_pmc,
|
||||
policy=self._profiling_config["iteration_multiplexing"],
|
||||
)
|
||||
|
||||
# Apply filters to workload data
|
||||
console_debug("analysis", f"gui dispatch filter is {disp_filt}")
|
||||
console_debug("analysis", f"gui kernel filter is {kernel_filter}")
|
||||
@@ -224,6 +230,9 @@ class webui_analysis(OmniAnalyze_Base):
|
||||
"is_standalone": False,
|
||||
"roofline_data_type": self.__roofline_data_type,
|
||||
"kernel_filter": False,
|
||||
"iteration_multiplexing": self._profiling_config[
|
||||
"iteration_multiplexing"
|
||||
],
|
||||
}
|
||||
)
|
||||
roof_obj = soc[self.arch].roofline_obj
|
||||
|
||||
@@ -37,6 +37,7 @@ import yaml
|
||||
import config
|
||||
from roofline import Roofline
|
||||
from utils.amdsmi_interface import amdsmi_ctx, get_gpu_model, get_mem_max_clock
|
||||
from utils.file_io import create_df_pmc, load_profiling_config
|
||||
from utils.logger import (
|
||||
console_debug,
|
||||
console_error,
|
||||
@@ -45,15 +46,18 @@ from utils.logger import (
|
||||
demarcate,
|
||||
)
|
||||
from utils.mi_gpu_spec import mi_gpu_specs
|
||||
from utils.parser import BUILD_IN_VARS, SUPPORTED_DENOM
|
||||
from utils.parser import BUILD_IN_VARS, SUPPORTED_DENOM, apply_filters
|
||||
from utils.roofline_calc import validate_roofline_csv
|
||||
from utils.schema import Workload
|
||||
from utils.specs import MachineSpecs
|
||||
from utils.utils import (
|
||||
METRIC_ID_RE,
|
||||
add_counter_extra_config_input_yaml,
|
||||
convert_metric_id_to_panel_info,
|
||||
get_panel_alias,
|
||||
impute_counters_iteration_multiplex,
|
||||
is_tcc_channel_counter,
|
||||
merge_counters_spatial_multiplex,
|
||||
parse_sets_yaml,
|
||||
)
|
||||
|
||||
@@ -701,7 +705,32 @@ class OmniSoC_Base:
|
||||
)
|
||||
return
|
||||
|
||||
self.roofline_obj.post_processing()
|
||||
args = self.get_args()
|
||||
workload = Workload()
|
||||
workload.path = self.__args.path
|
||||
profiling_config = load_profiling_config(workload.path)
|
||||
workload.raw_pmc = create_df_pmc(
|
||||
raw_data_root_dir=workload.path,
|
||||
nodes=None,
|
||||
spatial_multiplexing=args.spatial_multiplexing,
|
||||
kernel_verbose=-1,
|
||||
verbose=args.verbose,
|
||||
config_dict=profiling_config,
|
||||
)
|
||||
|
||||
if args.spatial_multiplexing:
|
||||
workload.raw_pmc = merge_counters_spatial_multiplex(workload.raw_pmc)
|
||||
|
||||
if profiling_config["iteration_multiplexing"] is not None:
|
||||
workload.raw_pmc = impute_counters_iteration_multiplex(
|
||||
workload.raw_pmc,
|
||||
policy=profiling_config["iteration_multiplexing"],
|
||||
)
|
||||
filtered_pmc = apply_filters(
|
||||
workload, workload.path, is_gui=False, debug=False
|
||||
)
|
||||
|
||||
self.roofline_obj.post_processing(filtered_pmc)
|
||||
|
||||
@abstractmethod
|
||||
def analysis_setup(self, roofline_parameters: Optional[dict[str, Any]]) -> None:
|
||||
|
||||
@@ -25,7 +25,6 @@
|
||||
import argparse
|
||||
import textwrap
|
||||
from abc import abstractmethod
|
||||
from collections import OrderedDict
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
@@ -36,7 +35,7 @@ import plotly.graph_objects as go
|
||||
from dash import dcc, html
|
||||
from plotly.subplots import make_subplots
|
||||
|
||||
from utils import file_io, rocpd_data, schema
|
||||
from utils import schema
|
||||
from utils.logger import (
|
||||
console_debug,
|
||||
console_error,
|
||||
@@ -96,6 +95,7 @@ class Roofline:
|
||||
"is_standalone": False,
|
||||
"roofline_data_type": ["FP32"], # default to FP32
|
||||
"kernel_filter": False,
|
||||
"iteration_multiplexing": None,
|
||||
}
|
||||
)
|
||||
self.__ai_data: Optional[dict[str, Any]] = None
|
||||
@@ -116,6 +116,13 @@ class Roofline:
|
||||
hasattr(self.__args, "gpu_kernel") and self.__args.gpu_kernel
|
||||
):
|
||||
self.__run_parameters["kernel_filter"] = True
|
||||
if (
|
||||
hasattr(self.__args, "iteration_multiplexing")
|
||||
and self.__args.iteration_multiplexing is not None
|
||||
):
|
||||
self.__run_parameters["iteration_multiplexing"] = (
|
||||
self.__args.iteration_multiplexing
|
||||
)
|
||||
|
||||
def get_args(self) -> argparse.Namespace:
|
||||
return self.__args
|
||||
@@ -286,10 +293,6 @@ class Roofline:
|
||||
Generate a set of empirical roofline plots given a directory containing
|
||||
required profiling and benchmarking data.
|
||||
"""
|
||||
if (
|
||||
not isinstance(self.__run_parameters["workload_dir"], list)
|
||||
and self.__run_parameters["workload_dir"] != None
|
||||
):
|
||||
self.roof_setup()
|
||||
|
||||
console_debug("roofline", f"Path: {self.__run_parameters.get('workload_dir')}")
|
||||
@@ -300,7 +303,10 @@ class Roofline:
|
||||
)
|
||||
|
||||
self.__ai_data = calc_ai_profile(
|
||||
self.__mspec, self.__run_parameters.get("sort_type"), ret_df
|
||||
self.__mspec,
|
||||
self.__run_parameters.get("sort_type"),
|
||||
ret_df,
|
||||
self.__run_parameters["iteration_multiplexing"],
|
||||
)
|
||||
|
||||
msg = "AI at each mem level:"
|
||||
@@ -1133,14 +1139,17 @@ class Roofline:
|
||||
def cli_generate_plot(
|
||||
self,
|
||||
dtype: str,
|
||||
workload: Optional[schema.Workload] = None,
|
||||
config: Optional[dict[str, Any]] = None,
|
||||
arch_config: Optional[schema.ArchConfig] = None,
|
||||
workload: schema.Workload,
|
||||
config: dict[str, Any],
|
||||
arch_config: schema.ArchConfig,
|
||||
) -> Optional[str]:
|
||||
"""
|
||||
Plot CLI mode roofline analysis in terminal using plotext
|
||||
|
||||
:param dtype: The datatype to be profiled
|
||||
:param workload: Complete dataframe
|
||||
:param config: Profiling configuration from profiling_config.yaml
|
||||
:param arch_config: Archetype-specific configurations
|
||||
:type method: str
|
||||
:return: Build the current figure using plot.build(),
|
||||
or None if datatype is not valid for the architecture
|
||||
@@ -1200,9 +1209,6 @@ class Roofline:
|
||||
console_warning("roofline", "Skipping plot generation")
|
||||
return None
|
||||
|
||||
# if workload is detected, utilize Roofline yamls.
|
||||
# If not, fallback to legacy calc_ai
|
||||
if workload and config and arch_config:
|
||||
self.__ai_data = calc_ai_analyze(
|
||||
workload=workload,
|
||||
mspec=self.__mspec,
|
||||
@@ -1211,23 +1217,6 @@ class Roofline:
|
||||
arch_config=arch_config,
|
||||
)
|
||||
|
||||
else:
|
||||
pmc_perf_csv = base_path / "pmc_perf.csv"
|
||||
if not pmc_perf_csv.is_file():
|
||||
console_error("roofline", f"{pmc_perf_csv} does not exist")
|
||||
|
||||
t_df = OrderedDict()
|
||||
t_df["pmc_perf"] = pd.read_csv(pmc_perf_csv)
|
||||
|
||||
profiling_config = file_io.load_profiling_config(self.__args.path[0][0])
|
||||
if profiling_config.get("format_rocprof_output") == "rocpd":
|
||||
t_df["pmc_perf"] = rocpd_data.process_rocpd_csv(t_df["pmc_perf"])
|
||||
|
||||
t_df = self.validate_apply_kernel_filter(df=t_df, path_str=str(base_path))
|
||||
self.__ai_data = calc_ai_profile(
|
||||
self.__mspec, self.__run_parameters["sort_type"], t_df
|
||||
)
|
||||
|
||||
self.__ceiling_data = construct_roof(
|
||||
roofline_parameters=self.__run_parameters, dtype=dtype
|
||||
)
|
||||
@@ -1402,11 +1391,10 @@ class Roofline:
|
||||
return plt.build()
|
||||
|
||||
@demarcate
|
||||
def standalone_roofline(self) -> None:
|
||||
if (
|
||||
not isinstance(self.__run_parameters["workload_dir"], list)
|
||||
and self.__run_parameters["workload_dir"] != None
|
||||
):
|
||||
def standalone_roofline(
|
||||
self,
|
||||
df: dict[str, pd.DataFrame],
|
||||
) -> None:
|
||||
self.roof_setup()
|
||||
|
||||
# Change vL1D to a interpretable str, if required
|
||||
@@ -1414,26 +1402,18 @@ class Roofline:
|
||||
self.__run_parameters["mem_level"].remove("vL1D")
|
||||
self.__run_parameters["mem_level"].append("L1")
|
||||
|
||||
app_path = Path(str(self.__run_parameters["workload_dir"])) / "pmc_perf.csv"
|
||||
if not app_path.is_file():
|
||||
console_error("roofline", f"{app_path} does not exist")
|
||||
|
||||
t_df = OrderedDict()
|
||||
t_df["pmc_perf"] = pd.read_csv(app_path)
|
||||
|
||||
profiling_config = file_io.load_profiling_config(self.__args.path)
|
||||
if profiling_config.get("format_rocprof_output") == "rocpd":
|
||||
t_df["pmc_perf"] = rocpd_data.process_rocpd_csv(t_df["pmc_perf"])
|
||||
|
||||
self.empirical_roofline(ret_df=t_df)
|
||||
self.empirical_roofline(ret_df=df)
|
||||
|
||||
# NB: Currently the post_prossesing() method is the only one being used by
|
||||
# rocprofiler-compute, we include pre_processing() and profile() methods for
|
||||
# those who wish to borrow the roofline module
|
||||
@abstractmethod
|
||||
def post_processing(self) -> None:
|
||||
def post_processing(
|
||||
self,
|
||||
filtered_pmc: pd.DataFrame,
|
||||
) -> None:
|
||||
if self.__run_parameters["is_standalone"]:
|
||||
self.standalone_roofline()
|
||||
self.standalone_roofline(filtered_pmc)
|
||||
|
||||
def get_dtype(self) -> list[str]:
|
||||
return self.__run_parameters["roofline_data_type"]
|
||||
|
||||
@@ -251,6 +251,8 @@ def create_df_pmc(
|
||||
tmp_df = rocpd_data.process_rocpd_csv(tmp_df)
|
||||
|
||||
# Demangle original KernelNames
|
||||
# Skip for Standalone Roofline with -1 to keep full kernel names
|
||||
if kernel_verbose >= 0:
|
||||
kernel_name_shortener(tmp_df, kernel_verbose)
|
||||
|
||||
# NB:
|
||||
|
||||
@@ -466,7 +466,10 @@ def calc_ai_analyze(
|
||||
|
||||
|
||||
def calc_ai_profile(
|
||||
mspec: MachineSpecs, sort_type: str, ret_df: dict[str, pd.DataFrame]
|
||||
mspec: MachineSpecs,
|
||||
sort_type: str,
|
||||
ret_df: dict[str, pd.DataFrame],
|
||||
iteration_multiplexing: str,
|
||||
) -> dict[str, Union[list[list[float]], list[str]]]:
|
||||
"""Given counter data, calculate arithmetic intensity for each kernel
|
||||
in the application. Leverage hard-coded equations to calculate AI values.
|
||||
@@ -505,6 +508,10 @@ def calc_ai_profile(
|
||||
next_kernel_name = df["Kernel_Name"][idx + 1] if not at_end else ""
|
||||
kernel_name = df["Kernel_Name"][idx]
|
||||
|
||||
# Skip this kernel dispatch row if any counter value is n/a
|
||||
if df.iloc[idx].isna().any():
|
||||
continue
|
||||
|
||||
try:
|
||||
total_flops += (
|
||||
(
|
||||
@@ -546,7 +553,8 @@ def calc_ai_profile(
|
||||
except KeyError as e:
|
||||
console_debug(
|
||||
"roofline",
|
||||
f"{kernel_name[:35]}: Skipped total_flops at index {idx} due to {e}",
|
||||
f"{kernel_name[:35]}: Skipped total_flops at index \
|
||||
{idx} due to {e}",
|
||||
)
|
||||
pass
|
||||
try:
|
||||
@@ -615,7 +623,8 @@ def calc_ai_profile(
|
||||
except KeyError as e:
|
||||
console_debug(
|
||||
"roofline",
|
||||
f"{kernel_name[:35]}: Skipped L1cache_data at index {idx} due to {e}",
|
||||
f"{kernel_name[:35]}: Skipped L1cache_data at index \
|
||||
{idx} due to {e}",
|
||||
)
|
||||
pass
|
||||
|
||||
@@ -629,7 +638,8 @@ def calc_ai_profile(
|
||||
except KeyError as e:
|
||||
console_debug(
|
||||
"roofline",
|
||||
f"{kernel_name[:35]}: Skipped L2cache_data at index {idx} due to {e}",
|
||||
f"{kernel_name[:35]}: Skipped L2cache_data at index \
|
||||
{idx} due to {e}",
|
||||
)
|
||||
pass
|
||||
try:
|
||||
|
||||
@@ -1502,6 +1502,7 @@ def impute_counters_iteration_multiplex(
|
||||
}
|
||||
# Collect imputed sub-groups as dataframes
|
||||
subgroup_dfs = []
|
||||
previous_fill_values = {}
|
||||
for i in range(0, len(group), subgroup_size):
|
||||
subgroup = group.iloc[i : i + subgroup_size]
|
||||
|
||||
@@ -1517,7 +1518,22 @@ def impute_counters_iteration_multiplex(
|
||||
if fill_values:
|
||||
subgroup = subgroup.fillna(fill_values)
|
||||
|
||||
# If this is the last subgroup and it still has missing values,
|
||||
# use previous subgroup's fill values
|
||||
# NOTE: This wont work if the first subgroup is itself incomplete
|
||||
is_last_subgroup = (i + subgroup_size) >= len(group)
|
||||
# First any() returns bool pd.Series for every column,
|
||||
# second any() returns single bool
|
||||
if (
|
||||
is_last_subgroup
|
||||
and previous_fill_values
|
||||
and subgroup.isna().any().any()
|
||||
):
|
||||
# Use previous subgroup's fill values for remaining missing values
|
||||
subgroup = subgroup.fillna(previous_fill_values)
|
||||
|
||||
subgroup_dfs.append(subgroup)
|
||||
previous_fill_values = fill_values
|
||||
|
||||
# Concatenate all subgroups for this group
|
||||
if subgroup_dfs:
|
||||
|
||||
@@ -75,6 +75,8 @@ config["COUNTER_LOGGING"] = False
|
||||
config["METRIC_COMPARE"] = False
|
||||
config["METRIC_LOGGING"] = False
|
||||
|
||||
arch_config = {}
|
||||
|
||||
num_kernels = 3
|
||||
num_devices = 1
|
||||
|
||||
@@ -1326,6 +1328,7 @@ def test_roofline_missing_file_handling(binary_handler_profile_rocprof_compute):
|
||||
|
||||
try:
|
||||
from roofline import Roofline
|
||||
from utils.schema import Workload
|
||||
from utils.specs import generate_machine_specs
|
||||
|
||||
class MockArgs:
|
||||
@@ -1337,6 +1340,7 @@ def test_roofline_missing_file_handling(binary_handler_profile_rocprof_compute):
|
||||
|
||||
args = MockArgs()
|
||||
mspec = generate_machine_specs(None, None)
|
||||
workload = Workload()
|
||||
|
||||
workload_dir = test_utils.get_output_dir()
|
||||
|
||||
@@ -1351,7 +1355,9 @@ def test_roofline_missing_file_handling(binary_handler_profile_rocprof_compute):
|
||||
|
||||
roofline_instance = Roofline(args, mspec, run_parameters)
|
||||
|
||||
result = roofline_instance.cli_generate_plot("FP32")
|
||||
result = roofline_instance.cli_generate_plot(
|
||||
"FP32", workload, config, arch_config
|
||||
)
|
||||
|
||||
assert result is None
|
||||
|
||||
@@ -1378,6 +1384,7 @@ def test_roofline_invalid_datatype_cli(binary_handler_profile_rocprof_compute):
|
||||
|
||||
try:
|
||||
from roofline import Roofline
|
||||
from utils.schema import Workload
|
||||
from utils.specs import generate_machine_specs
|
||||
|
||||
class MockArgs:
|
||||
@@ -1389,6 +1396,7 @@ def test_roofline_invalid_datatype_cli(binary_handler_profile_rocprof_compute):
|
||||
|
||||
args = MockArgs()
|
||||
mspec = generate_machine_specs(None, None)
|
||||
workload = Workload()
|
||||
|
||||
run_parameters = {
|
||||
"workload_dir": test_utils.get_output_dir(),
|
||||
@@ -1401,7 +1409,9 @@ def test_roofline_invalid_datatype_cli(binary_handler_profile_rocprof_compute):
|
||||
|
||||
roofline_instance = Roofline(args, mspec, run_parameters)
|
||||
|
||||
result = roofline_instance.cli_generate_plot("INVALID_DATATYPE")
|
||||
result = roofline_instance.cli_generate_plot(
|
||||
"INVALID_DATATYPE", workload, config, arch_config
|
||||
)
|
||||
|
||||
assert result is None
|
||||
|
||||
|
||||
Reference in New Issue
Block a user