Add support for MI 100 with rocprofiler-sdk (#768)

* Add custom rocprofiler-sdk counter definitions file for MI 100

* Update CHANGELOG to mention that accumulation counters will not be
  collected when profiling on MI 100 using rocprofiler-sdk/rocprofv3

* Migrate accum_counters.yaml to code

[ROCm/rocprofiler-compute commit: a95a45d69a]
Этот коммит содержится в:
vedithal-amd
2025-06-26 09:03:18 -04:00
коммит произвёл GitHub
родитель 3a703cec00
Коммит 1d59cbb06d
7 изменённых файлов: 2974 добавлений и 200 удалений
+4 -1
Просмотреть файл
@@ -81,6 +81,7 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs.
### Resolved issues
* Fixed MI 100 counters not being collected when rocprofv3 is used
* Fixed option specs-correction
* Fixed kernel name and kernel dispatch filtering when using rocprof v3
* Fixed not collecting TCC channel counters in rocprof v3
@@ -88,7 +89,9 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs.
### Known issues
* Profiling on MI 100 will not work unless ROCPROF=rocprofv1 environment variable is explictly provided
* On MI 100, accumulation counters will not be collected and the following metrics will not show up in analysis: Instruction Fetch Latency, Wavefront Occupancy, LDS Latency
* As a workaround, use ROCPROF=rocprof environement variable, to use rocprofv1 for profiling on MI 100
* GPU id filtering is not supported when using rocprof v3
* Analysis of previously collected workload data will not work due to sysinfo.csv schema change
-1
Просмотреть файл
@@ -33,7 +33,6 @@ from pathlib import Path
import pandas as pd
import config
from utils.logger import (
console_debug,
console_error,
-58
Просмотреть файл
@@ -1,58 +0,0 @@
rocprofiler-sdk:
counters-schema-version: 1
counters:
- name: SQ_IFETCH_LEVEL_ACCUM
description: 'SQ_IFETCH_LEVEL accumulation'
properties: []
definitions:
- architectures:
- gfx942
- gfx941
- gfx940
- gfx90a
- gfx950
expression: accumulate(SQ_IFETCH_LEVEL, HIGH_RES)
- name: SQ_INST_LEVEL_LDS_ACCUM
description: 'SQ_INST_LEVEL_LDS accumulation'
properties: []
definitions:
- architectures:
- gfx942
- gfx941
- gfx940
- gfx90a
- gfx950
expression: accumulate(SQ_INST_LEVEL_LDS, HIGH_RES)
- name: SQ_INST_LEVEL_SMEM_ACCUM
description: 'SQ_INST_LEVEL_SMEM accumulation'
properties: []
definitions:
- architectures:
- gfx942
- gfx941
- gfx940
- gfx90a
- gfx950
expression: accumulate(SQ_INST_LEVEL_SMEM, HIGH_RES)
- name: SQ_INST_LEVEL_VMEM_ACCUM
description: 'SQ_INST_LEVEL_VMEM accumulation'
properties: []
definitions:
- architectures:
- gfx942
- gfx941
- gfx940
- gfx90a
- gfx950
expression: accumulate(SQ_INST_LEVEL_VMEM, HIGH_RES)
- name: SQ_LEVEL_WAVES_ACCUM
description: 'SQ_LEVEL_WAVES accumulation'
properties: []
definitions:
- architectures:
- gfx942
- gfx941
- gfx940
- gfx90a
- gfx950
expression: accumulate(SQ_LEVEL_WAVES, HIGH_RES)
Разница между файлами не показана из-за своего большого размера Загрузить разницу
+68 -26
Просмотреть файл
@@ -47,13 +47,11 @@ from utils.mi_gpu_spec import mi_gpu_specs
from utils.parser import build_in_vars, supported_denom
from utils.utils import (
add_counter_extra_config_input_yaml,
add_counter_from_source_to_target_extra_config_input_yaml,
capture_subprocess_output,
convert_metric_id_to_panel_idx,
detect_rocprof,
get_base_spi_pipe_counter,
get_submodules,
is_counter_existed_in_extra_input_yaml,
is_spi_pipe_counter,
is_tcc_channel_counter,
using_v3,
@@ -495,6 +493,18 @@ class OmniSoC_Base:
if "Name:" in line:
counters, _ = self.parse_counters_text(line.split(":")[1].strip())
rocprof_counters.update(counters)
# Custom counter support for mi100 for rocprofv3
if self._mspec.gpu_model.lower() == "mi100":
counter_defs_path = (
config.rocprof_compute_home
/ "rocprof_compute_soc"
/ "profile_configs"
/ "gfx908_counter_defs.yaml"
)
with open(counter_defs_path, "r") as fp:
counter_defs_contents = fp.read()
counters, _ = self.parse_counters_text(counter_defs_contents)
rocprof_counters.update(counters)
elif str(rocprof_cmd) == "rocprofiler-sdk":
MAX_STR = 256
@@ -556,6 +566,18 @@ class OmniSoC_Base:
rocprof_counters.add(
ctypes.cast(name_args, ctypes.c_char_p).value.decode("utf-8")
)
# Custom counter support for mi100 for rocprofiler-sdk
if self._mspec.gpu_model.lower() == "mi100":
counter_defs_path = (
config.rocprof_compute_home
/ "rocprof_compute_soc"
/ "profile_configs"
/ "gfx908_counter_defs.yaml"
)
with open(counter_defs_path, "r") as fp:
counter_defs_contents = fp.read()
counters, _ = self.parse_counters_text(counter_defs_contents)
rocprof_counters.update(counters)
else:
console_error(
@@ -750,18 +772,6 @@ class OmniSoC_Base:
else:
# Output to files
with open(
str(
Path(config.rocprof_compute_home).joinpath(
"rocprof_compute_soc",
"profile_configs",
"accum_counters.yaml",
)
),
"r",
) as fp:
accum_counters_def = yaml.safe_load(fp)
for f in output_files:
file_name_txt = str(Path(workload_perfmon_dir).joinpath(f.file_name_txt))
file_name_yaml = str(
@@ -777,16 +787,49 @@ class OmniSoC_Base:
]:
pmc.append(ctr)
if using_v3():
if is_counter_existed_in_extra_input_yaml(
accum_counters_def, ctr
) and not is_counter_existed_in_extra_input_yaml(
counter_def, ctr
):
counter_def = (
add_counter_from_source_to_target_extra_config_input_yaml(
accum_counters_def, counter_def, ctr
# MI 100 accumulate counters dont work with rocprofiler sdk
if self._mspec.gpu_model.lower() != "mi100":
# Add accumulation counters definitions
if ctr == "SQ_IFETCH_LEVEL":
counter_def = add_counter_extra_config_input_yaml(
counter_def,
"SQ_IFETCH_LEVEL_ACCUM",
"SQ_IFETCH_LEVEL accumulation",
"accumulate(SQ_IFETCH_LEVEL, HIGH_RES)",
[self.__arch],
)
elif ctr == "SQ_INST_LEVEL_LDS":
counter_def = add_counter_extra_config_input_yaml(
counter_def,
"SQ_INST_LEVEL_LDS_ACCUM",
"SQ_INST_LEVEL_LDS accumulation",
"accumulate(SQ_INST_LEVEL_LDS, HIGH_RES)",
[self.__arch],
)
elif ctr == "SQ_INST_LEVEL_SMEM":
counter_def = add_counter_extra_config_input_yaml(
counter_def,
"SQ_INST_LEVEL_SMEM_ACCUM",
"SQ_INST_LEVEL_SMEM accumulation",
"accumulate(SQ_INST_LEVEL_SMEM, HIGH_RES)",
[self.__arch],
)
elif ctr == "SQ_INST_LEVEL_VMEM":
counter_def = add_counter_extra_config_input_yaml(
counter_def,
"SQ_INST_LEVEL_VMEM_ACCUM",
"SQ_INST_LEVEL_VMEM accumulation",
"accumulate(SQ_INST_LEVEL_VMEM, HIGH_RES)",
[self.__arch],
)
elif ctr == "SQ_LEVEL_WAVES":
counter_def = add_counter_extra_config_input_yaml(
counter_def,
"SQ_LEVEL_WAVES_ACCUM",
"SQ_LEVEL_WAVES accumulation",
"accumulate(SQ_LEVEL_WAVES, HIGH_RES)",
[self.__arch],
)
)
# Add TCC channel counters definitions
if is_tcc_channel_counter(ctr):
counter_name = ctr.split("[")[0]
@@ -813,10 +856,9 @@ class OmniSoC_Base:
fd.close()
# Write counter definitions to file
if using_v3():
if counter_def:
with open(file_name_yaml, "w") as fp:
if counter_def:
fp.write(yaml.dump(counter_def, sort_keys=False))
fp.write(yaml.dump(counter_def, sort_keys=False))
# Add a timestamp file
# TODO: Does v3 need this?
+1 -1
Просмотреть файл
@@ -34,7 +34,7 @@ class gfx908_soc(OmniSoC_Base):
def __init__(self, args, mspec):
super().__init__(args, mspec)
self.set_arch("gfx908")
self.set_compatible_profilers(["rocprofv1"])
self.set_compatible_profilers(["rocprofv1", "rocprofv3", "rocprofiler-sdk"])
# Per IP block max number of simultaneous counters. GFX IP Blocks
self.set_perfmon_config(mi_gpu_specs.get_perfmon_config("gfx908"))
+60 -113
Просмотреть файл
@@ -62,21 +62,6 @@ def is_tcc_channel_counter(counter):
return counter.startswith("TCC") and counter.endswith("]")
def is_counter_existed_in_extra_input_yaml(data: dict, counter_name: str) -> bool:
"""
Check if a counter with the given name exists in the rocprofiler-sdk counters.
Args:
data (dict): The loaded YAML dictionary.
counter_name (str): The name of the counter to check.
Returns:
bool: True if the counter exists, False otherwise.
"""
counters = data.get("rocprofiler-sdk", {}).get("counters", [])
return any(counter.get("name") == counter_name for counter in counters)
def add_counter_extra_config_input_yaml(
data: dict,
counter_name: str,
@@ -172,46 +157,6 @@ def extract_counter_info_extra_config_input_yaml(
return None
def add_counter_from_source_to_target_extra_config_input_yaml(
source_data: dict, target_data: dict, counter_name: str
) -> dict:
"""
Check if counter_name exists in source_data, and if yes, add it to target_data.
Args:
source_data (dict): Source YAML dictionary to extract from.
target_data (dict): Target YAML dictionary to add to.
counter_name (str): Name of the counter to copy.
Returns:
dict: Updated target_data dictionary.
"""
counter = extract_counter_info_extra_config_input_yaml(source_data, counter_name)
if not counter:
raise ValueError(f"Counter '{counter_name}' not found in source data")
# Extract required info
name = counter.get("name")
description = counter.get("description", "")
properties = counter.get("properties", [])
definitions = counter.get("definitions", [])
if not definitions:
raise ValueError(f"Counter '{counter_name}' has no definitions")
architectures = definitions[0].get("architectures", [])
expression = definitions[0].get("expression", "")
return add_counter_extra_config_input_yaml(
target_data,
counter_name=name,
description=description,
expression=expression,
architectures=architectures,
properties=properties,
)
def is_spi_pipe_counter(counter):
for pattern in spi_pipe_counter_regexs:
if re.match(pattern, counter):
@@ -806,57 +751,66 @@ def run_prof(
else:
options = ["-A", "absolute"] + options
new_env = None
new_env = os.environ.copy()
path_counter_config_yaml = path(fname).with_suffix(".yaml")
if using_v3() and path_counter_config_yaml.exists():
if using_v3():
# Default counter definitions
if rocprof_cmd == "rocprofiler-sdk":
counter_defs_path = (
path(options["ROCP_TOOL_LIBRARIES"])
.resolve()
.parent.parent.parent.joinpath(
"share", "rocprofiler-sdk", "counter_defs.yaml"
)
)
else:
counter_defs_path = (
path(shutil.which(rocprof_cmd))
.resolve()
.parent.parent.joinpath("share", "rocprofiler-sdk", "counter_defs.yaml")
)
# Custom counter definitions for MI 100
if mspec.gpu_model.lower() == "mi100":
counter_defs_path = (
config.rocprof_compute_home
/ "rocprof_compute_soc"
/ "profile_configs"
/ "gfx908_counter_defs.yaml"
)
# Read counter definitions
with open(counter_defs_path, "r") as file:
counter_defs = yaml.safe_load(file)
# Get extra counter definitions
with open(path_counter_config_yaml, "r") as file:
extra_counter_defs = yaml.safe_load(file)
if extra_counter_defs:
# Get default counter definitions path
if rocprof_cmd == "rocprofiler-sdk":
counter_defs_path = (
path(options["ROCP_TOOL_LIBRARIES"])
.resolve()
.parent.parent.parent.joinpath(
"share", "rocprofiler-sdk", "counter_defs.yaml"
)
)
else:
counter_defs_path = (
path(shutil.which(rocprof_cmd))
.resolve()
.parent.parent.joinpath(
"share", "rocprofiler-sdk", "counter_defs.yaml"
)
)
# Get default counter definitions
with open(counter_defs_path, "r") as file:
counter_defs = yaml.safe_load(file)
# Merge counter definitions
path_counter_config_yaml = path(fname).with_suffix(".yaml")
if path_counter_config_yaml.exists():
with open(path_counter_config_yaml, "r") as file:
extra_counter_defs = yaml.safe_load(file)
# Merge extra counter definitions
counter_defs["rocprofiler-sdk"]["counters"].extend(
extra_counter_defs["rocprofiler-sdk"]["counters"]
)
# Write merged counter definitions to a temporary file
tmp_dir = tempfile.mkdtemp(prefix="rocprof_counter_defs_", dir="/tmp")
tmpfile_path = path(tmp_dir) / "counter_defs.yaml"
with open(tmpfile_path, "w") as tmpfile:
yaml.dump(
counter_defs, tmpfile, default_flow_style=False, sort_keys=False
)
# Set the environment variable to point to the temporary file
if not new_env:
new_env = os.environ.copy()
new_env["ROCPROFILER_METRICS_PATH"] = str(path(tmp_dir))
console_debug(
f"Adding env var for extra counters: ROCPROFILER_METRICS_PATH={new_env['ROCPROFILER_METRICS_PATH']}"
)
# Write counter definitions to a temporary file
tmpfile_path = (
path(tempfile.mkdtemp(prefix="rocprof_counter_defs_", dir="/tmp"))
/ "counter_defs.yaml"
)
with open(tmpfile_path, "w") as tmpfile:
yaml.dump(counter_defs, tmpfile, default_flow_style=False, sort_keys=False)
# Set rocprofiler sdk counter definitions
new_env["ROCPROFILER_METRICS_PATH"] = str(tmpfile_path.parent)
console_debug(
f"Adding env var for counter definitions: ROCPROFILER_METRICS_PATH={new_env['ROCPROFILER_METRICS_PATH']}"
)
# set required env var for mi300
if mspec.gpu_model.lower() not in ("mi50", "mi60", "mi210", "mi250", "mi250x"):
if not new_env:
new_env = os.environ.copy()
# set required env var for >= mi300
if mspec.gpu_model.lower() not in (
"mi50",
"mi60",
"mi100",
"mi210",
"mi250",
"mi250x",
):
new_env["ROCPROFILER_INDIVIDUAL_XCC_MODE"] = "1"
is_timestamps = False
@@ -866,8 +820,6 @@ def run_prof(
if rocprof_cmd == "rocprofiler-sdk":
app_cmd = options.pop("APP_CMD")
if not new_env:
new_env = os.environ.copy()
for key, value in options.items():
new_env[key] = value
console_debug("rocprof sdk env vars: {}".format(new_env))
@@ -878,14 +830,9 @@ def run_prof(
else:
console_debug("rocprof command: {}".format([rocprof_cmd] + options))
# profile the app
if new_env:
success, output = capture_subprocess_output(
[rocprof_cmd] + options, new_env=new_env, profileMode=True
)
else:
success, output = capture_subprocess_output(
[rocprof_cmd] + options, profileMode=True
)
success, output = capture_subprocess_output(
[rocprof_cmd] + options, new_env=new_env, profileMode=True
)
time_2 = time.time()
console_debug(
@@ -894,8 +841,8 @@ def run_prof(
)
)
# Delete temporary files
if new_env and "ROCPROFILER_METRICS_PATH" in new_env:
# Delete counter definition temporary directory
if new_env.get("ROCPROFILER_METRICS_PATH"):
shutil.rmtree(new_env["ROCPROFILER_METRICS_PATH"], ignore_errors=True)
if not success:
@@ -959,7 +906,7 @@ def run_prof(
workload_dir + "/out/pmc_1/results_" + fbase + ".csv", index=False
)
if new_env and not using_v3() and not using_v1():
if not using_v3() and not using_v1():
# flatten tcc for applicable mi300 input
f = path(workload_dir + "/out/pmc_1/results_" + fbase + ".csv")
xcds = mi_gpu_specs.get_num_xcds(