Enable rocpd output format with rocprofiler sdk (#790)

* Add `rocpd` choice for `--format-rocprof-output` option
* Add rocpd_data.py which defines SQL queries to extract data from rocpd database
* Use sqlite3 package to read the database
* Add `--retain-rocpd-output` option in profile mode to retain raw
  rocpd database
* Add warning notice to say `--format-rocprof-output rocpd` will be
  default in future release

For rocpd output:
* Use only `pmc_perf.csv` instead of reading individual coll_level results csv files
* Post process csv files using pandas in analysis mode instead of profile mode
* Use ACCUM counters instead of SQ_ACCUM_PREV_HIRES

* Add test cases for rocpd output format
* Fix code formatting issues
* Update CHANGELOG

[ROCm/rocprofiler-compute commit: 03d27c0ba0]
This commit is contained in:
vedithal-amd
2025-07-28 11:02:28 -04:00
committato da GitHub
parent 97465e7448
commit 17e5892614
39 ha cambiato i file con 19428 aggiunte e 37 eliminazioni
@@ -6,6 +6,26 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs.
### Added
* Add `rocpd` choice for `--format-rocprof-output` option in profile mode
* Add `--retain-rocpd-output` option in profile mode to save large raw rocpd databases in workload directory
### Changed
* Add notice for change in default output format to `rocpd` in a future release
* This is displayed when `--format-rocprof-output rocpd` is not used in profile mode
* When `--format-rocprof-output rocpd` is used, only pmc_perf.csv will be written to workload directory instead of mulitple csv files.
### Resolved issues
### Known issues
### Removed
## ROCm Compute Profiler 3.2.0 for ROCm 7.0.0
### Added
* Support Roofline plot on CLI (single run)
* Stochastic (hardware-based) PC sampling has been enabled for AMD Instinct MI300X series and later accelerators.
@@ -249,7 +249,7 @@ Examples:
required=False,
metavar="",
dest="format_rocprof_output",
choices=["json", "csv"],
choices=["json", "csv", "rocpd"],
default="csv",
help="\t\t\tSet the format of output file of rocprof.",
)
@@ -280,6 +280,13 @@ Examples:
default="/opt/rocm/lib/librocprofiler-sdk.so",
help="\t\t\tSet the path to rocprofiler SDK library.",
)
profile_group.add_argument(
"--retain-rocpd-output",
required=False,
default=False,
action="store_true",
help="\t\t\tRetain the large raw rocpd database in workload directory.\n\t\t\tThis option requires --format-rocprof-output rocpd.",
)
## Roofline Command Line Options
roofline_group.add_argument(
@@ -128,11 +128,13 @@ class OmniAnalyze_Base:
if not normalization_filter:
for k, v in self._arch_configs.items():
parser.build_metric_value_string(
v.dfs, v.dfs_type, self.__args.normal_unit
v.dfs, v.dfs_type, self.__args.normal_unit, self._profiling_config
)
else:
for k, v in self._arch_configs.items():
parser.build_metric_value_string(v.dfs, v.dfs_type, normalization_filter)
parser.build_metric_value_string(
v.dfs, v.dfs_type, normalization_filter, self._profiling_config
)
args = self.__args
# Error checking for multiple runs and multiple kernel filters
@@ -47,6 +47,7 @@ class cli_analysis(OmniAnalyze_Base):
self.get_args().spatial_multiplexing,
self.get_args().kernel_verbose,
self.get_args().verbose,
self._profiling_config,
)
if self.get_args().spatial_multiplexing:
@@ -72,7 +73,11 @@ class cli_analysis(OmniAnalyze_Base):
# create the loaded table
parser.load_table_data(
workload=self._runs[d[0]], dir=d[0], is_gui=False, args=self.get_args()
workload=self._runs[d[0]],
dir=d[0],
is_gui=False,
args=self.get_args(),
config=self._profiling_config,
)
@demarcate
@@ -120,6 +120,7 @@ class webui_analysis(OmniAnalyze_Base):
self.get_args().spatial_multiplexing,
self.get_args().kernel_verbose,
self.get_args().verbose,
self._profiling_config,
)
if self.get_args().spatial_multiplexing:
@@ -168,6 +169,7 @@ class webui_analysis(OmniAnalyze_Base):
dir=self.dest_dir,
is_gui=True,
args=self.get_args(),
config=self._profiling_config,
)
# ~~~~~~~~~~~~~~~~~~~~~~~
@@ -300,6 +302,7 @@ class webui_analysis(OmniAnalyze_Base):
self.get_args().spatial_multiplexing,
self.get_args().kernel_verbose,
args.verbose,
self._profiling_config,
)
if self.get_args().spatial_multiplexing:
@@ -169,6 +169,14 @@ class RocProfCompute:
)
self.__args = parser.parse_args()
if (
"format_rocprof_output" in self.__args
and self.__args.format_rocprof_output != "rocpd"
):
console_warning(
f"The option --format-rocprof-output currently set to {self.__args.format_rocprof_output} will default to rocpd in a future release."
)
if self.__args.mode == None:
if self.__args.specs:
print(generate_machine_specs(self.__args))
@@ -22,6 +22,7 @@
# SOFTWARE.
##############################################################################el
import csv
import glob
import logging
import os
@@ -68,6 +69,30 @@ class RocProfCompute_Base:
@demarcate
def join_prof(self, out=None):
"""Manually join separated rocprof runs"""
if self.get_args().format_rocprof_output == "rocpd":
# Vertically concat (by rows) results_*.csv into pmc_perf.csv
result_files = glob.glob(self.get_args().path + "/results_*.csv")
if out is None:
out = self.__args.path + "/pmc_perf.csv"
with open(out, "w", newline="") as outfile:
writer = None
for file in result_files:
with open(file, "r", newline="") as infile:
reader = csv.reader(infile)
header = next(reader)
# Write header only once
if writer is None:
writer = csv.writer(outfile)
writer.writerow(header)
for row in reader:
writer.writerow(row)
console_debug(f"Created file: {out}")
# Delete results_*.csv files
for file in result_files:
os.remove(file)
console_debug(f"Deleted file: {file}")
return
# Set default output directory if not specified
if type(self.__args.path) == str:
if out is None:
@@ -412,6 +437,7 @@ class RocProfCompute_Base:
mspec=self._soc._mspec,
loglevel=self.get_args().loglevel,
format_rocprof_output=self.get_args().format_rocprof_output,
retain_rocpd_output=self.get_args().retain_rocpd_output,
)
end_run_prof = time.time()
actual_profiling_duration = end_run_prof - start_run_prof
@@ -43,11 +43,6 @@ class rocprof_v3_profiler(RocProfCompute_Base):
def get_profiler_options(self, fname, soc):
app_cmd = shlex.split(self.get_args().remaining)
trace_option = "--kernel-trace"
rocprof_out_format = "json"
if self.get_args().format_rocprof_output == "csv":
rocprof_out_format = "csv"
if self.get_args().kokkos_trace:
trace_option = "--kokkos-trace"
# NOTE: --kokkos-trace feature is incomplete and is disabled for now.
@@ -63,7 +58,7 @@ class rocprof_v3_profiler(RocProfCompute_Base):
self.get_args().path + "/" + "out",
trace_option,
"--output-format",
rocprof_out_format,
self.get_args().format_rocprof_output,
]
# Kernel filtering
if self.get_args().kernel:
@@ -55,13 +55,10 @@ class rocprofiler_sdk_profiler(RocProfCompute_Base):
"ROCP_TOOL_LIBRARIES": rocprofiler_sdk_tool_path,
"LD_LIBRARY_PATH": rocm_libdir,
"ROCPROF_KERNEL_TRACE": "1",
"ROCPROF_OUTPUT_FORMAT": "json",
"ROCPROF_OUTPUT_FORMAT": self.get_args().format_rocprof_output,
"ROCPROF_OUTPUT_PATH": self.get_args().path + "/out/pmc_1",
}
if self.get_args().format_rocprof_output == "csv":
options["ROCPROF_OUTPUT_FORMAT"] = "csv"
if self.get_args().kokkos_trace:
# NOTE: --kokkos-trace feature is incomplete and is disabled for now.
console_error(
@@ -61,6 +61,7 @@ class tui_analysis(OmniAnalyze_Base):
self.get_args().spatial_multiplexing,
self.get_args().kernel_verbose,
self.get_args().verbose,
self._profiling_config,
)
if self.get_args().spatial_multiplexing:
@@ -90,6 +91,7 @@ class tui_analysis(OmniAnalyze_Base):
dir=self.path,
is_gui=False,
args=self.get_args(),
config=self._profiling_config,
)
def initalize_runs(self, normalization_filter=None):
@@ -35,6 +35,7 @@ import plotext as plt
import plotly.graph_objects as go
from dash import dcc, html
from utils import file_io, rocpd_data
from utils.logger import (
console_debug,
console_error,
@@ -673,6 +674,9 @@ class Roofline:
console_error("roofline", "{} does not exist".format(pmc_perf_csv))
t_df = OrderedDict()
t_df["pmc_perf"] = pd.read_csv(pmc_perf_csv)
profiling_config = file_io.load_profiling_config(self.__args.path[0][0])
if profiling_config.get("format_rocprof_output") == "rocpd":
t_df["pmc_perf"] = rocpd_data.process_rocpd_csv(t_df["pmc_perf"])
color_scheme = {
"HBM": "blue+",
@@ -861,6 +865,9 @@ class Roofline:
console_error("roofline", "{} does not exist".format(app_path))
t_df = OrderedDict()
t_df["pmc_perf"] = pd.read_csv(app_path)
profiling_config = file_io.load_profiling_config(self.__args.path)
if profiling_config.get("format_rocprof_output") == "rocpd":
t_df["pmc_perf"] = rocpd_data.process_rocpd_csv(t_df["pmc_perf"])
self.empirical_roofline(ret_df=t_df)
@abstractmethod
@@ -31,7 +31,7 @@ import pandas as pd
import yaml
import config
from utils import schema
from utils import rocpd_data, schema
from utils.kernel_name_shortener import kernel_name_shortener
from utils.logger import console_debug, console_error, console_log, demarcate
@@ -95,9 +95,7 @@ def load_profiling_config(config_dir):
prof_config = yaml.safe_load(file)
return prof_config
except FileNotFoundError:
console_log(
f"Could not find profiling_config.yaml in {config_dir} for filtering analysis report"
)
console_log(f"Could not find profiling_config.yaml in {config_dir}")
return dict()
@@ -195,7 +193,7 @@ def create_df_kernel_top_stats(
@demarcate
def create_df_pmc(
raw_data_root_dir, nodes, spatial_multiplexing, kernel_verbose, verbose
raw_data_root_dir, nodes, spatial_multiplexing, kernel_verbose, verbose, config
):
"""
Load all raw pmc counters and join into one df.
@@ -214,6 +212,8 @@ def create_df_pmc(
f == schema.pmc_perf_file_prefix + ".csv"
):
tmp_df = pd.read_csv(str(Path(root).joinpath(f)))
if config.get("format_rocprof_output") == "rocpd":
tmp_df = rocpd_data.process_rocpd_csv(tmp_df)
# Demangle original KernelNames
kernel_name_shortener(tmp_df, kernel_verbose)
@@ -271,7 +271,7 @@ class CodeTransformer(ast.NodeTransformer):
return node
def build_eval_string(equation, coll_level):
def build_eval_string(equation, coll_level, config):
"""
Convert user defined equation string to eval executable string
For example,
@@ -314,7 +314,14 @@ def build_eval_string(equation, coll_level):
# use .get() to catch any potential KeyErrors
s = re.sub(r"raw_pmc_df\['(.*?)']", r'raw_pmc_df.get("\1")', s)
# apply coll_level
s = re.sub(r"raw_pmc_df", "raw_pmc_df.get('" + coll_level + "')", s)
if config.get("format_rocprof_output") == "rocpd":
# Replace SQ_ACCUM_PREV_HIRES with coll_level_ACCUM then ignore coll_level df
s = re.sub(f"SQ_ACCUM_PREV_HIRES", f"{coll_level}_ACCUM", s)
s = re.sub(
r"raw_pmc_df", "raw_pmc_df.get('" + schema.pmc_perf_file_prefix + "')", s
)
else:
s = re.sub(r"raw_pmc_df", "raw_pmc_df.get('" + coll_level + "')", s)
# print("--- build_eval_string, return: ", s)
return s
@@ -653,7 +660,7 @@ def build_dfs(archConfigs, filter_metrics, sys_info):
setattr(archConfigs, "metric_counters", metric_counters)
def build_metric_value_string(dfs, dfs_type, normal_unit):
def build_metric_value_string(dfs, dfs_type, normal_unit, profiling_config):
"""
Apply the real eval string to its field in the metric_table df.
"""
@@ -674,6 +681,7 @@ def build_metric_value_string(dfs, dfs_type, normal_unit):
df.at[row_idx_label, expr] = build_eval_string(
df.at[row_idx_label, expr],
df.at[row_idx_label, "coll_level"],
profiling_config,
)
elif expr.lower() == "unit" or expr.lower() == "units":
@@ -683,7 +691,7 @@ def build_metric_value_string(dfs, dfs_type, normal_unit):
@demarcate
def eval_metric(dfs, dfs_type, sys_info, raw_pmc_df, debug):
def eval_metric(dfs, dfs_type, sys_info, raw_pmc_df, debug, config):
"""
Execute the expr string for each metric in the df.
"""
@@ -784,7 +792,7 @@ def eval_metric(dfs, dfs_type, sys_info, raw_pmc_df, debug):
if "PER_XCD" not in key:
continue
# NB: assume all built-in vars from pmc_perf.csv for now
s = build_eval_string(value, schema.pmc_perf_file_prefix)
s = build_eval_string(value, schema.pmc_perf_file_prefix, config)
try:
ammolite__build_in[key] = eval(compile(s, "<string>", "eval"))
except TypeError:
@@ -801,7 +809,7 @@ def eval_metric(dfs, dfs_type, sys_info, raw_pmc_df, debug):
if "PER_XCD" in key:
continue
# NB: assume all built-in vars from pmc_perf.csv for now
s = build_eval_string(value, schema.pmc_perf_file_prefix)
s = build_eval_string(value, schema.pmc_perf_file_prefix, config)
try:
ammolite__build_in[key] = eval(compile(s, "<string>", "eval"))
except TypeError:
@@ -1437,7 +1445,7 @@ def load_kernel_top(workload, dir, args):
@demarcate
def load_table_data(workload, dir, is_gui, args, skipKernelTop=False):
def load_table_data(workload, dir, is_gui, args, config, skipKernelTop=False):
"""
- Load data for all "raw_csv_table"
- Load dat for "pc_sampling_table"
@@ -1452,6 +1460,7 @@ def load_table_data(workload, dir, is_gui, args, skipKernelTop=False):
workload.sys_info.iloc[0],
apply_filters(workload, dir, is_gui, args.debug),
args.debug,
config,
)
@@ -0,0 +1,94 @@
import csv
import sqlite3
from contextlib import closing
from utils.logger import console_error
# From schema definition in source/share/rocprofiler-sdk-rocpd/data_views.sql in rocprofiler-sdk repository
COUNTERS_COLLECTION_QUERY = """
SELECT
agent_id as GPU_ID,
dispatch_id as Dispatch_ID,
grid_size as Grid_Size,
workgroup_size as Workgroup_Size,
lds_block_size as LDS_Per_Workgroup,
scratch_size as Scratch_Per_Workitem,
vgpr_count as Arch_VGPR,
accum_vgpr_count as Accum_VGPR,
sgpr_count as SGPR,
kernel_name as Kernel_Name,
start as Start_Timestamp,
end as End_Timestamp,
kernel_id as Kernel_ID,
counter_name as Counter_Name,
value as Counter_Value
FROM counters_collection
"""
def convert_db_to_csv(
db_path: str,
csv_file_path: str,
) -> None:
"""
Read rocpd database and write to CSV file
"""
# Read counters_collection view from the database and write to CSV
try:
with closing(sqlite3.connect(db_path)) as conn:
with closing(conn.execute(COUNTERS_COLLECTION_QUERY)) as cursor:
with open(csv_file_path, "w", newline="") as csvfile:
writer = csv.writer(csvfile)
writer.writerow(
[description[0] for description in cursor.description]
)
for row in cursor:
writer.writerow(row)
except (sqlite3.DatabaseError, IOError) as e:
console_error(f"Error converting database to CSV: {e}")
def process_rocpd_csv(df):
"""
Merge counters across unique dispatches from the input dataframe and return processed dataframe.
"""
# Only import pandas if needed
import pandas as pd
data = list()
# Group by unique kernel and merge into a single row
for _, group_df in df.groupby(
[
"Dispatch_ID",
"Kernel_Name",
"Grid_Size",
"Workgroup_Size",
"LDS_Per_Workgroup",
]
):
row = {
"GPU_ID": group_df["GPU_ID"].iloc[0],
"Grid_Size": group_df["Grid_Size"].iloc[0],
"Workgroup_Size": group_df["Workgroup_Size"].iloc[0],
"LDS_Per_Workgroup": group_df["LDS_Per_Workgroup"].iloc[0],
"Scratch_Per_Workitem": group_df["Scratch_Per_Workitem"].iloc[0],
"Arch_VGPR": group_df["Arch_VGPR"].iloc[0],
"Accum_VGPR": group_df["Accum_VGPR"].iloc[0],
"SGPR": group_df["SGPR"].iloc[0],
"Kernel_Name": group_df["Kernel_Name"].iloc[0],
"Kernel_ID": group_df["Kernel_ID"].iloc[0],
}
# Each counter will become its own column
row.update(dict(zip(group_df["Counter_Name"], group_df["Counter_Value"])))
# Replace end timestamp with median of durations of group, start timestamp is set to 0
row["End_Timestamp"] = (
group_df["End_Timestamp"] - group_df["Start_Timestamp"]
).median()
row["Start_Timestamp"] = 0.0
data.append(row)
df = pd.DataFrame(data)
# Rank GPU IDs, map lowest number to 0, next to 1, etc.
df["GPU_ID"] = df["GPU_ID"].rank(method="dense").astype(int) - 1
# Reset dispatch IDs
df["Dispatch_ID"] = range(len(df))
return df
@@ -45,6 +45,7 @@ import pandas as pd
import yaml
import config
from utils import rocpd_data
from utils.logger import (
console_debug,
console_error,
@@ -707,9 +708,14 @@ def parse_text(text_file):
def run_prof(
fname, profiler_options, workload_dir, mspec, loglevel, format_rocprof_output
fname,
profiler_options,
workload_dir,
mspec,
loglevel,
format_rocprof_output,
retain_rocpd_output=False,
):
time_0 = time.time()
fbase = path(fname).stem
console_debug("pmc file: %s" % path(fname).name)
@@ -831,7 +837,29 @@ def run_prof(
results_files = []
if rocprof_cmd.endswith("v2"):
if format_rocprof_output == "rocpd":
if rocprof_cmd == "rocprofiler-sdk" or rocprof_cmd.endswith("v3"):
# Write results_fbase.csv
rocpd_data.convert_db_to_csv(
glob.glob(workload_dir + "/out/pmc_1/*/*.db")[0],
workload_dir + f"/results_{fbase}.csv",
)
if retain_rocpd_output:
shutil.copyfile(
glob.glob(workload_dir + "/out/pmc_1/*/*.db")[0],
workload_dir + "/" + fbase + ".db",
)
console_warning(
f"Retaining large raw rocpd database: {workload_dir}/{fbase}.db"
)
# Remove temp directory
shutil.rmtree(workload_dir + "/" + "out")
return
else:
console_error(
"rocpd output format is only supported with rocprofiler-sdk or rocprofv3."
)
elif rocprof_cmd.endswith("v2"):
# rocprofv2 has separate csv files for each process
results_files = glob.glob(workload_dir + "/out/pmc_1/results_*.csv")
@@ -1058,7 +1086,6 @@ def process_rocprofv3_output(rocprof_output, workload_dir, is_timestamps):
else:
# when the input is not for timestamps, and counter csv file is not generated, we assume failed rocprof run and will completely bypass the file generation and merging for current pmc
results_files_csv = []
else:
console_error("The output file of rocprofv3 can only support json or csv!!!")
@@ -38,6 +38,7 @@ indirs = [
"tests/workloads/vcopy/MI200",
"tests/workloads/vcopy/MI300A_A1",
"tests/workloads/vcopy/MI300X_A1",
"tests/workloads/vcopy/MI300X_A1_rocpd",
"tests/workloads/vcopy/MI350",
]
@@ -266,7 +267,11 @@ def test_dispatch_5(binary_handler_analyze_rocprof_compute):
@pytest.mark.misc
def test_gpu_ids(binary_handler_analyze_rocprof_compute):
for dir in indirs:
if dir.endswith("MI350"):
# if dir.endswith("MI350") or dir.endswith("MI300X_A1_rocpd"):
if dir in (
"tests/workloads/vcopy/MI350",
"tests/workloads/vcopy/MI300X_A1_rocpd",
):
gpu_id = "0"
else:
gpu_id = "2"
@@ -783,12 +788,12 @@ def test_parser_error_handling():
from utils.parser import build_eval_string, calc_builtin_var, update_denom_string
try:
build_eval_string("AVG(SQ_WAVES)", None)
build_eval_string("AVG(SQ_WAVES)", None, config={})
assert False, "Should have raised exception for None coll_level"
except Exception as e:
assert "coll_level can not be None" in str(e)
assert build_eval_string("", "pmc_perf") == ""
assert build_eval_string("", "pmc_perf", config={}) == ""
assert update_denom_string("", "per_wave") == ""
class MockSysInfo:
@@ -813,12 +818,12 @@ def test_parser_error_handling():
from utils.parser import build_eval_string, calc_builtin_var, update_denom_string
try:
build_eval_string("AVG(SQ_WAVES)", None)
build_eval_string("AVG(SQ_WAVES)", None, config={})
assert False, "Should have raised exception for None coll_level"
except Exception as e:
assert "coll_level can not be None" in str(e)
assert build_eval_string("", "pmc_perf") == ""
assert build_eval_string("", "pmc_perf", config={}) == ""
assert update_denom_string("", "per_wave") == ""
class MockSysInfo:
@@ -943,7 +948,7 @@ def test_analyze_with_debug_mode(binary_handler_analyze_rocprof_compute):
}
try:
eval_metric(mock_dfs, mock_dfs_type, sys_info, raw_pmc_df, debug=True)
eval_metric(mock_dfs, mock_dfs_type, sys_info, raw_pmc_df, debug=True, config={})
except Exception as e:
pass
@@ -45,6 +45,13 @@ def test_analyze_vcopy_MI200(binary_handler_analyze_rocprof_compute):
assert code == 0
def test_analyze_vcopy_MI300_rocpd(binary_handler_analyze_rocprof_compute):
code = binary_handler_analyze_rocprof_compute(
["analyze", "--path", "tests/workloads/vcopy/MI300X_A1_rocpd"]
)
assert code == 0
def test_analyze_ipblocks_TCP_MI300X_A1(binary_handler_analyze_rocprof_compute):
code = binary_handler_analyze_rocprof_compute(
["analyze", "--path", "tests/workloads/ipblocks_TCP/MI300X_A1"]
@@ -572,6 +572,21 @@ def test_path(binary_handler_profile_rocprof_compute):
test_utils.clean_output_dir(config["cleanup"], workload_dir)
@pytest.mark.misc
def test_path_rocpd(binary_handler_profile_rocprof_compute):
workload_dir = test_utils.get_output_dir()
options = ["--format-rocprof-output", "rocpd"]
binary_handler_profile_rocprof_compute(config, workload_dir, options)
assert (Path(workload_dir) / "pmc_perf.csv").exists()
assert test_utils.check_file_pattern(
"format_rocprof_output: rocpd", f"{workload_dir}/profiling_config.yaml"
)
assert test_utils.check_file_pattern("Counter_Name", f"{workload_dir}/pmc_perf.csv")
test_utils.clean_output_dir(config["cleanup"], workload_dir)
@pytest.mark.misc
def test_roof_kernel_names(binary_handler_profile_rocprof_compute):
if soc in ("MI100"):
@@ -711,6 +726,45 @@ def test_roof_file_validation(binary_handler_profile_rocprof_compute):
test_utils.clean_output_dir(config["cleanup"], workload_dir)
@pytest.mark.misc
def test_roof_rocpd(binary_handler_profile_rocprof_compute):
workload_dir = test_utils.get_output_dir()
options = ["--device", "0", "--roof-only", "--format-rocprof-output", "rocpd"]
binary_handler_profile_rocprof_compute(config, workload_dir, options, roof=True)
assert (Path(workload_dir) / "pmc_perf.csv").exists()
assert (Path(workload_dir) / "roofline.csv").exists()
assert test_utils.check_file_pattern(
"format_rocprof_output: rocpd", f"{workload_dir}/profiling_config.yaml"
)
assert test_utils.check_file_pattern("Counter_Name", f"{workload_dir}/pmc_perf.csv")
test_utils.clean_output_dir(config["cleanup"], workload_dir)
@pytest.mark.misc
def test_roofline_kernel_names_validation_error(binary_handler_profile_rocprof_compute):
"""
Test validate_parameters() error: --roof-only is required for --kernel-names
This should trigger console_error("--roof-only is required for --kernel-names")
"""
if soc in ("MI100"):
# roofline is not supported on MI100
pytest.skip("Skipping roofline test for MI100")
return
options = ["--device", "0", "--kernel-names"] # missing --roof-only
workload_dir = test_utils.get_output_dir()
returncode = binary_handler_profile_rocprof_compute(
config, workload_dir, options, check_success=False, roof=True
)
assert returncode != 0
test_utils.clean_output_dir(config["cleanup"], workload_dir)
@pytest.mark.misc
def test_roofline_workload_dir_not_set_error():
"""
@@ -1612,7 +1666,7 @@ def test_comprehensive_error_paths():
assert result == 16
try:
build_eval_string("test", None)
build_eval_string("test", None, config={})
assert False, "Should raise exception for None coll_level"
except Exception as e:
assert "coll_level can not be None" in str(e)
@@ -0,0 +1,399 @@
[profiling] pre-processing using rocprofv3 profiler
[gen_sysinfo]
Incomplete class definition for gfx942. Expecting populated max_mclk but detected None.
Incomplete class definition for gfx942. Expecting populated cur_mclk but detected None.
Missing specs fields for gfx942
starting "run_profiling" and about to start rocprof's workload
[profiling] performing profiling using rocprofv3 profiler
Rocprofiler-Compute version: 3.2.0
Profiler choice: rocprofv3
Path: /app/workloads/vcopy/MI300X_A1
Target: MI300X_A1
Command: sample/vcopy -n 1048576 -b 256 -i 3
Kernel Selection: None
Dispatch Selection: None
Hardware Blocks: []
Report Sections: []
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Collecting Performance Counters
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[Run 1/12][Approximate profiling time left: pending first measurement...]
[profiling] Current input file: /app/workloads/vcopy/MI300X_A1/perfmon/SQ_IFETCH_LEVEL.txt
pmc file: SQ_IFETCH_LEVEL.txt
Adding env var for counter definitions: ROCPROFILER_METRICS_PATH=/tmp/rocprof_counter_defs_w2ranh0p
rocprof command: ['rocprofv3', '-A', 'absolute', '-i', '/app/workloads/vcopy/MI300X_A1/perfmon/SQ_IFETCH_LEVEL.txt', '-d', '/app/workloads/vcopy/MI300X_A1/out', '--kernel-trace', '--output-format', 'rocpd', '--', 'sample/vcopy', '-n', '1048576', '-b', '256', '-i', '3']
[subprocess] Running: rocprofv3 -A absolute -i /app/workloads/vcopy/MI300X_A1/perfmon/SQ_IFETCH_LEVEL.txt -d /app/workloads/vcopy/MI300X_A1/out --kernel-trace --output-format rocpd -- sample/vcopy -n 1048576 -b 256 -i 3
|-> [rocprofv3] W20250710 21:33:25.713947 140172743004992 simple_timer.cpp:55] [rocprofv3] tool initialization :: 0.166549 sec
|-> [rocprofv3] W20250710 21:33:25.714215 140172743004992 simple_timer.cpp:55] [rocprofv3] 'sample/vcopy -n 1048576 -b 256 -i 3' :: 0.000000 sec
|-> [rocprofv3] W20250710 21:33:25.941891 140172743004992 tool.cpp:2150] HSA version 8.18.0 initialized (instance=0)
|-> [rocprofv3] W20250710 21:33:26.335688 140172743004992 simple_timer.cpp:55] [rocprofv3] 'sample/vcopy -n 1048576 -b 256 -i 3' :: 0.621474 sec
|-> [rocprofv3] W20250710 21:33:26.351137 140172743004992 generateRocpd.cpp:580] writing SQL database for process 3554 on node 3224294684
|-> [rocprofv3] E20250710 21:33:26.351912 140172743004992 generateRocpd.cpp:603] Opened result file: /app/workloads/vcopy/MI300X_A1/out/pmc_1/7f6eaef84eaf/3554_results.db (UUID=00031de8-fbce-7bce-9912-6a8f9645ae7f)
|-> [rocprofv3] W20250710 21:33:26.391902 140172743004992 simple_timer.cpp:55] SQLite3 generation :: rocpd_string :: 0.014565 sec
|-> [rocprofv3] W20250710 21:33:26.392284 140172743004992 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_node :: 0.000363 sec
|-> [rocprofv3] W20250710 21:33:26.393046 140172743004992 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_process :: 0.000752 sec
|-> [rocprofv3] W20250710 21:33:26.406145 140172743004992 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_agent :: 0.012446 sec
|-> [rocprofv3] W20250710 21:33:27.791204 140172743004992 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_pmc :: 1.385042 sec
|-> [rocprofv3] W20250710 21:33:27.792304 140172743004992 simple_timer.cpp:55] SQLite3 generation :: rocpd kernel info :: 0.001065 sec
|-> [rocprofv3] W20250710 21:33:27.792315 140172743004992 simple_timer.cpp:55] SQLite3 generation :: rocpd_region :: 0.000002 sec
|-> [rocprofv3] W20250710 21:33:27.798288 140172743004992 simple_timer.cpp:55] SQLite3 generation :: rocpd_kernel_dispatch :: 0.005969 sec
|-> [rocprofv3] W20250710 21:33:27.827005 140172743004992 simple_timer.cpp:55] SQLite3 generation :: rocpd_pmc_event :: 0.028702 sec
|-> [rocprofv3] W20250710 21:33:27.827022 140172743004992 simple_timer.cpp:55] SQLite3 generation :: rocpd_memory_copy :: 0.000000 sec
|-> [rocprofv3] W20250710 21:33:27.827026 140172743004992 simple_timer.cpp:55] SQLite3 generation :: rocpd_memory_allocate :: 0.000000 sec
|-> [rocprofv3] W20250710 21:33:27.827218 140172743004992 simple_timer.cpp:55] SQLite3 generation :: SQL indexing :: 0.000190 sec
|-> [rocprofv3] W20250710 21:33:27.827762 140172743004992 simple_timer.cpp:55] SQLite3 generation :: total :: 1.476625 sec
|-> [rocprofv3] W20250710 21:33:27.832297 140172743004992 simple_timer.cpp:55] [rocprofv3] tool finalization :: 1.495785 sec
|-> [rocprofv3] vcopy testing on GCD 0
|-> [rocprofv3] Finished allocating vectors on the CPU
Finishing subprocess of fname /app/workloads/vcopy/MI300X_A1/perfmon/SQ_IFETCH_LEVEL.txt, the time it takes was 0 m 2.7194788455963135 sec
The time of run_prof of /app/workloads/vcopy/MI300X_A1/perfmon/SQ_IFETCH_LEVEL.txt is 0 m 3.8476221561431885 sec
[Run 2/12][Approximate profiling time left: 38 seconds]...
[profiling] Current input file: /app/workloads/vcopy/MI300X_A1/perfmon/SQ_INST_LEVEL_LDS.txt
pmc file: SQ_INST_LEVEL_LDS.txt
Adding env var for counter definitions: ROCPROFILER_METRICS_PATH=/tmp/rocprof_counter_defs_wp1tt6p6
rocprof command: ['rocprofv3', '-A', 'absolute', '-i', '/app/workloads/vcopy/MI300X_A1/perfmon/SQ_INST_LEVEL_LDS.txt', '-d', '/app/workloads/vcopy/MI300X_A1/out', '--kernel-trace', '--output-format', 'rocpd', '--', 'sample/vcopy', '-n', '1048576', '-b', '256', '-i', '3']
[subprocess] Running: rocprofv3 -A absolute -i /app/workloads/vcopy/MI300X_A1/perfmon/SQ_INST_LEVEL_LDS.txt -d /app/workloads/vcopy/MI300X_A1/out --kernel-trace --output-format rocpd -- sample/vcopy -n 1048576 -b 256 -i 3
|-> [rocprofv3] W20250710 21:33:29.552773 140238077793088 simple_timer.cpp:55] [rocprofv3] tool initialization :: 0.168769 sec
|-> [rocprofv3] W20250710 21:33:29.553044 140238077793088 simple_timer.cpp:55] [rocprofv3] 'sample/vcopy -n 1048576 -b 256 -i 3' :: 0.000000 sec
|-> [rocprofv3] W20250710 21:33:29.736798 140238077793088 tool.cpp:2150] HSA version 8.18.0 initialized (instance=0)
|-> [rocprofv3] W20250710 21:33:30.154960 140238077793088 simple_timer.cpp:55] [rocprofv3] 'sample/vcopy -n 1048576 -b 256 -i 3' :: 0.601916 sec
|-> [rocprofv3] W20250710 21:33:30.170439 140238077793088 generateRocpd.cpp:580] writing SQL database for process 3561 on node 3224294684
|-> [rocprofv3] E20250710 21:33:30.171201 140238077793088 generateRocpd.cpp:603] Opened result file: /app/workloads/vcopy/MI300X_A1/out/pmc_1/7f6eaef84eaf/3561_results.db (UUID=00031de9-0ace-7ace-9f47-5cc55ac49931)
|-> [rocprofv3] W20250710 21:33:30.211098 140238077793088 simple_timer.cpp:55] SQLite3 generation :: rocpd_string :: 0.014766 sec
|-> [rocprofv3] W20250710 21:33:30.211463 140238077793088 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_node :: 0.000347 sec
|-> [rocprofv3] W20250710 21:33:30.212224 140238077793088 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_process :: 0.000751 sec
|-> [rocprofv3] W20250710 21:33:30.224678 140238077793088 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_agent :: 0.011904 sec
|-> [rocprofv3] W20250710 21:33:31.620420 140238077793088 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_pmc :: 1.395725 sec
|-> [rocprofv3] W20250710 21:33:31.621558 140238077793088 simple_timer.cpp:55] SQLite3 generation :: rocpd kernel info :: 0.001104 sec
|-> [rocprofv3] W20250710 21:33:31.621569 140238077793088 simple_timer.cpp:55] SQLite3 generation :: rocpd_region :: 0.000002 sec
|-> [rocprofv3] W20250710 21:33:31.627417 140238077793088 simple_timer.cpp:55] SQLite3 generation :: rocpd_kernel_dispatch :: 0.005843 sec
|-> [rocprofv3] W20250710 21:33:31.658664 140238077793088 simple_timer.cpp:55] SQLite3 generation :: rocpd_pmc_event :: 0.031232 sec
|-> [rocprofv3] W20250710 21:33:31.658684 140238077793088 simple_timer.cpp:55] SQLite3 generation :: rocpd_memory_copy :: 0.000000 sec
|-> [rocprofv3] W20250710 21:33:31.658688 140238077793088 simple_timer.cpp:55] SQLite3 generation :: rocpd_memory_allocate :: 0.000001 sec
|-> [rocprofv3] W20250710 21:33:31.658887 140238077793088 simple_timer.cpp:55] SQLite3 generation :: SQL indexing :: 0.000198 sec
|-> [rocprofv3] W20250710 21:33:31.659456 140238077793088 simple_timer.cpp:55] SQLite3 generation :: total :: 1.489016 sec
|-> [rocprofv3] W20250710 21:33:31.664027 140238077793088 simple_timer.cpp:55] [rocprofv3] tool finalization :: 1.508410 sec
|-> [rocprofv3] vcopy testing on GCD 0
|-> [rocprofv3] Finished allocating vectors on the CPU
Finishing subprocess of fname /app/workloads/vcopy/MI300X_A1/perfmon/SQ_INST_LEVEL_LDS.txt, the time it takes was 0 m 2.7174251079559326 sec
The time of run_prof of /app/workloads/vcopy/MI300X_A1/perfmon/SQ_INST_LEVEL_LDS.txt is 0 m 3.833453416824341 sec
[Run 3/12][Approximate profiling time left: 34 seconds]...
[profiling] Current input file: /app/workloads/vcopy/MI300X_A1/perfmon/SQ_INST_LEVEL_SMEM.txt
pmc file: SQ_INST_LEVEL_SMEM.txt
Adding env var for counter definitions: ROCPROFILER_METRICS_PATH=/tmp/rocprof_counter_defs_dl9hsbp8
rocprof command: ['rocprofv3', '-A', 'absolute', '-i', '/app/workloads/vcopy/MI300X_A1/perfmon/SQ_INST_LEVEL_SMEM.txt', '-d', '/app/workloads/vcopy/MI300X_A1/out', '--kernel-trace', '--output-format', 'rocpd', '--', 'sample/vcopy', '-n', '1048576', '-b', '256', '-i', '3']
[subprocess] Running: rocprofv3 -A absolute -i /app/workloads/vcopy/MI300X_A1/perfmon/SQ_INST_LEVEL_SMEM.txt -d /app/workloads/vcopy/MI300X_A1/out --kernel-trace --output-format rocpd -- sample/vcopy -n 1048576 -b 256 -i 3
|-> [rocprofv3] W20250710 21:33:33.415380 140711413728064 simple_timer.cpp:55] [rocprofv3] tool initialization :: 0.170888 sec
|-> [rocprofv3] W20250710 21:33:33.415655 140711413728064 simple_timer.cpp:55] [rocprofv3] 'sample/vcopy -n 1048576 -b 256 -i 3' :: 0.000000 sec
|-> [rocprofv3] W20250710 21:33:33.649099 140711413728064 tool.cpp:2150] HSA version 8.18.0 initialized (instance=0)
|-> [rocprofv3] W20250710 21:33:34.063816 140711413728064 simple_timer.cpp:55] [rocprofv3] 'sample/vcopy -n 1048576 -b 256 -i 3' :: 0.648161 sec
|-> [rocprofv3] W20250710 21:33:34.079514 140711413728064 generateRocpd.cpp:580] writing SQL database for process 3568 on node 3224294684
|-> [rocprofv3] E20250710 21:33:34.080280 140711413728064 generateRocpd.cpp:603] Opened result file: /app/workloads/vcopy/MI300X_A1/out/pmc_1/7f6eaef84eaf/3568_results.db (UUID=00031de9-19e2-79e2-befc-6c52d81fce09)
|-> [rocprofv3] W20250710 21:33:34.120796 140711413728064 simple_timer.cpp:55] SQLite3 generation :: rocpd_string :: 0.014830 sec
|-> [rocprofv3] W20250710 21:33:34.121112 140711413728064 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_node :: 0.000299 sec
|-> [rocprofv3] W20250710 21:33:34.121859 140711413728064 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_process :: 0.000738 sec
|-> [rocprofv3] W20250710 21:33:34.134532 140711413728064 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_agent :: 0.012161 sec
|-> [rocprofv3] W20250710 21:33:35.536345 140711413728064 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_pmc :: 1.401796 sec
|-> [rocprofv3] W20250710 21:33:35.537554 140711413728064 simple_timer.cpp:55] SQLite3 generation :: rocpd kernel info :: 0.001175 sec
|-> [rocprofv3] W20250710 21:33:35.537565 140711413728064 simple_timer.cpp:55] SQLite3 generation :: rocpd_region :: 0.000002 sec
|-> [rocprofv3] W20250710 21:33:35.543474 140711413728064 simple_timer.cpp:55] SQLite3 generation :: rocpd_kernel_dispatch :: 0.005905 sec
|-> [rocprofv3] W20250710 21:33:35.574172 140711413728064 simple_timer.cpp:55] SQLite3 generation :: rocpd_pmc_event :: 0.030682 sec
|-> [rocprofv3] W20250710 21:33:35.574191 140711413728064 simple_timer.cpp:55] SQLite3 generation :: rocpd_memory_copy :: 0.000000 sec
|-> [rocprofv3] W20250710 21:33:35.574194 140711413728064 simple_timer.cpp:55] SQLite3 generation :: rocpd_memory_allocate :: 0.000001 sec
|-> [rocprofv3] W20250710 21:33:35.574389 140711413728064 simple_timer.cpp:55] SQLite3 generation :: SQL indexing :: 0.000193 sec
|-> [rocprofv3] W20250710 21:33:35.574956 140711413728064 simple_timer.cpp:55] SQLite3 generation :: total :: 1.495442 sec
|-> [rocprofv3] W20250710 21:33:35.579445 140711413728064 simple_timer.cpp:55] [rocprofv3] tool finalization :: 1.514702 sec
|-> [rocprofv3] vcopy testing on GCD 0
|-> [rocprofv3] Finished allocating vectors on the CPU
Finishing subprocess of fname /app/workloads/vcopy/MI300X_A1/perfmon/SQ_INST_LEVEL_SMEM.txt, the time it takes was 0 m 2.7634034156799316 sec
The time of run_prof of /app/workloads/vcopy/MI300X_A1/perfmon/SQ_INST_LEVEL_SMEM.txt is 0 m 3.9122095108032227 sec
[Run 4/12][Approximate profiling time left: 30 seconds]...
[profiling] Current input file: /app/workloads/vcopy/MI300X_A1/perfmon/SQ_INST_LEVEL_VMEM.txt
pmc file: SQ_INST_LEVEL_VMEM.txt
Adding env var for counter definitions: ROCPROFILER_METRICS_PATH=/tmp/rocprof_counter_defs_20mmgyh1
rocprof command: ['rocprofv3', '-A', 'absolute', '-i', '/app/workloads/vcopy/MI300X_A1/perfmon/SQ_INST_LEVEL_VMEM.txt', '-d', '/app/workloads/vcopy/MI300X_A1/out', '--kernel-trace', '--output-format', 'rocpd', '--', 'sample/vcopy', '-n', '1048576', '-b', '256', '-i', '3']
[subprocess] Running: rocprofv3 -A absolute -i /app/workloads/vcopy/MI300X_A1/perfmon/SQ_INST_LEVEL_VMEM.txt -d /app/workloads/vcopy/MI300X_A1/out --kernel-trace --output-format rocpd -- sample/vcopy -n 1048576 -b 256 -i 3
|-> [rocprofv3] W20250710 21:33:36.918470 140619354510144 simple_timer.cpp:55] [rocprofv3] tool initialization :: 0.104106 sec
|-> [rocprofv3] W20250710 21:33:36.918753 140619354510144 simple_timer.cpp:55] [rocprofv3] 'sample/vcopy -n 1048576 -b 256 -i 3' :: 0.000000 sec
|-> [rocprofv3] W20250710 21:33:37.102945 140619354510144 tool.cpp:2150] HSA version 8.18.0 initialized (instance=0)
|-> [rocprofv3] W20250710 21:33:37.299363 140619354510144 simple_timer.cpp:55] [rocprofv3] 'sample/vcopy -n 1048576 -b 256 -i 3' :: 0.380610 sec
|-> [rocprofv3] W20250710 21:33:37.314710 140619354510144 generateRocpd.cpp:580] writing SQL database for process 3575 on node 3224294684
|-> [rocprofv3] E20250710 21:33:37.315474 140619354510144 generateRocpd.cpp:603] Opened result file: /app/workloads/vcopy/MI300X_A1/out/pmc_1/7f6eaef84eaf/3575_results.db (UUID=00031de9-27d0-77d0-99c1-6bed5f3fcae1)
|-> [rocprofv3] W20250710 21:33:37.350755 140619354510144 simple_timer.cpp:55] SQLite3 generation :: rocpd_string :: 0.010948 sec
|-> [rocprofv3] W20250710 21:33:37.351102 140619354510144 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_node :: 0.000328 sec
|-> [rocprofv3] W20250710 21:33:37.351614 140619354510144 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_process :: 0.000503 sec
|-> [rocprofv3] W20250710 21:33:37.364570 140619354510144 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_agent :: 0.012404 sec
|-> [rocprofv3] W20250710 21:33:38.033478 140619354510144 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_pmc :: 0.668889 sec
|-> [rocprofv3] W20250710 21:33:38.034553 140619354510144 simple_timer.cpp:55] SQLite3 generation :: rocpd kernel info :: 0.001043 sec
|-> [rocprofv3] W20250710 21:33:38.034565 140619354510144 simple_timer.cpp:55] SQLite3 generation :: rocpd_region :: 0.000002 sec
|-> [rocprofv3] W20250710 21:33:38.040348 140619354510144 simple_timer.cpp:55] SQLite3 generation :: rocpd_kernel_dispatch :: 0.005778 sec
|-> [rocprofv3] W20250710 21:33:38.053614 140619354510144 simple_timer.cpp:55] SQLite3 generation :: rocpd_pmc_event :: 0.013255 sec
|-> [rocprofv3] W20250710 21:33:38.053627 140619354510144 simple_timer.cpp:55] SQLite3 generation :: rocpd_memory_copy :: 0.000000 sec
|-> [rocprofv3] W20250710 21:33:38.053630 140619354510144 simple_timer.cpp:55] SQLite3 generation :: rocpd_memory_allocate :: 0.000001 sec
|-> [rocprofv3] W20250710 21:33:38.053815 140619354510144 simple_timer.cpp:55] SQLite3 generation :: SQL indexing :: 0.000183 sec
|-> [rocprofv3] W20250710 21:33:38.054183 140619354510144 simple_timer.cpp:55] SQLite3 generation :: total :: 0.739472 sec
|-> [rocprofv3] W20250710 21:33:38.057863 140619354510144 simple_timer.cpp:55] [rocprofv3] tool finalization :: 0.757852 sec
|-> [rocprofv3] vcopy testing on GCD 0
|-> [rocprofv3] Finished allocating vectors on the CPU
Finishing subprocess of fname /app/workloads/vcopy/MI300X_A1/perfmon/SQ_INST_LEVEL_VMEM.txt, the time it takes was 0 m 1.6674315929412842 sec
The time of run_prof of /app/workloads/vcopy/MI300X_A1/perfmon/SQ_INST_LEVEL_VMEM.txt is 0 m 2.43190336227417 sec
[Run 5/12][Approximate profiling time left: 24 seconds]...
[profiling] Current input file: /app/workloads/vcopy/MI300X_A1/perfmon/SQ_LEVEL_WAVES.txt
pmc file: SQ_LEVEL_WAVES.txt
Adding env var for counter definitions: ROCPROFILER_METRICS_PATH=/tmp/rocprof_counter_defs_4_rrldsg
rocprof command: ['rocprofv3', '-A', 'absolute', '-i', '/app/workloads/vcopy/MI300X_A1/perfmon/SQ_LEVEL_WAVES.txt', '-d', '/app/workloads/vcopy/MI300X_A1/out', '--kernel-trace', '--output-format', 'rocpd', '--', 'sample/vcopy', '-n', '1048576', '-b', '256', '-i', '3']
[subprocess] Running: rocprofv3 -A absolute -i /app/workloads/vcopy/MI300X_A1/perfmon/SQ_LEVEL_WAVES.txt -d /app/workloads/vcopy/MI300X_A1/out --kernel-trace --output-format rocpd -- sample/vcopy -n 1048576 -b 256 -i 3
|-> [rocprofv3] W20250710 21:33:39.561464 139705878659904 simple_timer.cpp:55] [rocprofv3] tool initialization :: 0.135631 sec
|-> [rocprofv3] W20250710 21:33:39.561722 139705878659904 simple_timer.cpp:55] [rocprofv3] 'sample/vcopy -n 1048576 -b 256 -i 3' :: 0.000000 sec
|-> [rocprofv3] W20250710 21:33:39.744744 139705878659904 tool.cpp:2150] HSA version 8.18.0 initialized (instance=0)
|-> [rocprofv3] W20250710 21:33:40.044125 139705878659904 simple_timer.cpp:55] [rocprofv3] 'sample/vcopy -n 1048576 -b 256 -i 3' :: 0.482403 sec
|-> [rocprofv3] W20250710 21:33:40.059697 139705878659904 generateRocpd.cpp:580] writing SQL database for process 3582 on node 3224294684
|-> [rocprofv3] E20250710 21:33:40.060465 139705878659904 generateRocpd.cpp:603] Opened result file: /app/workloads/vcopy/MI300X_A1/out/pmc_1/7f6eaef84eaf/3582_results.db (UUID=00031de9-3205-7205-869c-ec5f02149d7e)
|-> [rocprofv3] W20250710 21:33:40.097027 139705878659904 simple_timer.cpp:55] SQLite3 generation :: rocpd_string :: 0.012617 sec
|-> [rocprofv3] W20250710 21:33:40.097312 139705878659904 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_node :: 0.000266 sec
|-> [rocprofv3] W20250710 21:33:40.097962 139705878659904 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_process :: 0.000641 sec
|-> [rocprofv3] W20250710 21:33:40.110168 139705878659904 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_agent :: 0.011707 sec
|-> [rocprofv3] W20250710 21:33:41.116521 139705878659904 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_pmc :: 1.006337 sec
|-> [rocprofv3] W20250710 21:33:41.117628 139705878659904 simple_timer.cpp:55] SQLite3 generation :: rocpd kernel info :: 0.001074 sec
|-> [rocprofv3] W20250710 21:33:41.117640 139705878659904 simple_timer.cpp:55] SQLite3 generation :: rocpd_region :: 0.000002 sec
|-> [rocprofv3] W20250710 21:33:41.123520 139705878659904 simple_timer.cpp:55] SQLite3 generation :: rocpd_kernel_dispatch :: 0.005876 sec
|-> [rocprofv3] W20250710 21:33:41.143166 139705878659904 simple_timer.cpp:55] SQLite3 generation :: rocpd_pmc_event :: 0.019632 sec
|-> [rocprofv3] W20250710 21:33:41.143182 139705878659904 simple_timer.cpp:55] SQLite3 generation :: rocpd_memory_copy :: 0.000000 sec
|-> [rocprofv3] W20250710 21:33:41.143186 139705878659904 simple_timer.cpp:55] SQLite3 generation :: rocpd_memory_allocate :: 0.000001 sec
|-> [rocprofv3] W20250710 21:33:41.143382 139705878659904 simple_timer.cpp:55] SQLite3 generation :: SQL indexing :: 0.000194 sec
|-> [rocprofv3] W20250710 21:33:41.143922 139705878659904 simple_timer.cpp:55] SQLite3 generation :: total :: 1.084224 sec
|-> [rocprofv3] W20250710 21:33:41.148481 139705878659904 simple_timer.cpp:55] [rocprofv3] tool finalization :: 1.103637 sec
|-> [rocprofv3] vcopy testing on GCD 0
|-> [rocprofv3] Finished allocating vectors on the CPU
Finishing subprocess of fname /app/workloads/vcopy/MI300X_A1/perfmon/SQ_LEVEL_WAVES.txt, the time it takes was 0 m 2.1514015197753906 sec
The time of run_prof of /app/workloads/vcopy/MI300X_A1/perfmon/SQ_LEVEL_WAVES.txt is 0 m 3.113880157470703 sec
[Run 6/12][Approximate profiling time left: 20 seconds]...
[profiling] Current input file: /app/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_0.txt
pmc file: pmc_perf_0.txt
Adding env var for counter definitions: ROCPROFILER_METRICS_PATH=/tmp/rocprof_counter_defs_b7cf79sp
rocprof command: ['rocprofv3', '-A', 'absolute', '-i', '/app/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_0.txt', '-d', '/app/workloads/vcopy/MI300X_A1/out', '--kernel-trace', '--output-format', 'rocpd', '--', 'sample/vcopy', '-n', '1048576', '-b', '256', '-i', '3']
[subprocess] Running: rocprofv3 -A absolute -i /app/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_0.txt -d /app/workloads/vcopy/MI300X_A1/out --kernel-trace --output-format rocpd -- sample/vcopy -n 1048576 -b 256 -i 3
|-> [rocprofv3] W20250710 21:33:42.470522 139873853399872 simple_timer.cpp:55] [rocprofv3] tool initialization :: 0.104173 sec
|-> [rocprofv3] W20250710 21:33:42.470776 139873853399872 simple_timer.cpp:55] [rocprofv3] 'sample/vcopy -n 1048576 -b 256 -i 3' :: 0.000000 sec
|-> [rocprofv3] W20250710 21:33:42.655197 139873853399872 tool.cpp:2150] HSA version 8.18.0 initialized (instance=0)
|-> [rocprofv3] W20250710 21:33:42.849277 139873853399872 simple_timer.cpp:55] [rocprofv3] 'sample/vcopy -n 1048576 -b 256 -i 3' :: 0.378502 sec
|-> [rocprofv3] W20250710 21:33:42.865054 139873853399872 generateRocpd.cpp:580] writing SQL database for process 3589 on node 3224294684
|-> [rocprofv3] E20250710 21:33:42.865845 139873853399872 generateRocpd.cpp:603] Opened result file: /app/workloads/vcopy/MI300X_A1/out/pmc_1/7f6eaef84eaf/3589_results.db (UUID=00031de9-3d82-7d82-96dc-2dcf0a0631dc)
|-> [rocprofv3] W20250710 21:33:42.903272 139873853399872 simple_timer.cpp:55] SQLite3 generation :: rocpd_string :: 0.011102 sec
|-> [rocprofv3] W20250710 21:33:42.903557 139873853399872 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_node :: 0.000267 sec
|-> [rocprofv3] W20250710 21:33:42.904080 139873853399872 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_process :: 0.000512 sec
|-> [rocprofv3] W20250710 21:33:42.916637 139873853399872 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_agent :: 0.012050 sec
|-> [rocprofv3] W20250710 21:33:43.580464 139873853399872 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_pmc :: 0.663812 sec
|-> [rocprofv3] W20250710 21:33:43.581638 139873853399872 simple_timer.cpp:55] SQLite3 generation :: rocpd kernel info :: 0.001143 sec
|-> [rocprofv3] W20250710 21:33:43.581649 139873853399872 simple_timer.cpp:55] SQLite3 generation :: rocpd_region :: 0.000002 sec
|-> [rocprofv3] W20250710 21:33:43.587455 139873853399872 simple_timer.cpp:55] SQLite3 generation :: rocpd_kernel_dispatch :: 0.005801 sec
|-> [rocprofv3] W20250710 21:33:43.599109 139873853399872 simple_timer.cpp:55] SQLite3 generation :: rocpd_pmc_event :: 0.011640 sec
|-> [rocprofv3] W20250710 21:33:43.599125 139873853399872 simple_timer.cpp:55] SQLite3 generation :: rocpd_memory_copy :: 0.000000 sec
|-> [rocprofv3] W20250710 21:33:43.599128 139873853399872 simple_timer.cpp:55] SQLite3 generation :: rocpd_memory_allocate :: 0.000001 sec
|-> [rocprofv3] W20250710 21:33:43.599317 139873853399872 simple_timer.cpp:55] SQLite3 generation :: SQL indexing :: 0.000186 sec
|-> [rocprofv3] W20250710 21:33:43.599679 139873853399872 simple_timer.cpp:55] SQLite3 generation :: total :: 0.734625 sec
|-> [rocprofv3] W20250710 21:33:43.603395 139873853399872 simple_timer.cpp:55] [rocprofv3] tool finalization :: 0.753253 sec
|-> [rocprofv3] vcopy testing on GCD 0
|-> [rocprofv3] Finished allocating vectors on the CPU
Finishing subprocess of fname /app/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_0.txt, the time it takes was 0 m 1.6624979972839355 sec
The time of run_prof of /app/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_0.txt is 0 m 2.4318082332611084 sec
[Run 7/12][Approximate profiling time left: 16 seconds]...
[profiling] Current input file: /app/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_1.txt
pmc file: pmc_perf_1.txt
Adding env var for counter definitions: ROCPROFILER_METRICS_PATH=/tmp/rocprof_counter_defs_i5zi8g1c
rocprof command: ['rocprofv3', '-A', 'absolute', '-i', '/app/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_1.txt', '-d', '/app/workloads/vcopy/MI300X_A1/out', '--kernel-trace', '--output-format', 'rocpd', '--', 'sample/vcopy', '-n', '1048576', '-b', '256', '-i', '3']
[subprocess] Running: rocprofv3 -A absolute -i /app/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_1.txt -d /app/workloads/vcopy/MI300X_A1/out --kernel-trace --output-format rocpd -- sample/vcopy -n 1048576 -b 256 -i 3
|-> [rocprofv3] W20250710 21:33:44.921375 140329810051904 simple_timer.cpp:55] [rocprofv3] tool initialization :: 0.104611 sec
|-> [rocprofv3] W20250710 21:33:44.921646 140329810051904 simple_timer.cpp:55] [rocprofv3] 'sample/vcopy -n 1048576 -b 256 -i 3' :: 0.000000 sec
|-> [rocprofv3] W20250710 21:33:45.127446 140329810051904 tool.cpp:2150] HSA version 8.18.0 initialized (instance=0)
|-> [rocprofv3] W20250710 21:33:45.322745 140329810051904 simple_timer.cpp:55] [rocprofv3] 'sample/vcopy -n 1048576 -b 256 -i 3' :: 0.401100 sec
|-> [rocprofv3] W20250710 21:33:45.338315 140329810051904 generateRocpd.cpp:580] writing SQL database for process 3596 on node 3224294684
|-> [rocprofv3] E20250710 21:33:45.339069 140329810051904 generateRocpd.cpp:603] Opened result file: /app/workloads/vcopy/MI300X_A1/out/pmc_1/7f6eaef84eaf/3596_results.db (UUID=00031de9-4713-7713-aca4-263b5bf68657)
|-> [rocprofv3] W20250710 21:33:45.374491 140329810051904 simple_timer.cpp:55] SQLite3 generation :: rocpd_string :: 0.010790 sec
|-> [rocprofv3] W20250710 21:33:45.374878 140329810051904 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_node :: 0.000369 sec
|-> [rocprofv3] W20250710 21:33:45.375436 140329810051904 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_process :: 0.000547 sec
|-> [rocprofv3] W20250710 21:33:45.388444 140329810051904 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_agent :: 0.012406 sec
|-> [rocprofv3] W20250710 21:33:46.060238 140329810051904 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_pmc :: 0.671777 sec
|-> [rocprofv3] W20250710 21:33:46.061333 140329810051904 simple_timer.cpp:55] SQLite3 generation :: rocpd kernel info :: 0.001063 sec
|-> [rocprofv3] W20250710 21:33:46.061344 140329810051904 simple_timer.cpp:55] SQLite3 generation :: rocpd_region :: 0.000002 sec
|-> [rocprofv3] W20250710 21:33:46.067223 140329810051904 simple_timer.cpp:55] SQLite3 generation :: rocpd_kernel_dispatch :: 0.005874 sec
|-> [rocprofv3] W20250710 21:33:46.078638 140329810051904 simple_timer.cpp:55] SQLite3 generation :: rocpd_pmc_event :: 0.011402 sec
|-> [rocprofv3] W20250710 21:33:46.078650 140329810051904 simple_timer.cpp:55] SQLite3 generation :: rocpd_memory_copy :: 0.000000 sec
|-> [rocprofv3] W20250710 21:33:46.078653 140329810051904 simple_timer.cpp:55] SQLite3 generation :: rocpd_memory_allocate :: 0.000001 sec
|-> [rocprofv3] W20250710 21:33:46.078838 140329810051904 simple_timer.cpp:55] SQLite3 generation :: SQL indexing :: 0.000182 sec
|-> [rocprofv3] W20250710 21:33:46.079192 140329810051904 simple_timer.cpp:55] SQLite3 generation :: total :: 0.740877 sec
|-> [rocprofv3] W20250710 21:33:46.082891 140329810051904 simple_timer.cpp:55] [rocprofv3] tool finalization :: 0.759142 sec
|-> [rocprofv3] vcopy testing on GCD 0
|-> [rocprofv3] Finished allocating vectors on the CPU
Finishing subprocess of fname /app/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_1.txt, the time it takes was 0 m 1.6950774192810059 sec
The time of run_prof of /app/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_1.txt is 0 m 2.480154514312744 sec
[Run 8/12][Approximate profiling time left: 12 seconds]...
[profiling] Current input file: /app/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_2.txt
pmc file: pmc_perf_2.txt
Adding env var for counter definitions: ROCPROFILER_METRICS_PATH=/tmp/rocprof_counter_defs_tv_uqwoh
rocprof command: ['rocprofv3', '-A', 'absolute', '-i', '/app/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_2.txt', '-d', '/app/workloads/vcopy/MI300X_A1/out', '--kernel-trace', '--output-format', 'rocpd', '--', 'sample/vcopy', '-n', '1048576', '-b', '256', '-i', '3']
[subprocess] Running: rocprofv3 -A absolute -i /app/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_2.txt -d /app/workloads/vcopy/MI300X_A1/out --kernel-trace --output-format rocpd -- sample/vcopy -n 1048576 -b 256 -i 3
|-> [rocprofv3] W20250710 21:33:47.392035 140613043073856 simple_timer.cpp:55] [rocprofv3] tool initialization :: 0.103348 sec
|-> [rocprofv3] W20250710 21:33:47.392296 140613043073856 simple_timer.cpp:55] [rocprofv3] 'sample/vcopy -n 1048576 -b 256 -i 3' :: 0.000000 sec
|-> [rocprofv3] W20250710 21:33:47.575114 140613043073856 tool.cpp:2150] HSA version 8.18.0 initialized (instance=0)
|-> [rocprofv3] W20250710 21:33:47.769792 140613043073856 simple_timer.cpp:55] [rocprofv3] 'sample/vcopy -n 1048576 -b 256 -i 3' :: 0.377496 sec
|-> [rocprofv3] W20250710 21:33:47.785435 140613043073856 generateRocpd.cpp:580] writing SQL database for process 3603 on node 3224294684
|-> [rocprofv3] E20250710 21:33:47.786191 140613043073856 generateRocpd.cpp:603] Opened result file: /app/workloads/vcopy/MI300X_A1/out/pmc_1/7f6eaef84eaf/3603_results.db (UUID=00031de9-50bd-70bd-ae26-e3dd40c28d4c)
|-> [rocprofv3] W20250710 21:33:47.820358 140613043073856 simple_timer.cpp:55] SQLite3 generation :: rocpd_string :: 0.010878 sec
|-> [rocprofv3] W20250710 21:33:47.820688 140613043073856 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_node :: 0.000313 sec
|-> [rocprofv3] W20250710 21:33:47.821208 140613043073856 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_process :: 0.000510 sec
|-> [rocprofv3] W20250710 21:33:47.833615 140613043073856 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_agent :: 0.011855 sec
|-> [rocprofv3] W20250710 21:33:48.485409 140613043073856 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_pmc :: 0.651779 sec
|-> [rocprofv3] W20250710 21:33:48.486280 140613043073856 simple_timer.cpp:55] SQLite3 generation :: rocpd kernel info :: 0.000839 sec
|-> [rocprofv3] W20250710 21:33:48.486291 140613043073856 simple_timer.cpp:55] SQLite3 generation :: rocpd_region :: 0.000002 sec
|-> [rocprofv3] W20250710 21:33:48.492148 140613043073856 simple_timer.cpp:55] SQLite3 generation :: rocpd_kernel_dispatch :: 0.005851 sec
|-> [rocprofv3] W20250710 21:33:48.503742 140613043073856 simple_timer.cpp:55] SQLite3 generation :: rocpd_pmc_event :: 0.011578 sec
|-> [rocprofv3] W20250710 21:33:48.503758 140613043073856 simple_timer.cpp:55] SQLite3 generation :: rocpd_memory_copy :: 0.000000 sec
|-> [rocprofv3] W20250710 21:33:48.503762 140613043073856 simple_timer.cpp:55] SQLite3 generation :: rocpd_memory_allocate :: 0.000000 sec
|-> [rocprofv3] W20250710 21:33:48.503952 140613043073856 simple_timer.cpp:55] SQLite3 generation :: SQL indexing :: 0.000188 sec
|-> [rocprofv3] W20250710 21:33:48.504347 140613043073856 simple_timer.cpp:55] SQLite3 generation :: total :: 0.718912 sec
|-> [rocprofv3] W20250710 21:33:48.508036 140613043073856 simple_timer.cpp:55] [rocprofv3] tool finalization :: 0.737530 sec
|-> [rocprofv3] vcopy testing on GCD 0
|-> [rocprofv3] Finished allocating vectors on the CPU
Finishing subprocess of fname /app/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_2.txt, the time it takes was 0 m 1.6442956924438477 sec
The time of run_prof of /app/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_2.txt is 0 m 2.423243761062622 sec
[Run 9/12][Approximate profiling time left: 9 seconds]...
[profiling] Current input file: /app/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_3.txt
pmc file: pmc_perf_3.txt
Adding env var for counter definitions: ROCPROFILER_METRICS_PATH=/tmp/rocprof_counter_defs_d6edxy7z
rocprof command: ['rocprofv3', '-A', 'absolute', '-i', '/app/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_3.txt', '-d', '/app/workloads/vcopy/MI300X_A1/out', '--kernel-trace', '--output-format', 'rocpd', '--', 'sample/vcopy', '-n', '1048576', '-b', '256', '-i', '3']
[subprocess] Running: rocprofv3 -A absolute -i /app/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_3.txt -d /app/workloads/vcopy/MI300X_A1/out --kernel-trace --output-format rocpd -- sample/vcopy -n 1048576 -b 256 -i 3
|-> [rocprofv3] W20250710 21:33:49.825653 139970175718208 simple_timer.cpp:55] [rocprofv3] tool initialization :: 0.103156 sec
|-> [rocprofv3] W20250710 21:33:49.825903 139970175718208 simple_timer.cpp:55] [rocprofv3] 'sample/vcopy -n 1048576 -b 256 -i 3' :: 0.000000 sec
|-> [rocprofv3] W20250710 21:33:50.011954 139970175718208 tool.cpp:2150] HSA version 8.18.0 initialized (instance=0)
|-> [rocprofv3] W20250710 21:33:50.207081 139970175718208 simple_timer.cpp:55] [rocprofv3] 'sample/vcopy -n 1048576 -b 256 -i 3' :: 0.381179 sec
|-> [rocprofv3] W20250710 21:33:50.222580 139970175718208 generateRocpd.cpp:580] writing SQL database for process 3610 on node 3224294684
|-> [rocprofv3] E20250710 21:33:50.223351 139970175718208 generateRocpd.cpp:603] Opened result file: /app/workloads/vcopy/MI300X_A1/out/pmc_1/7f6eaef84eaf/3610_results.db (UUID=00031de9-5a3e-7a3e-8845-b724df0c5328)
|-> [rocprofv3] W20250710 21:33:50.277220 139970175718208 simple_timer.cpp:55] SQLite3 generation :: rocpd_string :: 0.030272 sec
|-> [rocprofv3] W20250710 21:33:50.277672 139970175718208 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_node :: 0.000428 sec
|-> [rocprofv3] W20250710 21:33:50.278189 139970175718208 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_process :: 0.000509 sec
|-> [rocprofv3] W20250710 21:33:50.290370 139970175718208 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_agent :: 0.011683 sec
|-> [rocprofv3] W20250710 21:33:50.954577 139970175718208 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_pmc :: 0.664191 sec
|-> [rocprofv3] W20250710 21:33:50.955670 139970175718208 simple_timer.cpp:55] SQLite3 generation :: rocpd kernel info :: 0.001063 sec
|-> [rocprofv3] W20250710 21:33:50.955682 139970175718208 simple_timer.cpp:55] SQLite3 generation :: rocpd_region :: 0.000002 sec
|-> [rocprofv3] W20250710 21:33:50.961629 139970175718208 simple_timer.cpp:55] SQLite3 generation :: rocpd_kernel_dispatch :: 0.005942 sec
|-> [rocprofv3] W20250710 21:33:50.973364 139970175718208 simple_timer.cpp:55] SQLite3 generation :: rocpd_pmc_event :: 0.011722 sec
|-> [rocprofv3] W20250710 21:33:50.973378 139970175718208 simple_timer.cpp:55] SQLite3 generation :: rocpd_memory_copy :: 0.000000 sec
|-> [rocprofv3] W20250710 21:33:50.973381 139970175718208 simple_timer.cpp:55] SQLite3 generation :: rocpd_memory_allocate :: 0.000000 sec
|-> [rocprofv3] W20250710 21:33:50.973569 139970175718208 simple_timer.cpp:55] SQLite3 generation :: SQL indexing :: 0.000187 sec
|-> [rocprofv3] W20250710 21:33:50.973960 139970175718208 simple_timer.cpp:55] SQLite3 generation :: total :: 0.751380 sec
|-> [rocprofv3] W20250710 21:33:50.977619 139970175718208 simple_timer.cpp:55] [rocprofv3] tool finalization :: 0.769843 sec
|-> [rocprofv3] vcopy testing on GCD 0
|-> [rocprofv3] Finished allocating vectors on the CPU
Finishing subprocess of fname /app/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_3.txt, the time it takes was 0 m 1.6794917583465576 sec
The time of run_prof of /app/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_3.txt is 0 m 2.4662797451019287 sec
[Run 10/12][Approximate profiling time left: 5 seconds]...
[profiling] Current input file: /app/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_4.txt
pmc file: pmc_perf_4.txt
Adding env var for counter definitions: ROCPROFILER_METRICS_PATH=/tmp/rocprof_counter_defs_7pmy1aeg
rocprof command: ['rocprofv3', '-A', 'absolute', '-i', '/app/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_4.txt', '-d', '/app/workloads/vcopy/MI300X_A1/out', '--kernel-trace', '--output-format', 'rocpd', '--', 'sample/vcopy', '-n', '1048576', '-b', '256', '-i', '3']
[subprocess] Running: rocprofv3 -A absolute -i /app/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_4.txt -d /app/workloads/vcopy/MI300X_A1/out --kernel-trace --output-format rocpd -- sample/vcopy -n 1048576 -b 256 -i 3
|-> [rocprofv3] W20250710 21:33:52.291220 140318446199616 simple_timer.cpp:55] [rocprofv3] tool initialization :: 0.102577 sec
|-> [rocprofv3] W20250710 21:33:52.291470 140318446199616 simple_timer.cpp:55] [rocprofv3] 'sample/vcopy -n 1048576 -b 256 -i 3' :: 0.000000 sec
|-> [rocprofv3] W20250710 21:33:52.492822 140318446199616 tool.cpp:2150] HSA version 8.18.0 initialized (instance=0)
|-> [rocprofv3] W20250710 21:33:52.684795 140318446199616 simple_timer.cpp:55] [rocprofv3] 'sample/vcopy -n 1048576 -b 256 -i 3' :: 0.393325 sec
|-> [rocprofv3] W20250710 21:33:52.700354 140318446199616 generateRocpd.cpp:580] writing SQL database for process 3617 on node 3224294684
|-> [rocprofv3] E20250710 21:33:52.701113 140318446199616 generateRocpd.cpp:603] Opened result file: /app/workloads/vcopy/MI300X_A1/out/pmc_1/7f6eaef84eaf/3617_results.db (UUID=00031de9-63e2-73e2-b1de-4720cbe72cf4)
|-> [rocprofv3] W20250710 21:33:52.757125 140318446199616 simple_timer.cpp:55] SQLite3 generation :: rocpd_string :: 0.010891 sec
|-> [rocprofv3] W20250710 21:33:52.757435 140318446199616 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_node :: 0.000290 sec
|-> [rocprofv3] W20250710 21:33:52.757948 140318446199616 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_process :: 0.000505 sec
|-> [rocprofv3] W20250710 21:33:52.771027 140318446199616 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_agent :: 0.012490 sec
|-> [rocprofv3] W20250710 21:33:53.432385 140318446199616 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_pmc :: 0.661342 sec
|-> [rocprofv3] W20250710 21:33:53.433363 140318446199616 simple_timer.cpp:55] SQLite3 generation :: rocpd kernel info :: 0.000947 sec
|-> [rocprofv3] W20250710 21:33:53.433374 140318446199616 simple_timer.cpp:55] SQLite3 generation :: rocpd_region :: 0.000002 sec
|-> [rocprofv3] W20250710 21:33:53.439104 140318446199616 simple_timer.cpp:55] SQLite3 generation :: rocpd_kernel_dispatch :: 0.005725 sec
|-> [rocprofv3] W20250710 21:33:53.450240 140318446199616 simple_timer.cpp:55] SQLite3 generation :: rocpd_pmc_event :: 0.011123 sec
|-> [rocprofv3] W20250710 21:33:53.450253 140318446199616 simple_timer.cpp:55] SQLite3 generation :: rocpd_memory_copy :: 0.000000 sec
|-> [rocprofv3] W20250710 21:33:53.450257 140318446199616 simple_timer.cpp:55] SQLite3 generation :: rocpd_memory_allocate :: 0.000001 sec
|-> [rocprofv3] W20250710 21:33:53.450440 140318446199616 simple_timer.cpp:55] SQLite3 generation :: SQL indexing :: 0.000181 sec
|-> [rocprofv3] W20250710 21:33:53.450806 140318446199616 simple_timer.cpp:55] SQLite3 generation :: total :: 0.750452 sec
|-> [rocprofv3] W20250710 21:33:53.454365 140318446199616 simple_timer.cpp:55] [rocprofv3] tool finalization :: 0.768517 sec
|-> [rocprofv3] vcopy testing on GCD 0
|-> [rocprofv3] Finished allocating vectors on the CPU
Finishing subprocess of fname /app/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_4.txt, the time it takes was 0 m 1.689640760421753 sec
The time of run_prof of /app/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_4.txt is 0 m 2.470271110534668 sec
[Run 11/12][Approximate profiling time left: 2 seconds]...
[profiling] Current input file: /app/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_5.txt
pmc file: pmc_perf_5.txt
Adding env var for counter definitions: ROCPROFILER_METRICS_PATH=/tmp/rocprof_counter_defs_i1z59usc
rocprof command: ['rocprofv3', '-A', 'absolute', '-i', '/app/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_5.txt', '-d', '/app/workloads/vcopy/MI300X_A1/out', '--kernel-trace', '--output-format', 'rocpd', '--', 'sample/vcopy', '-n', '1048576', '-b', '256', '-i', '3']
[subprocess] Running: rocprofv3 -A absolute -i /app/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_5.txt -d /app/workloads/vcopy/MI300X_A1/out --kernel-trace --output-format rocpd -- sample/vcopy -n 1048576 -b 256 -i 3
|-> [rocprofv3] W20250710 21:33:54.741603 140680114004800 simple_timer.cpp:55] [rocprofv3] tool initialization :: 0.103912 sec
|-> [rocprofv3] W20250710 21:33:54.741856 140680114004800 simple_timer.cpp:55] [rocprofv3] 'sample/vcopy -n 1048576 -b 256 -i 3' :: 0.000000 sec
|-> [rocprofv3] W20250710 21:33:54.934468 140680114004800 tool.cpp:2150] HSA version 8.18.0 initialized (instance=0)
|-> [rocprofv3] W20250710 21:33:55.127192 140680114004800 simple_timer.cpp:55] [rocprofv3] 'sample/vcopy -n 1048576 -b 256 -i 3' :: 0.385336 sec
|-> [rocprofv3] W20250710 21:33:55.142986 140680114004800 generateRocpd.cpp:580] writing SQL database for process 3624 on node 3224294684
|-> [rocprofv3] E20250710 21:33:55.143809 140680114004800 generateRocpd.cpp:603] Opened result file: /app/workloads/vcopy/MI300X_A1/out/pmc_1/7f6eaef84eaf/3624_results.db (UUID=00031de9-6d70-7d70-801f-957d2f8ca86a)
|-> [rocprofv3] W20250710 21:33:55.177382 140680114004800 simple_timer.cpp:55] SQLite3 generation :: rocpd_string :: 0.010768 sec
|-> [rocprofv3] W20250710 21:33:55.177741 140680114004800 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_node :: 0.000341 sec
|-> [rocprofv3] W20250710 21:33:55.178266 140680114004800 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_process :: 0.000516 sec
|-> [rocprofv3] W20250710 21:33:55.190617 140680114004800 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_agent :: 0.011831 sec
|-> [rocprofv3] W20250710 21:33:55.867254 140680114004800 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_pmc :: 0.676620 sec
|-> [rocprofv3] W20250710 21:33:55.868249 140680114004800 simple_timer.cpp:55] SQLite3 generation :: rocpd kernel info :: 0.000964 sec
|-> [rocprofv3] W20250710 21:33:55.868260 140680114004800 simple_timer.cpp:55] SQLite3 generation :: rocpd_region :: 0.000002 sec
|-> [rocprofv3] W20250710 21:33:55.874104 140680114004800 simple_timer.cpp:55] SQLite3 generation :: rocpd_kernel_dispatch :: 0.005839 sec
|-> [rocprofv3] W20250710 21:33:55.885401 140680114004800 simple_timer.cpp:55] SQLite3 generation :: rocpd_pmc_event :: 0.011285 sec
|-> [rocprofv3] W20250710 21:33:55.885413 140680114004800 simple_timer.cpp:55] SQLite3 generation :: rocpd_memory_copy :: 0.000000 sec
|-> [rocprofv3] W20250710 21:33:55.885417 140680114004800 simple_timer.cpp:55] SQLite3 generation :: rocpd_memory_allocate :: 0.000001 sec
|-> [rocprofv3] W20250710 21:33:55.885599 140680114004800 simple_timer.cpp:55] SQLite3 generation :: SQL indexing :: 0.000180 sec
|-> [rocprofv3] W20250710 21:33:55.885979 140680114004800 simple_timer.cpp:55] SQLite3 generation :: total :: 0.742993 sec
|-> [rocprofv3] W20250710 21:33:55.889649 140680114004800 simple_timer.cpp:55] [rocprofv3] tool finalization :: 0.761385 sec
|-> [rocprofv3] vcopy testing on GCD 0
|-> [rocprofv3] Finished allocating vectors on the CPU
Finishing subprocess of fname /app/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_5.txt, the time it takes was 0 m 1.6850097179412842 sec
The time of run_prof of /app/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_5.txt is 0 m 2.4484713077545166 sec
[Run 12/12][Approximate profiling time left: 0 seconds]...
[profiling] Current input file: /app/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_6.txt
pmc file: pmc_perf_6.txt
Adding env var for counter definitions: ROCPROFILER_METRICS_PATH=/tmp/rocprof_counter_defs_8km5kicc
rocprof command: ['rocprofv3', '-A', 'absolute', '-i', '/app/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_6.txt', '-d', '/app/workloads/vcopy/MI300X_A1/out', '--kernel-trace', '--output-format', 'rocpd', '--', 'sample/vcopy', '-n', '1048576', '-b', '256', '-i', '3']
[subprocess] Running: rocprofv3 -A absolute -i /app/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_6.txt -d /app/workloads/vcopy/MI300X_A1/out --kernel-trace --output-format rocpd -- sample/vcopy -n 1048576 -b 256 -i 3
|-> [rocprofv3] W20250710 21:33:57.212530 139907727749952 simple_timer.cpp:55] [rocprofv3] tool initialization :: 0.103224 sec
|-> [rocprofv3] W20250710 21:33:57.212781 139907727749952 simple_timer.cpp:55] [rocprofv3] 'sample/vcopy -n 1048576 -b 256 -i 3' :: 0.000000 sec
|-> [rocprofv3] W20250710 21:33:57.411536 139907727749952 tool.cpp:2150] HSA version 8.18.0 initialized (instance=0)
|-> [rocprofv3] W20250710 21:33:57.602202 139907727749952 simple_timer.cpp:55] [rocprofv3] 'sample/vcopy -n 1048576 -b 256 -i 3' :: 0.389421 sec
|-> [rocprofv3] W20250710 21:33:57.617817 139907727749952 generateRocpd.cpp:580] writing SQL database for process 3631 on node 3224294684
|-> [rocprofv3] E20250710 21:33:57.618592 139907727749952 generateRocpd.cpp:603] Opened result file: /app/workloads/vcopy/MI300X_A1/out/pmc_1/7f6eaef84eaf/3631_results.db (UUID=00031de9-771a-771a-95de-02702fbe6904)
|-> [rocprofv3] W20250710 21:33:57.682260 139907727749952 simple_timer.cpp:55] SQLite3 generation :: rocpd_string :: 0.010946 sec
|-> [rocprofv3] W20250710 21:33:57.682562 139907727749952 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_node :: 0.000283 sec
|-> [rocprofv3] W20250710 21:33:57.683093 139907727749952 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_process :: 0.000521 sec
|-> [rocprofv3] W20250710 21:33:57.695274 139907727749952 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_agent :: 0.011651 sec
|-> [rocprofv3] W20250710 21:33:58.358124 139907727749952 simple_timer.cpp:55] SQLite3 generation :: rocpd_info_pmc :: 0.662833 sec
|-> [rocprofv3] W20250710 21:33:58.358983 139907727749952 simple_timer.cpp:55] SQLite3 generation :: rocpd kernel info :: 0.000831 sec
|-> [rocprofv3] W20250710 21:33:58.358994 139907727749952 simple_timer.cpp:55] SQLite3 generation :: rocpd_region :: 0.000002 sec
|-> [rocprofv3] W20250710 21:33:58.364913 139907727749952 simple_timer.cpp:55] SQLite3 generation :: rocpd_kernel_dispatch :: 0.005914 sec
|-> [rocprofv3] W20250710 21:33:58.372546 139907727749952 simple_timer.cpp:55] SQLite3 generation :: rocpd_pmc_event :: 0.007617 sec
|-> [rocprofv3] W20250710 21:33:58.372562 139907727749952 simple_timer.cpp:55] SQLite3 generation :: rocpd_memory_copy :: 0.000000 sec
|-> [rocprofv3] W20250710 21:33:58.372566 139907727749952 simple_timer.cpp:55] SQLite3 generation :: rocpd_memory_allocate :: 0.000001 sec
|-> [rocprofv3] W20250710 21:33:58.372750 139907727749952 simple_timer.cpp:55] SQLite3 generation :: SQL indexing :: 0.000182 sec
|-> [rocprofv3] W20250710 21:33:58.373145 139907727749952 simple_timer.cpp:55] SQLite3 generation :: total :: 0.755328 sec
|-> [rocprofv3] W20250710 21:33:58.376827 139907727749952 simple_timer.cpp:55] [rocprofv3] tool finalization :: 0.773574 sec
|-> [rocprofv3] vcopy testing on GCD 0
|-> [rocprofv3] Finished allocating vectors on the CPU
Finishing subprocess of fname /app/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_6.txt, the time it takes was 0 m 1.6893463134765625 sec
The time of run_prof of /app/workloads/vcopy/MI300X_A1/perfmon/pmc_perf_6.txt is 0 m 2.4789364337921143 sec
finished "run_profiling" and finished rocprof's workload, time taken was 0 m 34.34300780296326 sec
[profiling] performing post-processing using rocprofv3 profiler
Created file: /app/workloads/vcopy/MI300X_A1/pmc_perf.csv
Deleted file: /app/workloads/vcopy/MI300X_A1/results_pmc_perf_6.csv
Deleted file: /app/workloads/vcopy/MI300X_A1/results_SQ_IFETCH_LEVEL.csv
Deleted file: /app/workloads/vcopy/MI300X_A1/results_pmc_perf_2.csv
Deleted file: /app/workloads/vcopy/MI300X_A1/results_SQ_INST_LEVEL_VMEM.csv
Deleted file: /app/workloads/vcopy/MI300X_A1/results_SQ_INST_LEVEL_LDS.csv
Deleted file: /app/workloads/vcopy/MI300X_A1/results_SQ_INST_LEVEL_SMEM.csv
Deleted file: /app/workloads/vcopy/MI300X_A1/results_pmc_perf_5.csv
Deleted file: /app/workloads/vcopy/MI300X_A1/results_pmc_perf_3.csv
Deleted file: /app/workloads/vcopy/MI300X_A1/results_pmc_perf_0.csv
Deleted file: /app/workloads/vcopy/MI300X_A1/results_pmc_perf_4.csv
Deleted file: /app/workloads/vcopy/MI300X_A1/results_pmc_perf_1.csv
Deleted file: /app/workloads/vcopy/MI300X_A1/results_SQ_LEVEL_WAVES.csv
time taken for "post_processing" was 0 seconds
[profiling] perform SoC post processing for gfx942
[roofline] Skipping roofline
File diff soppresso perché una o più righe sono troppo lunghe
File diff soppresso perché una o più righe sono troppo lunghe
File diff soppresso perché una o più righe sono troppo lunghe
@@ -0,0 +1,5 @@
pmc: SQ_INST_LEVEL_VMEM SQ_INST_LEVEL_VMEM_ACCUM SQ_LDS_UNALIGNED_STALL SQ_INSTS_VALU_MFMA_F32 SQ_INSTS SQ_WAIT_ANY SQC_ICACHE_MISSES SQ_INSTS_VALU_MFMA_BF16 TA_TOTAL_WAVEFRONTS_sum TA_BUFFER_READ_WAVEFRONTS_sum TCP_TCC_RW_READ_REQ_sum TCP_UTCL1_TRANSLATION_MISS_sum TCP_TCC_CC_READ_REQ_sum TCP_UTCL1_PERMISSION_MISS_sum TCC_EA0_RDREQ_LEVEL_sum TCC_ALL_TC_OP_INV_EVICT_sum TCC_EA0_WRREQ_64B_sum TCC_MISS_sum CPC_ME1_DC0_SPI_BUSY CPC_CPC_STAT_BUSY
gpu:
range:
kernel:
@@ -0,0 +1,10 @@
rocprofiler-sdk:
counters-schema-version: 1
counters:
- name: SQ_INST_LEVEL_VMEM_ACCUM
description: SQ_INST_LEVEL_VMEM accumulation
properties: []
definitions:
- architectures:
- gfx942
expression: accumulate(SQ_INST_LEVEL_VMEM, HIGH_RES)
@@ -0,0 +1,5 @@
pmc: SQ_LEVEL_WAVES SQ_LEVEL_WAVES_ACCUM SQ_INSTS_VMEM SQC_TC_DATA_READ_REQ SQ_INSTS_VALU_MFMA_F64 SQ_INSTS_VALU_MFMA_F16 SQ_ACTIVE_INST_FLAT SQC_DCACHE_HITS TA_BUFFER_ATOMIC_WAVEFRONTS_sum TA_ADDR_STALLED_BY_TC_CYCLES_sum TD_SPI_STALL_sum TCP_TCC_UC_ATOMIC_REQ_sum TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum TCP_TOTAL_READ_sum TCP_TOTAL_CACHE_ACCESSES_sum TCC_WRITE[99] TCC_REQ[28] TCC_REQ[31] TCC_WRITE[100] TCC_WRITE[112] TCC_WRITE[75] TCC_WRITE[114] TCC_EA0_WRREQ_DRAM_sum TCC_REQ[9] TCC_WRITE[39] TCC_REQ[122] TCC_WRITE[76] TCC_NORMAL_EVICT_sum TCC_WRITE[66] TCC_REQ[90] TCC_WRITE[61] TCC_REQ[45] TCC_WRITE[74] TCC_WRITE[108] TCC_REQ[76] TCC_WRITE[85] TCC_WRITE[111] TCC_REQ[46] TCC_WRITE[33] TCC_WRITE[0] TCC_WRITE[113] TCC_WRITE[86] TCC_REQ[37] TCC_REQ[77] TCC_REQ[13] TCC_WRITE[55] TCC_WRITE[101] TCC_REQ[27] TCC_REQ[89] TCC_WRITE[126] TCC_WRITE[34] TCC_REQ[112] TCC_WRITE[105] TCC_WRITE[2] TCC_REQ[39] TCC_WRITE[31] TCC_REQ[18] TCC_REQ[120] TCC_WRITE[15] TCC_WRITE[77] TCC_WRITE[68] TCC_REQ[51] TCC_WRITE[115] TCC_WRITE[63] TCC_REQ[23] TCC_REQ[32] TCC_REQ[85] TCC_REQ[95] TCC_WRITE[8] TCC_WRITE[98] TCC_WRITE[53] TCC_WRITE[51] TCC_WRITE[67] TCC_WRITE[1] TCC_REQ[110] TCC_REQ[65] TCC_WRITE[7] TCC_REQ[107] TCC_WRITE[89] TCC_WRITE[123] TCC_WRITE[23] TCC_REQ[116] TCC_WRITE[110] TCC_REQ[36] TCC_WRITE[107] TCC_REQ[74] TCC_REQ[12] TCC_WRITE[124] TCC_REQ[71] TCC_WRITE[4] TCC_WRITE[71] TCC_WRITE[103] TCC_WRITE[93] TCC_REQ[6] TCC_WRITE[6] TCC_REQ[96] TCC_REQ[99] TCC_WRITE[37] TCC_WRITE[22] TCC_WRITE[12] TCC_WRITE[122] TCC_REQ[127] TCC_REQ[68] TCC_WRITE[82] TCC_REQ[30] TCC_WRITE[45] TCC_REQ[38] TCC_WRITE[11] TCC_REQ[35] TCC_REQ[126] TCC_REQ[69] TCC_REQ[98] TCC_WRITE[3] TCC_WRITE[38] TCC_REQ[15] TCC_WRITE[49] TCC_WRITE[72] TCC_WRITE[116] TCC_REQ[59] TCC_WRITE[47] TCC_REQ[40] TCC_REQ[78] TCC_WRITE[70] TCC_WRITE[88] TCC_WRITE[17] TCC_WRITE[48] TCC_REQ[123] TCC_WRITE[102] TCC_REQ[53] TCC_REQ[19] TCC_REQ[108] TCC_REQ[11] TCC_REQ[64] TCC_WRITE[95] TCC_REQ[58] TCC_REQ[106] TCC_WRITE[81] TCC_WRITE[32] TCC_REQ[117] TCC_REQ[24] TCC_REQ[82] TCC_WRITE[65] TCC_WRITE[24] TCC_WRITE[19] TCC_WRITE[60] TCC_WRITE[58] TCC_REQ[66] TCC_WRITE[30] TCC_WRITE[118] TCC_REQ[80] TCC_REQ[94] TCC_REQ[20] TCC_REQ[102] TCC_WRITE[5] TCC_REQ[33] TCC_REQ[3] TCC_WRITE[25] TCC_REQ[118] TCC_REQ[52] TCC_REQ[17] TCC_REQ[8] TCC_WRITE[56] TCC_REQ[125] TCC_WRITE[97] TCC_WRITE[90] TCC_WRITE[20] TCC_WRITE[64] TCC_REQ[54] TCC_REQ[4] TCC_WRITE[84] TCC_REQ[29] TCC_REQ[92] TCC_REQ[101] TCC_WRITE[13] TCC_WRITE[96] TCC_REQ[7] TCC_WRITE[80] TCC_WRITE[46] TCC_REQ[62] TCC_WRITE[83] TCC_REQ[49] TCC_REQ[88] TCC_WRITE[104] TCC_REQ[84] TCC_REQ[41] TCC_REQ[44] TCC_REQ[60] TCC_REQ[119] TCC_REQ[0] TCC_REQ[55] TCC_REQ[26] TCC_REQ[81] TCC_WRITE[59] TCC_WRITE[42] TCC_WRITE[119] TCC_REQ[109] TCC_WRITE[50] TCC_WRITE[121] TCC_WRITE[18] TCC_REQ[70] TCC_WRITE[125] TCC_REQ[48] TCC_REQ[16] TCC_REQ[72] TCC_REQ[91] TCC_REQ[22] TCC_WRITE[40] TCC_REQ[75] TCC_WRITE[79] TCC_REQ[93] TCC_WRITE[78] TCC_WRITE[92] TCC_REQ[25] TCC_WRITE[9] TCC_REQ[43] TCC_WRITE[28] TCC_REQ[124] TCC_WRITE[52] TCC_WRITE[27] TCC_WRITE[87] TCC_WRITE[29] TCC_WRITE[73] TCC_WRITE[44] TCC_WRITE[41] TCC_REQ[73] TCC_REQ[111] TCC_REQ[87] TCC_REQ[86] TCC_REQ[5] TCC_WRITE[120] TCC_REQ[114] TCC_REQ[1] TCC_REQ[67] TCC_REQ[42] TCC_WRITE[10] TCC_REQ[2] TCC_WRITE[62] TCC_WRITE[106] TCC_REQ[97] TCC_REQ[57] TCC_WRITE[54] TCC_WRITE[117] TCC_WRITE[109] TCC_WRITE[43] TCC_REQ[115] TCC_REQ[100] TCC_REQ[121] TCC_WRITE[127] TCC_REQ[10] TCC_REQ[61] TCC_REQ[50] TCC_REQ[56] TCC_WRITE[94] TCC_REQ[103] TCC_REQ[21] TCC_WRITE[36] TCC_REQ[47] TCC_WRITE[26] TCC_REQ[79] TCC_WRITE[14] TCC_WRITE[91] TCC_WRITE[35] TCC_WRITE[21] TCC_WRITE[69] TCC_REQ[34] TCC_REQ[14] TCC_WRITE[57] TCC_REQ[105] TCC_REQ[83] TCC_REQ[113] TCC_REQ[63] TCC_REQ[104] TCC_WRITE[16] CPC_CPC_STAT_IDLE CPC_UTCL1_STALL_ON_TRANSLATION CPF_CPF_TCIU_IDLE
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc: SQ_ACTIVE_INST_VMEM SQC_DCACHE_REQ_READ_1 SQ_ACTIVE_INST_ANY SQC_TC_DATA_ATOMIC_REQ SQ_INSTS_LDS SQ_LDS_IDX_ACTIVE SQ_LDS_MEM_VIOLATIONS SQ_INSTS_SENDMSG TA_BUFFER_COALESCED_READ_CYCLES_sum TA_FLAT_WRITE_WAVEFRONTS_sum TCP_GATE_EN2_sum TCP_TCC_WRITE_REQ_sum TCP_UTCL1_REQUEST_sum TCP_TCC_ATOMIC_WITH_RET_REQ_sum TCC_TOO_MANY_EA_WRREQS_STALL_sum TCC_HIT_sum TCC_NORMAL_WRITEBACK_sum TCC_UC_REQ_sum
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc: SQ_INSTS_VALU_INT64 SQ_INSTS_SALU SQ_INSTS_VALU_MFMA_F8 SQ_INSTS_VALU_MUL_F64 SQ_WAVES SQ_WAVES_RESTORED SQC_DCACHE_MISSES_DUPLICATE SQ_INSTS_VSKIPPED TA_FLAT_READ_WAVEFRONTS_sum TA_BUFFER_WAVEFRONTS_sum TCP_TOTAL_WRITEBACK_INVALIDATES_sum TCP_TOTAL_ACCESSES_sum TCP_TCR_TCP_STALL_CYCLES_sum TCP_TCC_NC_WRITE_REQ_sum TCC_EA0_RDREQ_sum TCC_WRITEBACK_sum TCC_READ_sum TCC_EA0_RD_UNCACHED_32B_sum
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc: SQC_TC_INST_REQ SQ_ACTIVE_INST_LDS SQC_DCACHE_MISSES SQ_BUSY_CU_CYCLES SQ_INSTS_VALU_ADD_F32 SQ_INSTS_VALU_MFMA_I8 SQ_LDS_ADDR_CONFLICT SQ_IFETCH TA_DATA_STALLED_BY_TC_CYCLES_sum TA_TA_BUSY_sum TCP_TCC_RW_ATOMIC_REQ_sum TCP_GATE_EN1_sum TCP_TA_TCP_STATE_READ_sum TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum TCC_BUSY_sum TCC_BUBBLE_sum TCC_WRITE_sum TCC_STREAMING_REQ_sum
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc: SQ_INSTS_VALU_FMA_F64 SQ_INSTS_VALU_ADD_F16 SQ_INSTS_VALU_MUL_F32 SQ_INSTS_VALU_MFMA_MOPS_BF16 SQC_DCACHE_REQ_READ_8 SQC_ICACHE_HITS SQ_ACTIVE_INST_VALU SQ_INSTS_VALU_FMA_F16 TCP_TCC_CC_ATOMIC_REQ_sum TCP_TCC_CC_WRITE_REQ_sum TCP_TOTAL_WRITE_sum TCC_EA0_RDREQ_32B_sum TCC_EA0_WR_UNCACHED_32B_sum TCC_NC_REQ_sum TCC_EA0_WRREQ_sum
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc: SQ_WAIT_INST_ANY SQ_INSTS_GDS SQ_INSTS_VALU_ADD_F64 SQ_INSTS_VALU_MFMA_MOPS_F64 SQ_INSTS_VALU_MFMA_MOPS_F16 SQ_INSTS_VALU_INT32 SQC_DCACHE_REQ_READ_2 SQC_DCACHE_REQ TCC_EA0_ATOMIC_sum TCC_CC_REQ_sum TCC_ATOMIC_sum TCC_PROBE_sum
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc: SQ_INSTS_MFMA SQ_INSTS_VALU_TRANS_F16 SQ_BUSY_CYCLES SQC_DCACHE_ATOMIC SQ_INSTS_VALU_CVT SQC_TC_DATA_WRITE_REQ SQ_INSTS_VALU_FMA_F32 SQ_INSTS_BRANCH TCC_EA0_ATOMIC_LEVEL_sum TCC_EA0_RDREQ_DRAM_sum TCC_REQ_sum TCC_ALL_TC_OP_WB_WRITEBACK_sum
gpu:
range:
kernel:
@@ -0,0 +1,5 @@
pmc: SQ_INSTS_VALU_MFMA_MOPS_F8 SQ_LDS_ATOMIC_RETURN SQ_INSTS_VALU_MFMA_MOPS_I8 TCC_RW_REQ_sum TCC_EA0_WRREQ_LEVEL_sum
gpu:
range:
kernel:
@@ -0,0 +1,40 @@
config_dir: /app/src/rocprof_compute_soc/analysis_configs
device: -1
dispatch: null
filter_blocks: {}
format_rocprof_output: rocpd
hip_trace: false
join_type: grid
kernel: null
kernel_names: false
kokkos_trace: false
list_metrics: null
loglevel: 10
mem_level: ALL
mode: profile
name: vcopy
no_roof: true
path: /app/workloads/vcopy/MI300X_A1
pc_sampling_interval: 1048576
pc_sampling_method: stochastic
quiet: false
remaining:
- --
- sample/vcopy
- -n
- '1048576'
- -b
- '256'
- -i
- '3'
retain_rocpd_output: false
rocprofiler_sdk_library_path: /opt/rocm/lib/librocprofiler-sdk.so
roof_only: false
roofline_data_type:
- FP32
sort: kernels
spatial_multiplexing: null
specs: false
subpath: gpu
target: null
verbose: 1
@@ -0,0 +1,2 @@
workload_name,command,ip_blocks,timestamp,version,hostname,cpu_model,sbios,linux_distro,linux_kernel_version,amd_gpu_kernel_version,cpu_memory,gpu_memory,rocm_version,vbios,compute_partition,memory_partition,gpu_series,gpu_model,gpu_arch,gpu_chip_id,gpu_l1,gpu_l2,cu_per_gpu,simd_per_cu,se_per_gpu,wave_size,workgroup_max_size,max_waves_per_cu,max_sclk,max_mclk,cur_sclk,cur_mclk,total_l2_chan,lds_banks_per_cu,sqc_per_gpu,pipes_per_gpu,num_xcd,num_hbm_channels
vcopy,sample/vcopy -n 1048576 -b 256 -i 3,SQ|LDS|SQC|TA|TD|TCP|TCC|SPI|CPC|CPF,Thu Jul 10 21:33:20 2025 (UTC),3,7f6eaef84eaf,AMD EPYC 9354 32-Core Processor,"American Megatrends International, LLC.1.8",Ubuntu 22.04.5 LTS,5.15.0-70-generic,,1584988420,,7.0.0,113-M3000100-103,SPX,NPS1,MI300,MI300X_A1,gfx942,29857,32,4096,304,4,32,64,1024,32,2100,,2100,,128,32,160,4,8,128
1 workload_name command ip_blocks timestamp version hostname cpu_model sbios linux_distro linux_kernel_version amd_gpu_kernel_version cpu_memory gpu_memory rocm_version vbios compute_partition memory_partition gpu_series gpu_model gpu_arch gpu_chip_id gpu_l1 gpu_l2 cu_per_gpu simd_per_cu se_per_gpu wave_size workgroup_max_size max_waves_per_cu max_sclk max_mclk cur_sclk cur_mclk total_l2_chan lds_banks_per_cu sqc_per_gpu pipes_per_gpu num_xcd num_hbm_channels
2 vcopy sample/vcopy -n 1048576 -b 256 -i 3 SQ|LDS|SQC|TA|TD|TCP|TCC|SPI|CPC|CPF Thu Jul 10 21:33:20 2025 (UTC) 3 7f6eaef84eaf AMD EPYC 9354 32-Core Processor American Megatrends International, LLC.1.8 Ubuntu 22.04.5 LTS 5.15.0-70-generic 1584988420 7.0.0 113-M3000100-103 SPX NPS1 MI300 MI300X_A1 gfx942 29857 32 4096 304 4 32 64 1024 32 2100 2100 128 32 160 4 8 128