Adding att v3 support (#84)
* Adding att v3 support * misc fix * bug fix * Python linting workflow and rules * fix regex * Adding temporary args * fix temporary args * fix format * remove att_perfcounters from test input * Review comments (#163) Co-authored-by: Giovanni Baraldi <gbaraldi@amd.com> * Revert "Review comments (#163)" This reverts commit 9ef0f8e5a4489d5581255e1b70ced2aef5c1c1d0. * Address review comments 2 * review changes * review comments * review * cmake alias * review * review * review * review * Enabling percounter in v3 script * review * formatting * formatting --------- Co-authored-by: Jonathan R. Madsen <jonathanrmadsen@gmail.com> Co-authored-by: Baraldi, Giovanni <Giovanni.Baraldi@amd.com> Co-authored-by: Giovanni Baraldi <gbaraldi@amd.com>
This commit is contained in:
committed by
GitHub
parent
72a27feb04
commit
d4a51e4102
@@ -0,0 +1,16 @@
|
||||
[flake8]
|
||||
max-line-length =
|
||||
90
|
||||
ignore =
|
||||
E203,
|
||||
E501,
|
||||
W503,
|
||||
W605,
|
||||
exclude =
|
||||
.git,
|
||||
__pycache__,
|
||||
scripts,
|
||||
external,
|
||||
tests,
|
||||
build,
|
||||
build-*
|
||||
@@ -0,0 +1,39 @@
|
||||
name: Python Linting
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
push:
|
||||
branches: [ amd-staging, amd-mainline ]
|
||||
paths:
|
||||
- '**.py'
|
||||
pull_request:
|
||||
paths:
|
||||
- '**.py'
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
linting:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ['3.8', '3.10', '3.12']
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
|
||||
- name: Lint with flake8
|
||||
run: |
|
||||
# stop the build if there are Python syntax errors or undefined names
|
||||
flake8 source --count --show-source --statistics --select=E9,F63,F7,F82
|
||||
# flake8 options are defined in .flake8
|
||||
flake8 source --count --show-source --statistics
|
||||
@@ -4,6 +4,7 @@ clang-tidy>=15.0.0,<19.0.0
|
||||
cmake>=3.21.0
|
||||
cmake-format
|
||||
dataclasses
|
||||
flake8
|
||||
numpy
|
||||
otf2
|
||||
pandas
|
||||
|
||||
+272
-2
@@ -26,6 +26,7 @@ import os
|
||||
import sys
|
||||
import argparse
|
||||
import subprocess
|
||||
import re
|
||||
|
||||
|
||||
class dotdict(dict):
|
||||
@@ -87,6 +88,95 @@ def strtobool(val):
|
||||
raise ValueError(f"invalid truth value {val} (type={val_type})")
|
||||
|
||||
|
||||
def search_path(path_list):
|
||||
supported_option = []
|
||||
lib_att_pattern = r"libatt_decoder_(trace|summary|debug|testing)\.so"
|
||||
file_list = []
|
||||
|
||||
for path in path_list:
|
||||
for root, dirs, files in os.walk(path, topdown=True):
|
||||
file_list.extend(files)
|
||||
break
|
||||
for itr in file_list:
|
||||
_match = re.match(lib_att_pattern, itr)
|
||||
if _match:
|
||||
lst = re.findall("trace|debug|summary|testing", itr)
|
||||
supported_option.extend(lst)
|
||||
return set(supported_option)
|
||||
|
||||
|
||||
def check_att_capability(args):
|
||||
|
||||
path = []
|
||||
ROCPROFV3_DIR = os.path.dirname(os.path.realpath(__file__))
|
||||
ROCM_DIR = os.path.dirname(ROCPROFV3_DIR)
|
||||
support_input = {}
|
||||
tmp_parser = argparse.ArgumentParser(add_help=False)
|
||||
tmp_parser.add_argument(
|
||||
"--att-library-path",
|
||||
default=os.environ.get(
|
||||
"ATT_LIBRARY_PATH", os.environ.get("LD_LIBRARY_PATH", None)
|
||||
),
|
||||
type=str,
|
||||
required=False,
|
||||
)
|
||||
|
||||
tmp_parser.add_argument(
|
||||
"-i",
|
||||
"--input",
|
||||
default=None,
|
||||
type=str,
|
||||
required=False,
|
||||
)
|
||||
|
||||
tmp_data = {}
|
||||
att_args, unparsed_args = tmp_parser.parse_known_args(args)
|
||||
tmp_keys = list(att_args.__dict__.keys())
|
||||
|
||||
for itr in tmp_keys:
|
||||
if has_set_attr(att_args, itr):
|
||||
tmp_data[itr] = getattr(att_args, itr)
|
||||
|
||||
data = dotdict(tmp_data)
|
||||
if data.input:
|
||||
# If index of a pass in input file is a key in the support_input dict, then that pass has att-library-path arg
|
||||
args_list = parse_input(data.input)
|
||||
for index, itr in enumerate(args_list):
|
||||
if itr.att_library_path:
|
||||
library_path = []
|
||||
if ":" in itr.att_library_path:
|
||||
library_path.extend(itr.att_library_path.split(":"))
|
||||
else:
|
||||
library_path.append(itr.att_library_path)
|
||||
support = search_path(library_path)
|
||||
# If the att-library-path in the input file for a pass is valid, then the value of index key in the dict, support_input, is updated to that valid path
|
||||
if support:
|
||||
support_input[index] = set(support)
|
||||
else:
|
||||
# If the att-library-path in the input file for a pass is invalid, then the value of index key in the dict, support_input, is empty
|
||||
support_input[index] = []
|
||||
if data.att_library_path:
|
||||
if ":" in data.att_library_path:
|
||||
path.extend(data.att_library_path.split(":"))
|
||||
else:
|
||||
path.append(data.att_library_path)
|
||||
else:
|
||||
path.append(f"{ROCM_DIR}/lib")
|
||||
path.append(f"{ROCM_DIR}/lib64")
|
||||
|
||||
support = search_path(set(path))
|
||||
if support:
|
||||
if len(path) == 1:
|
||||
os.environ["ATT_LIBRARY_PATH"] = path[0]
|
||||
os.environ["ROCPROF_ATT_LIBRARY_PATH"] = path[0]
|
||||
else:
|
||||
os.environ["ATT_LIBRARY_PATH"] = ":".join(path)
|
||||
os.environ["ROCPROF_ATT_LIBRARY_PATH"] = ":".join(path)
|
||||
return support, support_input
|
||||
|
||||
return None, support_input
|
||||
|
||||
|
||||
class booleanArgAction(argparse.Action):
|
||||
def __call__(self, parser, args, value, option_string=None):
|
||||
setattr(args, self.dest, strtobool(value))
|
||||
@@ -473,6 +563,14 @@ For MPI applications (or other job launchers such as SLURM), place rocprofv3 ins
|
||||
default=os.environ.get("ROCPROF_PRELOAD", "").split(":"),
|
||||
nargs="*",
|
||||
)
|
||||
|
||||
advanced_options.add_argument(
|
||||
"--att-library-path",
|
||||
default=os.environ.get(
|
||||
"ATT_LIBRARY_PATH", os.environ.get("LD_LIBRARY_PATH", None)
|
||||
),
|
||||
help="ATT library path to find decoder library",
|
||||
)
|
||||
# below is available for CI because LD_PRELOADing a library linked to a sanitizer library
|
||||
# causes issues in apps where HIP is part of shared library.
|
||||
add_parser_bool_argument(
|
||||
@@ -494,7 +592,66 @@ For MPI applications (or other job launchers such as SLURM), place rocprofv3 ins
|
||||
app_args = args[(idx + 1) :]
|
||||
break
|
||||
|
||||
return (parser.parse_args(rocp_args), app_args)
|
||||
supported_list, is_support_input = check_att_capability(rocp_args)
|
||||
if supported_list or len(is_support_input) != 0:
|
||||
choice_list = []
|
||||
for keys, values in is_support_input.items():
|
||||
choice_list.extend(values)
|
||||
if supported_list:
|
||||
choice_list.extend(list(supported_list))
|
||||
|
||||
att_options = parser.add_argument_group("Advanced Thread Trace")
|
||||
|
||||
add_parser_bool_argument(
|
||||
att_options,
|
||||
"--advanced-thread-trace",
|
||||
help="Enable ATT",
|
||||
)
|
||||
|
||||
att_options.add_argument(
|
||||
"--att-target-cu",
|
||||
help="ATT target compute unit",
|
||||
default=None,
|
||||
type=int,
|
||||
)
|
||||
|
||||
att_options.add_argument(
|
||||
"--att-simd-select",
|
||||
help="Select ATT SIMD",
|
||||
default=None,
|
||||
type=str,
|
||||
)
|
||||
|
||||
att_options.add_argument(
|
||||
"--att-buffer-size",
|
||||
help="Buffer Size",
|
||||
default=None,
|
||||
type=str,
|
||||
)
|
||||
|
||||
att_options.add_argument(
|
||||
"--att-shader-engine-mask",
|
||||
help="att shader engine mask",
|
||||
default=None,
|
||||
type=str,
|
||||
)
|
||||
|
||||
att_options.add_argument(
|
||||
"--att-parse",
|
||||
type=str.lower,
|
||||
default=None,
|
||||
help="Select ATT Parse method from the choices",
|
||||
choices=set(choice_list),
|
||||
)
|
||||
|
||||
add_parser_bool_argument(
|
||||
att_options,
|
||||
"--att-serialize-all",
|
||||
default=False,
|
||||
help="Serialize all kernels",
|
||||
)
|
||||
|
||||
return (parser.parse_args(rocp_args), app_args, supported_list, is_support_input)
|
||||
|
||||
|
||||
def parse_yaml(yaml_file):
|
||||
@@ -1048,6 +1205,77 @@ def run(app_args, args, **kwargs):
|
||||
update_env("ROCPROF_PC_SAMPLING_METHOD", args.pc_sampling_method)
|
||||
update_env("ROCPROF_PC_SAMPLING_INTERVAL", args.pc_sampling_interval)
|
||||
|
||||
if args.advanced_thread_trace:
|
||||
|
||||
def int_auto(num_str):
|
||||
if "0x" in num_str:
|
||||
return int(num_str, 16)
|
||||
else:
|
||||
return int(num_str, 10)
|
||||
|
||||
if args.pmc or (
|
||||
args.pc_sampling_beta_enabled
|
||||
or args.pc_sampling_unit
|
||||
or args.pc_sampling_method
|
||||
or args.pc_sampling_interval
|
||||
):
|
||||
fatal_error(
|
||||
"Advanced thread trace cannot be enabled with counter collection or pc sampling"
|
||||
)
|
||||
|
||||
if not args.att_parse:
|
||||
fatal_error("provide the parser choice")
|
||||
|
||||
update_env("ROCPROF_ADVANCED_THREAD_TRACE", True, overwrite=True)
|
||||
update_env("ROCPROF_ATT_CAPABILITY", args.att_parse, overwrite=True)
|
||||
|
||||
if args.att_target_cu:
|
||||
update_env("ROCPROF_ATT_PARAM_TARGET_CU", args.att_target_cu, overwrite=True)
|
||||
|
||||
if args.att_shader_engine_mask:
|
||||
update_env(
|
||||
"ROCPROF_ATT_PARAM_SHADER_ENGINE_MASK",
|
||||
int_auto(args.att_shader_engine_mask),
|
||||
overwrite=True,
|
||||
)
|
||||
if args.att_buffer_size:
|
||||
update_env(
|
||||
"ROCPROF_ATT_PARAM_BUFFER_SIZE",
|
||||
int_auto(args.att_buffer_size),
|
||||
overwrite=True,
|
||||
)
|
||||
if args.att_simd_select:
|
||||
update_env(
|
||||
"ROCPROF_ATT_PARAM_SIMD_SELECT",
|
||||
int_auto(args.att_simd_select),
|
||||
overwrite=True,
|
||||
)
|
||||
if args.att_serialize_all:
|
||||
update_env(
|
||||
"ROCPROF_ATT_PARAM_SERIALIZE_ALL",
|
||||
args.att_serialize_all,
|
||||
overwrite=True,
|
||||
)
|
||||
|
||||
if args.att_library_path:
|
||||
|
||||
update_env(
|
||||
"ROCPROF_ATT_LIBRARY_PATH",
|
||||
args.att_library_path,
|
||||
overwrite=True,
|
||||
)
|
||||
update_env(
|
||||
"ATT_LIBRARY_PATH",
|
||||
args.att_library_path,
|
||||
overwrite=True,
|
||||
)
|
||||
if args.att_percounters:
|
||||
update_env(
|
||||
"ROCPROF_ATT_PARAM_PERFCOUNTERS",
|
||||
" ".join(args.att_perfcounters),
|
||||
overwrite=True,
|
||||
)
|
||||
|
||||
if use_execv:
|
||||
# does not return
|
||||
os.execvpe(app_args[0], app_args, env=app_env)
|
||||
@@ -1061,9 +1289,45 @@ def run(app_args, args, **kwargs):
|
||||
return exit_code
|
||||
|
||||
|
||||
def check_att_path_parse_method(args, index, support_att_input, att_parse_supported):
|
||||
|
||||
if not att_parse_supported:
|
||||
if index not in support_att_input.keys():
|
||||
fatal_error(
|
||||
f"Advanced_thread_trace enabled but no decoder library found in cmdline/env paths and att_library_path not set for pass-{index + 1}"
|
||||
)
|
||||
elif not support_att_input[index]:
|
||||
fatal_error(
|
||||
f"Advanced_thread_trace enabled but no decoder library found in att_library_path for pass-{index + 1}"
|
||||
)
|
||||
else:
|
||||
if args.att_parse and args.att_parse not in support_att_input[index]:
|
||||
fatal_error(
|
||||
f"Advanced_thread_trace enabled but decoder library for requested parse method not found in att_library_path for pass-{index + 1}"
|
||||
)
|
||||
else:
|
||||
if index in support_att_input.keys() and not support_att_input[index]:
|
||||
fatal_error(
|
||||
f"Advanced_thread_trace enabled but no decoder library found in att_library_path for pass-{index + 1}"
|
||||
)
|
||||
|
||||
elif index not in support_att_input.keys():
|
||||
if args.att_parse and args.att_parse not in att_parse_supported:
|
||||
fatal_error(
|
||||
"Advanced_thread_trace enabled but decoder library for requested parse method not found"
|
||||
)
|
||||
else:
|
||||
if args.att_parse and args.att_parse not in support_att_input[index]:
|
||||
fatal_error(
|
||||
f"Advanced_thread_trace enabled but decoder library for requested parse method not found for pass-{index + 1}"
|
||||
)
|
||||
|
||||
|
||||
def main(argv=None):
|
||||
|
||||
cmd_args, app_args = parse_arguments(argv)
|
||||
# att_parse_supported is valid path for decoder in env or commandline arg
|
||||
# support_att_input is a dict, where key is a pass index with value being a valid decoder path
|
||||
cmd_args, app_args, att_parse_supported, support_att_input = parse_arguments(argv)
|
||||
inp_args = (
|
||||
parse_input(cmd_args.input) if getattr(cmd_args, "input") else [dotdict({})]
|
||||
)
|
||||
@@ -1073,10 +1337,16 @@ def main(argv=None):
|
||||
pass_idx = None
|
||||
if has_set_attr(args, "pmc") and len(args.pmc) > 0:
|
||||
pass_idx = 1
|
||||
if args.advanced_thread_trace:
|
||||
check_att_path_parse_method(args, 0, support_att_input, att_parse_supported)
|
||||
run(app_args, args, pass_id=pass_idx)
|
||||
else:
|
||||
for idx, itr in enumerate(inp_args):
|
||||
args = get_args(cmd_args, itr)
|
||||
if args.advanced_thread_trace:
|
||||
check_att_path_parse_method(
|
||||
args, idx, support_att_input, att_parse_supported
|
||||
)
|
||||
run(
|
||||
app_args,
|
||||
args,
|
||||
|
||||
@@ -23,7 +23,6 @@
|
||||
# THE SOFTWARE.
|
||||
|
||||
import ctypes
|
||||
import pathlib
|
||||
import os
|
||||
import io
|
||||
import csv
|
||||
|
||||
@@ -805,6 +805,11 @@ Properties
|
||||
- **``pc_sampling_interval``** *(integer)*: pc sampling interval.
|
||||
- **``pc-sampling-beta-enabled``** *(boolean)*: enable pc
|
||||
sampling support; beta version.
|
||||
- **``att_filenames``** *(object)*
|
||||
- **``key``** *(integer)*: Dispatch id.
|
||||
- **``value``** *(array)*: An array of ATT filenames.
|
||||
- **``code_object_snapshot_filenames``** *(array)*: Code
|
||||
object snapshot filename.
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
|
||||
@@ -567,6 +567,24 @@
|
||||
"pc_sample_comments": {
|
||||
"type": "array",
|
||||
"description": "Comments matching assembly instructions from pc_sample_instructions array. If debug symbols are available, comments provide instructions to source-line mapping. Otherwise, a comment is an empty string."
|
||||
},
|
||||
"att_filenames" : {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"key": {
|
||||
"type": "integer",
|
||||
"description": "Dispatch id."
|
||||
},
|
||||
"value": {
|
||||
"type": "array",
|
||||
"description": "An array of ATT filenames."
|
||||
}
|
||||
|
||||
}
|
||||
},
|
||||
"code_object_snapshot_filenames": {
|
||||
"type": "array",
|
||||
"description": "Code object snapshot filename"
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
@@ -25,6 +25,7 @@
|
||||
#include "statistics.hpp"
|
||||
#include "timestamps.hpp"
|
||||
|
||||
#include "lib/common/filesystem.hpp"
|
||||
#include "lib/common/string_entry.hpp"
|
||||
#include "lib/common/utility.hpp"
|
||||
|
||||
@@ -37,6 +38,7 @@ namespace rocprofiler
|
||||
{
|
||||
namespace tool
|
||||
{
|
||||
namespace fs = common::filesystem;
|
||||
json_output::json_output(const output_config& cfg,
|
||||
std::string_view filename,
|
||||
JSONOutputArchive::Options _opts)
|
||||
@@ -115,11 +117,18 @@ write_json(json_output& json_ar,
|
||||
json_ar(cereal::make_nvp("counters", tool_metadata.get_counter_info()));
|
||||
|
||||
{
|
||||
auto callback_name_info = tool_metadata.callback_names;
|
||||
auto buffer_name_info = tool_metadata.buffer_names;
|
||||
auto counter_dims = tool_metadata.get_counter_dimension_info();
|
||||
auto marker_msg_data = tool_metadata.marker_messages.get();
|
||||
|
||||
auto callback_name_info = tool_metadata.callback_names;
|
||||
auto buffer_name_info = tool_metadata.buffer_names;
|
||||
auto counter_dims = tool_metadata.get_counter_dimension_info();
|
||||
auto marker_msg_data = tool_metadata.marker_messages.get();
|
||||
auto code_object_load_info = tool_metadata.get_code_object_load_info();
|
||||
auto att_filenames = tool_metadata.get_att_filenames();
|
||||
auto code_object_snapshot_filenames = std::vector<std::string>{};
|
||||
code_object_snapshot_filenames.reserve(code_object_load_info.size());
|
||||
for(auto info : code_object_load_info)
|
||||
{
|
||||
code_object_snapshot_filenames.emplace_back(fs::path(info.name).filename());
|
||||
}
|
||||
json_ar.setNextName("strings");
|
||||
json_ar.startNode();
|
||||
json_ar(cereal::make_nvp("callback_records", callback_name_info));
|
||||
@@ -131,7 +140,8 @@ write_json(json_output& json_ar,
|
||||
json_ar(
|
||||
cereal::make_nvp("pc_sample_instructions", tool_metadata.get_pc_sample_instructions()));
|
||||
json_ar(cereal::make_nvp("pc_sample_comments", tool_metadata.get_pc_sample_comments()));
|
||||
|
||||
json_ar(cereal::make_nvp("att_filenames", att_filenames));
|
||||
json_ar(cereal::make_nvp("code_object_snapshot_filenames", code_object_snapshot_filenames));
|
||||
{
|
||||
auto _extern_corr_id_strings = std::map<size_t, std::string>{};
|
||||
if(cfg.kernel_rename)
|
||||
|
||||
@@ -22,12 +22,15 @@
|
||||
|
||||
#include "metadata.hpp"
|
||||
|
||||
#include "lib/common/filesystem.hpp"
|
||||
#include "lib/common/string_entry.hpp"
|
||||
#include "lib/output/agent_info.hpp"
|
||||
#include "lib/output/host_symbol_info.hpp"
|
||||
#include "lib/output/kernel_symbol_info.hpp"
|
||||
#include "lib/rocprofiler-sdk-att/att_lib_wrapper.hpp"
|
||||
|
||||
#include <rocprofiler-sdk/fwd.h>
|
||||
#include <rocprofiler-sdk/cxx/details/tokenize.hpp>
|
||||
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
@@ -36,6 +39,7 @@ namespace rocprofiler
|
||||
{
|
||||
namespace tool
|
||||
{
|
||||
namespace fs = common::filesystem;
|
||||
namespace
|
||||
{
|
||||
rocprofiler_status_t
|
||||
@@ -218,6 +222,34 @@ metadata::get_code_object(uint64_t code_obj_id) const
|
||||
});
|
||||
}
|
||||
|
||||
code_object_load_info_vec_t
|
||||
metadata::get_code_object_load_info() const
|
||||
{
|
||||
auto _data = code_object_load.rlock([](const auto& _data_v) {
|
||||
auto _info = std::vector<rocprofiler::att_wrapper::CodeobjLoadInfo>{};
|
||||
_info.reserve(_data_v.size());
|
||||
for(const auto& itr : _data_v)
|
||||
_info.emplace_back(itr);
|
||||
return _info;
|
||||
});
|
||||
|
||||
return _data;
|
||||
}
|
||||
|
||||
std::vector<std::string>
|
||||
metadata::get_att_filenames() const
|
||||
{
|
||||
auto data = std::vector<std::string>{};
|
||||
for(auto filenames : att_filenames)
|
||||
{
|
||||
for(auto file : filenames.second.second)
|
||||
{
|
||||
data.emplace_back(fs::path(file).filename());
|
||||
}
|
||||
}
|
||||
return data;
|
||||
}
|
||||
|
||||
const kernel_symbol_info*
|
||||
metadata::get_kernel_symbol(uint64_t kernel_id) const
|
||||
{
|
||||
|
||||
@@ -32,6 +32,7 @@
|
||||
#include "lib/common/demangle.hpp"
|
||||
#include "lib/common/logging.hpp"
|
||||
#include "lib/common/synchronized.hpp"
|
||||
#include "lib/rocprofiler-sdk-att/att_lib_wrapper.hpp"
|
||||
|
||||
#include <rocprofiler-sdk/agent.h>
|
||||
#include <rocprofiler-sdk/buffer_tracing.h>
|
||||
@@ -74,8 +75,11 @@ using marker_message_ordered_map_t = std::map<uint64_t, std::string>;
|
||||
using string_entry_map_t = std::unordered_map<size_t, std::unique_ptr<std::string>>;
|
||||
using counter_dimension_vec_t = std::vector<rocprofiler_record_dimension_info_t>;
|
||||
using external_corr_id_set_t = std::unordered_set<uint64_t>;
|
||||
using code_obj_decoder_t = rocprofiler::sdk::codeobj::disassembly::CodeobjAddressTranslate;
|
||||
using instruction_t = rocprofiler::sdk::codeobj::disassembly::Instruction;
|
||||
using code_obj_decoder_t = rocprofiler::sdk::codeobj::disassembly::CodeobjAddressTranslate;
|
||||
using instruction_t = rocprofiler::sdk::codeobj::disassembly::Instruction;
|
||||
using att_agent_filenames_t = std::pair<rocprofiler_agent_id_t, std::vector<std::string>>;
|
||||
using att_filenames_map_t = std::unordered_map<rocprofiler_dispatch_id_t, att_agent_filenames_t>;
|
||||
using code_object_load_info_vec_t = std::vector<rocprofiler::att_wrapper::CodeobjLoadInfo>;
|
||||
template <typename Tp>
|
||||
using synced_map = common::Synchronized<Tp, true>;
|
||||
|
||||
@@ -94,14 +98,16 @@ struct metadata
|
||||
agent_counter_info_map_t agent_counter_info = {};
|
||||
agent_pc_sample_config_info_map_t agent_pc_sample_config_info = {};
|
||||
|
||||
sdk::buffer_name_info buffer_names = {};
|
||||
sdk::callback_name_info callback_names = {};
|
||||
synced_map<code_object_data_map_t> code_objects = {};
|
||||
synced_map<kernel_symbol_data_map_t> kernel_symbols = {};
|
||||
synced_map<marker_message_map_t> marker_messages = {};
|
||||
synced_map<string_entry_map_t> string_entries = {};
|
||||
synced_map<external_corr_id_set_t> external_corr_ids = {};
|
||||
synced_map<host_function_info_map_t> host_functions = {};
|
||||
sdk::buffer_name_info buffer_names = {};
|
||||
sdk::callback_name_info callback_names = {};
|
||||
synced_map<code_object_data_map_t> code_objects = {};
|
||||
synced_map<kernel_symbol_data_map_t> kernel_symbols = {};
|
||||
synced_map<marker_message_map_t> marker_messages = {};
|
||||
synced_map<string_entry_map_t> string_entries = {};
|
||||
synced_map<external_corr_id_set_t> external_corr_ids = {};
|
||||
synced_map<host_function_info_map_t> host_functions = {};
|
||||
synced_map<code_object_load_info_vec_t> code_object_load = {};
|
||||
att_filenames_map_t att_filenames = {};
|
||||
|
||||
metadata() = default;
|
||||
metadata(inprocess);
|
||||
@@ -122,6 +128,7 @@ struct metadata
|
||||
const tool_counter_info* get_counter_info(rocprofiler_counter_id_t id) const;
|
||||
const counter_dimension_info_vec_t* get_counter_dimension_info(uint64_t instance_id) const;
|
||||
|
||||
std::vector<std::string> get_att_filenames() const;
|
||||
code_object_data_vec_t get_code_objects() const;
|
||||
kernel_symbol_data_vec_t get_kernel_symbols() const;
|
||||
host_function_data_vec_t get_host_symbols() const;
|
||||
@@ -135,6 +142,7 @@ struct metadata
|
||||
std::string_view get_comment(int64_t index) const { return instruction_comment.at(index); }
|
||||
int64_t get_instruction_index(rocprofiler_pc_t record);
|
||||
void add_decoder(rocprofiler_code_object_info_t* obj_data_v);
|
||||
code_object_load_info_vec_t get_code_object_load_info() const;
|
||||
|
||||
template <typename Tp>
|
||||
Tp get_marker_messages(Tp&&);
|
||||
|
||||
@@ -29,6 +29,7 @@ find_package(
|
||||
lib/cmake/amd_comgr)
|
||||
|
||||
add_library(rocprofiler-sdk-att-parser STATIC)
|
||||
add_library(rocprofiler-sdk::rocprofiler-sdk-att-parser ALIAS rocprofiler-sdk-att-parser)
|
||||
target_sources(rocprofiler-sdk-att-parser PRIVATE ${ATT_TOOL_SOURCE_FILES})
|
||||
|
||||
target_link_libraries(
|
||||
|
||||
@@ -86,8 +86,8 @@ get_shader_id(const std::string& name)
|
||||
auto run_pos = name.rfind('_');
|
||||
if(run_pos == std::string::npos) throw std::runtime_error("Invalid name");
|
||||
|
||||
std::string_view stripped = name.substr(0, run_pos);
|
||||
auto se_number_pos = stripped.rfind('_');
|
||||
std::string stripped = name.substr(0, run_pos);
|
||||
auto se_number_pos = stripped.rfind('_');
|
||||
if(se_number_pos == std::string::npos || se_number_pos + 1 >= stripped.size())
|
||||
throw std::runtime_error("Invalid name");
|
||||
|
||||
@@ -97,15 +97,11 @@ get_shader_id(const std::string& name)
|
||||
std::vector<tool_att_capability_t>
|
||||
query_att_decode_capability()
|
||||
{
|
||||
std::vector<tool_att_capability_t> ret;
|
||||
auto ret = std::vector<tool_att_capability_t>{};
|
||||
|
||||
for(auto& [cap, libname] : get_lib_names())
|
||||
{
|
||||
if(auto handle = dlopen(libname, RTLD_NOW | RTLD_LOCAL))
|
||||
{
|
||||
dlclose(handle);
|
||||
ret.push_back(cap);
|
||||
}
|
||||
if(DL(libname).handle != 0) ret.push_back(cap);
|
||||
}
|
||||
|
||||
return ret;
|
||||
|
||||
@@ -20,12 +20,19 @@
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
#include "dl.hpp"
|
||||
#include "lib/rocprofiler-sdk-att/dl.hpp"
|
||||
#include "lib/common/environment.hpp"
|
||||
#include "lib/common/filesystem.hpp"
|
||||
#include "lib/common/logging.hpp"
|
||||
|
||||
#include <rocprofiler-sdk/cxx/details/tokenize.hpp>
|
||||
|
||||
#include <dlfcn.h>
|
||||
#include <atomic>
|
||||
#include <cassert>
|
||||
#include <cstdlib>
|
||||
#include <mutex>
|
||||
#include <set>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
|
||||
@@ -33,9 +40,26 @@ namespace rocprofiler
|
||||
{
|
||||
namespace att_wrapper
|
||||
{
|
||||
DL::DL(const char* dlname)
|
||||
namespace fs = ::rocprofiler::common::filesystem;
|
||||
|
||||
fs::path
|
||||
get_search_path(std::string path_name)
|
||||
{
|
||||
handle = dlopen(dlname, RTLD_NOW | RTLD_LOCAL);
|
||||
if(fs::exists(path_name)) return fs::path(path_name);
|
||||
return "";
|
||||
}
|
||||
|
||||
DL::DL(const char* libname)
|
||||
{
|
||||
auto paths = rocprofiler::common::get_env("ROCPROF_ATT_LIBRARY_PATH", "");
|
||||
if(paths.empty()) return;
|
||||
auto path_set = rocprofiler::sdk::parse::tokenize(paths, ":");
|
||||
|
||||
for(auto&& name : path_set)
|
||||
{
|
||||
handle = dlopen((get_search_path(name) / libname).string().c_str(), RTLD_LAZY | RTLD_LOCAL);
|
||||
if(handle) break;
|
||||
}
|
||||
if(!handle) return;
|
||||
|
||||
att_parse_data_fn =
|
||||
|
||||
@@ -35,7 +35,7 @@ class DL
|
||||
using StatusFn = decltype(rocprofiler_att_decoder_get_status_string);
|
||||
|
||||
public:
|
||||
DL(const char* dlname);
|
||||
DL(const char* libname);
|
||||
~DL();
|
||||
|
||||
ParseFn* att_parse_data_fn = nullptr;
|
||||
|
||||
@@ -6,14 +6,15 @@ project(rocprofiler-att-parser-tests LANGUAGES CXX)
|
||||
add_executable(att-parser-tool-v3)
|
||||
target_link_libraries(
|
||||
att-parser-tool-v3
|
||||
PRIVATE rocprofiler-sdk-att-parser rocprofiler-sdk::rocprofiler-sdk-json
|
||||
PRIVATE rocprofiler-sdk::rocprofiler-sdk-att-parser
|
||||
rocprofiler-sdk::rocprofiler-sdk-json
|
||||
rocprofiler-sdk::rocprofiler-sdk-common-library)
|
||||
target_sources(att-parser-tool-v3 PRIVATE standalone_tool_main.cpp)
|
||||
|
||||
add_executable(att-decoder-test)
|
||||
target_link_libraries(
|
||||
att-decoder-test
|
||||
PRIVATE rocprofiler-sdk-att-parser
|
||||
PRIVATE rocprofiler-sdk::rocprofiler-sdk-att-parser
|
||||
rocprofiler-sdk::rocprofiler-sdk-json
|
||||
rocprofiler-sdk::rocprofiler-sdk-common-library
|
||||
rocprofiler-sdk::rocprofiler-sdk-glog
|
||||
@@ -22,6 +23,7 @@ target_link_libraries(
|
||||
GTest::gtest_main)
|
||||
target_sources(att-decoder-test PRIVATE att_decoder_test.cpp)
|
||||
|
||||
set(env-att-lib "ROCPROF_ATT_LIBRARY_PATH=${CMAKE_CURRENT_BINARY_DIR}/../lib")
|
||||
add_library(att_decoder_testing SHARED)
|
||||
target_sources(att_decoder_testing PRIVATE dummy_decoder.cpp)
|
||||
|
||||
@@ -31,6 +33,7 @@ if(NOT ROCPROFILER_MEMCHECK)
|
||||
SOURCES att_decoder_test.cpp
|
||||
TEST_LIST att-decoder-test_TESTS
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
|
||||
set_tests_properties(${att-decoder-test_TESTS} PROPERTIES TIMEOUT 10 LABELS
|
||||
"unittests")
|
||||
set_tests_properties(
|
||||
${att-decoder-test_TESTS} PROPERTIES ENVIRONMENT "${env-att-lib}" TIMEOUT 10
|
||||
LABELS "unittests")
|
||||
endif()
|
||||
|
||||
@@ -6,7 +6,7 @@ project(rocprofiler-att-parser-waitcnt-tests LANGUAGES CXX)
|
||||
add_executable(att-decoder-waitcnt-test)
|
||||
target_link_libraries(
|
||||
att-decoder-waitcnt-test
|
||||
PRIVATE rocprofiler-sdk-att-parser
|
||||
PRIVATE rocprofiler-sdk::rocprofiler-sdk-att-parser
|
||||
rocprofiler-sdk::rocprofiler-sdk-json
|
||||
rocprofiler-sdk::rocprofiler-sdk-common-library
|
||||
rocprofiler-sdk::rocprofiler-sdk-glog
|
||||
|
||||
@@ -23,7 +23,8 @@ target_link_libraries(
|
||||
rocprofiler-sdk::rocprofiler-sdk-perfetto
|
||||
rocprofiler-sdk::rocprofiler-sdk-otf2
|
||||
rocprofiler-sdk::rocprofiler-sdk-dw
|
||||
rocprofiler-sdk::rocprofiler-sdk-amd-comgr)
|
||||
rocprofiler-sdk::rocprofiler-sdk-amd-comgr
|
||||
rocprofiler-sdk::rocprofiler-sdk-att-parser)
|
||||
|
||||
set_target_properties(
|
||||
rocprofiler-sdk-tool
|
||||
|
||||
@@ -144,6 +144,68 @@ get_kernel_filter_range(const std::string& kernel_filter)
|
||||
return range_set;
|
||||
}
|
||||
|
||||
std::vector<att_perfcounter>
|
||||
parse_att_counters(std::string line)
|
||||
{
|
||||
auto counters = std::vector<att_perfcounter>{};
|
||||
|
||||
if(line.empty()) return counters;
|
||||
|
||||
// strip the comment
|
||||
if(auto pos = line.find('#'); pos != std::string::npos) line = line.substr(0, pos);
|
||||
|
||||
// trim line for any white spaces after comment strip
|
||||
trim(line);
|
||||
|
||||
// check to see if comment stripping + trim resulted in empty line
|
||||
if(line.empty()) return counters;
|
||||
|
||||
handle_special_chars(line);
|
||||
|
||||
auto extract_counter_name_and_simd_mask = [](std::string& input) {
|
||||
std::string counter_name = "";
|
||||
auto ret = att_perfcounter{};
|
||||
|
||||
size_t pos = input.find(':');
|
||||
|
||||
if(pos != std::string::npos)
|
||||
{
|
||||
ret.counter_name = input.substr(0, pos);
|
||||
ret.simd_mask = std::stoi(input.substr(pos + 1), nullptr, 16);
|
||||
}
|
||||
else
|
||||
counter_name = input;
|
||||
return ret;
|
||||
};
|
||||
|
||||
// regex to check if string is of the form "counter_name:simd_mask"
|
||||
std::regex pattern(R"([a-zA-Z0-9_]+(:0x[0-9a-fA-F]+)?)");
|
||||
std::set<std::string> unique_counters;
|
||||
|
||||
auto input_ss = std::stringstream{line};
|
||||
while(true)
|
||||
{
|
||||
auto counter = std::string{};
|
||||
input_ss >> counter;
|
||||
if(counter.empty()) break;
|
||||
|
||||
// check if the counter string matches the pattern
|
||||
if(!std::regex_match(counter, pattern))
|
||||
{
|
||||
ROCP_FATAL << "Invalid counter format for ATT: " << counter
|
||||
<< ". Expected format : Counter_name:optional_simd_mask(hexadecimal)";
|
||||
}
|
||||
|
||||
// Consider only those counters where combination of counter name and simd mask is unique
|
||||
if(unique_counters.insert(counter).second == false) continue;
|
||||
|
||||
auto res = extract_counter_name_and_simd_mask(counter);
|
||||
counters.emplace_back(res);
|
||||
}
|
||||
|
||||
return counters;
|
||||
}
|
||||
|
||||
std::set<std::string>
|
||||
parse_counters(std::string line)
|
||||
{
|
||||
@@ -193,6 +255,8 @@ config::config()
|
||||
, kernel_filter_range{get_kernel_filter_range(
|
||||
get_env("ROCPROF_KERNEL_FILTER_RANGE", std::string{}))}
|
||||
, counters{parse_counters(get_env("ROCPROF_COUNTERS", std::string{}))}
|
||||
, att_param_perfcounters{
|
||||
parse_att_counters(get_env("ROCPROF_ATT_PARAM_PERFCOUNTERS", std::string{}))}
|
||||
{
|
||||
if(kernel_filter_include.empty()) kernel_filter_include = std::string{".*"};
|
||||
|
||||
|
||||
@@ -29,6 +29,7 @@
|
||||
#include "lib/common/units.hpp"
|
||||
#include "lib/output/format_path.hpp"
|
||||
#include "lib/output/output_config.hpp"
|
||||
#include "lib/rocprofiler-sdk-att/att_lib_wrapper.hpp"
|
||||
|
||||
#include <rocprofiler-sdk/cxx/serialization.hpp>
|
||||
|
||||
@@ -63,6 +64,12 @@ get_config();
|
||||
std::string
|
||||
format_name(std::string_view _name, const config& = get_config<>());
|
||||
|
||||
struct att_perfcounter
|
||||
{
|
||||
std::string counter_name = {};
|
||||
uint32_t simd_mask = 0xf;
|
||||
};
|
||||
|
||||
struct config : output_config
|
||||
{
|
||||
using base_type = output_config;
|
||||
@@ -101,13 +108,20 @@ struct config : output_config
|
||||
bool list_metrics = get_env("ROCPROF_LIST_METRICS", false);
|
||||
bool list_metrics_output_file = get_env("ROCPROF_OUTPUT_LIST_METRICS_FILE", false);
|
||||
bool pc_sampling_host_trap = false;
|
||||
bool advanced_thread_trace = get_env("ROCPROF_ADVANCED_THREAD_TRACE", false);
|
||||
size_t pc_sampling_interval = get_env("ROCPROF_PC_SAMPLING_INTERVAL", 1);
|
||||
bool att_serialize_all = get_env("ROCPROF_ATT_PARAM_SERIALIZE_ALL", false);
|
||||
rocprofiler_pc_sampling_method_t pc_sampling_method_value = ROCPROFILER_PC_SAMPLING_METHOD_NONE;
|
||||
rocprofiler_pc_sampling_unit_t pc_sampling_unit_value = ROCPROFILER_PC_SAMPLING_UNIT_NONE;
|
||||
|
||||
std::string stats_summary_unit = get_env("ROCPROF_STATS_SUMMARY_UNITS", "nsec");
|
||||
int mpi_size = get_mpi_size();
|
||||
int mpi_rank = get_mpi_rank();
|
||||
uint64_t att_param_shader_engine_mask =
|
||||
get_env<uint64_t>("ROCPROF_ATT_PARAM_SHADER_ENGINE_MASK", 0x1);
|
||||
uint64_t att_param_buffer_size = get_env<uint64_t>("ROCPROF_ATT_PARAM_BUFFER_SIZE", 0x6000000);
|
||||
uint64_t att_param_simd_select = get_env<uint64_t>("ROCPROF_ATT_PARAM_SIMD_SELECT", 0xF);
|
||||
uint64_t att_param_target_cu = get_env<uint64_t>("ROCPROF_ATT_PARAM_TARGET_CU", 1);
|
||||
|
||||
std::string kernel_filter_include = get_env("ROCPROF_KERNEL_FILTER_INCLUDE_REGEX", ".*");
|
||||
std::string kernel_filter_exclude = get_env("ROCPROF_KERNEL_FILTER_EXCLUDE_REGEX", "");
|
||||
@@ -115,8 +129,10 @@ struct config : output_config
|
||||
std::string pc_sampling_unit = get_env("ROCPROF_PC_SAMPLING_UNIT", "none");
|
||||
std::string extra_counters_contents = get_env("ROCPROF_EXTRA_COUNTERS_CONTENTS", "");
|
||||
|
||||
std::unordered_set<uint32_t> kernel_filter_range = {};
|
||||
std::set<std::string> counters = {};
|
||||
std::unordered_set<uint32_t> kernel_filter_range = {};
|
||||
std::set<std::string> counters = {};
|
||||
std::string att_capability = get_env("ROCPROF_ATT_CAPABILITY", "");
|
||||
std::vector<att_perfcounter> att_param_perfcounters = {};
|
||||
|
||||
std::queue<CollectionPeriod> collection_periods = {};
|
||||
|
||||
|
||||
@@ -45,8 +45,11 @@
|
||||
#include "lib/output/statistics.hpp"
|
||||
#include "lib/output/tmp_file.hpp"
|
||||
#include "lib/output/tmp_file_buffer.hpp"
|
||||
#include "lib/rocprofiler-sdk-att/att_lib_wrapper.hpp"
|
||||
|
||||
#include <rocprofiler-sdk/agent.h>
|
||||
#include <rocprofiler-sdk/amd_detail/thread_trace_core.h>
|
||||
#include <rocprofiler-sdk/amd_detail/thread_trace_dispatch.h>
|
||||
#include <rocprofiler-sdk/buffer_tracing.h>
|
||||
#include <rocprofiler-sdk/callback_tracing.h>
|
||||
#include <rocprofiler-sdk/experimental/counters.h>
|
||||
@@ -193,9 +196,10 @@ using kernel_iteration_t = std::unordered_map<rocprofiler_kernel_id_t, uint32
|
||||
using kernel_rename_map_t = std::unordered_map<uint64_t, uint64_t>;
|
||||
using kernel_rename_stack_t = std::stack<uint64_t>;
|
||||
|
||||
auto* tool_metadata = as_pointer<tool::metadata>(tool::metadata::inprocess{});
|
||||
auto target_kernels = common::Synchronized<targeted_kernels_map_t>{};
|
||||
auto kernel_iteration = common::Synchronized<kernel_iteration_t, true>{};
|
||||
auto* tool_metadata = as_pointer<tool::metadata>(tool::metadata::inprocess{});
|
||||
auto target_kernels = common::Synchronized<targeted_kernels_map_t>{};
|
||||
auto kernel_iteration = common::Synchronized<kernel_iteration_t, true>{};
|
||||
std::mutex att_shader_data;
|
||||
|
||||
thread_local auto thread_dispatch_rename = as_pointer<kernel_rename_stack_t>();
|
||||
thread_local auto thread_dispatch_rename_dtor = common::scope_destructor{[]() {
|
||||
@@ -238,7 +242,14 @@ is_targeted_kernel(uint64_t _kern_id)
|
||||
|
||||
// If the iteration range is not given then all iterations of the kernel is profiled
|
||||
if(_range.empty())
|
||||
return true;
|
||||
{
|
||||
if(!tool::get_config().advanced_thread_trace)
|
||||
return true;
|
||||
else
|
||||
{
|
||||
if(itr == 1) return true;
|
||||
}
|
||||
}
|
||||
else if(_range.find(itr) != _range.end())
|
||||
return true;
|
||||
return false;
|
||||
@@ -607,6 +618,75 @@ code_object_tracing_callback(rocprofiler_callback_tracing_record_t record,
|
||||
{
|
||||
CHECK_NOTNULL(tool_metadata)->add_decoder(obj_data);
|
||||
}
|
||||
|
||||
if(obj_data->storage_type == ROCPROFILER_CODE_OBJECT_STORAGE_TYPE_MEMORY &&
|
||||
tool::get_config().advanced_thread_trace)
|
||||
{
|
||||
const char* gpu_name = tool_metadata->agents_map.at(obj_data->rocp_agent).name;
|
||||
auto filename = fmt::format("{}_code_object_id_{}",
|
||||
std::string(gpu_name),
|
||||
std::to_string(obj_data->code_object_id));
|
||||
auto output_stream = get_output_stream(tool::get_config(), filename, ".out");
|
||||
std::string output_filename =
|
||||
get_output_filename(tool::get_config(), filename, ".out");
|
||||
|
||||
// NOLINTNEXTLINE(performance-no-int-to-ptr)
|
||||
output_stream.stream->write(reinterpret_cast<char*>(obj_data->memory_base),
|
||||
obj_data->memory_size);
|
||||
tool_metadata->code_object_load.wlock(
|
||||
[](auto& data_vec,
|
||||
std::string file_name,
|
||||
tool::rocprofiler_code_object_info_t* obj_data_v) {
|
||||
data_vec.push_back({file_name,
|
||||
obj_data_v->code_object_id,
|
||||
obj_data_v->load_base,
|
||||
obj_data_v->load_size});
|
||||
},
|
||||
output_filename,
|
||||
obj_data);
|
||||
}
|
||||
else if(obj_data->storage_type == ROCPROFILER_CODE_OBJECT_STORAGE_TYPE_FILE &&
|
||||
tool::get_config().advanced_thread_trace)
|
||||
{
|
||||
const char* gpu_name = tool_metadata->agents_map.at(obj_data->rocp_agent).name;
|
||||
auto filename = fmt::format("{}_code_object_id_{}",
|
||||
std::string(gpu_name),
|
||||
std::to_string(obj_data->code_object_id));
|
||||
auto output_stream = get_output_stream(tool::get_config(), filename, ".out");
|
||||
std::string output_filename =
|
||||
get_output_filename(tool::get_config(), filename, ".out");
|
||||
|
||||
uint8_t* binary = nullptr;
|
||||
size_t buffer_size = 0;
|
||||
std::ifstream code_object_file(obj_data->uri, std::ios::binary | std::ios::ate);
|
||||
if(code_object_file.good())
|
||||
{
|
||||
buffer_size = code_object_file.tellg();
|
||||
code_object_file.seekg(0, std::ios::beg);
|
||||
binary = new(std::nothrow) uint8_t[buffer_size];
|
||||
if(binary &&
|
||||
!code_object_file.read(reinterpret_cast<char*>(binary), buffer_size))
|
||||
{
|
||||
delete[] binary;
|
||||
binary = nullptr;
|
||||
}
|
||||
}
|
||||
// NOLINTBEGIN(performance-no-int-to-ptr)
|
||||
output_stream.stream->write(reinterpret_cast<char*>(obj_data->memory_base),
|
||||
obj_data->memory_size);
|
||||
// NOLINTEND(performance-no-int-to-ptr)
|
||||
tool_metadata->code_object_load.wlock(
|
||||
[](auto& data_vec,
|
||||
std::string file_name,
|
||||
tool::rocprofiler_code_object_info_t* obj_data_v) {
|
||||
data_vec.push_back({file_name,
|
||||
obj_data_v->code_object_id,
|
||||
obj_data_v->load_base,
|
||||
obj_data_v->load_size});
|
||||
},
|
||||
output_filename,
|
||||
obj_data);
|
||||
}
|
||||
}
|
||||
else if(record.phase == ROCPROFILER_CALLBACK_PHASE_UNLOAD)
|
||||
{
|
||||
@@ -879,6 +959,38 @@ get_instruction_index(rocprofiler_pc_t pc)
|
||||
|
||||
} // namespace
|
||||
|
||||
std::vector<rocprofiler_att_parameter_t>
|
||||
get_att_perfcounter_params(std::vector<rocprofiler::tool::att_perfcounter>& att_perf_counters)
|
||||
{
|
||||
std::vector<rocprofiler_att_parameter_t> _data;
|
||||
if(att_perf_counters.empty()) return _data;
|
||||
|
||||
static const auto gpu_agents = get_gpu_agents();
|
||||
static const auto gpu_agents_counter_info = get_agent_counter_info();
|
||||
|
||||
for(const auto& [agent_, tool_counter_info_] : gpu_agents_counter_info)
|
||||
{
|
||||
for(const auto& counter_info_ : tool_counter_info_)
|
||||
{
|
||||
if(std::string_view(counter_info_.block) != "SQ") continue;
|
||||
|
||||
for(const auto& att_perf_counter : att_perf_counters)
|
||||
{
|
||||
if(std::string_view(counter_info_.name) == att_perf_counter.counter_name)
|
||||
{
|
||||
auto param = rocprofiler_att_parameter_t{};
|
||||
param.type = ROCPROFILER_ATT_PARAMETER_PERFCOUNTER,
|
||||
param.counter_id = counter_info_.id,
|
||||
param.simd_mask = att_perf_counter.simd_mask;
|
||||
_data.emplace_back(param);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return _data;
|
||||
}
|
||||
|
||||
void
|
||||
rocprofiler_pc_sampling_callback(rocprofiler_context_id_t /* context_id*/,
|
||||
rocprofiler_buffer_id_t /* buffer_id*/,
|
||||
@@ -920,6 +1032,55 @@ rocprofiler_pc_sampling_callback(rocprofiler_context_id_t /* context_id*/,
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
att_shader_data_callback(rocprofiler_agent_id_t agent,
|
||||
int64_t se_id,
|
||||
void* se_data,
|
||||
size_t data_size,
|
||||
rocprofiler_user_data_t userdata)
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(att_shader_data);
|
||||
std::stringstream filename;
|
||||
filename << fmt::format("{}_shader_engine_{}_{}", agent.handle, se_id, userdata.value);
|
||||
|
||||
auto dispatch_id = static_cast<rocprofiler_dispatch_id_t>(userdata.value);
|
||||
auto output_stream = get_output_stream(tool::get_config(), filename.str(), ".att");
|
||||
std::string output_filename = get_output_filename(tool::get_config(), filename.str(), ".att");
|
||||
|
||||
output_stream.stream->write(reinterpret_cast<char*>(se_data), data_size);
|
||||
tool_metadata->att_filenames[dispatch_id].first = agent;
|
||||
tool_metadata->att_filenames[dispatch_id].second.emplace_back(output_filename);
|
||||
}
|
||||
|
||||
rocprofiler_att_control_flags_t
|
||||
att_dispatch_callback(rocprofiler_agent_id_t /* agent_id */,
|
||||
rocprofiler_queue_id_t /* queue_id */,
|
||||
rocprofiler_correlation_id_t /* correlation_id */,
|
||||
rocprofiler_kernel_id_t kernel_id,
|
||||
rocprofiler_dispatch_id_t dispatch_id,
|
||||
void* /*userdata_config*/,
|
||||
rocprofiler_user_data_t* userdata_shader)
|
||||
{
|
||||
userdata_shader->value = dispatch_id;
|
||||
kernel_iteration.wlock(
|
||||
[](auto& _kernel_iter, rocprofiler_kernel_id_t _kernel_id) {
|
||||
auto itr = _kernel_iter.find(_kernel_id);
|
||||
if(itr == _kernel_iter.end())
|
||||
_kernel_iter.emplace(_kernel_id, 1);
|
||||
else
|
||||
{
|
||||
itr->second++;
|
||||
}
|
||||
},
|
||||
kernel_id);
|
||||
|
||||
if(is_targeted_kernel(kernel_id))
|
||||
{
|
||||
return ROCPROFILER_ATT_CONTROL_START_AND_STOP;
|
||||
}
|
||||
return ROCPROFILER_ATT_CONTROL_NONE;
|
||||
}
|
||||
|
||||
void
|
||||
dispatch_callback(rocprofiler_dispatch_counting_service_data_t dispatch_data,
|
||||
rocprofiler_profile_config_id_t* config,
|
||||
@@ -1214,6 +1375,37 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
|
||||
}
|
||||
}
|
||||
|
||||
if(tool::get_config().advanced_thread_trace)
|
||||
{
|
||||
auto parameters = std::vector<rocprofiler_att_parameter_t>{};
|
||||
uint64_t target_cu = tool::get_config().att_param_target_cu;
|
||||
uint64_t simd_select = tool::get_config().att_param_simd_select;
|
||||
uint64_t buffer_sz = tool::get_config().att_param_buffer_size;
|
||||
uint64_t shader_mask = tool::get_config().att_param_shader_engine_mask;
|
||||
auto& att_perf = tool::get_config().att_param_perfcounters;
|
||||
bool att_serialize_all = tool::get_config().att_serialize_all;
|
||||
auto att_perf_params = get_att_perfcounter_params(att_perf);
|
||||
parameters.insert(parameters.end(), att_perf_params.begin(), att_perf_params.end());
|
||||
|
||||
// TODO: att params could be different for different devices. How to support?
|
||||
// Input file schema might also need to change to support multiple ATT params
|
||||
|
||||
parameters.push_back({ROCPROFILER_ATT_PARAMETER_TARGET_CU, {target_cu}});
|
||||
parameters.push_back({ROCPROFILER_ATT_PARAMETER_SIMD_SELECT, {simd_select}});
|
||||
parameters.push_back({ROCPROFILER_ATT_PARAMETER_BUFFER_SIZE, {buffer_sz}});
|
||||
parameters.push_back({ROCPROFILER_ATT_PARAMETER_SHADER_ENGINE_MASK, {shader_mask}});
|
||||
parameters.push_back({ROCPROFILER_ATT_PARAMETER_SERIALIZE_ALL, {att_serialize_all}});
|
||||
|
||||
ROCPROFILER_CALL(
|
||||
rocprofiler_configure_dispatch_thread_trace_service(get_client_ctx(),
|
||||
parameters.data(),
|
||||
parameters.size(),
|
||||
att_dispatch_callback,
|
||||
att_shader_data_callback,
|
||||
tool_data),
|
||||
"thread trace service configure");
|
||||
}
|
||||
|
||||
if(tool::get_config().hip_runtime_api_trace || tool::get_config().hip_compiler_api_trace)
|
||||
{
|
||||
ROCPROFILER_CALL(rocprofiler_create_buffer(get_client_ctx(),
|
||||
@@ -1504,6 +1696,40 @@ tool_fini(void* /*tool_data*/)
|
||||
tool::generate_csv(tool::get_config(), *tool_metadata, contributions);
|
||||
}
|
||||
|
||||
if(tool::get_config().advanced_thread_trace)
|
||||
{
|
||||
std::unordered_map<std::string_view, rocprofiler::att_wrapper::tool_att_capability_t>
|
||||
tool_att_capability_map = {
|
||||
{"testing", rocprofiler::att_wrapper::ATT_CAPABILITIES_TESTING},
|
||||
{"summary", rocprofiler::att_wrapper::ATT_CAPABILITIES_SUMMARY},
|
||||
{"trace", rocprofiler::att_wrapper::ATT_CAPABILITIES_TRACE},
|
||||
{"debug", rocprofiler::att_wrapper::ATT_CAPABILITIES_DEBUG}};
|
||||
|
||||
ROCP_FATAL_IF(tool::get_config().att_capability.empty())
|
||||
<< "Provide the decoder parser method as input";
|
||||
|
||||
auto att_capability_value = tool_att_capability_map.at(tool::get_config().att_capability);
|
||||
auto decoder = rocprofiler::att_wrapper::ATTDecoder(att_capability_value);
|
||||
ROCP_FATAL_IF(!decoder.valid()) << "Decoder library not found at ROCPORF_ATT_LIBRARY_PATH";
|
||||
auto codeobj = tool_metadata->get_code_object_load_info();
|
||||
auto output_path = tool::format_path(tool::get_config().output_path);
|
||||
for(auto& [dispatch_id, att_filename_data] : tool_metadata->att_filenames)
|
||||
{
|
||||
std::string formats = "json,csv";
|
||||
// if(tool::get_config().json_output) formats += "json,";
|
||||
// if(tool::get_config().csv_output) formats += "csv,";
|
||||
|
||||
std::stringstream ui_name;
|
||||
ui_name << fmt::format("ui_output_agent_{}_dispatch_{}",
|
||||
std::to_string(att_filename_data.first.handle),
|
||||
dispatch_id);
|
||||
auto out_path = fmt::format("{}/{}", output_path, ui_name.str());
|
||||
std::string in_path = ".";
|
||||
|
||||
decoder.parse(in_path, out_path, att_filename_data.second, codeobj, formats);
|
||||
}
|
||||
}
|
||||
|
||||
if(tool::get_config().json_output)
|
||||
{
|
||||
auto json_ar = tool::open_json(tool::get_config());
|
||||
|
||||
@@ -32,3 +32,4 @@ add_subdirectory(pc-sampling)
|
||||
if(ROCPROFILER_BUILD_ROCDECODE_TESTS)
|
||||
add_subdirectory(rocdecode)
|
||||
endif()
|
||||
add_subdirectory(hsa-code-object)
|
||||
|
||||
@@ -0,0 +1,90 @@
|
||||
#
|
||||
#
|
||||
# HSA multi-queue dependency test
|
||||
|
||||
cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR)
|
||||
|
||||
project(rocprofiler-tests-bin-hsa-code-object LANGUAGES CXX)
|
||||
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
set(CMAKE_CXX_EXTENSIONS OFF)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||
|
||||
find_program(
|
||||
amdclangpp_EXECUTABLE REQUIRED
|
||||
NAMES amdclang++
|
||||
HINTS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm
|
||||
PATHS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm
|
||||
PATH_SUFFIXES bin llvm/bin NO_CACHE)
|
||||
|
||||
function(generate_hsaco TARGET_ID INPUT_FILE OUTPUT_FILE)
|
||||
separate_arguments(
|
||||
CLANG_ARG_LIST
|
||||
UNIX_COMMAND
|
||||
"-O2 -x cl -Xclang -finclude-default-header -cl-denorms-are-zero -cl-std=CL2.0 -Wl,--build-id=sha1
|
||||
-target amdgcn-amd-amdhsa -mcpu=${TARGET_ID} -o ${OUTPUT_FILE} ${INPUT_FILE}")
|
||||
add_custom_command(
|
||||
OUTPUT ${PROJECT_BINARY_DIR}/${OUTPUT_FILE}
|
||||
COMMAND ${amdclangpp_EXECUTABLE} ${CLANG_ARG_LIST}
|
||||
COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_BINARY_DIR}/${OUTPUT_FILE}
|
||||
${CMAKE_BINARY_DIR}/tests/rocprofv3/advanced-thread-trace/${OUTPUT_FILE}
|
||||
OUTPUT ${CMAKE_BINARY_DIR}/tests/rocprofv3/advanced-thread-trace/${OUTPUT_FILE}
|
||||
COMMAND
|
||||
${CMAKE_COMMAND} -E copy
|
||||
${CMAKE_BINARY_DIR}/tests/rocprofv3/advanced-thread-trace/${OUTPUT_FILE}
|
||||
${CMAKE_BINARY_DIR}/rocprofv3/advanced-thread-trace/${OUTPUT_FILE}
|
||||
COMMENT "Building ${OUTPUT_FILE}...")
|
||||
set(HSACO_TARGET_LIST
|
||||
${HSACO_TARGET_LIST} ${PROJECT_BINARY_DIR}/${OUTPUT_FILE}
|
||||
PARENT_SCOPE)
|
||||
endfunction(generate_hsaco)
|
||||
|
||||
foreach(target_id ${GPU_TARGETS})
|
||||
# generate kernel bitcodes
|
||||
generate_hsaco(${target_id} ${CMAKE_CURRENT_SOURCE_DIR}/copy.cl
|
||||
${target_id}_copy.hsaco)
|
||||
generate_hsaco(${target_id} ${CMAKE_CURRENT_SOURCE_DIR}/copy_memory.cl
|
||||
${target_id}_copy_memory.hsaco)
|
||||
endforeach()
|
||||
|
||||
add_custom_target(generate_hsaco_targets_code_object DEPENDS ${HSACO_TARGET_LIST})
|
||||
|
||||
add_executable(hsa_code_object_testapp)
|
||||
target_sources(hsa_code_object_testapp PRIVATE hsa_code_object_app.cpp)
|
||||
target_compile_options(hsa_code_object_testapp PRIVATE -W -Wall -Wextra -Wshadow -Werror)
|
||||
|
||||
find_package(Threads REQUIRED)
|
||||
target_link_libraries(hsa_code_object_testapp PRIVATE stdc++fs Threads::Threads)
|
||||
|
||||
find_package(
|
||||
amd_comgr
|
||||
REQUIRED
|
||||
CONFIG
|
||||
HINTS
|
||||
${CMAKE_INSTALL_PREFIX}
|
||||
PATHS
|
||||
${ROCM_PATH}
|
||||
PATH_SUFFIXES
|
||||
lib/cmake/amd_comgr)
|
||||
|
||||
target_link_libraries(hsa_code_object_testapp PRIVATE amd_comgr)
|
||||
|
||||
find_package(rocprofiler-sdk REQUIRED)
|
||||
target_link_libraries(
|
||||
hsa_code_object_testapp PRIVATE rocprofiler-sdk::rocprofiler-sdk
|
||||
rocprofiler-sdk::tests-common-library)
|
||||
|
||||
find_package(
|
||||
hsa-runtime64
|
||||
REQUIRED
|
||||
CONFIG
|
||||
HINTS
|
||||
${rocm_version_DIR}
|
||||
${ROCM_PATH}
|
||||
PATHS
|
||||
${rocm_version_DIR}
|
||||
${ROCM_PATH})
|
||||
|
||||
target_link_libraries(hsa_code_object_testapp PRIVATE hsa-runtime64::hsa-runtime64)
|
||||
|
||||
add_dependencies(hsa_code_object_testapp generate_hsaco_targets_code_object)
|
||||
@@ -0,0 +1,32 @@
|
||||
/* Copyright (c) 2022 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE. */
|
||||
|
||||
__kernel void copyA(__global unsigned int* a, __global unsigned int* b) {
|
||||
uint tid = get_global_id(0);
|
||||
a[tid] = b[tid];
|
||||
}
|
||||
__kernel void copyB(__global unsigned int* a, __global unsigned int* b) {
|
||||
uint tid = get_global_id(0);
|
||||
a[tid] = b[tid];
|
||||
}
|
||||
__kernel void copyC(__global unsigned int* a, __global unsigned int* b) {
|
||||
uint tid = get_global_id(0);
|
||||
a[tid] = b[tid];
|
||||
}
|
||||
@@ -0,0 +1,32 @@
|
||||
/* Copyright (c) 2022 Advanced Micro Devices, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE. */
|
||||
|
||||
__kernel void copyD(__global unsigned int* a, __global unsigned int* b) {
|
||||
uint tid = get_global_id(0);
|
||||
a[tid] = b[tid];
|
||||
}
|
||||
__kernel void copyE(__global unsigned int* a, __global unsigned int* b) {
|
||||
uint tid = get_global_id(0);
|
||||
a[tid] = b[tid];
|
||||
}
|
||||
__kernel void copyF(__global unsigned int* a, __global unsigned int* b) {
|
||||
uint tid = get_global_id(0);
|
||||
a[tid] = b[tid];
|
||||
}
|
||||
@@ -0,0 +1,492 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in
|
||||
// all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
// THE SOFTWARE.
|
||||
|
||||
/** ROC Profiler Multi Queue Dependency Test
|
||||
*
|
||||
* The goal of this test is to ensure ROC profiler does not go to deadlock
|
||||
* when multiple queue are created and they are dependent on each other
|
||||
*
|
||||
*/
|
||||
|
||||
#include "hsa_code_object_app.h"
|
||||
|
||||
enum class storage_type
|
||||
{
|
||||
CODE_OBJECT_STORAGE_FILE,
|
||||
CODE_OBJECT_STORAGE_MEMORY
|
||||
};
|
||||
|
||||
void
|
||||
code_object_load(MQDependencyTest& obj,
|
||||
storage_type type,
|
||||
MQDependencyTest::CodeObject& code_object)
|
||||
{
|
||||
hsa_status_t status;
|
||||
obj.device_discovery();
|
||||
char agent_name[64];
|
||||
status = hsa_agent_get_info(obj.gpu[0].agent, HSA_AGENT_INFO_NAME, agent_name);
|
||||
RET_IF_HSA_ERR(status)
|
||||
|
||||
if(type == storage_type::CODE_OBJECT_STORAGE_FILE)
|
||||
{
|
||||
std::string hasco_file_path = std::string(agent_name) + std::string("_copy.hsaco");
|
||||
obj.search_hasco(fs::current_path(), hasco_file_path);
|
||||
if(!obj.load_code_object(hasco_file_path, obj.gpu[0].agent, code_object))
|
||||
{
|
||||
printf("Kernel file not found or not usable with given agent.\n");
|
||||
abort();
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::string hasco_file_path = std::string(agent_name) + std::string("_copy_memory.hsaco");
|
||||
obj.search_hasco(fs::current_path(), hasco_file_path);
|
||||
if(!obj.load_code_object_memory(hasco_file_path, obj.gpu[0].agent, code_object))
|
||||
{
|
||||
abort();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
MQDependencyTest::Kernel
|
||||
get_kernel(MQDependencyTest::CodeObject& code_object,
|
||||
std::string kernel_name,
|
||||
MQDependencyTest& obj)
|
||||
{
|
||||
MQDependencyTest::Kernel copy;
|
||||
if(!obj.get_kernel(code_object, kernel_name, obj.gpu[0].agent, copy))
|
||||
{
|
||||
printf("Test %s not found.\n", kernel_name.c_str());
|
||||
abort();
|
||||
}
|
||||
return copy;
|
||||
}
|
||||
|
||||
int
|
||||
main()
|
||||
{
|
||||
hsa_status_t status;
|
||||
MQDependencyTest obj;
|
||||
MQDependencyTest obj_memory = {};
|
||||
MQDependencyTest::CodeObject code_object = {}, code_object_memory = {};
|
||||
|
||||
code_object_load(obj, storage_type::CODE_OBJECT_STORAGE_FILE, code_object);
|
||||
code_object_load(obj_memory, storage_type::CODE_OBJECT_STORAGE_MEMORY, code_object_memory);
|
||||
|
||||
MQDependencyTest::Kernel copyA = get_kernel(code_object, "copyA", obj);
|
||||
MQDependencyTest::Kernel copyB = get_kernel(code_object, "copyB", obj);
|
||||
MQDependencyTest::Kernel copyC = get_kernel(code_object, "copyC", obj);
|
||||
|
||||
MQDependencyTest::Kernel copyD = get_kernel(code_object_memory, "copyD", obj_memory);
|
||||
MQDependencyTest::Kernel copyE = get_kernel(code_object_memory, "copyE", obj_memory);
|
||||
MQDependencyTest::Kernel copyF = get_kernel(code_object_memory, "copyF", obj_memory);
|
||||
|
||||
struct args_t
|
||||
{
|
||||
uint32_t* a = nullptr;
|
||||
uint32_t* b = nullptr;
|
||||
MQDependencyTest::OCLHiddenArgs hidden = {};
|
||||
};
|
||||
|
||||
args_t* args = static_cast<args_t*>(obj.hsa_malloc(sizeof(args_t), obj.kernarg));
|
||||
*args = {};
|
||||
|
||||
uint32_t* a = static_cast<uint32_t*>(obj.hsa_malloc(64 * sizeof(uint32_t), obj.kernarg));
|
||||
uint32_t* b = static_cast<uint32_t*>(obj.hsa_malloc(64 * sizeof(uint32_t), obj.kernarg));
|
||||
|
||||
memset(a, 0, 64 * sizeof(uint32_t));
|
||||
memset(b, 1, 64 * sizeof(uint32_t));
|
||||
|
||||
args_t* args_memory =
|
||||
static_cast<args_t*>(obj_memory.hsa_malloc(sizeof(args_t), obj_memory.kernarg));
|
||||
*args_memory = {};
|
||||
|
||||
uint32_t* c =
|
||||
static_cast<uint32_t*>(obj_memory.hsa_malloc(64 * sizeof(uint32_t), obj_memory.kernarg));
|
||||
uint32_t* d =
|
||||
static_cast<uint32_t*>(obj_memory.hsa_malloc(64 * sizeof(uint32_t), obj_memory.kernarg));
|
||||
|
||||
memset(c, 0, 64 * sizeof(uint32_t));
|
||||
memset(d, 1, 64 * sizeof(uint32_t));
|
||||
|
||||
// Create queue in gpu agent and prepare a kernel dispatch packet
|
||||
hsa_queue_t* queue1 = nullptr;
|
||||
status = hsa_queue_create(obj.gpu[0].agent,
|
||||
1024,
|
||||
HSA_QUEUE_TYPE_SINGLE,
|
||||
nullptr,
|
||||
nullptr,
|
||||
UINT32_MAX,
|
||||
UINT32_MAX,
|
||||
&queue1);
|
||||
RET_IF_HSA_ERR(status)
|
||||
|
||||
// Create a signal with a value of 1 and attach it to the first kernel
|
||||
// dispatch packet
|
||||
hsa_signal_t completion_signal_1 = {};
|
||||
status = hsa_signal_create(1, 0, nullptr, &completion_signal_1);
|
||||
RET_IF_HSA_ERR(status)
|
||||
|
||||
// First dispath packet on queue 1, Kernel A
|
||||
{
|
||||
MQDependencyTest::Aql packet{};
|
||||
packet.header.type = HSA_PACKET_TYPE_KERNEL_DISPATCH;
|
||||
packet.header.barrier = 1;
|
||||
packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM;
|
||||
packet.header.release = HSA_FENCE_SCOPE_SYSTEM;
|
||||
|
||||
packet.dispatch.setup = 1;
|
||||
packet.dispatch.workgroup_size_x = 64;
|
||||
packet.dispatch.workgroup_size_y = 1;
|
||||
packet.dispatch.workgroup_size_z = 1;
|
||||
packet.dispatch.grid_size_x = 64;
|
||||
packet.dispatch.grid_size_y = 1;
|
||||
packet.dispatch.grid_size_z = 1;
|
||||
|
||||
packet.dispatch.group_segment_size = copyA.group;
|
||||
packet.dispatch.private_segment_size = copyA.scratch;
|
||||
packet.dispatch.kernel_object = copyA.handle;
|
||||
|
||||
packet.dispatch.kernarg_address = args;
|
||||
packet.dispatch.completion_signal = completion_signal_1;
|
||||
|
||||
args->a = a;
|
||||
args->b = b;
|
||||
// Tell packet processor of A to launch the first kernel dispatch packet
|
||||
obj.submit_packet(queue1, packet);
|
||||
}
|
||||
|
||||
// Create a signal with a value of 1 and attach it to the second kernel
|
||||
// dispatch packet
|
||||
hsa_signal_t completion_signal_2 = {};
|
||||
status = hsa_signal_create(1, 0, nullptr, &completion_signal_2);
|
||||
RET_IF_HSA_ERR(status)
|
||||
|
||||
hsa_signal_t completion_signal_3 = {};
|
||||
status = hsa_signal_create(1, 0, nullptr, &completion_signal_3);
|
||||
RET_IF_HSA_ERR(status)
|
||||
|
||||
// Create barrier-AND packet that is enqueued in queue 1
|
||||
{
|
||||
MQDependencyTest::Aql packet{};
|
||||
packet.header.type = HSA_PACKET_TYPE_BARRIER_AND;
|
||||
packet.header.barrier = 1;
|
||||
packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM;
|
||||
packet.header.release = HSA_FENCE_SCOPE_SYSTEM;
|
||||
|
||||
packet.barrier_and.dep_signal[0] = completion_signal_2;
|
||||
obj.submit_packet(queue1, packet);
|
||||
}
|
||||
|
||||
// Second dispath packet on queue 1, Kernel C
|
||||
{
|
||||
MQDependencyTest::Aql packet{};
|
||||
packet.header.type = HSA_PACKET_TYPE_KERNEL_DISPATCH;
|
||||
packet.header.barrier = 1;
|
||||
packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM;
|
||||
packet.header.release = HSA_FENCE_SCOPE_SYSTEM;
|
||||
|
||||
packet.dispatch.setup = 1;
|
||||
packet.dispatch.workgroup_size_x = 64;
|
||||
packet.dispatch.workgroup_size_y = 1;
|
||||
packet.dispatch.workgroup_size_z = 1;
|
||||
packet.dispatch.grid_size_x = 64;
|
||||
packet.dispatch.grid_size_y = 1;
|
||||
packet.dispatch.grid_size_z = 1;
|
||||
|
||||
packet.dispatch.group_segment_size = copyC.group;
|
||||
packet.dispatch.private_segment_size = copyC.scratch;
|
||||
packet.dispatch.kernel_object = copyC.handle;
|
||||
packet.dispatch.completion_signal = completion_signal_3;
|
||||
packet.dispatch.kernarg_address = args;
|
||||
|
||||
args->a = a;
|
||||
args->b = b;
|
||||
// Tell packet processor to launch the second kernel dispatch packet
|
||||
obj.submit_packet(queue1, packet);
|
||||
}
|
||||
|
||||
// Create queue 2
|
||||
hsa_queue_t* queue2 = nullptr;
|
||||
status = hsa_queue_create(obj.gpu[0].agent,
|
||||
1024,
|
||||
HSA_QUEUE_TYPE_SINGLE,
|
||||
nullptr,
|
||||
nullptr,
|
||||
UINT32_MAX,
|
||||
UINT32_MAX,
|
||||
&queue2);
|
||||
RET_IF_HSA_ERR(status)
|
||||
|
||||
// Create barrier-AND packet that is enqueued in queue 2
|
||||
{
|
||||
MQDependencyTest::Aql packet{};
|
||||
packet.header.type = HSA_PACKET_TYPE_BARRIER_AND;
|
||||
packet.header.barrier = 1;
|
||||
packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM;
|
||||
packet.header.release = HSA_FENCE_SCOPE_SYSTEM;
|
||||
|
||||
packet.barrier_and.dep_signal[0] = completion_signal_1;
|
||||
obj.submit_packet(queue2, packet);
|
||||
}
|
||||
|
||||
// Third dispath packet on queue 2, Kernel B
|
||||
{
|
||||
MQDependencyTest::Aql packet{};
|
||||
packet.header.type = HSA_PACKET_TYPE_KERNEL_DISPATCH;
|
||||
packet.header.barrier = 1;
|
||||
packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM;
|
||||
packet.header.release = HSA_FENCE_SCOPE_SYSTEM;
|
||||
|
||||
packet.dispatch.setup = 1;
|
||||
packet.dispatch.workgroup_size_x = 64;
|
||||
packet.dispatch.workgroup_size_y = 1;
|
||||
packet.dispatch.workgroup_size_z = 1;
|
||||
packet.dispatch.grid_size_x = 64;
|
||||
packet.dispatch.grid_size_y = 1;
|
||||
packet.dispatch.grid_size_z = 1;
|
||||
|
||||
packet.dispatch.group_segment_size = copyB.group;
|
||||
packet.dispatch.private_segment_size = copyB.scratch;
|
||||
packet.dispatch.kernel_object = copyB.handle;
|
||||
|
||||
packet.dispatch.kernarg_address = args;
|
||||
packet.dispatch.completion_signal = completion_signal_2;
|
||||
|
||||
args->a = a;
|
||||
args->b = b;
|
||||
// Tell packet processor to launch the third kernel dispatch packet
|
||||
obj.submit_packet(queue2, packet);
|
||||
}
|
||||
// Create a signal with a value of 1 and attach it to the first kernel
|
||||
// dispatch packet
|
||||
hsa_signal_t completion_signal_4 = {};
|
||||
status = hsa_signal_create(1, 0, nullptr, &completion_signal_4);
|
||||
RET_IF_HSA_ERR(status)
|
||||
// First dispath packet on queue 1, Kernel D
|
||||
{
|
||||
[[maybe_unused]] MQDependencyTest::Aql packet{};
|
||||
packet.header.type = HSA_PACKET_TYPE_KERNEL_DISPATCH;
|
||||
packet.header.barrier = 1;
|
||||
packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM;
|
||||
packet.header.release = HSA_FENCE_SCOPE_SYSTEM;
|
||||
|
||||
packet.dispatch.setup = 1;
|
||||
packet.dispatch.workgroup_size_x = 64;
|
||||
packet.dispatch.workgroup_size_y = 1;
|
||||
packet.dispatch.workgroup_size_z = 1;
|
||||
packet.dispatch.grid_size_x = 64;
|
||||
packet.dispatch.grid_size_y = 1;
|
||||
packet.dispatch.grid_size_z = 1;
|
||||
|
||||
packet.dispatch.group_segment_size = copyD.group;
|
||||
packet.dispatch.private_segment_size = copyD.scratch;
|
||||
packet.dispatch.kernel_object = copyD.handle;
|
||||
|
||||
packet.dispatch.kernarg_address = args_memory;
|
||||
packet.dispatch.completion_signal = completion_signal_4;
|
||||
|
||||
args_memory->a = c;
|
||||
args_memory->b = d;
|
||||
// Tell packet processor of A to launch the first kernel dispatch packet
|
||||
obj_memory.submit_packet(queue1, packet);
|
||||
}
|
||||
|
||||
// Create a signal with a value of 1 and attach it to the second kernel
|
||||
// dispatch packet
|
||||
hsa_signal_t completion_signal_5 = {};
|
||||
status = hsa_signal_create(1, 0, nullptr, &completion_signal_5);
|
||||
RET_IF_HSA_ERR(status)
|
||||
hsa_signal_t completion_signal_6 = {};
|
||||
status = hsa_signal_create(1, 0, nullptr, &completion_signal_6);
|
||||
RET_IF_HSA_ERR(status)
|
||||
|
||||
// Create barrier-AND packet that is enqueued in queue 1
|
||||
{
|
||||
MQDependencyTest::Aql packet{};
|
||||
packet.header.type = HSA_PACKET_TYPE_BARRIER_AND;
|
||||
packet.header.barrier = 1;
|
||||
packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM;
|
||||
packet.header.release = HSA_FENCE_SCOPE_SYSTEM;
|
||||
|
||||
packet.barrier_and.dep_signal[0] = completion_signal_5;
|
||||
obj_memory.submit_packet(queue1, packet);
|
||||
}
|
||||
// Second dispath packet on queue 1, Kernel F
|
||||
{
|
||||
MQDependencyTest::Aql packet{};
|
||||
packet.header.type = HSA_PACKET_TYPE_KERNEL_DISPATCH;
|
||||
packet.header.barrier = 1;
|
||||
packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM;
|
||||
packet.header.release = HSA_FENCE_SCOPE_SYSTEM;
|
||||
|
||||
packet.dispatch.setup = 1;
|
||||
packet.dispatch.workgroup_size_x = 64;
|
||||
packet.dispatch.workgroup_size_y = 1;
|
||||
packet.dispatch.workgroup_size_z = 1;
|
||||
packet.dispatch.grid_size_x = 64;
|
||||
packet.dispatch.grid_size_y = 1;
|
||||
packet.dispatch.grid_size_z = 1;
|
||||
|
||||
packet.dispatch.group_segment_size = copyF.group;
|
||||
packet.dispatch.private_segment_size = copyF.scratch;
|
||||
packet.dispatch.kernel_object = copyF.handle;
|
||||
packet.dispatch.completion_signal = completion_signal_6;
|
||||
packet.dispatch.kernarg_address = args_memory;
|
||||
|
||||
args_memory->a = c;
|
||||
args_memory->b = d;
|
||||
// Tell packet processor to launch the second kernel dispatch packet
|
||||
obj_memory.submit_packet(queue1, packet);
|
||||
}
|
||||
// Create barrier-AND packet that is enqueued in queue 2
|
||||
{
|
||||
MQDependencyTest::Aql packet{};
|
||||
packet.header.type = HSA_PACKET_TYPE_BARRIER_AND;
|
||||
packet.header.barrier = 1;
|
||||
packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM;
|
||||
packet.header.release = HSA_FENCE_SCOPE_SYSTEM;
|
||||
|
||||
packet.barrier_and.dep_signal[0] = completion_signal_4;
|
||||
obj_memory.submit_packet(queue2, packet);
|
||||
}
|
||||
// Third dispath packet on queue 2, Kernel
|
||||
{
|
||||
MQDependencyTest::Aql packet{};
|
||||
packet.header.type = HSA_PACKET_TYPE_KERNEL_DISPATCH;
|
||||
packet.header.barrier = 1;
|
||||
packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM;
|
||||
packet.header.release = HSA_FENCE_SCOPE_SYSTEM;
|
||||
|
||||
packet.dispatch.setup = 1;
|
||||
packet.dispatch.workgroup_size_x = 64;
|
||||
packet.dispatch.workgroup_size_y = 1;
|
||||
packet.dispatch.workgroup_size_z = 1;
|
||||
packet.dispatch.grid_size_x = 64;
|
||||
packet.dispatch.grid_size_y = 1;
|
||||
packet.dispatch.grid_size_z = 1;
|
||||
|
||||
packet.dispatch.group_segment_size = copyE.group;
|
||||
packet.dispatch.private_segment_size = copyE.scratch;
|
||||
packet.dispatch.kernel_object = copyE.handle;
|
||||
|
||||
packet.dispatch.kernarg_address = args_memory;
|
||||
packet.dispatch.completion_signal = completion_signal_5;
|
||||
|
||||
args_memory->a = c;
|
||||
args_memory->b = d;
|
||||
// Tell packet processor to launch the third kernel dispatch packet
|
||||
obj_memory.submit_packet(queue2, packet);
|
||||
}
|
||||
// Wait on the completion signal
|
||||
hsa_signal_wait_relaxed(
|
||||
completion_signal_1, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
|
||||
|
||||
// Wait on the completion signal
|
||||
hsa_signal_wait_relaxed(
|
||||
completion_signal_2, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
|
||||
|
||||
// Wait on the completion signal
|
||||
hsa_signal_wait_relaxed(
|
||||
completion_signal_3, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
|
||||
|
||||
// Wait on the completion signal
|
||||
hsa_signal_wait_relaxed(
|
||||
completion_signal_4, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
|
||||
|
||||
// Wait on the completion signal
|
||||
hsa_signal_wait_relaxed(
|
||||
completion_signal_5, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
|
||||
|
||||
// Wait on the completion signal
|
||||
hsa_signal_wait_relaxed(
|
||||
completion_signal_6, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
|
||||
|
||||
for(int i = 0; i < 64; i++)
|
||||
{
|
||||
if(a[i] != b[i])
|
||||
{
|
||||
printf("error at %d: expected %d, got %d\n", i, b[i], a[i]);
|
||||
abort();
|
||||
}
|
||||
}
|
||||
|
||||
// Clearing data structures and memory
|
||||
status = hsa_signal_destroy(completion_signal_1);
|
||||
RET_IF_HSA_ERR(status)
|
||||
|
||||
status = hsa_signal_destroy(completion_signal_2);
|
||||
RET_IF_HSA_ERR(status)
|
||||
|
||||
status = hsa_signal_destroy(completion_signal_3);
|
||||
RET_IF_HSA_ERR(status)
|
||||
|
||||
// Clearing data structures and memory
|
||||
status = hsa_signal_destroy(completion_signal_4);
|
||||
RET_IF_HSA_ERR(status)
|
||||
|
||||
status = hsa_signal_destroy(completion_signal_5);
|
||||
RET_IF_HSA_ERR(status)
|
||||
|
||||
status = hsa_signal_destroy(completion_signal_6);
|
||||
RET_IF_HSA_ERR(status)
|
||||
|
||||
if(queue1 != nullptr)
|
||||
{
|
||||
status = hsa_queue_destroy(queue1);
|
||||
RET_IF_HSA_ERR(status)
|
||||
}
|
||||
|
||||
if(queue2 != nullptr)
|
||||
{
|
||||
status = hsa_queue_destroy(queue2);
|
||||
RET_IF_HSA_ERR(status)
|
||||
}
|
||||
|
||||
status = hsa_memory_free(a);
|
||||
RET_IF_HSA_ERR(status)
|
||||
|
||||
status = hsa_memory_free(b);
|
||||
RET_IF_HSA_ERR(status)
|
||||
|
||||
status = hsa_memory_free(c);
|
||||
RET_IF_HSA_ERR(status)
|
||||
|
||||
status = hsa_memory_free(d);
|
||||
RET_IF_HSA_ERR(status)
|
||||
|
||||
status = hsa_executable_destroy(code_object.executable);
|
||||
RET_IF_HSA_ERR(status)
|
||||
|
||||
status = hsa_code_object_reader_destroy(code_object.code_obj_rdr);
|
||||
RET_IF_HSA_ERR(status)
|
||||
|
||||
status = hsa_executable_destroy(code_object_memory.executable);
|
||||
RET_IF_HSA_ERR(status)
|
||||
|
||||
status = hsa_code_object_reader_destroy(code_object_memory.code_obj_rdr);
|
||||
RET_IF_HSA_ERR(status)
|
||||
|
||||
close(code_object.file);
|
||||
|
||||
close(code_object_memory.file);
|
||||
}
|
||||
@@ -0,0 +1,415 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in
|
||||
// all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
// THE SOFTWARE.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "common/filesystem.hpp"
|
||||
|
||||
#include <hsa/hsa.h>
|
||||
#include <hsa/hsa_ext_amd.h>
|
||||
|
||||
#include <dlfcn.h>
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
#include <cassert>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
namespace fs = common::fs;
|
||||
|
||||
#define RET_IF_HSA_ERR(err) \
|
||||
{ \
|
||||
if((err) != HSA_STATUS_SUCCESS) \
|
||||
{ \
|
||||
char err_val[12]; \
|
||||
char* err_str = nullptr; \
|
||||
if(hsa_status_string(err, (const char**) &err_str) != HSA_STATUS_SUCCESS) \
|
||||
{ \
|
||||
sprintf(&(err_val[0]), "%#x", (uint32_t) err); \
|
||||
err_str = &(err_val[0]); \
|
||||
} \
|
||||
printf("hsa api call failure at: %s:%d\n", __FILE__, __LINE__); \
|
||||
printf("Call returned %s\n", err_str); \
|
||||
abort(); \
|
||||
} \
|
||||
}
|
||||
|
||||
struct Device
|
||||
{
|
||||
struct Memory
|
||||
{
|
||||
hsa_amd_memory_pool_t pool;
|
||||
bool fine;
|
||||
bool kernarg;
|
||||
size_t size;
|
||||
size_t granule;
|
||||
};
|
||||
|
||||
hsa_agent_t agent;
|
||||
char name[64];
|
||||
std::vector<Memory> pools;
|
||||
uint32_t fine;
|
||||
uint32_t coarse;
|
||||
static std::vector<hsa_agent_t> all_devices;
|
||||
};
|
||||
|
||||
class MQDependencyTest
|
||||
{
|
||||
public:
|
||||
MQDependencyTest() { hsa_init(); }
|
||||
~MQDependencyTest() { hsa_shut_down(); }
|
||||
|
||||
std::vector<Device> cpu;
|
||||
std::vector<Device> gpu;
|
||||
Device::Memory kernarg;
|
||||
std::vector<hsa_agent_t> all_devices = {};
|
||||
|
||||
struct CodeObject
|
||||
{
|
||||
hsa_file_t file = 0;
|
||||
hsa_code_object_reader_t code_obj_rdr = {};
|
||||
hsa_executable_t executable = {};
|
||||
};
|
||||
|
||||
struct Kernel
|
||||
{
|
||||
uint64_t handle = 0;
|
||||
uint32_t scratch = 0;
|
||||
uint32_t group = 0;
|
||||
uint32_t kernarg_size = 0;
|
||||
uint32_t kernarg_align = 0;
|
||||
};
|
||||
|
||||
union AqlHeader
|
||||
{
|
||||
struct
|
||||
{
|
||||
uint16_t type : 8;
|
||||
uint16_t barrier : 1;
|
||||
uint16_t acquire : 2;
|
||||
uint16_t release : 2;
|
||||
uint16_t reserved : 3;
|
||||
};
|
||||
uint16_t raw = 0;
|
||||
};
|
||||
|
||||
struct BarrierValue
|
||||
{
|
||||
AqlHeader header = {};
|
||||
uint8_t AmdFormat = 0;
|
||||
uint8_t reserved = 0;
|
||||
uint32_t reserved1 = 0;
|
||||
hsa_signal_t signal = {};
|
||||
hsa_signal_value_t value = 0;
|
||||
hsa_signal_value_t mask = 0;
|
||||
uint32_t cond = 0;
|
||||
uint32_t reserved2 = 0;
|
||||
uint64_t reserved3 = 0;
|
||||
uint64_t reserved4 = 0;
|
||||
hsa_signal_t completion_signal = {};
|
||||
};
|
||||
|
||||
union Aql
|
||||
{
|
||||
AqlHeader header;
|
||||
hsa_kernel_dispatch_packet_t dispatch;
|
||||
hsa_barrier_and_packet_t barrier_and;
|
||||
hsa_barrier_or_packet_t barrier_or;
|
||||
BarrierValue barrier_value = {};
|
||||
};
|
||||
|
||||
struct OCLHiddenArgs
|
||||
{
|
||||
uint64_t offset_x = 0;
|
||||
uint64_t offset_y = 0;
|
||||
uint64_t offset_z = 0;
|
||||
void* printf_buffer = nullptr;
|
||||
void* enqueue = nullptr;
|
||||
void* enqueue2 = nullptr;
|
||||
void* multi_grid = nullptr;
|
||||
};
|
||||
|
||||
bool load_code_object(const std::string& filename, hsa_agent_t agent, CodeObject& code_object)
|
||||
{
|
||||
hsa_status_t err;
|
||||
code_object.file = open(filename.c_str(), O_RDONLY);
|
||||
if(code_object.file == -1)
|
||||
{
|
||||
fprintf(stderr, "%s:%s\n", "Could not load code object", filename.c_str());
|
||||
abort();
|
||||
return false;
|
||||
}
|
||||
|
||||
err = hsa_code_object_reader_create_from_file(code_object.file, &code_object.code_obj_rdr);
|
||||
RET_IF_HSA_ERR(err);
|
||||
|
||||
err = hsa_executable_create_alt(HSA_PROFILE_FULL,
|
||||
HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT,
|
||||
nullptr,
|
||||
&code_object.executable);
|
||||
RET_IF_HSA_ERR(err);
|
||||
|
||||
err = hsa_executable_load_agent_code_object(
|
||||
code_object.executable, agent, code_object.code_obj_rdr, nullptr, nullptr);
|
||||
if(err != HSA_STATUS_SUCCESS) return false;
|
||||
|
||||
err = hsa_executable_freeze(code_object.executable, nullptr);
|
||||
RET_IF_HSA_ERR(err);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool load_code_object_memory(const std::string& filename,
|
||||
hsa_agent_t agent,
|
||||
CodeObject& code_object)
|
||||
{
|
||||
hsa_status_t err;
|
||||
size_t buffer_size = 0;
|
||||
std::ifstream code_object_file(filename.c_str(), std::ios::binary | std::ios::ate);
|
||||
if(!code_object_file.good()) return false;
|
||||
buffer_size = code_object_file.tellg();
|
||||
code_object_file.seekg(0, std::ios::beg);
|
||||
uint8_t* binary = new(std::nothrow) uint8_t[buffer_size];
|
||||
if(binary && !code_object_file.read(reinterpret_cast<char*>(binary), buffer_size))
|
||||
{
|
||||
delete[] binary;
|
||||
binary = nullptr;
|
||||
return false;
|
||||
}
|
||||
|
||||
err = hsa_code_object_reader_create_from_memory(
|
||||
binary, buffer_size, &code_object.code_obj_rdr);
|
||||
RET_IF_HSA_ERR(err);
|
||||
|
||||
err = hsa_executable_create_alt(HSA_PROFILE_FULL,
|
||||
HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT,
|
||||
nullptr,
|
||||
&code_object.executable);
|
||||
RET_IF_HSA_ERR(err);
|
||||
|
||||
err = hsa_executable_load_agent_code_object(
|
||||
code_object.executable, agent, code_object.code_obj_rdr, nullptr, nullptr);
|
||||
if(err != HSA_STATUS_SUCCESS) return false;
|
||||
|
||||
err = hsa_executable_freeze(code_object.executable, nullptr);
|
||||
RET_IF_HSA_ERR(err);
|
||||
delete[] binary;
|
||||
binary = nullptr;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool get_kernel(const CodeObject& code_object,
|
||||
const std::string& kernel,
|
||||
hsa_agent_t agent,
|
||||
Kernel& kern)
|
||||
{
|
||||
hsa_executable_symbol_t symbol;
|
||||
hsa_status_t err = hsa_executable_get_symbol_by_name(
|
||||
code_object.executable, kernel.c_str(), &agent, &symbol);
|
||||
if(err != HSA_STATUS_SUCCESS)
|
||||
{
|
||||
err = hsa_executable_get_symbol_by_name(
|
||||
code_object.executable, (kernel + ".kd").c_str(), &agent, &symbol);
|
||||
if(err != HSA_STATUS_SUCCESS)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
printf("\nkernel-name: %s\n", kernel.c_str());
|
||||
err = hsa_executable_symbol_get_info(
|
||||
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kern.handle);
|
||||
RET_IF_HSA_ERR(err);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// Not for parallel insertion.
|
||||
bool submit_packet(hsa_queue_t* queue, Aql& pkt)
|
||||
{
|
||||
size_t mask = queue->size - 1;
|
||||
Aql* ring = static_cast<Aql*>(queue->base_address);
|
||||
|
||||
uint64_t write = hsa_queue_load_write_index_relaxed(queue);
|
||||
uint64_t read = hsa_queue_load_read_index_relaxed(queue);
|
||||
if(write - read + 1 > queue->size) return false;
|
||||
|
||||
Aql& dst = ring[write & mask];
|
||||
|
||||
uint16_t header = pkt.header.raw;
|
||||
pkt.header.raw = dst.header.raw;
|
||||
dst = pkt;
|
||||
__atomic_store_n(&dst.header.raw, header, __ATOMIC_RELEASE);
|
||||
pkt.header.raw = header;
|
||||
|
||||
hsa_queue_store_write_index_release(queue, write + 1);
|
||||
hsa_signal_store_screlease(queue->doorbell_signal, write);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void* hsa_malloc(size_t size, const Device::Memory& mem)
|
||||
{
|
||||
void* ret;
|
||||
hsa_status_t err = hsa_amd_memory_pool_allocate(mem.pool, size, 0, &ret);
|
||||
RET_IF_HSA_ERR(err);
|
||||
|
||||
err = hsa_amd_agents_allow_access(all_devices.size(), all_devices.data(), nullptr, ret);
|
||||
RET_IF_HSA_ERR(err);
|
||||
return ret;
|
||||
}
|
||||
|
||||
void* hsa_malloc(size_t size, const Device& dev, bool fine)
|
||||
{
|
||||
uint32_t index = fine ? dev.fine : dev.coarse;
|
||||
assert(index != -1u && "Memory type unavailable.");
|
||||
return hsa_malloc(size, dev.pools[index]);
|
||||
}
|
||||
|
||||
bool device_discovery()
|
||||
{
|
||||
hsa_status_t err;
|
||||
|
||||
err = hsa_iterate_agents(
|
||||
[](hsa_agent_t agent, void* obj) {
|
||||
hsa_status_t error;
|
||||
|
||||
Device dev;
|
||||
dev.agent = agent;
|
||||
|
||||
dev.fine = -1u;
|
||||
dev.coarse = -1u;
|
||||
MQDependencyTest* _obj = reinterpret_cast<MQDependencyTest*>(obj);
|
||||
error = hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, dev.name);
|
||||
RET_IF_HSA_ERR(error)
|
||||
|
||||
hsa_device_type_t type;
|
||||
error = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type);
|
||||
RET_IF_HSA_ERR(error)
|
||||
|
||||
error = hsa_amd_agent_iterate_memory_pools(
|
||||
agent,
|
||||
[](hsa_amd_memory_pool_t pool, void* data) {
|
||||
auto& pools = *reinterpret_cast<std::vector<Device::Memory>*>(data);
|
||||
hsa_status_t status;
|
||||
|
||||
bool allowed = false;
|
||||
status = hsa_amd_memory_pool_get_info(
|
||||
pool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, &allowed);
|
||||
|
||||
if(!allowed) return HSA_STATUS_SUCCESS;
|
||||
|
||||
hsa_amd_segment_t segment;
|
||||
status = hsa_amd_memory_pool_get_info(
|
||||
pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment);
|
||||
RET_IF_HSA_ERR(status)
|
||||
|
||||
if(segment != HSA_AMD_SEGMENT_GLOBAL) return HSA_STATUS_SUCCESS;
|
||||
|
||||
uint32_t flags;
|
||||
status = hsa_amd_memory_pool_get_info(
|
||||
pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flags);
|
||||
RET_IF_HSA_ERR(status)
|
||||
|
||||
Device::Memory mem;
|
||||
mem.pool = pool;
|
||||
mem.fine = ((flags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) != 0u);
|
||||
mem.kernarg =
|
||||
((flags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) != 0u);
|
||||
|
||||
status = hsa_amd_memory_pool_get_info(
|
||||
pool, HSA_AMD_MEMORY_POOL_INFO_SIZE, &mem.size);
|
||||
RET_IF_HSA_ERR(status)
|
||||
|
||||
status = hsa_amd_memory_pool_get_info(
|
||||
pool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, &mem.granule);
|
||||
RET_IF_HSA_ERR(status)
|
||||
|
||||
pools.push_back(mem);
|
||||
return HSA_STATUS_SUCCESS;
|
||||
},
|
||||
static_cast<void*>(&dev.pools));
|
||||
|
||||
if(!dev.pools.empty())
|
||||
{
|
||||
for(size_t i = 0; i < dev.pools.size(); i++)
|
||||
{
|
||||
if(dev.pools[i].fine && dev.pools[i].kernarg && dev.fine == -1u)
|
||||
dev.fine = i;
|
||||
if(dev.pools[i].fine && !dev.pools[i].kernarg) dev.fine = i;
|
||||
if(!dev.pools[i].fine) dev.coarse = i;
|
||||
}
|
||||
|
||||
if(type == HSA_DEVICE_TYPE_CPU)
|
||||
_obj->cpu.push_back(dev);
|
||||
else
|
||||
_obj->gpu.push_back(dev);
|
||||
|
||||
_obj->all_devices.push_back(dev.agent);
|
||||
}
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
},
|
||||
this);
|
||||
|
||||
bool is_break = false;
|
||||
for(auto& dev : cpu)
|
||||
{
|
||||
for(auto& mem : dev.pools)
|
||||
{
|
||||
if(mem.fine && mem.kernarg)
|
||||
{
|
||||
kernarg = mem;
|
||||
is_break = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if(is_break) break;
|
||||
}
|
||||
|
||||
RET_IF_HSA_ERR(err);
|
||||
|
||||
if(cpu.empty() || gpu.empty() || kernarg.pool.handle == 0) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
void search_hasco(const fs::path& directory, std::string& filename)
|
||||
{
|
||||
for(const auto& entry : fs::directory_iterator(directory))
|
||||
{
|
||||
if(fs::is_regular_file(entry))
|
||||
{
|
||||
if(entry.path().filename() == filename)
|
||||
{
|
||||
filename = entry.path();
|
||||
}
|
||||
}
|
||||
else if(fs::is_directory(entry))
|
||||
{
|
||||
search_hasco(entry, filename); // Recursive call for subdirectories
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
@@ -39,3 +39,6 @@ add_subdirectory(collection-period)
|
||||
if(ROCPROFILER_BUILD_ROCDECODE_TESTS)
|
||||
add_subdirectory(rocdecode-trace)
|
||||
endif()
|
||||
if(TARGET att_decoder_testing)
|
||||
add_subdirectory(advanced-thread-trace)
|
||||
endif()
|
||||
|
||||
@@ -0,0 +1,100 @@
|
||||
#
|
||||
# rocprofv3 tool test
|
||||
#
|
||||
cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR)
|
||||
|
||||
project(
|
||||
rocprofiler-tests-rocprofv3-att
|
||||
LANGUAGES CXX
|
||||
VERSION 0.0.0)
|
||||
|
||||
string(REPLACE "LD_PRELOAD=" "--preload;" PRELOAD_ARGS
|
||||
"${ROCPROFILER_MEMCHECK_PRELOAD_ENV}")
|
||||
|
||||
rocprofiler_configure_pytest_files(CONFIG pytest.ini COPY validate.py conftest.py
|
||||
att_input.json)
|
||||
|
||||
find_package(rocprofiler-sdk REQUIRED)
|
||||
|
||||
# hsa multiqueue dependency test
|
||||
|
||||
add_test(
|
||||
NAME rocprofv3-test-hsa-multiqueue-att-cmd-env-ld-lib-path-execute
|
||||
COMMAND
|
||||
$<TARGET_FILE:rocprofiler-sdk::rocprofv3> --advanced-thread-trace 1
|
||||
--att-target-cu 1 --att-shader-engine-mask 0x11 --kernel-include-regex copyD
|
||||
--att-buffer-size 0x6000000 --att-simd-select 0x3 --att-parse testing
|
||||
--att-serialize-all 1 -d ${CMAKE_CURRENT_BINARY_DIR}/%argt%-trace/cmd_input -o
|
||||
out --output-format json ${PRELOAD_ARGS} --
|
||||
$<TARGET_FILE:hsa_code_object_testapp>)
|
||||
|
||||
set_tests_properties(
|
||||
rocprofv3-test-hsa-multiqueue-att-cmd-env-ld-lib-path-execute
|
||||
PROPERTIES TIMEOUT 45 LABELS "integration-tests" ENVIRONMENT
|
||||
LD_LIBRARY_PATH=${CMAKE_BINARY_DIR}/lib:$ENV{LD_LIBRARY_PATH}
|
||||
FAIL_REGULAR_EXPRESSION "HSA_API|HIP_API")
|
||||
# hsa multiqueue dependency test
|
||||
add_test(
|
||||
NAME rocprofv3-test-hsa-multiqueue-att-cmd-env-att-lib-path-execute
|
||||
COMMAND
|
||||
$<TARGET_FILE:rocprofiler-sdk::rocprofv3> --advanced-thread-trace 1
|
||||
--att-target-cu 1 --att-shader-engine-mask 0x11 --kernel-include-regex copyD
|
||||
--att-buffer-size 0x6000000 --att-simd-select 0x3 --att-parse testing
|
||||
--att-serialize-all 1 -d ${CMAKE_CURRENT_BINARY_DIR}/%argt%-trace/cmd_input -o
|
||||
out --output-format json ${PRELOAD_ARGS} --
|
||||
$<TARGET_FILE:hsa_code_object_testapp>)
|
||||
|
||||
set_tests_properties(
|
||||
rocprofv3-test-hsa-multiqueue-att-cmd-env-att-lib-path-execute
|
||||
PROPERTIES TIMEOUT 45 LABELS "integration-tests" ENVIRONMENT
|
||||
ATT_LIBRARY_PATH=${CMAKE_BINARY_DIR}/lib FAIL_REGULAR_EXPRESSION
|
||||
"HSA_API|HIP_API")
|
||||
|
||||
# hsa multiqueue dependency test
|
||||
add_test(
|
||||
NAME rocprofv3-test-hsa-multiqueue-att-json-execute
|
||||
COMMAND
|
||||
$<TARGET_FILE:rocprofiler-sdk::rocprofv3> --att-library-path
|
||||
${CMAKE_BINARY_DIR}/lib -d ${CMAKE_CURRENT_BINARY_DIR}/%argt%-trace/json_input -i
|
||||
${CMAKE_CURRENT_BINARY_DIR}/att_input.json ${PRELOAD_ARGS} --
|
||||
$<TARGET_FILE:hsa_code_object_testapp>)
|
||||
|
||||
set_tests_properties(
|
||||
rocprofv3-test-hsa-multiqueue-att-json-execute
|
||||
PROPERTIES TIMEOUT 45 LABELS "integration-tests" FAIL_REGULAR_EXPRESSION
|
||||
"HSA_API|HIP_API")
|
||||
|
||||
add_test(
|
||||
NAME rocprofv3-test-hsa-multiqueue-att-cmd-validate
|
||||
COMMAND
|
||||
${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py --input
|
||||
${CMAKE_CURRENT_BINARY_DIR}/hsa_code_object_testapp-trace/cmd_input/out_results.json
|
||||
--code-object-input ${CMAKE_CURRENT_BINARY_DIR} --output-path
|
||||
${CMAKE_CURRENT_BINARY_DIR}/hsa_code_object_testapp-trace/cmd_input)
|
||||
|
||||
add_test(
|
||||
NAME rocprofv3-test-hsa-multiqueue-att-json-validate
|
||||
COMMAND
|
||||
${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py --input
|
||||
${CMAKE_CURRENT_BINARY_DIR}/hsa_code_object_testapp-trace/json_input/out_results.json
|
||||
--code-object-input ${CMAKE_CURRENT_BINARY_DIR} --output-path
|
||||
${CMAKE_CURRENT_BINARY_DIR}/hsa_code_object_testapp-trace/json_input)
|
||||
|
||||
set(MULTIQUEUE_CMD_VALIDATION_FILES
|
||||
${CMAKE_CURRENT_BINARY_DIR}/hsa_code_object_testapp-trace/cmd_input/out_results.json)
|
||||
|
||||
set(MULTIQUEUE_JSON_VALIDATION_FILES
|
||||
${CMAKE_CURRENT_BINARY_DIR}/hsa_code_object_testapp-trace/json_input/pass_1/out_results.json
|
||||
)
|
||||
|
||||
set_tests_properties(
|
||||
rocprofv3-test-hsa-multiqueue-att-cmd-validate
|
||||
PROPERTIES TIMEOUT 45 LABELS "integration-tests" DEPENDS
|
||||
"rocprofv3-test-hsa-multiqueue-att-cmd-ld-lib-path-execute"
|
||||
FAIL_REGULAR_EXPRESSION "AssertionError")
|
||||
|
||||
set_tests_properties(
|
||||
rocprofv3-test-hsa-multiqueue-att-json-validate
|
||||
PROPERTIES TIMEOUT 45 LABELS "integration-tests" DEPENDS
|
||||
"rocprofv3-test-hsa-multiqueue-att-json-execute" FAIL_REGULAR_EXPRESSION
|
||||
"AssertionError")
|
||||
@@ -0,0 +1,20 @@
|
||||
{
|
||||
"jobs": [
|
||||
{
|
||||
"kernel_include_regex": "copyD",
|
||||
"kernel_exclude_regex": "",
|
||||
"output_file": "out",
|
||||
"output_format": [
|
||||
"json"
|
||||
],
|
||||
"truncate_kernels": true,
|
||||
"advanced_thread_trace": true,
|
||||
"att_parse" : "testing",
|
||||
"att_target_cu" : 1,
|
||||
"att_shader_engine_mask" : "0x11",
|
||||
"att_simd_select": "0x3",
|
||||
"att_buffer_size": "0x6000000",
|
||||
"att_perfcounters": "SQ_WAVES:0x1 SQ_INSTS_VALU:0x3 SQ_INSTS_SALU:0xF"
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,59 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import csv
|
||||
import pytest
|
||||
import json
|
||||
|
||||
from rocprofiler_sdk.pytest_utils.dotdict import dotdict
|
||||
from rocprofiler_sdk.pytest_utils import collapse_dict_list
|
||||
from rocprofiler_sdk.pytest_utils.perfetto_reader import PerfettoReader
|
||||
|
||||
import re
|
||||
import os
|
||||
|
||||
|
||||
def pytest_addoption(parser):
|
||||
|
||||
parser.addoption(
|
||||
"--input",
|
||||
action="store",
|
||||
help="Path to JSON file.",
|
||||
)
|
||||
parser.addoption(
|
||||
"--code-object-input",
|
||||
action="store",
|
||||
help="Path to code object file.",
|
||||
)
|
||||
parser.addoption(
|
||||
"--output-path",
|
||||
action="store",
|
||||
help="Output Path.",
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def json_data(request):
|
||||
filename = request.config.getoption("--input")
|
||||
with open(filename, "r") as inp:
|
||||
return dotdict(collapse_dict_list(json.load(inp)))
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def output_path(request):
|
||||
return request.config.getoption("--output-path")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def code_object_file_path(request):
|
||||
file_path = request.config.getoption("--code-object-input")
|
||||
# hsa_file_load = re.compile(".*copy.hsaco$")
|
||||
code_object_files = {}
|
||||
code_object_memory = []
|
||||
hsa_memory_load_pattern = "gfx[a-z0-9]+_copy_memory.hsaco"
|
||||
for root, dirs, files in os.walk(file_path, topdown=True):
|
||||
for file in files:
|
||||
filename = os.path.join(root, file)
|
||||
if re.search(hsa_memory_load_pattern, filename):
|
||||
code_object_memory.append(filename)
|
||||
code_object_files["hsa_memory_load"] = code_object_memory
|
||||
return code_object_files
|
||||
@@ -0,0 +1,5 @@
|
||||
|
||||
[pytest]
|
||||
addopts = --durations=20 -rA -s -vv
|
||||
testpaths = validate.py
|
||||
pythonpath = @ROCPROFILER_SDK_TESTS_BINARY_DIR@/pytest-packages
|
||||
@@ -0,0 +1,47 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import sys
|
||||
import pytest
|
||||
import re
|
||||
import os
|
||||
|
||||
|
||||
def test_json_data(json_data):
|
||||
data = json_data["rocprofiler-sdk-tool"]
|
||||
strings = data["strings"]
|
||||
assert "att_filenames" in strings.keys()
|
||||
att_files = data["strings"]["att_filenames"]
|
||||
assert len(att_files) > 0
|
||||
|
||||
|
||||
def test_code_object_memory(code_object_file_path, json_data, output_path):
|
||||
|
||||
data = json_data["rocprofiler-sdk-tool"]
|
||||
tool_memory_load = data["strings"]["code_object_snapshot_filenames"]
|
||||
gfx_pattern = "gfx[a-z0-9]+"
|
||||
match = re.search(gfx_pattern, tool_memory_load[0])
|
||||
assert match != None
|
||||
gpu_name = match.group(0)
|
||||
tool_memory_load_1 = open(os.path.join(output_path, tool_memory_load[0]), "rb")
|
||||
tool_memory_load_2 = open(os.path.join(output_path, tool_memory_load[1]), "rb")
|
||||
found = False
|
||||
for hsa_file in code_object_file_path["hsa_memory_load"]:
|
||||
|
||||
m = re.search(gfx_pattern, hsa_file)
|
||||
assert m != None
|
||||
gpu = m.group(0)
|
||||
|
||||
if gpu == gpu_name:
|
||||
found = True
|
||||
hsa_memory_load = open(hsa_file, "rb")
|
||||
hsa_memory_fs = hsa_memory_load.read()
|
||||
tool_memory_fs_1 = tool_memory_load_1.read()
|
||||
tool_memory_fs_2 = tool_memory_load_2.read()
|
||||
assert hsa_memory_fs == tool_memory_fs_2 or hsa_memory_fs == tool_memory_fs_1
|
||||
break
|
||||
assert found == True
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
exit_code = pytest.main(["-x", __file__] + sys.argv[1:])
|
||||
sys.exit(exit_code)
|
||||
Reference in New Issue
Block a user