Adding att v3 support (#84)

* Adding att v3 support

* misc fix

* bug fix

* Python linting workflow and rules

* fix regex

* Adding temporary args

* fix temporary args

* fix format

* remove att_perfcounters from test input

* Review comments (#163)

Co-authored-by: Giovanni Baraldi <gbaraldi@amd.com>

* Revert "Review comments (#163)"

This reverts commit 9ef0f8e5a4489d5581255e1b70ced2aef5c1c1d0.

* Address review comments 2

* review changes

* review comments

* review

* cmake alias

* review

* review

* review

* review

* Enabling percounter in v3 script

* review

* formatting

* formatting

---------

Co-authored-by: Jonathan R. Madsen <jonathanrmadsen@gmail.com>
Co-authored-by: Baraldi, Giovanni <Giovanni.Baraldi@amd.com>
Co-authored-by: Giovanni Baraldi <gbaraldi@amd.com>
This commit is contained in:
Nagaraj, Sriraksha
2025-02-04 04:05:38 -06:00
committed by GitHub
parent 72a27feb04
commit d4a51e4102
32 changed files with 2068 additions and 43 deletions
+16
View File
@@ -0,0 +1,16 @@
[flake8]
max-line-length =
90
ignore =
E203,
E501,
W503,
W605,
exclude =
.git,
__pycache__,
scripts,
external,
tests,
build,
build-*
+39
View File
@@ -0,0 +1,39 @@
name: Python Linting
on:
workflow_dispatch:
push:
branches: [ amd-staging, amd-mainline ]
paths:
- '**.py'
pull_request:
paths:
- '**.py'
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
linting:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['3.8', '3.10', '3.12']
steps:
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
flake8 source --count --show-source --statistics --select=E9,F63,F7,F82
# flake8 options are defined in .flake8
flake8 source --count --show-source --statistics
+1
View File
@@ -4,6 +4,7 @@ clang-tidy>=15.0.0,<19.0.0
cmake>=3.21.0
cmake-format
dataclasses
flake8
numpy
otf2
pandas
+272 -2
View File
@@ -26,6 +26,7 @@ import os
import sys
import argparse
import subprocess
import re
class dotdict(dict):
@@ -87,6 +88,95 @@ def strtobool(val):
raise ValueError(f"invalid truth value {val} (type={val_type})")
def search_path(path_list):
supported_option = []
lib_att_pattern = r"libatt_decoder_(trace|summary|debug|testing)\.so"
file_list = []
for path in path_list:
for root, dirs, files in os.walk(path, topdown=True):
file_list.extend(files)
break
for itr in file_list:
_match = re.match(lib_att_pattern, itr)
if _match:
lst = re.findall("trace|debug|summary|testing", itr)
supported_option.extend(lst)
return set(supported_option)
def check_att_capability(args):
path = []
ROCPROFV3_DIR = os.path.dirname(os.path.realpath(__file__))
ROCM_DIR = os.path.dirname(ROCPROFV3_DIR)
support_input = {}
tmp_parser = argparse.ArgumentParser(add_help=False)
tmp_parser.add_argument(
"--att-library-path",
default=os.environ.get(
"ATT_LIBRARY_PATH", os.environ.get("LD_LIBRARY_PATH", None)
),
type=str,
required=False,
)
tmp_parser.add_argument(
"-i",
"--input",
default=None,
type=str,
required=False,
)
tmp_data = {}
att_args, unparsed_args = tmp_parser.parse_known_args(args)
tmp_keys = list(att_args.__dict__.keys())
for itr in tmp_keys:
if has_set_attr(att_args, itr):
tmp_data[itr] = getattr(att_args, itr)
data = dotdict(tmp_data)
if data.input:
# If index of a pass in input file is a key in the support_input dict, then that pass has att-library-path arg
args_list = parse_input(data.input)
for index, itr in enumerate(args_list):
if itr.att_library_path:
library_path = []
if ":" in itr.att_library_path:
library_path.extend(itr.att_library_path.split(":"))
else:
library_path.append(itr.att_library_path)
support = search_path(library_path)
# If the att-library-path in the input file for a pass is valid, then the value of index key in the dict, support_input, is updated to that valid path
if support:
support_input[index] = set(support)
else:
# If the att-library-path in the input file for a pass is invalid, then the value of index key in the dict, support_input, is empty
support_input[index] = []
if data.att_library_path:
if ":" in data.att_library_path:
path.extend(data.att_library_path.split(":"))
else:
path.append(data.att_library_path)
else:
path.append(f"{ROCM_DIR}/lib")
path.append(f"{ROCM_DIR}/lib64")
support = search_path(set(path))
if support:
if len(path) == 1:
os.environ["ATT_LIBRARY_PATH"] = path[0]
os.environ["ROCPROF_ATT_LIBRARY_PATH"] = path[0]
else:
os.environ["ATT_LIBRARY_PATH"] = ":".join(path)
os.environ["ROCPROF_ATT_LIBRARY_PATH"] = ":".join(path)
return support, support_input
return None, support_input
class booleanArgAction(argparse.Action):
def __call__(self, parser, args, value, option_string=None):
setattr(args, self.dest, strtobool(value))
@@ -473,6 +563,14 @@ For MPI applications (or other job launchers such as SLURM), place rocprofv3 ins
default=os.environ.get("ROCPROF_PRELOAD", "").split(":"),
nargs="*",
)
advanced_options.add_argument(
"--att-library-path",
default=os.environ.get(
"ATT_LIBRARY_PATH", os.environ.get("LD_LIBRARY_PATH", None)
),
help="ATT library path to find decoder library",
)
# below is available for CI because LD_PRELOADing a library linked to a sanitizer library
# causes issues in apps where HIP is part of shared library.
add_parser_bool_argument(
@@ -494,7 +592,66 @@ For MPI applications (or other job launchers such as SLURM), place rocprofv3 ins
app_args = args[(idx + 1) :]
break
return (parser.parse_args(rocp_args), app_args)
supported_list, is_support_input = check_att_capability(rocp_args)
if supported_list or len(is_support_input) != 0:
choice_list = []
for keys, values in is_support_input.items():
choice_list.extend(values)
if supported_list:
choice_list.extend(list(supported_list))
att_options = parser.add_argument_group("Advanced Thread Trace")
add_parser_bool_argument(
att_options,
"--advanced-thread-trace",
help="Enable ATT",
)
att_options.add_argument(
"--att-target-cu",
help="ATT target compute unit",
default=None,
type=int,
)
att_options.add_argument(
"--att-simd-select",
help="Select ATT SIMD",
default=None,
type=str,
)
att_options.add_argument(
"--att-buffer-size",
help="Buffer Size",
default=None,
type=str,
)
att_options.add_argument(
"--att-shader-engine-mask",
help="att shader engine mask",
default=None,
type=str,
)
att_options.add_argument(
"--att-parse",
type=str.lower,
default=None,
help="Select ATT Parse method from the choices",
choices=set(choice_list),
)
add_parser_bool_argument(
att_options,
"--att-serialize-all",
default=False,
help="Serialize all kernels",
)
return (parser.parse_args(rocp_args), app_args, supported_list, is_support_input)
def parse_yaml(yaml_file):
@@ -1048,6 +1205,77 @@ def run(app_args, args, **kwargs):
update_env("ROCPROF_PC_SAMPLING_METHOD", args.pc_sampling_method)
update_env("ROCPROF_PC_SAMPLING_INTERVAL", args.pc_sampling_interval)
if args.advanced_thread_trace:
def int_auto(num_str):
if "0x" in num_str:
return int(num_str, 16)
else:
return int(num_str, 10)
if args.pmc or (
args.pc_sampling_beta_enabled
or args.pc_sampling_unit
or args.pc_sampling_method
or args.pc_sampling_interval
):
fatal_error(
"Advanced thread trace cannot be enabled with counter collection or pc sampling"
)
if not args.att_parse:
fatal_error("provide the parser choice")
update_env("ROCPROF_ADVANCED_THREAD_TRACE", True, overwrite=True)
update_env("ROCPROF_ATT_CAPABILITY", args.att_parse, overwrite=True)
if args.att_target_cu:
update_env("ROCPROF_ATT_PARAM_TARGET_CU", args.att_target_cu, overwrite=True)
if args.att_shader_engine_mask:
update_env(
"ROCPROF_ATT_PARAM_SHADER_ENGINE_MASK",
int_auto(args.att_shader_engine_mask),
overwrite=True,
)
if args.att_buffer_size:
update_env(
"ROCPROF_ATT_PARAM_BUFFER_SIZE",
int_auto(args.att_buffer_size),
overwrite=True,
)
if args.att_simd_select:
update_env(
"ROCPROF_ATT_PARAM_SIMD_SELECT",
int_auto(args.att_simd_select),
overwrite=True,
)
if args.att_serialize_all:
update_env(
"ROCPROF_ATT_PARAM_SERIALIZE_ALL",
args.att_serialize_all,
overwrite=True,
)
if args.att_library_path:
update_env(
"ROCPROF_ATT_LIBRARY_PATH",
args.att_library_path,
overwrite=True,
)
update_env(
"ATT_LIBRARY_PATH",
args.att_library_path,
overwrite=True,
)
if args.att_percounters:
update_env(
"ROCPROF_ATT_PARAM_PERFCOUNTERS",
" ".join(args.att_perfcounters),
overwrite=True,
)
if use_execv:
# does not return
os.execvpe(app_args[0], app_args, env=app_env)
@@ -1061,9 +1289,45 @@ def run(app_args, args, **kwargs):
return exit_code
def check_att_path_parse_method(args, index, support_att_input, att_parse_supported):
if not att_parse_supported:
if index not in support_att_input.keys():
fatal_error(
f"Advanced_thread_trace enabled but no decoder library found in cmdline/env paths and att_library_path not set for pass-{index + 1}"
)
elif not support_att_input[index]:
fatal_error(
f"Advanced_thread_trace enabled but no decoder library found in att_library_path for pass-{index + 1}"
)
else:
if args.att_parse and args.att_parse not in support_att_input[index]:
fatal_error(
f"Advanced_thread_trace enabled but decoder library for requested parse method not found in att_library_path for pass-{index + 1}"
)
else:
if index in support_att_input.keys() and not support_att_input[index]:
fatal_error(
f"Advanced_thread_trace enabled but no decoder library found in att_library_path for pass-{index + 1}"
)
elif index not in support_att_input.keys():
if args.att_parse and args.att_parse not in att_parse_supported:
fatal_error(
"Advanced_thread_trace enabled but decoder library for requested parse method not found"
)
else:
if args.att_parse and args.att_parse not in support_att_input[index]:
fatal_error(
f"Advanced_thread_trace enabled but decoder library for requested parse method not found for pass-{index + 1}"
)
def main(argv=None):
cmd_args, app_args = parse_arguments(argv)
# att_parse_supported is valid path for decoder in env or commandline arg
# support_att_input is a dict, where key is a pass index with value being a valid decoder path
cmd_args, app_args, att_parse_supported, support_att_input = parse_arguments(argv)
inp_args = (
parse_input(cmd_args.input) if getattr(cmd_args, "input") else [dotdict({})]
)
@@ -1073,10 +1337,16 @@ def main(argv=None):
pass_idx = None
if has_set_attr(args, "pmc") and len(args.pmc) > 0:
pass_idx = 1
if args.advanced_thread_trace:
check_att_path_parse_method(args, 0, support_att_input, att_parse_supported)
run(app_args, args, pass_id=pass_idx)
else:
for idx, itr in enumerate(inp_args):
args = get_args(cmd_args, itr)
if args.advanced_thread_trace:
check_att_path_parse_method(
args, idx, support_att_input, att_parse_supported
)
run(
app_args,
args,
-1
View File
@@ -23,7 +23,6 @@
# THE SOFTWARE.
import ctypes
import pathlib
import os
import io
import csv
+5
View File
@@ -805,6 +805,11 @@ Properties
- **``pc_sampling_interval``** *(integer)*: pc sampling interval.
- **``pc-sampling-beta-enabled``** *(boolean)*: enable pc
sampling support; beta version.
- **``att_filenames``** *(object)*
- **``key``** *(integer)*: Dispatch id.
- **``value``** *(array)*: An array of ATT filenames.
- **``code_object_snapshot_filenames``** *(array)*: Code
object snapshot filename.
.. code-block:: shell
+18
View File
@@ -567,6 +567,24 @@
"pc_sample_comments": {
"type": "array",
"description": "Comments matching assembly instructions from pc_sample_instructions array. If debug symbols are available, comments provide instructions to source-line mapping. Otherwise, a comment is an empty string."
},
"att_filenames" : {
"type": "object",
"properties": {
"key": {
"type": "integer",
"description": "Dispatch id."
},
"value": {
"type": "array",
"description": "An array of ATT filenames."
}
}
},
"code_object_snapshot_filenames": {
"type": "array",
"description": "Code object snapshot filename"
}
}
},
+16 -6
View File
@@ -25,6 +25,7 @@
#include "statistics.hpp"
#include "timestamps.hpp"
#include "lib/common/filesystem.hpp"
#include "lib/common/string_entry.hpp"
#include "lib/common/utility.hpp"
@@ -37,6 +38,7 @@ namespace rocprofiler
{
namespace tool
{
namespace fs = common::filesystem;
json_output::json_output(const output_config& cfg,
std::string_view filename,
JSONOutputArchive::Options _opts)
@@ -115,11 +117,18 @@ write_json(json_output& json_ar,
json_ar(cereal::make_nvp("counters", tool_metadata.get_counter_info()));
{
auto callback_name_info = tool_metadata.callback_names;
auto buffer_name_info = tool_metadata.buffer_names;
auto counter_dims = tool_metadata.get_counter_dimension_info();
auto marker_msg_data = tool_metadata.marker_messages.get();
auto callback_name_info = tool_metadata.callback_names;
auto buffer_name_info = tool_metadata.buffer_names;
auto counter_dims = tool_metadata.get_counter_dimension_info();
auto marker_msg_data = tool_metadata.marker_messages.get();
auto code_object_load_info = tool_metadata.get_code_object_load_info();
auto att_filenames = tool_metadata.get_att_filenames();
auto code_object_snapshot_filenames = std::vector<std::string>{};
code_object_snapshot_filenames.reserve(code_object_load_info.size());
for(auto info : code_object_load_info)
{
code_object_snapshot_filenames.emplace_back(fs::path(info.name).filename());
}
json_ar.setNextName("strings");
json_ar.startNode();
json_ar(cereal::make_nvp("callback_records", callback_name_info));
@@ -131,7 +140,8 @@ write_json(json_output& json_ar,
json_ar(
cereal::make_nvp("pc_sample_instructions", tool_metadata.get_pc_sample_instructions()));
json_ar(cereal::make_nvp("pc_sample_comments", tool_metadata.get_pc_sample_comments()));
json_ar(cereal::make_nvp("att_filenames", att_filenames));
json_ar(cereal::make_nvp("code_object_snapshot_filenames", code_object_snapshot_filenames));
{
auto _extern_corr_id_strings = std::map<size_t, std::string>{};
if(cfg.kernel_rename)
+32
View File
@@ -22,12 +22,15 @@
#include "metadata.hpp"
#include "lib/common/filesystem.hpp"
#include "lib/common/string_entry.hpp"
#include "lib/output/agent_info.hpp"
#include "lib/output/host_symbol_info.hpp"
#include "lib/output/kernel_symbol_info.hpp"
#include "lib/rocprofiler-sdk-att/att_lib_wrapper.hpp"
#include <rocprofiler-sdk/fwd.h>
#include <rocprofiler-sdk/cxx/details/tokenize.hpp>
#include <memory>
#include <vector>
@@ -36,6 +39,7 @@ namespace rocprofiler
{
namespace tool
{
namespace fs = common::filesystem;
namespace
{
rocprofiler_status_t
@@ -218,6 +222,34 @@ metadata::get_code_object(uint64_t code_obj_id) const
});
}
code_object_load_info_vec_t
metadata::get_code_object_load_info() const
{
auto _data = code_object_load.rlock([](const auto& _data_v) {
auto _info = std::vector<rocprofiler::att_wrapper::CodeobjLoadInfo>{};
_info.reserve(_data_v.size());
for(const auto& itr : _data_v)
_info.emplace_back(itr);
return _info;
});
return _data;
}
std::vector<std::string>
metadata::get_att_filenames() const
{
auto data = std::vector<std::string>{};
for(auto filenames : att_filenames)
{
for(auto file : filenames.second.second)
{
data.emplace_back(fs::path(file).filename());
}
}
return data;
}
const kernel_symbol_info*
metadata::get_kernel_symbol(uint64_t kernel_id) const
{
+18 -10
View File
@@ -32,6 +32,7 @@
#include "lib/common/demangle.hpp"
#include "lib/common/logging.hpp"
#include "lib/common/synchronized.hpp"
#include "lib/rocprofiler-sdk-att/att_lib_wrapper.hpp"
#include <rocprofiler-sdk/agent.h>
#include <rocprofiler-sdk/buffer_tracing.h>
@@ -74,8 +75,11 @@ using marker_message_ordered_map_t = std::map<uint64_t, std::string>;
using string_entry_map_t = std::unordered_map<size_t, std::unique_ptr<std::string>>;
using counter_dimension_vec_t = std::vector<rocprofiler_record_dimension_info_t>;
using external_corr_id_set_t = std::unordered_set<uint64_t>;
using code_obj_decoder_t = rocprofiler::sdk::codeobj::disassembly::CodeobjAddressTranslate;
using instruction_t = rocprofiler::sdk::codeobj::disassembly::Instruction;
using code_obj_decoder_t = rocprofiler::sdk::codeobj::disassembly::CodeobjAddressTranslate;
using instruction_t = rocprofiler::sdk::codeobj::disassembly::Instruction;
using att_agent_filenames_t = std::pair<rocprofiler_agent_id_t, std::vector<std::string>>;
using att_filenames_map_t = std::unordered_map<rocprofiler_dispatch_id_t, att_agent_filenames_t>;
using code_object_load_info_vec_t = std::vector<rocprofiler::att_wrapper::CodeobjLoadInfo>;
template <typename Tp>
using synced_map = common::Synchronized<Tp, true>;
@@ -94,14 +98,16 @@ struct metadata
agent_counter_info_map_t agent_counter_info = {};
agent_pc_sample_config_info_map_t agent_pc_sample_config_info = {};
sdk::buffer_name_info buffer_names = {};
sdk::callback_name_info callback_names = {};
synced_map<code_object_data_map_t> code_objects = {};
synced_map<kernel_symbol_data_map_t> kernel_symbols = {};
synced_map<marker_message_map_t> marker_messages = {};
synced_map<string_entry_map_t> string_entries = {};
synced_map<external_corr_id_set_t> external_corr_ids = {};
synced_map<host_function_info_map_t> host_functions = {};
sdk::buffer_name_info buffer_names = {};
sdk::callback_name_info callback_names = {};
synced_map<code_object_data_map_t> code_objects = {};
synced_map<kernel_symbol_data_map_t> kernel_symbols = {};
synced_map<marker_message_map_t> marker_messages = {};
synced_map<string_entry_map_t> string_entries = {};
synced_map<external_corr_id_set_t> external_corr_ids = {};
synced_map<host_function_info_map_t> host_functions = {};
synced_map<code_object_load_info_vec_t> code_object_load = {};
att_filenames_map_t att_filenames = {};
metadata() = default;
metadata(inprocess);
@@ -122,6 +128,7 @@ struct metadata
const tool_counter_info* get_counter_info(rocprofiler_counter_id_t id) const;
const counter_dimension_info_vec_t* get_counter_dimension_info(uint64_t instance_id) const;
std::vector<std::string> get_att_filenames() const;
code_object_data_vec_t get_code_objects() const;
kernel_symbol_data_vec_t get_kernel_symbols() const;
host_function_data_vec_t get_host_symbols() const;
@@ -135,6 +142,7 @@ struct metadata
std::string_view get_comment(int64_t index) const { return instruction_comment.at(index); }
int64_t get_instruction_index(rocprofiler_pc_t record);
void add_decoder(rocprofiler_code_object_info_t* obj_data_v);
code_object_load_info_vec_t get_code_object_load_info() const;
template <typename Tp>
Tp get_marker_messages(Tp&&);
@@ -29,6 +29,7 @@ find_package(
lib/cmake/amd_comgr)
add_library(rocprofiler-sdk-att-parser STATIC)
add_library(rocprofiler-sdk::rocprofiler-sdk-att-parser ALIAS rocprofiler-sdk-att-parser)
target_sources(rocprofiler-sdk-att-parser PRIVATE ${ATT_TOOL_SOURCE_FILES})
target_link_libraries(
@@ -86,8 +86,8 @@ get_shader_id(const std::string& name)
auto run_pos = name.rfind('_');
if(run_pos == std::string::npos) throw std::runtime_error("Invalid name");
std::string_view stripped = name.substr(0, run_pos);
auto se_number_pos = stripped.rfind('_');
std::string stripped = name.substr(0, run_pos);
auto se_number_pos = stripped.rfind('_');
if(se_number_pos == std::string::npos || se_number_pos + 1 >= stripped.size())
throw std::runtime_error("Invalid name");
@@ -97,15 +97,11 @@ get_shader_id(const std::string& name)
std::vector<tool_att_capability_t>
query_att_decode_capability()
{
std::vector<tool_att_capability_t> ret;
auto ret = std::vector<tool_att_capability_t>{};
for(auto& [cap, libname] : get_lib_names())
{
if(auto handle = dlopen(libname, RTLD_NOW | RTLD_LOCAL))
{
dlclose(handle);
ret.push_back(cap);
}
if(DL(libname).handle != 0) ret.push_back(cap);
}
return ret;
+27 -3
View File
@@ -20,12 +20,19 @@
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include "dl.hpp"
#include "lib/rocprofiler-sdk-att/dl.hpp"
#include "lib/common/environment.hpp"
#include "lib/common/filesystem.hpp"
#include "lib/common/logging.hpp"
#include <rocprofiler-sdk/cxx/details/tokenize.hpp>
#include <dlfcn.h>
#include <atomic>
#include <cassert>
#include <cstdlib>
#include <mutex>
#include <set>
#include <stdexcept>
#include <string>
@@ -33,9 +40,26 @@ namespace rocprofiler
{
namespace att_wrapper
{
DL::DL(const char* dlname)
namespace fs = ::rocprofiler::common::filesystem;
fs::path
get_search_path(std::string path_name)
{
handle = dlopen(dlname, RTLD_NOW | RTLD_LOCAL);
if(fs::exists(path_name)) return fs::path(path_name);
return "";
}
DL::DL(const char* libname)
{
auto paths = rocprofiler::common::get_env("ROCPROF_ATT_LIBRARY_PATH", "");
if(paths.empty()) return;
auto path_set = rocprofiler::sdk::parse::tokenize(paths, ":");
for(auto&& name : path_set)
{
handle = dlopen((get_search_path(name) / libname).string().c_str(), RTLD_LAZY | RTLD_LOCAL);
if(handle) break;
}
if(!handle) return;
att_parse_data_fn =
+1 -1
View File
@@ -35,7 +35,7 @@ class DL
using StatusFn = decltype(rocprofiler_att_decoder_get_status_string);
public:
DL(const char* dlname);
DL(const char* libname);
~DL();
ParseFn* att_parse_data_fn = nullptr;
@@ -6,14 +6,15 @@ project(rocprofiler-att-parser-tests LANGUAGES CXX)
add_executable(att-parser-tool-v3)
target_link_libraries(
att-parser-tool-v3
PRIVATE rocprofiler-sdk-att-parser rocprofiler-sdk::rocprofiler-sdk-json
PRIVATE rocprofiler-sdk::rocprofiler-sdk-att-parser
rocprofiler-sdk::rocprofiler-sdk-json
rocprofiler-sdk::rocprofiler-sdk-common-library)
target_sources(att-parser-tool-v3 PRIVATE standalone_tool_main.cpp)
add_executable(att-decoder-test)
target_link_libraries(
att-decoder-test
PRIVATE rocprofiler-sdk-att-parser
PRIVATE rocprofiler-sdk::rocprofiler-sdk-att-parser
rocprofiler-sdk::rocprofiler-sdk-json
rocprofiler-sdk::rocprofiler-sdk-common-library
rocprofiler-sdk::rocprofiler-sdk-glog
@@ -22,6 +23,7 @@ target_link_libraries(
GTest::gtest_main)
target_sources(att-decoder-test PRIVATE att_decoder_test.cpp)
set(env-att-lib "ROCPROF_ATT_LIBRARY_PATH=${CMAKE_CURRENT_BINARY_DIR}/../lib")
add_library(att_decoder_testing SHARED)
target_sources(att_decoder_testing PRIVATE dummy_decoder.cpp)
@@ -31,6 +33,7 @@ if(NOT ROCPROFILER_MEMCHECK)
SOURCES att_decoder_test.cpp
TEST_LIST att-decoder-test_TESTS
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
set_tests_properties(${att-decoder-test_TESTS} PROPERTIES TIMEOUT 10 LABELS
"unittests")
set_tests_properties(
${att-decoder-test_TESTS} PROPERTIES ENVIRONMENT "${env-att-lib}" TIMEOUT 10
LABELS "unittests")
endif()
@@ -6,7 +6,7 @@ project(rocprofiler-att-parser-waitcnt-tests LANGUAGES CXX)
add_executable(att-decoder-waitcnt-test)
target_link_libraries(
att-decoder-waitcnt-test
PRIVATE rocprofiler-sdk-att-parser
PRIVATE rocprofiler-sdk::rocprofiler-sdk-att-parser
rocprofiler-sdk::rocprofiler-sdk-json
rocprofiler-sdk::rocprofiler-sdk-common-library
rocprofiler-sdk::rocprofiler-sdk-glog
@@ -23,7 +23,8 @@ target_link_libraries(
rocprofiler-sdk::rocprofiler-sdk-perfetto
rocprofiler-sdk::rocprofiler-sdk-otf2
rocprofiler-sdk::rocprofiler-sdk-dw
rocprofiler-sdk::rocprofiler-sdk-amd-comgr)
rocprofiler-sdk::rocprofiler-sdk-amd-comgr
rocprofiler-sdk::rocprofiler-sdk-att-parser)
set_target_properties(
rocprofiler-sdk-tool
@@ -144,6 +144,68 @@ get_kernel_filter_range(const std::string& kernel_filter)
return range_set;
}
std::vector<att_perfcounter>
parse_att_counters(std::string line)
{
auto counters = std::vector<att_perfcounter>{};
if(line.empty()) return counters;
// strip the comment
if(auto pos = line.find('#'); pos != std::string::npos) line = line.substr(0, pos);
// trim line for any white spaces after comment strip
trim(line);
// check to see if comment stripping + trim resulted in empty line
if(line.empty()) return counters;
handle_special_chars(line);
auto extract_counter_name_and_simd_mask = [](std::string& input) {
std::string counter_name = "";
auto ret = att_perfcounter{};
size_t pos = input.find(':');
if(pos != std::string::npos)
{
ret.counter_name = input.substr(0, pos);
ret.simd_mask = std::stoi(input.substr(pos + 1), nullptr, 16);
}
else
counter_name = input;
return ret;
};
// regex to check if string is of the form "counter_name:simd_mask"
std::regex pattern(R"([a-zA-Z0-9_]+(:0x[0-9a-fA-F]+)?)");
std::set<std::string> unique_counters;
auto input_ss = std::stringstream{line};
while(true)
{
auto counter = std::string{};
input_ss >> counter;
if(counter.empty()) break;
// check if the counter string matches the pattern
if(!std::regex_match(counter, pattern))
{
ROCP_FATAL << "Invalid counter format for ATT: " << counter
<< ". Expected format : Counter_name:optional_simd_mask(hexadecimal)";
}
// Consider only those counters where combination of counter name and simd mask is unique
if(unique_counters.insert(counter).second == false) continue;
auto res = extract_counter_name_and_simd_mask(counter);
counters.emplace_back(res);
}
return counters;
}
std::set<std::string>
parse_counters(std::string line)
{
@@ -193,6 +255,8 @@ config::config()
, kernel_filter_range{get_kernel_filter_range(
get_env("ROCPROF_KERNEL_FILTER_RANGE", std::string{}))}
, counters{parse_counters(get_env("ROCPROF_COUNTERS", std::string{}))}
, att_param_perfcounters{
parse_att_counters(get_env("ROCPROF_ATT_PARAM_PERFCOUNTERS", std::string{}))}
{
if(kernel_filter_include.empty()) kernel_filter_include = std::string{".*"};
+18 -2
View File
@@ -29,6 +29,7 @@
#include "lib/common/units.hpp"
#include "lib/output/format_path.hpp"
#include "lib/output/output_config.hpp"
#include "lib/rocprofiler-sdk-att/att_lib_wrapper.hpp"
#include <rocprofiler-sdk/cxx/serialization.hpp>
@@ -63,6 +64,12 @@ get_config();
std::string
format_name(std::string_view _name, const config& = get_config<>());
struct att_perfcounter
{
std::string counter_name = {};
uint32_t simd_mask = 0xf;
};
struct config : output_config
{
using base_type = output_config;
@@ -101,13 +108,20 @@ struct config : output_config
bool list_metrics = get_env("ROCPROF_LIST_METRICS", false);
bool list_metrics_output_file = get_env("ROCPROF_OUTPUT_LIST_METRICS_FILE", false);
bool pc_sampling_host_trap = false;
bool advanced_thread_trace = get_env("ROCPROF_ADVANCED_THREAD_TRACE", false);
size_t pc_sampling_interval = get_env("ROCPROF_PC_SAMPLING_INTERVAL", 1);
bool att_serialize_all = get_env("ROCPROF_ATT_PARAM_SERIALIZE_ALL", false);
rocprofiler_pc_sampling_method_t pc_sampling_method_value = ROCPROFILER_PC_SAMPLING_METHOD_NONE;
rocprofiler_pc_sampling_unit_t pc_sampling_unit_value = ROCPROFILER_PC_SAMPLING_UNIT_NONE;
std::string stats_summary_unit = get_env("ROCPROF_STATS_SUMMARY_UNITS", "nsec");
int mpi_size = get_mpi_size();
int mpi_rank = get_mpi_rank();
uint64_t att_param_shader_engine_mask =
get_env<uint64_t>("ROCPROF_ATT_PARAM_SHADER_ENGINE_MASK", 0x1);
uint64_t att_param_buffer_size = get_env<uint64_t>("ROCPROF_ATT_PARAM_BUFFER_SIZE", 0x6000000);
uint64_t att_param_simd_select = get_env<uint64_t>("ROCPROF_ATT_PARAM_SIMD_SELECT", 0xF);
uint64_t att_param_target_cu = get_env<uint64_t>("ROCPROF_ATT_PARAM_TARGET_CU", 1);
std::string kernel_filter_include = get_env("ROCPROF_KERNEL_FILTER_INCLUDE_REGEX", ".*");
std::string kernel_filter_exclude = get_env("ROCPROF_KERNEL_FILTER_EXCLUDE_REGEX", "");
@@ -115,8 +129,10 @@ struct config : output_config
std::string pc_sampling_unit = get_env("ROCPROF_PC_SAMPLING_UNIT", "none");
std::string extra_counters_contents = get_env("ROCPROF_EXTRA_COUNTERS_CONTENTS", "");
std::unordered_set<uint32_t> kernel_filter_range = {};
std::set<std::string> counters = {};
std::unordered_set<uint32_t> kernel_filter_range = {};
std::set<std::string> counters = {};
std::string att_capability = get_env("ROCPROF_ATT_CAPABILITY", "");
std::vector<att_perfcounter> att_param_perfcounters = {};
std::queue<CollectionPeriod> collection_periods = {};
+230 -4
View File
@@ -45,8 +45,11 @@
#include "lib/output/statistics.hpp"
#include "lib/output/tmp_file.hpp"
#include "lib/output/tmp_file_buffer.hpp"
#include "lib/rocprofiler-sdk-att/att_lib_wrapper.hpp"
#include <rocprofiler-sdk/agent.h>
#include <rocprofiler-sdk/amd_detail/thread_trace_core.h>
#include <rocprofiler-sdk/amd_detail/thread_trace_dispatch.h>
#include <rocprofiler-sdk/buffer_tracing.h>
#include <rocprofiler-sdk/callback_tracing.h>
#include <rocprofiler-sdk/experimental/counters.h>
@@ -193,9 +196,10 @@ using kernel_iteration_t = std::unordered_map<rocprofiler_kernel_id_t, uint32
using kernel_rename_map_t = std::unordered_map<uint64_t, uint64_t>;
using kernel_rename_stack_t = std::stack<uint64_t>;
auto* tool_metadata = as_pointer<tool::metadata>(tool::metadata::inprocess{});
auto target_kernels = common::Synchronized<targeted_kernels_map_t>{};
auto kernel_iteration = common::Synchronized<kernel_iteration_t, true>{};
auto* tool_metadata = as_pointer<tool::metadata>(tool::metadata::inprocess{});
auto target_kernels = common::Synchronized<targeted_kernels_map_t>{};
auto kernel_iteration = common::Synchronized<kernel_iteration_t, true>{};
std::mutex att_shader_data;
thread_local auto thread_dispatch_rename = as_pointer<kernel_rename_stack_t>();
thread_local auto thread_dispatch_rename_dtor = common::scope_destructor{[]() {
@@ -238,7 +242,14 @@ is_targeted_kernel(uint64_t _kern_id)
// If the iteration range is not given then all iterations of the kernel is profiled
if(_range.empty())
return true;
{
if(!tool::get_config().advanced_thread_trace)
return true;
else
{
if(itr == 1) return true;
}
}
else if(_range.find(itr) != _range.end())
return true;
return false;
@@ -607,6 +618,75 @@ code_object_tracing_callback(rocprofiler_callback_tracing_record_t record,
{
CHECK_NOTNULL(tool_metadata)->add_decoder(obj_data);
}
if(obj_data->storage_type == ROCPROFILER_CODE_OBJECT_STORAGE_TYPE_MEMORY &&
tool::get_config().advanced_thread_trace)
{
const char* gpu_name = tool_metadata->agents_map.at(obj_data->rocp_agent).name;
auto filename = fmt::format("{}_code_object_id_{}",
std::string(gpu_name),
std::to_string(obj_data->code_object_id));
auto output_stream = get_output_stream(tool::get_config(), filename, ".out");
std::string output_filename =
get_output_filename(tool::get_config(), filename, ".out");
// NOLINTNEXTLINE(performance-no-int-to-ptr)
output_stream.stream->write(reinterpret_cast<char*>(obj_data->memory_base),
obj_data->memory_size);
tool_metadata->code_object_load.wlock(
[](auto& data_vec,
std::string file_name,
tool::rocprofiler_code_object_info_t* obj_data_v) {
data_vec.push_back({file_name,
obj_data_v->code_object_id,
obj_data_v->load_base,
obj_data_v->load_size});
},
output_filename,
obj_data);
}
else if(obj_data->storage_type == ROCPROFILER_CODE_OBJECT_STORAGE_TYPE_FILE &&
tool::get_config().advanced_thread_trace)
{
const char* gpu_name = tool_metadata->agents_map.at(obj_data->rocp_agent).name;
auto filename = fmt::format("{}_code_object_id_{}",
std::string(gpu_name),
std::to_string(obj_data->code_object_id));
auto output_stream = get_output_stream(tool::get_config(), filename, ".out");
std::string output_filename =
get_output_filename(tool::get_config(), filename, ".out");
uint8_t* binary = nullptr;
size_t buffer_size = 0;
std::ifstream code_object_file(obj_data->uri, std::ios::binary | std::ios::ate);
if(code_object_file.good())
{
buffer_size = code_object_file.tellg();
code_object_file.seekg(0, std::ios::beg);
binary = new(std::nothrow) uint8_t[buffer_size];
if(binary &&
!code_object_file.read(reinterpret_cast<char*>(binary), buffer_size))
{
delete[] binary;
binary = nullptr;
}
}
// NOLINTBEGIN(performance-no-int-to-ptr)
output_stream.stream->write(reinterpret_cast<char*>(obj_data->memory_base),
obj_data->memory_size);
// NOLINTEND(performance-no-int-to-ptr)
tool_metadata->code_object_load.wlock(
[](auto& data_vec,
std::string file_name,
tool::rocprofiler_code_object_info_t* obj_data_v) {
data_vec.push_back({file_name,
obj_data_v->code_object_id,
obj_data_v->load_base,
obj_data_v->load_size});
},
output_filename,
obj_data);
}
}
else if(record.phase == ROCPROFILER_CALLBACK_PHASE_UNLOAD)
{
@@ -879,6 +959,38 @@ get_instruction_index(rocprofiler_pc_t pc)
} // namespace
std::vector<rocprofiler_att_parameter_t>
get_att_perfcounter_params(std::vector<rocprofiler::tool::att_perfcounter>& att_perf_counters)
{
std::vector<rocprofiler_att_parameter_t> _data;
if(att_perf_counters.empty()) return _data;
static const auto gpu_agents = get_gpu_agents();
static const auto gpu_agents_counter_info = get_agent_counter_info();
for(const auto& [agent_, tool_counter_info_] : gpu_agents_counter_info)
{
for(const auto& counter_info_ : tool_counter_info_)
{
if(std::string_view(counter_info_.block) != "SQ") continue;
for(const auto& att_perf_counter : att_perf_counters)
{
if(std::string_view(counter_info_.name) == att_perf_counter.counter_name)
{
auto param = rocprofiler_att_parameter_t{};
param.type = ROCPROFILER_ATT_PARAMETER_PERFCOUNTER,
param.counter_id = counter_info_.id,
param.simd_mask = att_perf_counter.simd_mask;
_data.emplace_back(param);
}
}
}
}
return _data;
}
void
rocprofiler_pc_sampling_callback(rocprofiler_context_id_t /* context_id*/,
rocprofiler_buffer_id_t /* buffer_id*/,
@@ -920,6 +1032,55 @@ rocprofiler_pc_sampling_callback(rocprofiler_context_id_t /* context_id*/,
}
}
void
att_shader_data_callback(rocprofiler_agent_id_t agent,
int64_t se_id,
void* se_data,
size_t data_size,
rocprofiler_user_data_t userdata)
{
std::lock_guard<std::mutex> lock(att_shader_data);
std::stringstream filename;
filename << fmt::format("{}_shader_engine_{}_{}", agent.handle, se_id, userdata.value);
auto dispatch_id = static_cast<rocprofiler_dispatch_id_t>(userdata.value);
auto output_stream = get_output_stream(tool::get_config(), filename.str(), ".att");
std::string output_filename = get_output_filename(tool::get_config(), filename.str(), ".att");
output_stream.stream->write(reinterpret_cast<char*>(se_data), data_size);
tool_metadata->att_filenames[dispatch_id].first = agent;
tool_metadata->att_filenames[dispatch_id].second.emplace_back(output_filename);
}
rocprofiler_att_control_flags_t
att_dispatch_callback(rocprofiler_agent_id_t /* agent_id */,
rocprofiler_queue_id_t /* queue_id */,
rocprofiler_correlation_id_t /* correlation_id */,
rocprofiler_kernel_id_t kernel_id,
rocprofiler_dispatch_id_t dispatch_id,
void* /*userdata_config*/,
rocprofiler_user_data_t* userdata_shader)
{
userdata_shader->value = dispatch_id;
kernel_iteration.wlock(
[](auto& _kernel_iter, rocprofiler_kernel_id_t _kernel_id) {
auto itr = _kernel_iter.find(_kernel_id);
if(itr == _kernel_iter.end())
_kernel_iter.emplace(_kernel_id, 1);
else
{
itr->second++;
}
},
kernel_id);
if(is_targeted_kernel(kernel_id))
{
return ROCPROFILER_ATT_CONTROL_START_AND_STOP;
}
return ROCPROFILER_ATT_CONTROL_NONE;
}
void
dispatch_callback(rocprofiler_dispatch_counting_service_data_t dispatch_data,
rocprofiler_profile_config_id_t* config,
@@ -1214,6 +1375,37 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
}
}
if(tool::get_config().advanced_thread_trace)
{
auto parameters = std::vector<rocprofiler_att_parameter_t>{};
uint64_t target_cu = tool::get_config().att_param_target_cu;
uint64_t simd_select = tool::get_config().att_param_simd_select;
uint64_t buffer_sz = tool::get_config().att_param_buffer_size;
uint64_t shader_mask = tool::get_config().att_param_shader_engine_mask;
auto& att_perf = tool::get_config().att_param_perfcounters;
bool att_serialize_all = tool::get_config().att_serialize_all;
auto att_perf_params = get_att_perfcounter_params(att_perf);
parameters.insert(parameters.end(), att_perf_params.begin(), att_perf_params.end());
// TODO: att params could be different for different devices. How to support?
// Input file schema might also need to change to support multiple ATT params
parameters.push_back({ROCPROFILER_ATT_PARAMETER_TARGET_CU, {target_cu}});
parameters.push_back({ROCPROFILER_ATT_PARAMETER_SIMD_SELECT, {simd_select}});
parameters.push_back({ROCPROFILER_ATT_PARAMETER_BUFFER_SIZE, {buffer_sz}});
parameters.push_back({ROCPROFILER_ATT_PARAMETER_SHADER_ENGINE_MASK, {shader_mask}});
parameters.push_back({ROCPROFILER_ATT_PARAMETER_SERIALIZE_ALL, {att_serialize_all}});
ROCPROFILER_CALL(
rocprofiler_configure_dispatch_thread_trace_service(get_client_ctx(),
parameters.data(),
parameters.size(),
att_dispatch_callback,
att_shader_data_callback,
tool_data),
"thread trace service configure");
}
if(tool::get_config().hip_runtime_api_trace || tool::get_config().hip_compiler_api_trace)
{
ROCPROFILER_CALL(rocprofiler_create_buffer(get_client_ctx(),
@@ -1504,6 +1696,40 @@ tool_fini(void* /*tool_data*/)
tool::generate_csv(tool::get_config(), *tool_metadata, contributions);
}
if(tool::get_config().advanced_thread_trace)
{
std::unordered_map<std::string_view, rocprofiler::att_wrapper::tool_att_capability_t>
tool_att_capability_map = {
{"testing", rocprofiler::att_wrapper::ATT_CAPABILITIES_TESTING},
{"summary", rocprofiler::att_wrapper::ATT_CAPABILITIES_SUMMARY},
{"trace", rocprofiler::att_wrapper::ATT_CAPABILITIES_TRACE},
{"debug", rocprofiler::att_wrapper::ATT_CAPABILITIES_DEBUG}};
ROCP_FATAL_IF(tool::get_config().att_capability.empty())
<< "Provide the decoder parser method as input";
auto att_capability_value = tool_att_capability_map.at(tool::get_config().att_capability);
auto decoder = rocprofiler::att_wrapper::ATTDecoder(att_capability_value);
ROCP_FATAL_IF(!decoder.valid()) << "Decoder library not found at ROCPORF_ATT_LIBRARY_PATH";
auto codeobj = tool_metadata->get_code_object_load_info();
auto output_path = tool::format_path(tool::get_config().output_path);
for(auto& [dispatch_id, att_filename_data] : tool_metadata->att_filenames)
{
std::string formats = "json,csv";
// if(tool::get_config().json_output) formats += "json,";
// if(tool::get_config().csv_output) formats += "csv,";
std::stringstream ui_name;
ui_name << fmt::format("ui_output_agent_{}_dispatch_{}",
std::to_string(att_filename_data.first.handle),
dispatch_id);
auto out_path = fmt::format("{}/{}", output_path, ui_name.str());
std::string in_path = ".";
decoder.parse(in_path, out_path, att_filename_data.second, codeobj, formats);
}
}
if(tool::get_config().json_output)
{
auto json_ar = tool::open_json(tool::get_config());
+1
View File
@@ -32,3 +32,4 @@ add_subdirectory(pc-sampling)
if(ROCPROFILER_BUILD_ROCDECODE_TESTS)
add_subdirectory(rocdecode)
endif()
add_subdirectory(hsa-code-object)
+90
View File
@@ -0,0 +1,90 @@
#
#
# HSA multi-queue dependency test
cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR)
project(rocprofiler-tests-bin-hsa-code-object LANGUAGES CXX)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_EXTENSIONS OFF)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
find_program(
amdclangpp_EXECUTABLE REQUIRED
NAMES amdclang++
HINTS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm
PATHS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm
PATH_SUFFIXES bin llvm/bin NO_CACHE)
function(generate_hsaco TARGET_ID INPUT_FILE OUTPUT_FILE)
separate_arguments(
CLANG_ARG_LIST
UNIX_COMMAND
"-O2 -x cl -Xclang -finclude-default-header -cl-denorms-are-zero -cl-std=CL2.0 -Wl,--build-id=sha1
-target amdgcn-amd-amdhsa -mcpu=${TARGET_ID} -o ${OUTPUT_FILE} ${INPUT_FILE}")
add_custom_command(
OUTPUT ${PROJECT_BINARY_DIR}/${OUTPUT_FILE}
COMMAND ${amdclangpp_EXECUTABLE} ${CLANG_ARG_LIST}
COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_BINARY_DIR}/${OUTPUT_FILE}
${CMAKE_BINARY_DIR}/tests/rocprofv3/advanced-thread-trace/${OUTPUT_FILE}
OUTPUT ${CMAKE_BINARY_DIR}/tests/rocprofv3/advanced-thread-trace/${OUTPUT_FILE}
COMMAND
${CMAKE_COMMAND} -E copy
${CMAKE_BINARY_DIR}/tests/rocprofv3/advanced-thread-trace/${OUTPUT_FILE}
${CMAKE_BINARY_DIR}/rocprofv3/advanced-thread-trace/${OUTPUT_FILE}
COMMENT "Building ${OUTPUT_FILE}...")
set(HSACO_TARGET_LIST
${HSACO_TARGET_LIST} ${PROJECT_BINARY_DIR}/${OUTPUT_FILE}
PARENT_SCOPE)
endfunction(generate_hsaco)
foreach(target_id ${GPU_TARGETS})
# generate kernel bitcodes
generate_hsaco(${target_id} ${CMAKE_CURRENT_SOURCE_DIR}/copy.cl
${target_id}_copy.hsaco)
generate_hsaco(${target_id} ${CMAKE_CURRENT_SOURCE_DIR}/copy_memory.cl
${target_id}_copy_memory.hsaco)
endforeach()
add_custom_target(generate_hsaco_targets_code_object DEPENDS ${HSACO_TARGET_LIST})
add_executable(hsa_code_object_testapp)
target_sources(hsa_code_object_testapp PRIVATE hsa_code_object_app.cpp)
target_compile_options(hsa_code_object_testapp PRIVATE -W -Wall -Wextra -Wshadow -Werror)
find_package(Threads REQUIRED)
target_link_libraries(hsa_code_object_testapp PRIVATE stdc++fs Threads::Threads)
find_package(
amd_comgr
REQUIRED
CONFIG
HINTS
${CMAKE_INSTALL_PREFIX}
PATHS
${ROCM_PATH}
PATH_SUFFIXES
lib/cmake/amd_comgr)
target_link_libraries(hsa_code_object_testapp PRIVATE amd_comgr)
find_package(rocprofiler-sdk REQUIRED)
target_link_libraries(
hsa_code_object_testapp PRIVATE rocprofiler-sdk::rocprofiler-sdk
rocprofiler-sdk::tests-common-library)
find_package(
hsa-runtime64
REQUIRED
CONFIG
HINTS
${rocm_version_DIR}
${ROCM_PATH}
PATHS
${rocm_version_DIR}
${ROCM_PATH})
target_link_libraries(hsa_code_object_testapp PRIVATE hsa-runtime64::hsa-runtime64)
add_dependencies(hsa_code_object_testapp generate_hsaco_targets_code_object)
+32
View File
@@ -0,0 +1,32 @@
/* Copyright (c) 2022 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
__kernel void copyA(__global unsigned int* a, __global unsigned int* b) {
uint tid = get_global_id(0);
a[tid] = b[tid];
}
__kernel void copyB(__global unsigned int* a, __global unsigned int* b) {
uint tid = get_global_id(0);
a[tid] = b[tid];
}
__kernel void copyC(__global unsigned int* a, __global unsigned int* b) {
uint tid = get_global_id(0);
a[tid] = b[tid];
}
+32
View File
@@ -0,0 +1,32 @@
/* Copyright (c) 2022 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
__kernel void copyD(__global unsigned int* a, __global unsigned int* b) {
uint tid = get_global_id(0);
a[tid] = b[tid];
}
__kernel void copyE(__global unsigned int* a, __global unsigned int* b) {
uint tid = get_global_id(0);
a[tid] = b[tid];
}
__kernel void copyF(__global unsigned int* a, __global unsigned int* b) {
uint tid = get_global_id(0);
a[tid] = b[tid];
}
@@ -0,0 +1,492 @@
// MIT License
//
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
/** ROC Profiler Multi Queue Dependency Test
*
* The goal of this test is to ensure ROC profiler does not go to deadlock
* when multiple queue are created and they are dependent on each other
*
*/
#include "hsa_code_object_app.h"
enum class storage_type
{
CODE_OBJECT_STORAGE_FILE,
CODE_OBJECT_STORAGE_MEMORY
};
void
code_object_load(MQDependencyTest& obj,
storage_type type,
MQDependencyTest::CodeObject& code_object)
{
hsa_status_t status;
obj.device_discovery();
char agent_name[64];
status = hsa_agent_get_info(obj.gpu[0].agent, HSA_AGENT_INFO_NAME, agent_name);
RET_IF_HSA_ERR(status)
if(type == storage_type::CODE_OBJECT_STORAGE_FILE)
{
std::string hasco_file_path = std::string(agent_name) + std::string("_copy.hsaco");
obj.search_hasco(fs::current_path(), hasco_file_path);
if(!obj.load_code_object(hasco_file_path, obj.gpu[0].agent, code_object))
{
printf("Kernel file not found or not usable with given agent.\n");
abort();
}
}
else
{
std::string hasco_file_path = std::string(agent_name) + std::string("_copy_memory.hsaco");
obj.search_hasco(fs::current_path(), hasco_file_path);
if(!obj.load_code_object_memory(hasco_file_path, obj.gpu[0].agent, code_object))
{
abort();
}
}
}
MQDependencyTest::Kernel
get_kernel(MQDependencyTest::CodeObject& code_object,
std::string kernel_name,
MQDependencyTest& obj)
{
MQDependencyTest::Kernel copy;
if(!obj.get_kernel(code_object, kernel_name, obj.gpu[0].agent, copy))
{
printf("Test %s not found.\n", kernel_name.c_str());
abort();
}
return copy;
}
int
main()
{
hsa_status_t status;
MQDependencyTest obj;
MQDependencyTest obj_memory = {};
MQDependencyTest::CodeObject code_object = {}, code_object_memory = {};
code_object_load(obj, storage_type::CODE_OBJECT_STORAGE_FILE, code_object);
code_object_load(obj_memory, storage_type::CODE_OBJECT_STORAGE_MEMORY, code_object_memory);
MQDependencyTest::Kernel copyA = get_kernel(code_object, "copyA", obj);
MQDependencyTest::Kernel copyB = get_kernel(code_object, "copyB", obj);
MQDependencyTest::Kernel copyC = get_kernel(code_object, "copyC", obj);
MQDependencyTest::Kernel copyD = get_kernel(code_object_memory, "copyD", obj_memory);
MQDependencyTest::Kernel copyE = get_kernel(code_object_memory, "copyE", obj_memory);
MQDependencyTest::Kernel copyF = get_kernel(code_object_memory, "copyF", obj_memory);
struct args_t
{
uint32_t* a = nullptr;
uint32_t* b = nullptr;
MQDependencyTest::OCLHiddenArgs hidden = {};
};
args_t* args = static_cast<args_t*>(obj.hsa_malloc(sizeof(args_t), obj.kernarg));
*args = {};
uint32_t* a = static_cast<uint32_t*>(obj.hsa_malloc(64 * sizeof(uint32_t), obj.kernarg));
uint32_t* b = static_cast<uint32_t*>(obj.hsa_malloc(64 * sizeof(uint32_t), obj.kernarg));
memset(a, 0, 64 * sizeof(uint32_t));
memset(b, 1, 64 * sizeof(uint32_t));
args_t* args_memory =
static_cast<args_t*>(obj_memory.hsa_malloc(sizeof(args_t), obj_memory.kernarg));
*args_memory = {};
uint32_t* c =
static_cast<uint32_t*>(obj_memory.hsa_malloc(64 * sizeof(uint32_t), obj_memory.kernarg));
uint32_t* d =
static_cast<uint32_t*>(obj_memory.hsa_malloc(64 * sizeof(uint32_t), obj_memory.kernarg));
memset(c, 0, 64 * sizeof(uint32_t));
memset(d, 1, 64 * sizeof(uint32_t));
// Create queue in gpu agent and prepare a kernel dispatch packet
hsa_queue_t* queue1 = nullptr;
status = hsa_queue_create(obj.gpu[0].agent,
1024,
HSA_QUEUE_TYPE_SINGLE,
nullptr,
nullptr,
UINT32_MAX,
UINT32_MAX,
&queue1);
RET_IF_HSA_ERR(status)
// Create a signal with a value of 1 and attach it to the first kernel
// dispatch packet
hsa_signal_t completion_signal_1 = {};
status = hsa_signal_create(1, 0, nullptr, &completion_signal_1);
RET_IF_HSA_ERR(status)
// First dispath packet on queue 1, Kernel A
{
MQDependencyTest::Aql packet{};
packet.header.type = HSA_PACKET_TYPE_KERNEL_DISPATCH;
packet.header.barrier = 1;
packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM;
packet.header.release = HSA_FENCE_SCOPE_SYSTEM;
packet.dispatch.setup = 1;
packet.dispatch.workgroup_size_x = 64;
packet.dispatch.workgroup_size_y = 1;
packet.dispatch.workgroup_size_z = 1;
packet.dispatch.grid_size_x = 64;
packet.dispatch.grid_size_y = 1;
packet.dispatch.grid_size_z = 1;
packet.dispatch.group_segment_size = copyA.group;
packet.dispatch.private_segment_size = copyA.scratch;
packet.dispatch.kernel_object = copyA.handle;
packet.dispatch.kernarg_address = args;
packet.dispatch.completion_signal = completion_signal_1;
args->a = a;
args->b = b;
// Tell packet processor of A to launch the first kernel dispatch packet
obj.submit_packet(queue1, packet);
}
// Create a signal with a value of 1 and attach it to the second kernel
// dispatch packet
hsa_signal_t completion_signal_2 = {};
status = hsa_signal_create(1, 0, nullptr, &completion_signal_2);
RET_IF_HSA_ERR(status)
hsa_signal_t completion_signal_3 = {};
status = hsa_signal_create(1, 0, nullptr, &completion_signal_3);
RET_IF_HSA_ERR(status)
// Create barrier-AND packet that is enqueued in queue 1
{
MQDependencyTest::Aql packet{};
packet.header.type = HSA_PACKET_TYPE_BARRIER_AND;
packet.header.barrier = 1;
packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM;
packet.header.release = HSA_FENCE_SCOPE_SYSTEM;
packet.barrier_and.dep_signal[0] = completion_signal_2;
obj.submit_packet(queue1, packet);
}
// Second dispath packet on queue 1, Kernel C
{
MQDependencyTest::Aql packet{};
packet.header.type = HSA_PACKET_TYPE_KERNEL_DISPATCH;
packet.header.barrier = 1;
packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM;
packet.header.release = HSA_FENCE_SCOPE_SYSTEM;
packet.dispatch.setup = 1;
packet.dispatch.workgroup_size_x = 64;
packet.dispatch.workgroup_size_y = 1;
packet.dispatch.workgroup_size_z = 1;
packet.dispatch.grid_size_x = 64;
packet.dispatch.grid_size_y = 1;
packet.dispatch.grid_size_z = 1;
packet.dispatch.group_segment_size = copyC.group;
packet.dispatch.private_segment_size = copyC.scratch;
packet.dispatch.kernel_object = copyC.handle;
packet.dispatch.completion_signal = completion_signal_3;
packet.dispatch.kernarg_address = args;
args->a = a;
args->b = b;
// Tell packet processor to launch the second kernel dispatch packet
obj.submit_packet(queue1, packet);
}
// Create queue 2
hsa_queue_t* queue2 = nullptr;
status = hsa_queue_create(obj.gpu[0].agent,
1024,
HSA_QUEUE_TYPE_SINGLE,
nullptr,
nullptr,
UINT32_MAX,
UINT32_MAX,
&queue2);
RET_IF_HSA_ERR(status)
// Create barrier-AND packet that is enqueued in queue 2
{
MQDependencyTest::Aql packet{};
packet.header.type = HSA_PACKET_TYPE_BARRIER_AND;
packet.header.barrier = 1;
packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM;
packet.header.release = HSA_FENCE_SCOPE_SYSTEM;
packet.barrier_and.dep_signal[0] = completion_signal_1;
obj.submit_packet(queue2, packet);
}
// Third dispath packet on queue 2, Kernel B
{
MQDependencyTest::Aql packet{};
packet.header.type = HSA_PACKET_TYPE_KERNEL_DISPATCH;
packet.header.barrier = 1;
packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM;
packet.header.release = HSA_FENCE_SCOPE_SYSTEM;
packet.dispatch.setup = 1;
packet.dispatch.workgroup_size_x = 64;
packet.dispatch.workgroup_size_y = 1;
packet.dispatch.workgroup_size_z = 1;
packet.dispatch.grid_size_x = 64;
packet.dispatch.grid_size_y = 1;
packet.dispatch.grid_size_z = 1;
packet.dispatch.group_segment_size = copyB.group;
packet.dispatch.private_segment_size = copyB.scratch;
packet.dispatch.kernel_object = copyB.handle;
packet.dispatch.kernarg_address = args;
packet.dispatch.completion_signal = completion_signal_2;
args->a = a;
args->b = b;
// Tell packet processor to launch the third kernel dispatch packet
obj.submit_packet(queue2, packet);
}
// Create a signal with a value of 1 and attach it to the first kernel
// dispatch packet
hsa_signal_t completion_signal_4 = {};
status = hsa_signal_create(1, 0, nullptr, &completion_signal_4);
RET_IF_HSA_ERR(status)
// First dispath packet on queue 1, Kernel D
{
[[maybe_unused]] MQDependencyTest::Aql packet{};
packet.header.type = HSA_PACKET_TYPE_KERNEL_DISPATCH;
packet.header.barrier = 1;
packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM;
packet.header.release = HSA_FENCE_SCOPE_SYSTEM;
packet.dispatch.setup = 1;
packet.dispatch.workgroup_size_x = 64;
packet.dispatch.workgroup_size_y = 1;
packet.dispatch.workgroup_size_z = 1;
packet.dispatch.grid_size_x = 64;
packet.dispatch.grid_size_y = 1;
packet.dispatch.grid_size_z = 1;
packet.dispatch.group_segment_size = copyD.group;
packet.dispatch.private_segment_size = copyD.scratch;
packet.dispatch.kernel_object = copyD.handle;
packet.dispatch.kernarg_address = args_memory;
packet.dispatch.completion_signal = completion_signal_4;
args_memory->a = c;
args_memory->b = d;
// Tell packet processor of A to launch the first kernel dispatch packet
obj_memory.submit_packet(queue1, packet);
}
// Create a signal with a value of 1 and attach it to the second kernel
// dispatch packet
hsa_signal_t completion_signal_5 = {};
status = hsa_signal_create(1, 0, nullptr, &completion_signal_5);
RET_IF_HSA_ERR(status)
hsa_signal_t completion_signal_6 = {};
status = hsa_signal_create(1, 0, nullptr, &completion_signal_6);
RET_IF_HSA_ERR(status)
// Create barrier-AND packet that is enqueued in queue 1
{
MQDependencyTest::Aql packet{};
packet.header.type = HSA_PACKET_TYPE_BARRIER_AND;
packet.header.barrier = 1;
packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM;
packet.header.release = HSA_FENCE_SCOPE_SYSTEM;
packet.barrier_and.dep_signal[0] = completion_signal_5;
obj_memory.submit_packet(queue1, packet);
}
// Second dispath packet on queue 1, Kernel F
{
MQDependencyTest::Aql packet{};
packet.header.type = HSA_PACKET_TYPE_KERNEL_DISPATCH;
packet.header.barrier = 1;
packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM;
packet.header.release = HSA_FENCE_SCOPE_SYSTEM;
packet.dispatch.setup = 1;
packet.dispatch.workgroup_size_x = 64;
packet.dispatch.workgroup_size_y = 1;
packet.dispatch.workgroup_size_z = 1;
packet.dispatch.grid_size_x = 64;
packet.dispatch.grid_size_y = 1;
packet.dispatch.grid_size_z = 1;
packet.dispatch.group_segment_size = copyF.group;
packet.dispatch.private_segment_size = copyF.scratch;
packet.dispatch.kernel_object = copyF.handle;
packet.dispatch.completion_signal = completion_signal_6;
packet.dispatch.kernarg_address = args_memory;
args_memory->a = c;
args_memory->b = d;
// Tell packet processor to launch the second kernel dispatch packet
obj_memory.submit_packet(queue1, packet);
}
// Create barrier-AND packet that is enqueued in queue 2
{
MQDependencyTest::Aql packet{};
packet.header.type = HSA_PACKET_TYPE_BARRIER_AND;
packet.header.barrier = 1;
packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM;
packet.header.release = HSA_FENCE_SCOPE_SYSTEM;
packet.barrier_and.dep_signal[0] = completion_signal_4;
obj_memory.submit_packet(queue2, packet);
}
// Third dispath packet on queue 2, Kernel
{
MQDependencyTest::Aql packet{};
packet.header.type = HSA_PACKET_TYPE_KERNEL_DISPATCH;
packet.header.barrier = 1;
packet.header.acquire = HSA_FENCE_SCOPE_SYSTEM;
packet.header.release = HSA_FENCE_SCOPE_SYSTEM;
packet.dispatch.setup = 1;
packet.dispatch.workgroup_size_x = 64;
packet.dispatch.workgroup_size_y = 1;
packet.dispatch.workgroup_size_z = 1;
packet.dispatch.grid_size_x = 64;
packet.dispatch.grid_size_y = 1;
packet.dispatch.grid_size_z = 1;
packet.dispatch.group_segment_size = copyE.group;
packet.dispatch.private_segment_size = copyE.scratch;
packet.dispatch.kernel_object = copyE.handle;
packet.dispatch.kernarg_address = args_memory;
packet.dispatch.completion_signal = completion_signal_5;
args_memory->a = c;
args_memory->b = d;
// Tell packet processor to launch the third kernel dispatch packet
obj_memory.submit_packet(queue2, packet);
}
// Wait on the completion signal
hsa_signal_wait_relaxed(
completion_signal_1, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
// Wait on the completion signal
hsa_signal_wait_relaxed(
completion_signal_2, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
// Wait on the completion signal
hsa_signal_wait_relaxed(
completion_signal_3, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
// Wait on the completion signal
hsa_signal_wait_relaxed(
completion_signal_4, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
// Wait on the completion signal
hsa_signal_wait_relaxed(
completion_signal_5, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
// Wait on the completion signal
hsa_signal_wait_relaxed(
completion_signal_6, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
for(int i = 0; i < 64; i++)
{
if(a[i] != b[i])
{
printf("error at %d: expected %d, got %d\n", i, b[i], a[i]);
abort();
}
}
// Clearing data structures and memory
status = hsa_signal_destroy(completion_signal_1);
RET_IF_HSA_ERR(status)
status = hsa_signal_destroy(completion_signal_2);
RET_IF_HSA_ERR(status)
status = hsa_signal_destroy(completion_signal_3);
RET_IF_HSA_ERR(status)
// Clearing data structures and memory
status = hsa_signal_destroy(completion_signal_4);
RET_IF_HSA_ERR(status)
status = hsa_signal_destroy(completion_signal_5);
RET_IF_HSA_ERR(status)
status = hsa_signal_destroy(completion_signal_6);
RET_IF_HSA_ERR(status)
if(queue1 != nullptr)
{
status = hsa_queue_destroy(queue1);
RET_IF_HSA_ERR(status)
}
if(queue2 != nullptr)
{
status = hsa_queue_destroy(queue2);
RET_IF_HSA_ERR(status)
}
status = hsa_memory_free(a);
RET_IF_HSA_ERR(status)
status = hsa_memory_free(b);
RET_IF_HSA_ERR(status)
status = hsa_memory_free(c);
RET_IF_HSA_ERR(status)
status = hsa_memory_free(d);
RET_IF_HSA_ERR(status)
status = hsa_executable_destroy(code_object.executable);
RET_IF_HSA_ERR(status)
status = hsa_code_object_reader_destroy(code_object.code_obj_rdr);
RET_IF_HSA_ERR(status)
status = hsa_executable_destroy(code_object_memory.executable);
RET_IF_HSA_ERR(status)
status = hsa_code_object_reader_destroy(code_object_memory.code_obj_rdr);
RET_IF_HSA_ERR(status)
close(code_object.file);
close(code_object_memory.file);
}
@@ -0,0 +1,415 @@
// MIT License
//
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#pragma once
#include "common/filesystem.hpp"
#include <hsa/hsa.h>
#include <hsa/hsa_ext_amd.h>
#include <dlfcn.h>
#include <fcntl.h>
#include <unistd.h>
#include <cassert>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <iostream>
#include <string>
#include <vector>
namespace fs = common::fs;
#define RET_IF_HSA_ERR(err) \
{ \
if((err) != HSA_STATUS_SUCCESS) \
{ \
char err_val[12]; \
char* err_str = nullptr; \
if(hsa_status_string(err, (const char**) &err_str) != HSA_STATUS_SUCCESS) \
{ \
sprintf(&(err_val[0]), "%#x", (uint32_t) err); \
err_str = &(err_val[0]); \
} \
printf("hsa api call failure at: %s:%d\n", __FILE__, __LINE__); \
printf("Call returned %s\n", err_str); \
abort(); \
} \
}
struct Device
{
struct Memory
{
hsa_amd_memory_pool_t pool;
bool fine;
bool kernarg;
size_t size;
size_t granule;
};
hsa_agent_t agent;
char name[64];
std::vector<Memory> pools;
uint32_t fine;
uint32_t coarse;
static std::vector<hsa_agent_t> all_devices;
};
class MQDependencyTest
{
public:
MQDependencyTest() { hsa_init(); }
~MQDependencyTest() { hsa_shut_down(); }
std::vector<Device> cpu;
std::vector<Device> gpu;
Device::Memory kernarg;
std::vector<hsa_agent_t> all_devices = {};
struct CodeObject
{
hsa_file_t file = 0;
hsa_code_object_reader_t code_obj_rdr = {};
hsa_executable_t executable = {};
};
struct Kernel
{
uint64_t handle = 0;
uint32_t scratch = 0;
uint32_t group = 0;
uint32_t kernarg_size = 0;
uint32_t kernarg_align = 0;
};
union AqlHeader
{
struct
{
uint16_t type : 8;
uint16_t barrier : 1;
uint16_t acquire : 2;
uint16_t release : 2;
uint16_t reserved : 3;
};
uint16_t raw = 0;
};
struct BarrierValue
{
AqlHeader header = {};
uint8_t AmdFormat = 0;
uint8_t reserved = 0;
uint32_t reserved1 = 0;
hsa_signal_t signal = {};
hsa_signal_value_t value = 0;
hsa_signal_value_t mask = 0;
uint32_t cond = 0;
uint32_t reserved2 = 0;
uint64_t reserved3 = 0;
uint64_t reserved4 = 0;
hsa_signal_t completion_signal = {};
};
union Aql
{
AqlHeader header;
hsa_kernel_dispatch_packet_t dispatch;
hsa_barrier_and_packet_t barrier_and;
hsa_barrier_or_packet_t barrier_or;
BarrierValue barrier_value = {};
};
struct OCLHiddenArgs
{
uint64_t offset_x = 0;
uint64_t offset_y = 0;
uint64_t offset_z = 0;
void* printf_buffer = nullptr;
void* enqueue = nullptr;
void* enqueue2 = nullptr;
void* multi_grid = nullptr;
};
bool load_code_object(const std::string& filename, hsa_agent_t agent, CodeObject& code_object)
{
hsa_status_t err;
code_object.file = open(filename.c_str(), O_RDONLY);
if(code_object.file == -1)
{
fprintf(stderr, "%s:%s\n", "Could not load code object", filename.c_str());
abort();
return false;
}
err = hsa_code_object_reader_create_from_file(code_object.file, &code_object.code_obj_rdr);
RET_IF_HSA_ERR(err);
err = hsa_executable_create_alt(HSA_PROFILE_FULL,
HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT,
nullptr,
&code_object.executable);
RET_IF_HSA_ERR(err);
err = hsa_executable_load_agent_code_object(
code_object.executable, agent, code_object.code_obj_rdr, nullptr, nullptr);
if(err != HSA_STATUS_SUCCESS) return false;
err = hsa_executable_freeze(code_object.executable, nullptr);
RET_IF_HSA_ERR(err);
return true;
}
bool load_code_object_memory(const std::string& filename,
hsa_agent_t agent,
CodeObject& code_object)
{
hsa_status_t err;
size_t buffer_size = 0;
std::ifstream code_object_file(filename.c_str(), std::ios::binary | std::ios::ate);
if(!code_object_file.good()) return false;
buffer_size = code_object_file.tellg();
code_object_file.seekg(0, std::ios::beg);
uint8_t* binary = new(std::nothrow) uint8_t[buffer_size];
if(binary && !code_object_file.read(reinterpret_cast<char*>(binary), buffer_size))
{
delete[] binary;
binary = nullptr;
return false;
}
err = hsa_code_object_reader_create_from_memory(
binary, buffer_size, &code_object.code_obj_rdr);
RET_IF_HSA_ERR(err);
err = hsa_executable_create_alt(HSA_PROFILE_FULL,
HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT,
nullptr,
&code_object.executable);
RET_IF_HSA_ERR(err);
err = hsa_executable_load_agent_code_object(
code_object.executable, agent, code_object.code_obj_rdr, nullptr, nullptr);
if(err != HSA_STATUS_SUCCESS) return false;
err = hsa_executable_freeze(code_object.executable, nullptr);
RET_IF_HSA_ERR(err);
delete[] binary;
binary = nullptr;
return true;
}
bool get_kernel(const CodeObject& code_object,
const std::string& kernel,
hsa_agent_t agent,
Kernel& kern)
{
hsa_executable_symbol_t symbol;
hsa_status_t err = hsa_executable_get_symbol_by_name(
code_object.executable, kernel.c_str(), &agent, &symbol);
if(err != HSA_STATUS_SUCCESS)
{
err = hsa_executable_get_symbol_by_name(
code_object.executable, (kernel + ".kd").c_str(), &agent, &symbol);
if(err != HSA_STATUS_SUCCESS)
{
return false;
}
}
printf("\nkernel-name: %s\n", kernel.c_str());
err = hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kern.handle);
RET_IF_HSA_ERR(err);
return true;
}
// Not for parallel insertion.
bool submit_packet(hsa_queue_t* queue, Aql& pkt)
{
size_t mask = queue->size - 1;
Aql* ring = static_cast<Aql*>(queue->base_address);
uint64_t write = hsa_queue_load_write_index_relaxed(queue);
uint64_t read = hsa_queue_load_read_index_relaxed(queue);
if(write - read + 1 > queue->size) return false;
Aql& dst = ring[write & mask];
uint16_t header = pkt.header.raw;
pkt.header.raw = dst.header.raw;
dst = pkt;
__atomic_store_n(&dst.header.raw, header, __ATOMIC_RELEASE);
pkt.header.raw = header;
hsa_queue_store_write_index_release(queue, write + 1);
hsa_signal_store_screlease(queue->doorbell_signal, write);
return true;
}
void* hsa_malloc(size_t size, const Device::Memory& mem)
{
void* ret;
hsa_status_t err = hsa_amd_memory_pool_allocate(mem.pool, size, 0, &ret);
RET_IF_HSA_ERR(err);
err = hsa_amd_agents_allow_access(all_devices.size(), all_devices.data(), nullptr, ret);
RET_IF_HSA_ERR(err);
return ret;
}
void* hsa_malloc(size_t size, const Device& dev, bool fine)
{
uint32_t index = fine ? dev.fine : dev.coarse;
assert(index != -1u && "Memory type unavailable.");
return hsa_malloc(size, dev.pools[index]);
}
bool device_discovery()
{
hsa_status_t err;
err = hsa_iterate_agents(
[](hsa_agent_t agent, void* obj) {
hsa_status_t error;
Device dev;
dev.agent = agent;
dev.fine = -1u;
dev.coarse = -1u;
MQDependencyTest* _obj = reinterpret_cast<MQDependencyTest*>(obj);
error = hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, dev.name);
RET_IF_HSA_ERR(error)
hsa_device_type_t type;
error = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type);
RET_IF_HSA_ERR(error)
error = hsa_amd_agent_iterate_memory_pools(
agent,
[](hsa_amd_memory_pool_t pool, void* data) {
auto& pools = *reinterpret_cast<std::vector<Device::Memory>*>(data);
hsa_status_t status;
bool allowed = false;
status = hsa_amd_memory_pool_get_info(
pool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, &allowed);
if(!allowed) return HSA_STATUS_SUCCESS;
hsa_amd_segment_t segment;
status = hsa_amd_memory_pool_get_info(
pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment);
RET_IF_HSA_ERR(status)
if(segment != HSA_AMD_SEGMENT_GLOBAL) return HSA_STATUS_SUCCESS;
uint32_t flags;
status = hsa_amd_memory_pool_get_info(
pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flags);
RET_IF_HSA_ERR(status)
Device::Memory mem;
mem.pool = pool;
mem.fine = ((flags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) != 0u);
mem.kernarg =
((flags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) != 0u);
status = hsa_amd_memory_pool_get_info(
pool, HSA_AMD_MEMORY_POOL_INFO_SIZE, &mem.size);
RET_IF_HSA_ERR(status)
status = hsa_amd_memory_pool_get_info(
pool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, &mem.granule);
RET_IF_HSA_ERR(status)
pools.push_back(mem);
return HSA_STATUS_SUCCESS;
},
static_cast<void*>(&dev.pools));
if(!dev.pools.empty())
{
for(size_t i = 0; i < dev.pools.size(); i++)
{
if(dev.pools[i].fine && dev.pools[i].kernarg && dev.fine == -1u)
dev.fine = i;
if(dev.pools[i].fine && !dev.pools[i].kernarg) dev.fine = i;
if(!dev.pools[i].fine) dev.coarse = i;
}
if(type == HSA_DEVICE_TYPE_CPU)
_obj->cpu.push_back(dev);
else
_obj->gpu.push_back(dev);
_obj->all_devices.push_back(dev.agent);
}
return HSA_STATUS_SUCCESS;
},
this);
bool is_break = false;
for(auto& dev : cpu)
{
for(auto& mem : dev.pools)
{
if(mem.fine && mem.kernarg)
{
kernarg = mem;
is_break = true;
break;
}
}
if(is_break) break;
}
RET_IF_HSA_ERR(err);
if(cpu.empty() || gpu.empty() || kernarg.pool.handle == 0) return false;
return true;
}
void search_hasco(const fs::path& directory, std::string& filename)
{
for(const auto& entry : fs::directory_iterator(directory))
{
if(fs::is_regular_file(entry))
{
if(entry.path().filename() == filename)
{
filename = entry.path();
}
}
else if(fs::is_directory(entry))
{
search_hasco(entry, filename); // Recursive call for subdirectories
}
}
}
};
+3
View File
@@ -39,3 +39,6 @@ add_subdirectory(collection-period)
if(ROCPROFILER_BUILD_ROCDECODE_TESTS)
add_subdirectory(rocdecode-trace)
endif()
if(TARGET att_decoder_testing)
add_subdirectory(advanced-thread-trace)
endif()
@@ -0,0 +1,100 @@
#
# rocprofv3 tool test
#
cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR)
project(
rocprofiler-tests-rocprofv3-att
LANGUAGES CXX
VERSION 0.0.0)
string(REPLACE "LD_PRELOAD=" "--preload;" PRELOAD_ARGS
"${ROCPROFILER_MEMCHECK_PRELOAD_ENV}")
rocprofiler_configure_pytest_files(CONFIG pytest.ini COPY validate.py conftest.py
att_input.json)
find_package(rocprofiler-sdk REQUIRED)
# hsa multiqueue dependency test
add_test(
NAME rocprofv3-test-hsa-multiqueue-att-cmd-env-ld-lib-path-execute
COMMAND
$<TARGET_FILE:rocprofiler-sdk::rocprofv3> --advanced-thread-trace 1
--att-target-cu 1 --att-shader-engine-mask 0x11 --kernel-include-regex copyD
--att-buffer-size 0x6000000 --att-simd-select 0x3 --att-parse testing
--att-serialize-all 1 -d ${CMAKE_CURRENT_BINARY_DIR}/%argt%-trace/cmd_input -o
out --output-format json ${PRELOAD_ARGS} --
$<TARGET_FILE:hsa_code_object_testapp>)
set_tests_properties(
rocprofv3-test-hsa-multiqueue-att-cmd-env-ld-lib-path-execute
PROPERTIES TIMEOUT 45 LABELS "integration-tests" ENVIRONMENT
LD_LIBRARY_PATH=${CMAKE_BINARY_DIR}/lib:$ENV{LD_LIBRARY_PATH}
FAIL_REGULAR_EXPRESSION "HSA_API|HIP_API")
# hsa multiqueue dependency test
add_test(
NAME rocprofv3-test-hsa-multiqueue-att-cmd-env-att-lib-path-execute
COMMAND
$<TARGET_FILE:rocprofiler-sdk::rocprofv3> --advanced-thread-trace 1
--att-target-cu 1 --att-shader-engine-mask 0x11 --kernel-include-regex copyD
--att-buffer-size 0x6000000 --att-simd-select 0x3 --att-parse testing
--att-serialize-all 1 -d ${CMAKE_CURRENT_BINARY_DIR}/%argt%-trace/cmd_input -o
out --output-format json ${PRELOAD_ARGS} --
$<TARGET_FILE:hsa_code_object_testapp>)
set_tests_properties(
rocprofv3-test-hsa-multiqueue-att-cmd-env-att-lib-path-execute
PROPERTIES TIMEOUT 45 LABELS "integration-tests" ENVIRONMENT
ATT_LIBRARY_PATH=${CMAKE_BINARY_DIR}/lib FAIL_REGULAR_EXPRESSION
"HSA_API|HIP_API")
# hsa multiqueue dependency test
add_test(
NAME rocprofv3-test-hsa-multiqueue-att-json-execute
COMMAND
$<TARGET_FILE:rocprofiler-sdk::rocprofv3> --att-library-path
${CMAKE_BINARY_DIR}/lib -d ${CMAKE_CURRENT_BINARY_DIR}/%argt%-trace/json_input -i
${CMAKE_CURRENT_BINARY_DIR}/att_input.json ${PRELOAD_ARGS} --
$<TARGET_FILE:hsa_code_object_testapp>)
set_tests_properties(
rocprofv3-test-hsa-multiqueue-att-json-execute
PROPERTIES TIMEOUT 45 LABELS "integration-tests" FAIL_REGULAR_EXPRESSION
"HSA_API|HIP_API")
add_test(
NAME rocprofv3-test-hsa-multiqueue-att-cmd-validate
COMMAND
${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py --input
${CMAKE_CURRENT_BINARY_DIR}/hsa_code_object_testapp-trace/cmd_input/out_results.json
--code-object-input ${CMAKE_CURRENT_BINARY_DIR} --output-path
${CMAKE_CURRENT_BINARY_DIR}/hsa_code_object_testapp-trace/cmd_input)
add_test(
NAME rocprofv3-test-hsa-multiqueue-att-json-validate
COMMAND
${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py --input
${CMAKE_CURRENT_BINARY_DIR}/hsa_code_object_testapp-trace/json_input/out_results.json
--code-object-input ${CMAKE_CURRENT_BINARY_DIR} --output-path
${CMAKE_CURRENT_BINARY_DIR}/hsa_code_object_testapp-trace/json_input)
set(MULTIQUEUE_CMD_VALIDATION_FILES
${CMAKE_CURRENT_BINARY_DIR}/hsa_code_object_testapp-trace/cmd_input/out_results.json)
set(MULTIQUEUE_JSON_VALIDATION_FILES
${CMAKE_CURRENT_BINARY_DIR}/hsa_code_object_testapp-trace/json_input/pass_1/out_results.json
)
set_tests_properties(
rocprofv3-test-hsa-multiqueue-att-cmd-validate
PROPERTIES TIMEOUT 45 LABELS "integration-tests" DEPENDS
"rocprofv3-test-hsa-multiqueue-att-cmd-ld-lib-path-execute"
FAIL_REGULAR_EXPRESSION "AssertionError")
set_tests_properties(
rocprofv3-test-hsa-multiqueue-att-json-validate
PROPERTIES TIMEOUT 45 LABELS "integration-tests" DEPENDS
"rocprofv3-test-hsa-multiqueue-att-json-execute" FAIL_REGULAR_EXPRESSION
"AssertionError")
@@ -0,0 +1,20 @@
{
"jobs": [
{
"kernel_include_regex": "copyD",
"kernel_exclude_regex": "",
"output_file": "out",
"output_format": [
"json"
],
"truncate_kernels": true,
"advanced_thread_trace": true,
"att_parse" : "testing",
"att_target_cu" : 1,
"att_shader_engine_mask" : "0x11",
"att_simd_select": "0x3",
"att_buffer_size": "0x6000000",
"att_perfcounters": "SQ_WAVES:0x1 SQ_INSTS_VALU:0x3 SQ_INSTS_SALU:0xF"
}
]
}
@@ -0,0 +1,59 @@
#!/usr/bin/env python3
import csv
import pytest
import json
from rocprofiler_sdk.pytest_utils.dotdict import dotdict
from rocprofiler_sdk.pytest_utils import collapse_dict_list
from rocprofiler_sdk.pytest_utils.perfetto_reader import PerfettoReader
import re
import os
def pytest_addoption(parser):
parser.addoption(
"--input",
action="store",
help="Path to JSON file.",
)
parser.addoption(
"--code-object-input",
action="store",
help="Path to code object file.",
)
parser.addoption(
"--output-path",
action="store",
help="Output Path.",
)
@pytest.fixture
def json_data(request):
filename = request.config.getoption("--input")
with open(filename, "r") as inp:
return dotdict(collapse_dict_list(json.load(inp)))
@pytest.fixture
def output_path(request):
return request.config.getoption("--output-path")
@pytest.fixture
def code_object_file_path(request):
file_path = request.config.getoption("--code-object-input")
# hsa_file_load = re.compile(".*copy.hsaco$")
code_object_files = {}
code_object_memory = []
hsa_memory_load_pattern = "gfx[a-z0-9]+_copy_memory.hsaco"
for root, dirs, files in os.walk(file_path, topdown=True):
for file in files:
filename = os.path.join(root, file)
if re.search(hsa_memory_load_pattern, filename):
code_object_memory.append(filename)
code_object_files["hsa_memory_load"] = code_object_memory
return code_object_files
@@ -0,0 +1,5 @@
[pytest]
addopts = --durations=20 -rA -s -vv
testpaths = validate.py
pythonpath = @ROCPROFILER_SDK_TESTS_BINARY_DIR@/pytest-packages
@@ -0,0 +1,47 @@
#!/usr/bin/env python3
import sys
import pytest
import re
import os
def test_json_data(json_data):
data = json_data["rocprofiler-sdk-tool"]
strings = data["strings"]
assert "att_filenames" in strings.keys()
att_files = data["strings"]["att_filenames"]
assert len(att_files) > 0
def test_code_object_memory(code_object_file_path, json_data, output_path):
data = json_data["rocprofiler-sdk-tool"]
tool_memory_load = data["strings"]["code_object_snapshot_filenames"]
gfx_pattern = "gfx[a-z0-9]+"
match = re.search(gfx_pattern, tool_memory_load[0])
assert match != None
gpu_name = match.group(0)
tool_memory_load_1 = open(os.path.join(output_path, tool_memory_load[0]), "rb")
tool_memory_load_2 = open(os.path.join(output_path, tool_memory_load[1]), "rb")
found = False
for hsa_file in code_object_file_path["hsa_memory_load"]:
m = re.search(gfx_pattern, hsa_file)
assert m != None
gpu = m.group(0)
if gpu == gpu_name:
found = True
hsa_memory_load = open(hsa_file, "rb")
hsa_memory_fs = hsa_memory_load.read()
tool_memory_fs_1 = tool_memory_load_1.read()
tool_memory_fs_2 = tool_memory_load_2.read()
assert hsa_memory_fs == tool_memory_fs_2 or hsa_memory_fs == tool_memory_fs_1
break
assert found == True
if __name__ == "__main__":
exit_code = pytest.main(["-x", __file__] + sys.argv[1:])
sys.exit(exit_code)