rocprofv3 kernel renaming support + misc rocprofv3 updates (#992)

* Increase rocprofv3 tool buffer size

- 32 pages instead of 1 page

* Improve rocprofv3 perfetto track labels

* Preliminary kernel renaming support + misc rocprofv3 updates

- add rocprofv3 option --kernel-rename
- add rocprofv3 options for perfetto settings (buffer size, etc.)
- add CSV columns for kernel trace
  - Thread_Id
  - Dispatch_Id
- add CSV column for counter_collection
 - Kernel_Id
Αυτή η υποβολή περιλαμβάνεται σε:
Jonathan R. Madsen
2024-07-29 14:33:50 -05:00
υποβλήθηκε από GitHub
γονέας 41f024475a
υποβολή ebb021c59f
18 αρχεία άλλαξαν με 641 προσθήκες και 60 διαγραφές
@@ -168,6 +168,10 @@ For MPI applications (or other job launchers such as SLURM), place rocprofv3 ins
"--list-metrics",
help="List metrics for counter collection",
)
add_parser_bool_argument(
"--kernel-rename",
help="Use region names defined by roctxRangePush/roctxRangePop regions to rename the kernels",
)
parser.add_argument(
"-i",
"--input",
@@ -232,6 +236,35 @@ For MPI applications (or other job launchers such as SLURM), place rocprofv3 ins
default=os.environ.get("ROCPROF_PRELOAD", "").split(":"),
nargs="*",
)
parser.add_argument(
"--perfetto-backend",
help="Perfetto data collection backend. 'system' mode requires starting traced and perfetto daemons",
default=None,
type=str,
nargs=1,
choices=("inprocess", "system"),
)
parser.add_argument(
"--perfetto-buffer-size",
help="Size of buffer for perfetto output in KB. default: 1 GB",
default=None,
type=int,
metavar="KB",
)
parser.add_argument(
"--perfetto-buffer-fill-policy",
help="Policy for handling new records when perfetto has reached the buffer limit",
default=None,
type=str,
choices=("discard", "ring_buffer"),
)
parser.add_argument(
"--perfetto-shmem-size-hint",
help="Perfetto shared memory size hint in KB. default: 64 KB",
default=None,
type=int,
metavar="KB",
)
if args is None:
args = sys.argv[1:]
@@ -582,6 +615,29 @@ def run(app_args, args, **kwargs):
args.log_level,
)
for opt, env_val in dict(
[
["kernel_rename", "KERNEL_RENAME"],
]
).items():
val = getattr(args, f"{opt}")
if val is not None:
update_env(f"ROCPROF_{env_val}", val, overwrite_if_true=True)
for opt, env_val in dict(
[
["perfetto_buffer_size", "PERFETTO_BUFFER_SIZE_KB"],
["perfetto_shmem_size_hint", "PERFETTO_SHMEM_SIZE_HINT_KB"],
["perfetto_fill_policy", "PERFETTO_BUFFER_FILL_POLICY"],
["perfetto_backend", "PERFETTO_BACKEND"],
]
).items():
val = getattr(args, f"{opt}")
if val is not None:
if isinstance(val, (list, tuple, set)):
val = ", ".join(val)
update_env(f"ROCPROF_{env_val}", val, overwrite=True)
def log_config(_env):
existing_env = dict(os.environ)
init_message = "\n- rocprofv3 configuration{}:\n".format(
@@ -70,5 +70,33 @@ get_string_entry(std::string_view name)
->emplace(_hash_v, std::make_unique<std::string>(name))
.first->second.get();
}
const std::string*
get_string_entry(size_t _hash_v)
{
if(!get_string_array()) return nullptr;
auto _lk = std::shared_lock<std::shared_mutex>{get_sync()};
if(get_string_array()->count(_hash_v) > 0) return get_string_array()->at(_hash_v).get();
return nullptr;
}
size_t
add_string_entry(std::string_view name)
{
if(!get_string_array()) return 0;
auto _hash_v = std::hash<std::string_view>{}(name);
{
auto _lk = std::shared_lock<std::shared_mutex>{get_sync()};
if(get_string_array()->count(_hash_v) > 0) return _hash_v;
}
auto _lk = std::unique_lock<std::shared_mutex>{get_sync()};
get_string_array()->emplace(_hash_v, std::make_unique<std::string>(name));
return _hash_v;
}
} // namespace common
} // namespace rocprofiler
@@ -22,6 +22,7 @@
#pragma once
#include <cstdint>
#include <string>
#include <string_view>
@@ -31,5 +32,11 @@ namespace common
{
const std::string*
get_string_entry(std::string_view name);
const std::string*
get_string_entry(size_t hash);
size_t
add_string_entry(std::string_view name);
} // namespace common
} // namespace rocprofiler
@@ -284,6 +284,10 @@ config::config()
<< "Unsupported output format type: " << itr;
}
if(kernel_filter_include.empty()) kernel_filter_include = std::string(".*");
const auto supported_perfetto_backends = std::set<std::string_view>{"inprocess", "system"};
LOG_IF(FATAL, supported_perfetto_backends.count(perfetto_backend) == 0)
<< "Unsupported perfetto backend type: " << perfetto_backend;
}
std::vector<output_key>
@@ -77,8 +77,11 @@ struct config
bool csv_output = false;
bool json_output = false;
bool pftrace_output = false;
bool kernel_rename = get_env("ROCPROF_KERNEL_RENAME", false);
int mpi_size = get_mpi_size();
int mpi_rank = get_mpi_rank();
size_t perfetto_shmem_size_hint = get_env("ROCPROF_PERFETTO_SHMEM_SIZE_HINT_KB", 64);
size_t perfetto_buffer_size = get_env("ROCPROF_PERFETTO_BUFFER_SIZE_KB", 1024000);
std::string output_path = get_env("ROCPROF_OUTPUT_PATH", fs::current_path().string());
std::string output_file = get_env("ROCPROF_OUTPUT_FILE_NAME", std::to_string(getpid()));
std::string tmp_directory = get_env("ROCPROF_TMPDIR", output_path);
@@ -87,6 +90,9 @@ struct config
get_env("ROCPROF_KERNEL_FILTER_INCLUDE_REGEX", std::string{".*"});
std::string kernel_filter_exclude =
get_env("ROCPROF_KERNEL_FILTER_EXCLUDE_REGEX", std::string{});
std::string perfetto_buffer_fill_policy =
get_env("ROCPROF_PERFETTO_BUFFER_FILL_POLICY", std::string{"discard"});
std::string perfetto_backend = get_env("ROCPROF_PERFETTO_BACKEND", std::string{"inprocess"});
std::unordered_set<uint32_t> kernel_filter_range = {};
std::set<std::string> counters = {};
};
@@ -101,8 +101,8 @@ struct csv_encoder
using api_csv_encoder = csv_encoder<7>;
using agent_info_csv_encoder = csv_encoder<53>;
using kernel_trace_csv_encoder = csv_encoder<16>;
using counter_collection_csv_encoder = csv_encoder<15>;
using kernel_trace_csv_encoder = csv_encoder<18>;
using counter_collection_csv_encoder = csv_encoder<16>;
using memory_copy_csv_encoder = csv_encoder<7>;
using marker_csv_encoder = csv_encoder<7>;
using list_basic_metrics_csv_encoder = csv_encoder<5>;
@@ -287,6 +287,8 @@ generate_csv(tool_table*
{"Kind",
"Agent_Id",
"Queue_Id",
"Thread_Id",
"Dispatch_Id",
"Kernel_Id",
"Kernel_Name",
"Correlation_Id",
@@ -304,12 +306,15 @@ generate_csv(tool_table*
for(const auto& record : data)
{
auto row_ss = std::stringstream{};
auto kernel_name = tool_functions->tool_get_kernel_name_fn(record.dispatch_info.kernel_id);
auto kernel_name = tool_functions->tool_get_kernel_name_fn(
record.dispatch_info.kernel_id, record.correlation_id.external.value);
rocprofiler::tool::csv::kernel_trace_csv_encoder::write_row(
row_ss,
tool_functions->tool_get_domain_name_fn(record.kind),
tool_functions->tool_get_agent_node_id_fn(record.dispatch_info.agent_id),
record.dispatch_info.queue_id.handle,
record.thread_id,
record.dispatch_info.dispatch_id,
record.dispatch_info.kernel_id,
kernel_name,
record.correlation_id.internal,
@@ -542,6 +547,7 @@ generate_csv(tool_table* too
"Process_Id",
"Thread_Id",
"Grid_Size",
"Kernel_Id",
"Kernel_Name",
"Workgroup_Size",
"LDS_Block_Size",
@@ -583,7 +589,8 @@ generate_csv(tool_table* too
getpid(),
record.thread_id,
magnitude(record.dispatch_data.dispatch_info.grid_size),
tool_functions->tool_get_kernel_name_fn(kernel_id),
record.dispatch_data.dispatch_info.kernel_id,
tool_functions->tool_get_kernel_name_fn(kernel_id, correlation_id.external.value),
magnitude(record.dispatch_data.dispatch_info.workgroup_size),
record.lds_block_size_v,
record.dispatch_data.dispatch_info.private_segment_size,
@@ -25,6 +25,8 @@
#include "helper.hpp"
#include "output_file.hpp"
#include "lib/common/string_entry.hpp"
#include <rocprofiler-sdk/fwd.h>
#include <rocprofiler-sdk/marker/api_id.h>
@@ -90,6 +92,27 @@ write_json(tool_table* tool
json_ar(cereal::make_nvp("buffer_records", buffer_name_info));
json_ar(cereal::make_nvp("marker_api", marker_msg_data));
{
auto _extern_corr_id_strings = std::map<size_t, std::string>{};
if(tool::get_config().kernel_rename)
{
for(auto itr : *kernel_dispatch_deque)
{
auto _value = itr.correlation_id.external.value;
if(_value > 0)
{
const auto* _str = common::get_string_entry(_value);
if(_str) _extern_corr_id_strings.emplace(_value, *_str);
}
}
}
json_ar.setNextName("correlation_id");
json_ar.startNode();
json_ar(cereal::make_nvp("external", _extern_corr_id_strings));
json_ar.finishNode();
}
{
json_ar.setNextName("counters");
json_ar.startNode();
@@ -25,6 +25,7 @@
#include "output_file.hpp"
#include "lib/common/utility.hpp"
#include "lib/rocprofiler-sdk-tool/config.hpp"
#include <rocprofiler-sdk/fwd.h>
#include <rocprofiler-sdk/marker/api_id.h>
@@ -83,20 +84,37 @@ write_perfetto(
auto cfg = ::perfetto::TraceConfig{};
// environment settings
auto shmem_size_hint = size_t{64};
auto buffer_size_kb = size_t{1024000};
auto shmem_size_hint = get_config().perfetto_shmem_size_hint;
auto buffer_size_kb = get_config().perfetto_buffer_size;
auto* buffer_config = cfg.add_buffers();
buffer_config->set_size_kb(buffer_size_kb);
buffer_config->set_fill_policy(
::perfetto::protos::gen::TraceConfig_BufferConfig_FillPolicy_DISCARD);
if(get_config().perfetto_buffer_fill_policy == "discard" ||
get_config().perfetto_buffer_fill_policy.empty())
buffer_config->set_fill_policy(
::perfetto::protos::gen::TraceConfig_BufferConfig_FillPolicy_DISCARD);
else if(get_config().perfetto_buffer_fill_policy == "ring_buffer")
buffer_config->set_fill_policy(
::perfetto::protos::gen::TraceConfig_BufferConfig_FillPolicy_RING_BUFFER);
else
ROCP_FATAL << "Unsupport perfetto buffer fill policy: '"
<< get_config().perfetto_buffer_fill_policy
<< "'. Supported: discard, ring_buffer";
auto* ds_cfg = cfg.add_data_sources()->mutable_config();
ds_cfg->set_name("track_event"); // this MUST be track_event
ds_cfg->set_track_event_config_raw(track_event_cfg.SerializeAsString());
args.shmem_size_hint_kb = shmem_size_hint;
args.backends |= ::perfetto::kInProcessBackend;
if(get_config().perfetto_backend == "inprocess" || get_config().perfetto_backend.empty())
args.backends |= ::perfetto::kInProcessBackend;
else if(get_config().perfetto_backend == "system")
args.backends |= ::perfetto::kSystemBackend;
else
ROCP_FATAL << "Unsupport perfetto backend: '" << get_config().perfetto_backend
<< "'. Supported: inprocess, system";
::perfetto::Tracing::Initialize(args);
::perfetto::TrackEvent::Register();
@@ -175,13 +193,15 @@ write_perfetto(
for(auto titr : itr.second)
{
auto _namess = std::stringstream{};
_namess << "COPY to [" << _agent->logical_node_id << "] THREAD ["
<< thread_indexes.at(titr) << "]";
_namess << "COPY to AGENT [" << _agent->logical_node_id << "] THREAD ["
<< thread_indexes.at(titr) << "] ";
if(_agent->type == ROCPROFILER_AGENT_TYPE_CPU)
_namess << " CPU";
_namess << "(CPU)";
else if(_agent->type == ROCPROFILER_AGENT_TYPE_GPU)
_namess << " GPU";
_namess << "(GPU)";
else
_namess << "(UNK)";
auto _track = ::perfetto::Track{get_hash_id(_namess.str())};
auto _desc = _track.Serialize();
@@ -201,12 +221,15 @@ write_perfetto(
const auto* _agent = _get_agent(aitr.first);
auto _namess = std::stringstream{};
_namess << "COMPUTE [" << _agent->logical_node_id << "] QUEUE [" << nqueue++ << "] ";
_namess << "COMPUTE AGENT [" << _agent->logical_node_id << "] QUEUE [" << nqueue++
<< "] ";
if(_agent->type == ROCPROFILER_AGENT_TYPE_CPU)
_namess << "CPU";
_namess << "(CPU)";
else if(_agent->type == ROCPROFILER_AGENT_TYPE_GPU)
_namess << "GPU";
_namess << "(GPU)";
else
_namess << "(UNK)";
auto _track = ::perfetto::Track{get_hash_id(_namess.str())};
auto _desc = _track.Serialize();
@@ -451,9 +474,9 @@ write_perfetto(
const auto* _agent = _get_agent(mitr.first);
if(_agent->type == ROCPROFILER_AGENT_TYPE_CPU)
_track_name << "COPY BYTES to [" << _agent->logical_node_id << "] CPU";
_track_name << "COPY BYTES to AGENT [" << _agent->logical_node_id << "] (CPU)";
else if(_agent->type == ROCPROFILER_AGENT_TYPE_GPU)
_track_name << "COPY BYTES to [" << _agent->logical_node_id << "] GPU";
_track_name << "COPY BYTES to AGENT [" << _agent->logical_node_id << "] (GPU)";
constexpr auto _unit = ::perfetto::CounterTrack::Unit::UNIT_SIZE_BYTES;
auto& _name = mem_cpy_cnt_names.emplace_back(_track_name.str());
@@ -333,7 +333,7 @@ using scratch_memory_buffered_output_t =
using tool_get_agent_node_id_fn_t = uint64_t (*)(rocprofiler_agent_id_t);
using tool_get_app_timestamps_fn_t = timestamps_t* (*) ();
using tool_get_kernel_name_fn_t = std::string_view (*)(uint64_t);
using tool_get_kernel_name_fn_t = std::string_view (*)(uint64_t, uint64_t);
using tool_get_domain_name_fn_t = std::string_view (*)(rocprofiler_buffer_tracing_kind_t);
using tool_get_operation_name_fn_t = std::string_view (*)(rocprofiler_buffer_tracing_kind_t,
rocprofiler_tracing_operation_t);
@@ -34,7 +34,10 @@
#include "lib/common/environment.hpp"
#include "lib/common/filesystem.hpp"
#include "lib/common/logging.hpp"
#include "lib/common/scope_destructor.hpp"
#include "lib/common/string_entry.hpp"
#include "lib/common/synchronized.hpp"
#include "lib/common/units.hpp"
#include "lib/common/utility.hpp"
#include <rocprofiler-sdk/agent.h>
@@ -181,8 +184,10 @@ using targeted_kernels_map_t =
std::unordered_map<rocprofiler_kernel_id_t, std::unordered_set<uint32_t>>;
using counter_dimension_info_map_t =
std::unordered_map<uint64_t, std::vector<rocprofiler_record_dimension_info_t>>;
using agent_info_map_t = std::unordered_map<rocprofiler_agent_id_t, rocprofiler_agent_t>;
using kernel_iteration_t = std::unordered_map<rocprofiler_kernel_id_t, uint32_t>;
using agent_info_map_t = std::unordered_map<rocprofiler_agent_id_t, rocprofiler_agent_t>;
using kernel_iteration_t = std::unordered_map<rocprofiler_kernel_id_t, uint32_t>;
using kernel_rename_map_t = std::unordered_map<uint64_t, uint64_t>;
using kernel_rename_stack_t = std::stack<uint64_t>;
auto code_obj_data = as_pointer<common::Synchronized<code_object_data_map_t, true>>();
auto* kernel_data = as_pointer<common::Synchronized<kernel_symbol_data_map_t, true>>();
@@ -196,14 +201,20 @@ auto* tool_functions = as_pointer(tool_table{});
auto* stats_timestamp = as_pointer(timestamps_t{});
auto kernel_iteration = common::Synchronized<kernel_iteration_t, true>{};
thread_local auto thread_dispatch_rename = as_pointer<kernel_rename_stack_t>();
thread_local auto thread_dispatch_rename_dtor = common::scope_destructor{[]() {
delete thread_dispatch_rename;
thread_dispatch_rename = nullptr;
}};
bool
add_kernel_target(uint64_t _kern_id, const std::unordered_set<uint32_t>& range)
{
return target_kernels
.wlock(
[](targeted_kernels_map_t& _targets_v,
uint64_t _kern_id_v,
std::unordered_set<uint32_t> _range) {
[](targeted_kernels_map_t& _targets_v,
uint64_t _kern_id_v,
const std::unordered_set<uint32_t>& _range) {
return _targets_v.emplace(_kern_id_v, _range);
},
_kern_id,
@@ -214,42 +225,33 @@ add_kernel_target(uint64_t _kern_id, const std::unordered_set<uint32_t>& range)
bool
is_targeted_kernel(uint64_t _kern_id)
{
bool is_target_kernel = false;
std::unordered_set<uint32_t> range = {};
is_target_kernel = target_kernels.rlock(
[&range](const auto& _targets_v, uint64_t _kern_id_v) {
if(_targets_v.find(_kern_id_v) != _targets_v.end())
{
range = _targets_v.at(_kern_id_v);
return true;
}
return false;
const std::unordered_set<uint32_t>* range = target_kernels.rlock(
[](const auto& _targets_v, uint64_t _kern_id_v) -> const std::unordered_set<uint32_t>* {
if(_targets_v.find(_kern_id_v) != _targets_v.end()) return &_targets_v.at(_kern_id_v);
return nullptr;
},
_kern_id);
if(is_target_kernel)
if(range)
{
kernel_iteration.rlock(
[&](const auto& _kernel_iter,
uint64_t _kernel_id,
std::unordered_set<uint32_t> _range) {
return kernel_iteration.rlock(
[](const auto& _kernel_iter,
uint64_t _kernel_id,
const std::unordered_set<uint32_t>& _range) {
auto itr = _kernel_iter.at(_kernel_id);
// If the iteration range is not given then all iterations of the kernel is profiled
if(_range.empty())
is_target_kernel = true;
return true;
else if(_range.find(itr) != _range.end())
{
is_target_kernel = true;
}
else
is_target_kernel = false;
return true;
return false;
},
_kern_id,
range);
*range);
}
return is_target_kernel;
return false;
}
auto&
@@ -295,6 +297,26 @@ get_roctx_msg(uint64_t cid)
cid);
}
int
set_kernel_rename_correlation_id(rocprofiler_thread_id_t thr_id,
rocprofiler_context_id_t ctx_id,
rocprofiler_external_correlation_id_request_kind_t kind,
rocprofiler_tracing_operation_t op,
uint64_t internal_corr_id,
rocprofiler_user_data_t* external_corr_id,
void* user_data)
{
ROCP_FATAL_IF(kind != ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KERNEL_DISPATCH)
<< "unexpected kind: " << kind;
if(thread_dispatch_rename != nullptr && !thread_dispatch_rename->empty())
external_corr_id->value = thread_dispatch_rename->top();
common::consume_args(thr_id, ctx_id, kind, op, internal_corr_id, user_data);
return 0;
}
void
cntrl_tracing_callback(rocprofiler_callback_tracing_record_t record,
rocprofiler_user_data_t* user_data,
@@ -337,6 +359,45 @@ cntrl_tracing_callback(rocprofiler_callback_tracing_record_t record,
}
}
void
kernel_rename_callback(rocprofiler_callback_tracing_record_t record,
rocprofiler_user_data_t* user_data,
void* data)
{
if(!rocprofiler::tool::get_config().kernel_rename || thread_dispatch_rename == nullptr) return;
if(record.kind == ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API)
{
auto* marker_data =
static_cast<rocprofiler_callback_tracing_marker_api_data_t*>(record.payload);
if(record.operation == ROCPROFILER_MARKER_CORE_API_ID_roctxMarkA &&
record.phase == ROCPROFILER_CALLBACK_PHASE_EXIT && marker_data->args.roctxMarkA.message)
{
thread_dispatch_rename->emplace(
common::add_string_entry(marker_data->args.roctxMarkA.message));
}
else if(record.operation == ROCPROFILER_MARKER_CORE_API_ID_roctxRangePushA &&
record.phase == ROCPROFILER_CALLBACK_PHASE_EXIT &&
marker_data->args.roctxRangePushA.message)
{
thread_dispatch_rename->emplace(
common::add_string_entry(marker_data->args.roctxRangePushA.message));
}
else if(record.operation == ROCPROFILER_MARKER_CORE_API_ID_roctxRangePop &&
record.phase == ROCPROFILER_CALLBACK_PHASE_ENTER)
{
ROCP_FATAL_IF(thread_dispatch_rename->empty())
<< "roctxRangePop invoked more times than roctxRangePush on thread "
<< rocprofiler::common::get_tid();
thread_dispatch_rename->pop();
}
}
common::consume_args(user_data, data);
}
void
callback_tracing_callback(rocprofiler_callback_tracing_record_t record,
rocprofiler_user_data_t* user_data,
@@ -485,8 +546,6 @@ callback_tracing_callback(rocprofiler_callback_tracing_record_t record,
}
}
(void) record;
(void) user_data;
(void) data;
}
@@ -544,9 +603,14 @@ code_object_tracing_callback(rocprofiler_callback_tracing_record_t record,
std::regex exclude_regex(kernel_filter_exclude);
if(std::regex_search(kernel_info.formatted_kernel_name, include_regex))
{
if(kernel_filter_exclude.empty())
if(kernel_filter_exclude.empty() ||
!std::regex_search(kernel_info.formatted_kernel_name, exclude_regex))
add_kernel_target(sym_data->kernel_id, kernel_filter_range);
else if(!std::regex_search(kernel_info.formatted_kernel_name, exclude_regex))
}
else
{
if(kernel_filter_exclude.empty() ||
!std::regex_search(kernel_info.formatted_kernel_name, exclude_regex))
add_kernel_target(sym_data->kernel_id, kernel_filter_range);
}
}
@@ -558,8 +622,13 @@ code_object_tracing_callback(rocprofiler_callback_tracing_record_t record,
}
std::string_view
get_kernel_name(uint64_t kernel_id)
get_kernel_name(uint64_t kernel_id, uint64_t rename_id)
{
if(rename_id > 0)
{
if(const auto* _name = common::get_string_entry(rename_id)) return std::string_view{*_name};
}
return CHECK_NOTNULL(kernel_data)->rlock([kernel_id](const auto& _data) -> std::string_view {
return _data.at(kernel_id).formatted_kernel_name;
});
@@ -1180,8 +1249,8 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
{
client_finalizer = fini_func;
constexpr uint64_t buffer_size = 4096;
constexpr uint64_t buffer_watermark = 4096;
const uint64_t buffer_size = 32 * common::units::get_page_size();
const uint64_t buffer_watermark = 31 * common::units::get_page_size();
rocprofiler_get_timestamp(&(stats_timestamp->app_start_time));
@@ -1362,6 +1431,40 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
"Could not setup counting service");
}
if(tool::get_config().kernel_rename)
{
auto rename_ctx = rocprofiler_context_id_t{};
auto marker_core_api_kinds = std::array<rocprofiler_tracing_operation_t, 3>{
ROCPROFILER_MARKER_CORE_API_ID_roctxMarkA,
ROCPROFILER_MARKER_CORE_API_ID_roctxRangePushA,
ROCPROFILER_MARKER_CORE_API_ID_roctxRangePop};
ROCPROFILER_CALL(rocprofiler_create_context(&rename_ctx), "failed to create context");
ROCPROFILER_CALL(rocprofiler_configure_callback_tracing_service(
rename_ctx,
ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API,
marker_core_api_kinds.data(),
marker_core_api_kinds.size(),
kernel_rename_callback,
nullptr),
"callback tracing service failed to configure");
ROCPROFILER_CALL(rocprofiler_start_context(rename_ctx), "start context failed");
auto external_corr_id_request_kinds =
std::array<rocprofiler_external_correlation_id_request_kind_t, 1>{
ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KERNEL_DISPATCH};
ROCPROFILER_CALL(rocprofiler_configure_external_correlation_id_request_service(
get_client_ctx(),
external_corr_id_request_kinds.data(),
external_corr_id_request_kinds.size(),
set_kernel_rename_correlation_id,
nullptr),
"Could not configure external correlation id request service");
}
for(auto itr : get_buffers().as_array())
{
if(itr.handle > 0)
@@ -206,8 +206,8 @@ run(int rank, int tid, int devid, int argc, char** argv)
int* in = nullptr;
int* out = nullptr;
HIP_API_CALL(hipMalloc(&in, size));
HIP_API_CALL(hipMalloc(&out, size));
HIP_API_CALL(hipMallocAsync(&in, size, stream));
HIP_API_CALL(hipMallocAsync(&out, size, stream));
HIP_API_CALL(hipMemsetAsync(in, 0, size, stream));
HIP_API_CALL(hipMemsetAsync(out, 0, size, stream));
HIP_API_CALL(hipMemcpyAsync(in, inp_matrix, size, hipMemcpyHostToDevice, stream));
@@ -238,13 +238,15 @@ run(int rank, int tid, int devid, int argc, char** argv)
print_lock.unlock();
HIP_API_CALL(hipStreamSynchronize(stream));
HIP_API_CALL(hipStreamDestroy(stream));
// cpu_transpose(matrix, out_matrix, M, N);
verify(inp_matrix, out_matrix, M, N);
HIP_API_CALL(hipFree(in));
HIP_API_CALL(hipFree(out));
HIP_API_CALL(hipFreeAsync(in, stream));
HIP_API_CALL(hipFreeAsync(out, stream));
HIP_API_CALL(hipStreamSynchronize(stream));
HIP_API_CALL(hipStreamDestroy(stream));
delete[] inp_matrix;
delete[] out_matrix;
@@ -28,3 +28,4 @@ add_subdirectory(tracing-plus-counter-collection)
add_subdirectory(tracing-hip-in-libraries)
add_subdirectory(counter-collection)
add_subdirectory(hsa-queue-dependency)
add_subdirectory(kernel-rename)
@@ -0,0 +1,114 @@
#
# rocprofv3 tool tests for kernel renaming
#
cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR)
project(
rocprofiler-tests-rocprofv3-kernel-rename
LANGUAGES CXX
VERSION 0.0.0)
find_package(rocprofiler-sdk REQUIRED)
if(ROCPROFILER_MEMCHECK STREQUAL "LeakSanitizer")
set(LOG_LEVEL "warning") # info produces memory leak
else()
set(LOG_LEVEL "info")
endif()
string(REPLACE "LD_PRELOAD=" "ROCPROF_PRELOAD=" PRELOAD_ENV
"${ROCPROFILER_MEMCHECK_PRELOAD_ENV}")
set(kernel-rename-env "${PRELOAD_ENV}")
rocprofiler_configure_pytest_files(CONFIG pytest.ini input-kernel-rename.yml
COPY validate.py conftest.py)
##########################################################################################
#
# Command line input
#
##########################################################################################
add_test(
NAME rocprofv3-test-kernel-rename-cmd-line-execute
COMMAND
$<TARGET_FILE:rocprofiler-sdk::rocprofv3> -M --sys-trace no --hsa-trace=0
--hsa-core-trace=1 --hip-compiler-trace False --hip-runtime-trace --kernel-trace
--memory-copy-trace -d ${CMAKE_CURRENT_BINARY_DIR}/%argt%-trace-cmd-line -o out
--output-format pftrace json --log-level env --kernel-rename
--perfetto-shmem-size-hint 128 --perfetto-buffer-size 2048000
--perfetto-buffer-fill-policy ring_buffer --perfetto-backend inprocess --
$<TARGET_FILE:transpose>)
set_tests_properties(
rocprofv3-test-kernel-rename-cmd-line-execute
PROPERTIES TIMEOUT 45 LABELS "integration-tests" ENVIRONMENT "${kernel-rename-env}"
FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}")
add_test(
NAME rocprofv3-test-kernel-rename-cmd-line-validate
COMMAND
${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py --json-input
${CMAKE_CURRENT_BINARY_DIR}/transpose-trace-cmd-line/out_results.json
--pftrace-input
${CMAKE_CURRENT_BINARY_DIR}/transpose-trace-cmd-line/out_results.pftrace)
set(VALIDATION_FILES
${CMAKE_CURRENT_BINARY_DIR}/transpose-trace-cmd-line/out_results.pftrace
${CMAKE_CURRENT_BINARY_DIR}/transpose-trace-cmd-line/out_results.json)
set_tests_properties(
rocprofv3-test-kernel-rename-cmd-line-validate
PROPERTIES TIMEOUT
45
LABELS
"integration-tests"
DEPENDS
"rocprofv3-test-kernel-rename-execute"
FAIL_REGULAR_EXPRESSION
"AssertionError"
ATTACHED_FILES_ON_FAIL
"${VALIDATION_FILES}")
##########################################################################################
#
# YAML input
#
##########################################################################################
add_test(
NAME rocprofv3-test-kernel-rename-inp-yaml-execute
COMMAND
$<TARGET_FILE:rocprofiler-sdk::rocprofv3> -i
${CMAKE_CURRENT_BINARY_DIR}/input-kernel-rename.yml -- $<TARGET_FILE:transpose>)
set_tests_properties(
rocprofv3-test-kernel-rename-inp-yaml-execute
PROPERTIES TIMEOUT 45 LABELS "integration-tests" ENVIRONMENT "${kernel-rename-env}"
FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}")
add_test(
NAME rocprofv3-test-kernel-rename-inp-yaml-validate
COMMAND
${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py --json-input
${CMAKE_CURRENT_BINARY_DIR}/transpose-trace-inp-yaml/out_results.json
--pftrace-input
${CMAKE_CURRENT_BINARY_DIR}/transpose-trace-inp-yaml/out_results.pftrace)
set(VALIDATION_FILES
${CMAKE_CURRENT_BINARY_DIR}/transpose-trace-inp-yaml/out_results.pftrace
${CMAKE_CURRENT_BINARY_DIR}/transpose-trace-inp-yaml/out_results.json)
set_tests_properties(
rocprofv3-test-kernel-rename-inp-yaml-validate
PROPERTIES TIMEOUT
45
LABELS
"integration-tests"
DEPENDS
"rocprofv3-test-kernel-rename-inp-yaml-execute"
FAIL_REGULAR_EXPRESSION
"AssertionError"
ATTACHED_FILES_ON_FAIL
"${VALIDATION_FILES}")
@@ -0,0 +1,34 @@
#!/usr/bin/env python3
import pytest
import json
from rocprofiler_sdk.pytest_utils.dotdict import dotdict
from rocprofiler_sdk.pytest_utils import collapse_dict_list
from rocprofiler_sdk.pytest_utils.perfetto_reader import PerfettoReader
def pytest_addoption(parser):
parser.addoption(
"--json-input",
action="store",
help="Path to JSON file.",
)
parser.addoption(
"--pftrace-input",
action="store",
help="Path to Perfetto trace file.",
)
@pytest.fixture
def json_data(request):
filename = request.config.getoption("--json-input")
with open(filename, "r") as inp:
return dotdict(collapse_dict_list(json.load(inp)))
@pytest.fixture
def pftrace_data(request):
filename = request.config.getoption("--pftrace-input")
return PerfettoReader(filename).read()[0]
@@ -0,0 +1,20 @@
jobs:
- mangled_kernels: True
sys_trace: False
hsa_trace: False
hsa_core_trace: True
hsa_amd_trace: False
hip_compiler_trace: False
hip_runtime_trace: True
kernel_trace: True
memory_copy_trace: True
marker_trace: False
output_directory: "@CMAKE_CURRENT_BINARY_DIR@/%argt%-trace-inp-yaml"
output_file: out
output_format: [pftrace, json]
log_level: env
kernel_rename: True
perfetto_shmem_size_hint: 128
perfetto_buffer_size: 2048000
perfetto_buffer_fill_policy: ring_buffer
perfetto_backend: inprocess
@@ -0,0 +1,5 @@
[pytest]
addopts = --durations=20 -rA -s -vv
testpaths = validate.py
pythonpath = @ROCPROFILER_SDK_TESTS_BINARY_DIR@/pytest-packages
@@ -0,0 +1,148 @@
#!/usr/bin/env python3
import sys
import pytest
def test_hsa_api_trace(json_data):
data = json_data["rocprofiler-sdk-tool"]
def get_operation_name(kind_id, op_id):
return data["strings"]["buffer_records"][kind_id]["operations"][op_id]
def get_kind_name(kind_id):
return data["strings"]["buffer_records"][kind_id]["kind"]
valid_domain_names = ("HSA_CORE_API",)
hsa_api_data = data["buffer_records"]["hsa_api"]
functions = []
for api in hsa_api_data:
kind = get_kind_name(api["kind"])
assert kind in valid_domain_names
assert api["end_timestamp"] >= api["start_timestamp"]
functions.append(get_operation_name(api["kind"], api["operation"]))
functions = list(set(functions))
assert "hsa_amd_memory_async_copy_on_engine" not in functions
assert "hsa_signal_destroy" in functions
def test_hip_api_trace(json_data):
data = json_data["rocprofiler-sdk-tool"]
def get_operation_name(kind_id, op_id):
return data["strings"]["buffer_records"][kind_id]["operations"][op_id]
def get_kind_name(kind_id):
return data["strings"]["buffer_records"][kind_id]["kind"]
valid_domain_names = ("HIP_RUNTIME_API",)
hip_api_data = data["buffer_records"]["hip_api"]
functions = []
for api in hip_api_data:
kind = get_kind_name(api["kind"])
assert kind in valid_domain_names
assert api["end_timestamp"] >= api["start_timestamp"]
functions.append(get_operation_name(api["kind"], api["operation"]))
functions = list(set(functions))
for itr in (
"__hipPushCallConfiguration",
"__hipPopCallConfiguration",
"__hipRegisterFatBinary",
"__hipRegisterFunction",
):
assert itr not in functions, f"{itr}"
for itr in (
"hipMallocAsync",
"hipMemcpyAsync",
"hipMemsetAsync",
"hipFreeAsync",
"hipLaunchKernel",
):
assert itr in functions, f"{itr}"
def test_kernel_trace(json_data):
data = json_data["rocprofiler-sdk-tool"]
def get_kernel_name(kernel_id):
return data["kernel_symbols"][kernel_id]["formatted_kernel_name"]
def get_kernel_rename(corr_id):
for itr in data.strings.correlation_id.external:
if itr.key == corr_id:
return itr.value
return None
def get_kind_name(kind_id):
return data["strings"]["buffer_records"][kind_id]["kind"]
valid_kernel_names = ("run",)
kernel_dispatch_data = data["buffer_records"]["kernel_dispatch"]
for dispatch in kernel_dispatch_data:
assert get_kind_name(dispatch["kind"]) == "KERNEL_DISPATCH"
assert dispatch["correlation_id"]["internal"] > 0
assert dispatch["correlation_id"]["external"] > 0
dispatch_info = dispatch["dispatch_info"]
assert dispatch_info["agent_id"]["handle"] > 0
assert dispatch_info["queue_id"]["handle"] > 0
assert dispatch_info["kernel_id"] > 0
assert dispatch["end_timestamp"] >= dispatch["start_timestamp"]
kernel_name = get_kernel_name(dispatch_info["kernel_id"])
assert kernel_name not in valid_kernel_names
external_corr_id = dispatch["correlation_id"]["external"]
assert external_corr_id > 0
kernel_rename = get_kernel_rename(external_corr_id)
assert kernel_rename is not None, f"{dispatch}"
assert kernel_rename in valid_kernel_names
def test_memory_copy_json_trace(json_data):
data = json_data["rocprofiler-sdk-tool"]
buffer_records = data["buffer_records"]
agent_data = data["agents"]
memory_copy_data = buffer_records["memory_copy"]
def get_kind_name(kind_id):
return data["strings"]["buffer_records"][kind_id]["kind"]
def get_agent(node_id):
for agent in agent_data:
if agent["id"]["handle"] == node_id["handle"]:
return agent
return None
assert len(memory_copy_data) == 12
for row in memory_copy_data:
src_agent = get_agent(row["src_agent_id"])
dst_agent = get_agent(row["dst_agent_id"])
assert get_kind_name(row["kind"]) == "MEMORY_COPY"
assert src_agent is not None, f"{row}"
assert dst_agent is not None, f"{row}"
assert row["correlation_id"]["internal"] > 0
assert row["end_timestamp"] >= row["start_timestamp"]
def test_perfetto_data(pftrace_data, json_data):
import rocprofiler_sdk.tests.rocprofv3 as rocprofv3
rocprofv3.test_perfetto_data(
pftrace_data, json_data, ("hip", "hsa", "kernel", "memory_copy")
)
if __name__ == "__main__":
exit_code = pytest.main(["-x", __file__] + sys.argv[1:])
sys.exit(exit_code)