From a728a4b4cd3a999e55fba6c762cc2697d4977665 Mon Sep 17 00:00:00 2001 From: "Jonathan R. Madsen" Date: Mon, 29 Jul 2024 14:33:50 -0500 Subject: [PATCH] rocprofv3 kernel renaming support + misc rocprofv3 updates (#992) * Increase rocprofv3 tool buffer size - 32 pages instead of 1 page * Improve rocprofv3 perfetto track labels * Preliminary kernel renaming support + misc rocprofv3 updates - add rocprofv3 option --kernel-rename - add rocprofv3 options for perfetto settings (buffer size, etc.) - add CSV columns for kernel trace - Thread_Id - Dispatch_Id - add CSV column for counter_collection - Kernel_Id [ROCm/rocprofiler-sdk commit: ebb021c59f455496055a7aac8f1f56d71c0dd56b] --- .../rocprofiler-sdk/source/bin/rocprofv3.py | 56 ++++++ .../source/lib/common/string_entry.cpp | 28 +++ .../source/lib/common/string_entry.hpp | 7 + .../lib/rocprofiler-sdk-tool/config.cpp | 4 + .../lib/rocprofiler-sdk-tool/config.hpp | 6 + .../source/lib/rocprofiler-sdk-tool/csv.hpp | 4 +- .../lib/rocprofiler-sdk-tool/generateCSV.cpp | 11 +- .../lib/rocprofiler-sdk-tool/generateJSON.cpp | 23 +++ .../rocprofiler-sdk-tool/generatePerfetto.cpp | 51 +++-- .../lib/rocprofiler-sdk-tool/helper.hpp | 2 +- .../source/lib/rocprofiler-sdk-tool/tool.cpp | 175 ++++++++++++++---- .../tests/bin/transpose/transpose.cpp | 12 +- .../tests/rocprofv3/CMakeLists.txt | 1 + .../rocprofv3/kernel-rename/CMakeLists.txt | 114 ++++++++++++ .../tests/rocprofv3/kernel-rename/conftest.py | 34 ++++ .../kernel-rename/input-kernel-rename.yml | 20 ++ .../tests/rocprofv3/kernel-rename/pytest.ini | 5 + .../tests/rocprofv3/kernel-rename/validate.py | 148 +++++++++++++++ 18 files changed, 641 insertions(+), 60 deletions(-) create mode 100644 projects/rocprofiler-sdk/tests/rocprofv3/kernel-rename/CMakeLists.txt create mode 100644 projects/rocprofiler-sdk/tests/rocprofv3/kernel-rename/conftest.py create mode 100644 projects/rocprofiler-sdk/tests/rocprofv3/kernel-rename/input-kernel-rename.yml create mode 100644 projects/rocprofiler-sdk/tests/rocprofv3/kernel-rename/pytest.ini create mode 100644 projects/rocprofiler-sdk/tests/rocprofv3/kernel-rename/validate.py diff --git a/projects/rocprofiler-sdk/source/bin/rocprofv3.py b/projects/rocprofiler-sdk/source/bin/rocprofv3.py index d765b01afa..a5d47a362a 100755 --- a/projects/rocprofiler-sdk/source/bin/rocprofv3.py +++ b/projects/rocprofiler-sdk/source/bin/rocprofv3.py @@ -168,6 +168,10 @@ For MPI applications (or other job launchers such as SLURM), place rocprofv3 ins "--list-metrics", help="List metrics for counter collection", ) + add_parser_bool_argument( + "--kernel-rename", + help="Use region names defined by roctxRangePush/roctxRangePop regions to rename the kernels", + ) parser.add_argument( "-i", "--input", @@ -232,6 +236,35 @@ For MPI applications (or other job launchers such as SLURM), place rocprofv3 ins default=os.environ.get("ROCPROF_PRELOAD", "").split(":"), nargs="*", ) + parser.add_argument( + "--perfetto-backend", + help="Perfetto data collection backend. 'system' mode requires starting traced and perfetto daemons", + default=None, + type=str, + nargs=1, + choices=("inprocess", "system"), + ) + parser.add_argument( + "--perfetto-buffer-size", + help="Size of buffer for perfetto output in KB. default: 1 GB", + default=None, + type=int, + metavar="KB", + ) + parser.add_argument( + "--perfetto-buffer-fill-policy", + help="Policy for handling new records when perfetto has reached the buffer limit", + default=None, + type=str, + choices=("discard", "ring_buffer"), + ) + parser.add_argument( + "--perfetto-shmem-size-hint", + help="Perfetto shared memory size hint in KB. default: 64 KB", + default=None, + type=int, + metavar="KB", + ) if args is None: args = sys.argv[1:] @@ -582,6 +615,29 @@ def run(app_args, args, **kwargs): args.log_level, ) + for opt, env_val in dict( + [ + ["kernel_rename", "KERNEL_RENAME"], + ] + ).items(): + val = getattr(args, f"{opt}") + if val is not None: + update_env(f"ROCPROF_{env_val}", val, overwrite_if_true=True) + + for opt, env_val in dict( + [ + ["perfetto_buffer_size", "PERFETTO_BUFFER_SIZE_KB"], + ["perfetto_shmem_size_hint", "PERFETTO_SHMEM_SIZE_HINT_KB"], + ["perfetto_fill_policy", "PERFETTO_BUFFER_FILL_POLICY"], + ["perfetto_backend", "PERFETTO_BACKEND"], + ] + ).items(): + val = getattr(args, f"{opt}") + if val is not None: + if isinstance(val, (list, tuple, set)): + val = ", ".join(val) + update_env(f"ROCPROF_{env_val}", val, overwrite=True) + def log_config(_env): existing_env = dict(os.environ) init_message = "\n- rocprofv3 configuration{}:\n".format( diff --git a/projects/rocprofiler-sdk/source/lib/common/string_entry.cpp b/projects/rocprofiler-sdk/source/lib/common/string_entry.cpp index 34dd77a9a3..76e31c5eed 100644 --- a/projects/rocprofiler-sdk/source/lib/common/string_entry.cpp +++ b/projects/rocprofiler-sdk/source/lib/common/string_entry.cpp @@ -70,5 +70,33 @@ get_string_entry(std::string_view name) ->emplace(_hash_v, std::make_unique(name)) .first->second.get(); } + +const std::string* +get_string_entry(size_t _hash_v) +{ + if(!get_string_array()) return nullptr; + + auto _lk = std::shared_lock{get_sync()}; + if(get_string_array()->count(_hash_v) > 0) return get_string_array()->at(_hash_v).get(); + + return nullptr; +} + +size_t +add_string_entry(std::string_view name) +{ + if(!get_string_array()) return 0; + + auto _hash_v = std::hash{}(name); + { + auto _lk = std::shared_lock{get_sync()}; + if(get_string_array()->count(_hash_v) > 0) return _hash_v; + } + + auto _lk = std::unique_lock{get_sync()}; + get_string_array()->emplace(_hash_v, std::make_unique(name)); + + return _hash_v; +} } // namespace common } // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/common/string_entry.hpp b/projects/rocprofiler-sdk/source/lib/common/string_entry.hpp index deeb200856..9d2f30c64d 100644 --- a/projects/rocprofiler-sdk/source/lib/common/string_entry.hpp +++ b/projects/rocprofiler-sdk/source/lib/common/string_entry.hpp @@ -22,6 +22,7 @@ #pragma once +#include #include #include @@ -31,5 +32,11 @@ namespace common { const std::string* get_string_entry(std::string_view name); + +const std::string* +get_string_entry(size_t hash); + +size_t +add_string_entry(std::string_view name); } // namespace common } // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.cpp index 588f2065b8..68850653ec 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.cpp @@ -284,6 +284,10 @@ config::config() << "Unsupported output format type: " << itr; } if(kernel_filter_include.empty()) kernel_filter_include = std::string(".*"); + + const auto supported_perfetto_backends = std::set{"inprocess", "system"}; + LOG_IF(FATAL, supported_perfetto_backends.count(perfetto_backend) == 0) + << "Unsupported perfetto backend type: " << perfetto_backend; } std::vector diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.hpp index 19905c1ec8..58a482c0ad 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.hpp @@ -77,8 +77,11 @@ struct config bool csv_output = false; bool json_output = false; bool pftrace_output = false; + bool kernel_rename = get_env("ROCPROF_KERNEL_RENAME", false); int mpi_size = get_mpi_size(); int mpi_rank = get_mpi_rank(); + size_t perfetto_shmem_size_hint = get_env("ROCPROF_PERFETTO_SHMEM_SIZE_HINT_KB", 64); + size_t perfetto_buffer_size = get_env("ROCPROF_PERFETTO_BUFFER_SIZE_KB", 1024000); std::string output_path = get_env("ROCPROF_OUTPUT_PATH", fs::current_path().string()); std::string output_file = get_env("ROCPROF_OUTPUT_FILE_NAME", std::to_string(getpid())); std::string tmp_directory = get_env("ROCPROF_TMPDIR", output_path); @@ -87,6 +90,9 @@ struct config get_env("ROCPROF_KERNEL_FILTER_INCLUDE_REGEX", std::string{".*"}); std::string kernel_filter_exclude = get_env("ROCPROF_KERNEL_FILTER_EXCLUDE_REGEX", std::string{}); + std::string perfetto_buffer_fill_policy = + get_env("ROCPROF_PERFETTO_BUFFER_FILL_POLICY", std::string{"discard"}); + std::string perfetto_backend = get_env("ROCPROF_PERFETTO_BACKEND", std::string{"inprocess"}); std::unordered_set kernel_filter_range = {}; std::set counters = {}; }; diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/csv.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/csv.hpp index b4472e22af..cd5d71c652 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/csv.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/csv.hpp @@ -101,8 +101,8 @@ struct csv_encoder using api_csv_encoder = csv_encoder<7>; using agent_info_csv_encoder = csv_encoder<53>; -using kernel_trace_csv_encoder = csv_encoder<16>; -using counter_collection_csv_encoder = csv_encoder<15>; +using kernel_trace_csv_encoder = csv_encoder<18>; +using counter_collection_csv_encoder = csv_encoder<16>; using memory_copy_csv_encoder = csv_encoder<7>; using marker_csv_encoder = csv_encoder<7>; using list_basic_metrics_csv_encoder = csv_encoder<5>; diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/generateCSV.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/generateCSV.cpp index d06cf79029..dd05c01115 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/generateCSV.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/generateCSV.cpp @@ -287,6 +287,8 @@ generate_csv(tool_table* {"Kind", "Agent_Id", "Queue_Id", + "Thread_Id", + "Dispatch_Id", "Kernel_Id", "Kernel_Name", "Correlation_Id", @@ -304,12 +306,15 @@ generate_csv(tool_table* for(const auto& record : data) { auto row_ss = std::stringstream{}; - auto kernel_name = tool_functions->tool_get_kernel_name_fn(record.dispatch_info.kernel_id); + auto kernel_name = tool_functions->tool_get_kernel_name_fn( + record.dispatch_info.kernel_id, record.correlation_id.external.value); rocprofiler::tool::csv::kernel_trace_csv_encoder::write_row( row_ss, tool_functions->tool_get_domain_name_fn(record.kind), tool_functions->tool_get_agent_node_id_fn(record.dispatch_info.agent_id), record.dispatch_info.queue_id.handle, + record.thread_id, + record.dispatch_info.dispatch_id, record.dispatch_info.kernel_id, kernel_name, record.correlation_id.internal, @@ -542,6 +547,7 @@ generate_csv(tool_table* too "Process_Id", "Thread_Id", "Grid_Size", + "Kernel_Id", "Kernel_Name", "Workgroup_Size", "LDS_Block_Size", @@ -583,7 +589,8 @@ generate_csv(tool_table* too getpid(), record.thread_id, magnitude(record.dispatch_data.dispatch_info.grid_size), - tool_functions->tool_get_kernel_name_fn(kernel_id), + record.dispatch_data.dispatch_info.kernel_id, + tool_functions->tool_get_kernel_name_fn(kernel_id, correlation_id.external.value), magnitude(record.dispatch_data.dispatch_info.workgroup_size), record.lds_block_size_v, record.dispatch_data.dispatch_info.private_segment_size, diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/generateJSON.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/generateJSON.cpp index ec08e4fac4..b0ecec970f 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/generateJSON.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/generateJSON.cpp @@ -25,6 +25,8 @@ #include "helper.hpp" #include "output_file.hpp" +#include "lib/common/string_entry.hpp" + #include #include @@ -90,6 +92,27 @@ write_json(tool_table* tool json_ar(cereal::make_nvp("buffer_records", buffer_name_info)); json_ar(cereal::make_nvp("marker_api", marker_msg_data)); + { + auto _extern_corr_id_strings = std::map{}; + if(tool::get_config().kernel_rename) + { + for(auto itr : *kernel_dispatch_deque) + { + auto _value = itr.correlation_id.external.value; + if(_value > 0) + { + const auto* _str = common::get_string_entry(_value); + if(_str) _extern_corr_id_strings.emplace(_value, *_str); + } + } + } + + json_ar.setNextName("correlation_id"); + json_ar.startNode(); + json_ar(cereal::make_nvp("external", _extern_corr_id_strings)); + json_ar.finishNode(); + } + { json_ar.setNextName("counters"); json_ar.startNode(); diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/generatePerfetto.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/generatePerfetto.cpp index 733941a1e1..638faed923 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/generatePerfetto.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/generatePerfetto.cpp @@ -25,6 +25,7 @@ #include "output_file.hpp" #include "lib/common/utility.hpp" +#include "lib/rocprofiler-sdk-tool/config.hpp" #include #include @@ -83,20 +84,37 @@ write_perfetto( auto cfg = ::perfetto::TraceConfig{}; // environment settings - auto shmem_size_hint = size_t{64}; - auto buffer_size_kb = size_t{1024000}; + auto shmem_size_hint = get_config().perfetto_shmem_size_hint; + auto buffer_size_kb = get_config().perfetto_buffer_size; auto* buffer_config = cfg.add_buffers(); buffer_config->set_size_kb(buffer_size_kb); - buffer_config->set_fill_policy( - ::perfetto::protos::gen::TraceConfig_BufferConfig_FillPolicy_DISCARD); + + if(get_config().perfetto_buffer_fill_policy == "discard" || + get_config().perfetto_buffer_fill_policy.empty()) + buffer_config->set_fill_policy( + ::perfetto::protos::gen::TraceConfig_BufferConfig_FillPolicy_DISCARD); + else if(get_config().perfetto_buffer_fill_policy == "ring_buffer") + buffer_config->set_fill_policy( + ::perfetto::protos::gen::TraceConfig_BufferConfig_FillPolicy_RING_BUFFER); + else + ROCP_FATAL << "Unsupport perfetto buffer fill policy: '" + << get_config().perfetto_buffer_fill_policy + << "'. Supported: discard, ring_buffer"; auto* ds_cfg = cfg.add_data_sources()->mutable_config(); ds_cfg->set_name("track_event"); // this MUST be track_event ds_cfg->set_track_event_config_raw(track_event_cfg.SerializeAsString()); args.shmem_size_hint_kb = shmem_size_hint; - args.backends |= ::perfetto::kInProcessBackend; + + if(get_config().perfetto_backend == "inprocess" || get_config().perfetto_backend.empty()) + args.backends |= ::perfetto::kInProcessBackend; + else if(get_config().perfetto_backend == "system") + args.backends |= ::perfetto::kSystemBackend; + else + ROCP_FATAL << "Unsupport perfetto backend: '" << get_config().perfetto_backend + << "'. Supported: inprocess, system"; ::perfetto::Tracing::Initialize(args); ::perfetto::TrackEvent::Register(); @@ -175,13 +193,15 @@ write_perfetto( for(auto titr : itr.second) { auto _namess = std::stringstream{}; - _namess << "COPY to [" << _agent->logical_node_id << "] THREAD [" - << thread_indexes.at(titr) << "]"; + _namess << "COPY to AGENT [" << _agent->logical_node_id << "] THREAD [" + << thread_indexes.at(titr) << "] "; if(_agent->type == ROCPROFILER_AGENT_TYPE_CPU) - _namess << " CPU"; + _namess << "(CPU)"; else if(_agent->type == ROCPROFILER_AGENT_TYPE_GPU) - _namess << " GPU"; + _namess << "(GPU)"; + else + _namess << "(UNK)"; auto _track = ::perfetto::Track{get_hash_id(_namess.str())}; auto _desc = _track.Serialize(); @@ -201,12 +221,15 @@ write_perfetto( const auto* _agent = _get_agent(aitr.first); auto _namess = std::stringstream{}; - _namess << "COMPUTE [" << _agent->logical_node_id << "] QUEUE [" << nqueue++ << "] "; + _namess << "COMPUTE AGENT [" << _agent->logical_node_id << "] QUEUE [" << nqueue++ + << "] "; if(_agent->type == ROCPROFILER_AGENT_TYPE_CPU) - _namess << "CPU"; + _namess << "(CPU)"; else if(_agent->type == ROCPROFILER_AGENT_TYPE_GPU) - _namess << "GPU"; + _namess << "(GPU)"; + else + _namess << "(UNK)"; auto _track = ::perfetto::Track{get_hash_id(_namess.str())}; auto _desc = _track.Serialize(); @@ -451,9 +474,9 @@ write_perfetto( const auto* _agent = _get_agent(mitr.first); if(_agent->type == ROCPROFILER_AGENT_TYPE_CPU) - _track_name << "COPY BYTES to [" << _agent->logical_node_id << "] CPU"; + _track_name << "COPY BYTES to AGENT [" << _agent->logical_node_id << "] (CPU)"; else if(_agent->type == ROCPROFILER_AGENT_TYPE_GPU) - _track_name << "COPY BYTES to [" << _agent->logical_node_id << "] GPU"; + _track_name << "COPY BYTES to AGENT [" << _agent->logical_node_id << "] (GPU)"; constexpr auto _unit = ::perfetto::CounterTrack::Unit::UNIT_SIZE_BYTES; auto& _name = mem_cpy_cnt_names.emplace_back(_track_name.str()); diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/helper.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/helper.hpp index ac8c9d34e9..dcdd60511d 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/helper.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/helper.hpp @@ -333,7 +333,7 @@ using scratch_memory_buffered_output_t = using tool_get_agent_node_id_fn_t = uint64_t (*)(rocprofiler_agent_id_t); using tool_get_app_timestamps_fn_t = timestamps_t* (*) (); -using tool_get_kernel_name_fn_t = std::string_view (*)(uint64_t); +using tool_get_kernel_name_fn_t = std::string_view (*)(uint64_t, uint64_t); using tool_get_domain_name_fn_t = std::string_view (*)(rocprofiler_buffer_tracing_kind_t); using tool_get_operation_name_fn_t = std::string_view (*)(rocprofiler_buffer_tracing_kind_t, rocprofiler_tracing_operation_t); diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/tool.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/tool.cpp index d7b2e9096f..ba44a505bd 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/tool.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/tool.cpp @@ -34,7 +34,10 @@ #include "lib/common/environment.hpp" #include "lib/common/filesystem.hpp" #include "lib/common/logging.hpp" +#include "lib/common/scope_destructor.hpp" +#include "lib/common/string_entry.hpp" #include "lib/common/synchronized.hpp" +#include "lib/common/units.hpp" #include "lib/common/utility.hpp" #include @@ -181,8 +184,10 @@ using targeted_kernels_map_t = std::unordered_map>; using counter_dimension_info_map_t = std::unordered_map>; -using agent_info_map_t = std::unordered_map; -using kernel_iteration_t = std::unordered_map; +using agent_info_map_t = std::unordered_map; +using kernel_iteration_t = std::unordered_map; +using kernel_rename_map_t = std::unordered_map; +using kernel_rename_stack_t = std::stack; auto code_obj_data = as_pointer>(); auto* kernel_data = as_pointer>(); @@ -196,14 +201,20 @@ auto* tool_functions = as_pointer(tool_table{}); auto* stats_timestamp = as_pointer(timestamps_t{}); auto kernel_iteration = common::Synchronized{}; +thread_local auto thread_dispatch_rename = as_pointer(); +thread_local auto thread_dispatch_rename_dtor = common::scope_destructor{[]() { + delete thread_dispatch_rename; + thread_dispatch_rename = nullptr; +}}; + bool add_kernel_target(uint64_t _kern_id, const std::unordered_set& range) { return target_kernels .wlock( - [](targeted_kernels_map_t& _targets_v, - uint64_t _kern_id_v, - std::unordered_set _range) { + [](targeted_kernels_map_t& _targets_v, + uint64_t _kern_id_v, + const std::unordered_set& _range) { return _targets_v.emplace(_kern_id_v, _range); }, _kern_id, @@ -214,42 +225,33 @@ add_kernel_target(uint64_t _kern_id, const std::unordered_set& range) bool is_targeted_kernel(uint64_t _kern_id) { - bool is_target_kernel = false; - std::unordered_set range = {}; - is_target_kernel = target_kernels.rlock( - [&range](const auto& _targets_v, uint64_t _kern_id_v) { - if(_targets_v.find(_kern_id_v) != _targets_v.end()) - { - range = _targets_v.at(_kern_id_v); - return true; - } - return false; + const std::unordered_set* range = target_kernels.rlock( + [](const auto& _targets_v, uint64_t _kern_id_v) -> const std::unordered_set* { + if(_targets_v.find(_kern_id_v) != _targets_v.end()) return &_targets_v.at(_kern_id_v); + return nullptr; }, _kern_id); - if(is_target_kernel) + if(range) { - kernel_iteration.rlock( - [&](const auto& _kernel_iter, - uint64_t _kernel_id, - std::unordered_set _range) { + return kernel_iteration.rlock( + [](const auto& _kernel_iter, + uint64_t _kernel_id, + const std::unordered_set& _range) { auto itr = _kernel_iter.at(_kernel_id); // If the iteration range is not given then all iterations of the kernel is profiled if(_range.empty()) - is_target_kernel = true; - + return true; else if(_range.find(itr) != _range.end()) - { - is_target_kernel = true; - } - else - is_target_kernel = false; + return true; + return false; }, _kern_id, - range); + *range); } - return is_target_kernel; + + return false; } auto& @@ -295,6 +297,26 @@ get_roctx_msg(uint64_t cid) cid); } +int +set_kernel_rename_correlation_id(rocprofiler_thread_id_t thr_id, + rocprofiler_context_id_t ctx_id, + rocprofiler_external_correlation_id_request_kind_t kind, + rocprofiler_tracing_operation_t op, + uint64_t internal_corr_id, + rocprofiler_user_data_t* external_corr_id, + void* user_data) +{ + ROCP_FATAL_IF(kind != ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KERNEL_DISPATCH) + << "unexpected kind: " << kind; + + if(thread_dispatch_rename != nullptr && !thread_dispatch_rename->empty()) + external_corr_id->value = thread_dispatch_rename->top(); + + common::consume_args(thr_id, ctx_id, kind, op, internal_corr_id, user_data); + + return 0; +} + void cntrl_tracing_callback(rocprofiler_callback_tracing_record_t record, rocprofiler_user_data_t* user_data, @@ -337,6 +359,45 @@ cntrl_tracing_callback(rocprofiler_callback_tracing_record_t record, } } +void +kernel_rename_callback(rocprofiler_callback_tracing_record_t record, + rocprofiler_user_data_t* user_data, + void* data) +{ + if(!rocprofiler::tool::get_config().kernel_rename || thread_dispatch_rename == nullptr) return; + + if(record.kind == ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API) + { + auto* marker_data = + static_cast(record.payload); + + if(record.operation == ROCPROFILER_MARKER_CORE_API_ID_roctxMarkA && + record.phase == ROCPROFILER_CALLBACK_PHASE_EXIT && marker_data->args.roctxMarkA.message) + { + thread_dispatch_rename->emplace( + common::add_string_entry(marker_data->args.roctxMarkA.message)); + } + else if(record.operation == ROCPROFILER_MARKER_CORE_API_ID_roctxRangePushA && + record.phase == ROCPROFILER_CALLBACK_PHASE_EXIT && + marker_data->args.roctxRangePushA.message) + { + thread_dispatch_rename->emplace( + common::add_string_entry(marker_data->args.roctxRangePushA.message)); + } + else if(record.operation == ROCPROFILER_MARKER_CORE_API_ID_roctxRangePop && + record.phase == ROCPROFILER_CALLBACK_PHASE_ENTER) + { + ROCP_FATAL_IF(thread_dispatch_rename->empty()) + << "roctxRangePop invoked more times than roctxRangePush on thread " + << rocprofiler::common::get_tid(); + + thread_dispatch_rename->pop(); + } + } + + common::consume_args(user_data, data); +} + void callback_tracing_callback(rocprofiler_callback_tracing_record_t record, rocprofiler_user_data_t* user_data, @@ -485,8 +546,6 @@ callback_tracing_callback(rocprofiler_callback_tracing_record_t record, } } - (void) record; - (void) user_data; (void) data; } @@ -544,9 +603,14 @@ code_object_tracing_callback(rocprofiler_callback_tracing_record_t record, std::regex exclude_regex(kernel_filter_exclude); if(std::regex_search(kernel_info.formatted_kernel_name, include_regex)) { - if(kernel_filter_exclude.empty()) + if(kernel_filter_exclude.empty() || + !std::regex_search(kernel_info.formatted_kernel_name, exclude_regex)) add_kernel_target(sym_data->kernel_id, kernel_filter_range); - else if(!std::regex_search(kernel_info.formatted_kernel_name, exclude_regex)) + } + else + { + if(kernel_filter_exclude.empty() || + !std::regex_search(kernel_info.formatted_kernel_name, exclude_regex)) add_kernel_target(sym_data->kernel_id, kernel_filter_range); } } @@ -558,8 +622,13 @@ code_object_tracing_callback(rocprofiler_callback_tracing_record_t record, } std::string_view -get_kernel_name(uint64_t kernel_id) +get_kernel_name(uint64_t kernel_id, uint64_t rename_id) { + if(rename_id > 0) + { + if(const auto* _name = common::get_string_entry(rename_id)) return std::string_view{*_name}; + } + return CHECK_NOTNULL(kernel_data)->rlock([kernel_id](const auto& _data) -> std::string_view { return _data.at(kernel_id).formatted_kernel_name; }); @@ -1180,8 +1249,8 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) { client_finalizer = fini_func; - constexpr uint64_t buffer_size = 4096; - constexpr uint64_t buffer_watermark = 4096; + const uint64_t buffer_size = 32 * common::units::get_page_size(); + const uint64_t buffer_watermark = 31 * common::units::get_page_size(); rocprofiler_get_timestamp(&(stats_timestamp->app_start_time)); @@ -1362,6 +1431,40 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) "Could not setup counting service"); } + if(tool::get_config().kernel_rename) + { + auto rename_ctx = rocprofiler_context_id_t{}; + auto marker_core_api_kinds = std::array{ + ROCPROFILER_MARKER_CORE_API_ID_roctxMarkA, + ROCPROFILER_MARKER_CORE_API_ID_roctxRangePushA, + ROCPROFILER_MARKER_CORE_API_ID_roctxRangePop}; + + ROCPROFILER_CALL(rocprofiler_create_context(&rename_ctx), "failed to create context"); + + ROCPROFILER_CALL(rocprofiler_configure_callback_tracing_service( + rename_ctx, + ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API, + marker_core_api_kinds.data(), + marker_core_api_kinds.size(), + kernel_rename_callback, + nullptr), + "callback tracing service failed to configure"); + + ROCPROFILER_CALL(rocprofiler_start_context(rename_ctx), "start context failed"); + + auto external_corr_id_request_kinds = + std::array{ + ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KERNEL_DISPATCH}; + + ROCPROFILER_CALL(rocprofiler_configure_external_correlation_id_request_service( + get_client_ctx(), + external_corr_id_request_kinds.data(), + external_corr_id_request_kinds.size(), + set_kernel_rename_correlation_id, + nullptr), + "Could not configure external correlation id request service"); + } + for(auto itr : get_buffers().as_array()) { if(itr.handle > 0) diff --git a/projects/rocprofiler-sdk/tests/bin/transpose/transpose.cpp b/projects/rocprofiler-sdk/tests/bin/transpose/transpose.cpp index 774edec48b..ad871d4c22 100644 --- a/projects/rocprofiler-sdk/tests/bin/transpose/transpose.cpp +++ b/projects/rocprofiler-sdk/tests/bin/transpose/transpose.cpp @@ -206,8 +206,8 @@ run(int rank, int tid, int devid, int argc, char** argv) int* in = nullptr; int* out = nullptr; - HIP_API_CALL(hipMalloc(&in, size)); - HIP_API_CALL(hipMalloc(&out, size)); + HIP_API_CALL(hipMallocAsync(&in, size, stream)); + HIP_API_CALL(hipMallocAsync(&out, size, stream)); HIP_API_CALL(hipMemsetAsync(in, 0, size, stream)); HIP_API_CALL(hipMemsetAsync(out, 0, size, stream)); HIP_API_CALL(hipMemcpyAsync(in, inp_matrix, size, hipMemcpyHostToDevice, stream)); @@ -238,13 +238,15 @@ run(int rank, int tid, int devid, int argc, char** argv) print_lock.unlock(); HIP_API_CALL(hipStreamSynchronize(stream)); - HIP_API_CALL(hipStreamDestroy(stream)); // cpu_transpose(matrix, out_matrix, M, N); verify(inp_matrix, out_matrix, M, N); - HIP_API_CALL(hipFree(in)); - HIP_API_CALL(hipFree(out)); + HIP_API_CALL(hipFreeAsync(in, stream)); + HIP_API_CALL(hipFreeAsync(out, stream)); + + HIP_API_CALL(hipStreamSynchronize(stream)); + HIP_API_CALL(hipStreamDestroy(stream)); delete[] inp_matrix; delete[] out_matrix; diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/CMakeLists.txt b/projects/rocprofiler-sdk/tests/rocprofv3/CMakeLists.txt index 3ed0e68d5d..17e6e8a16a 100644 --- a/projects/rocprofiler-sdk/tests/rocprofv3/CMakeLists.txt +++ b/projects/rocprofiler-sdk/tests/rocprofv3/CMakeLists.txt @@ -28,3 +28,4 @@ add_subdirectory(tracing-plus-counter-collection) add_subdirectory(tracing-hip-in-libraries) add_subdirectory(counter-collection) add_subdirectory(hsa-queue-dependency) +add_subdirectory(kernel-rename) diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/kernel-rename/CMakeLists.txt b/projects/rocprofiler-sdk/tests/rocprofv3/kernel-rename/CMakeLists.txt new file mode 100644 index 0000000000..bc7269015c --- /dev/null +++ b/projects/rocprofiler-sdk/tests/rocprofv3/kernel-rename/CMakeLists.txt @@ -0,0 +1,114 @@ +# +# rocprofv3 tool tests for kernel renaming +# +cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR) + +project( + rocprofiler-tests-rocprofv3-kernel-rename + LANGUAGES CXX + VERSION 0.0.0) + +find_package(rocprofiler-sdk REQUIRED) + +if(ROCPROFILER_MEMCHECK STREQUAL "LeakSanitizer") + set(LOG_LEVEL "warning") # info produces memory leak +else() + set(LOG_LEVEL "info") +endif() + +string(REPLACE "LD_PRELOAD=" "ROCPROF_PRELOAD=" PRELOAD_ENV + "${ROCPROFILER_MEMCHECK_PRELOAD_ENV}") + +set(kernel-rename-env "${PRELOAD_ENV}") + +rocprofiler_configure_pytest_files(CONFIG pytest.ini input-kernel-rename.yml + COPY validate.py conftest.py) + +########################################################################################## +# +# Command line input +# +########################################################################################## + +add_test( + NAME rocprofv3-test-kernel-rename-cmd-line-execute + COMMAND + $ -M --sys-trace no --hsa-trace=0 + --hsa-core-trace=1 --hip-compiler-trace False --hip-runtime-trace --kernel-trace + --memory-copy-trace -d ${CMAKE_CURRENT_BINARY_DIR}/%argt%-trace-cmd-line -o out + --output-format pftrace json --log-level env --kernel-rename + --perfetto-shmem-size-hint 128 --perfetto-buffer-size 2048000 + --perfetto-buffer-fill-policy ring_buffer --perfetto-backend inprocess -- + $) + +set_tests_properties( + rocprofv3-test-kernel-rename-cmd-line-execute + PROPERTIES TIMEOUT 45 LABELS "integration-tests" ENVIRONMENT "${kernel-rename-env}" + FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}") + +add_test( + NAME rocprofv3-test-kernel-rename-cmd-line-validate + COMMAND + ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py --json-input + ${CMAKE_CURRENT_BINARY_DIR}/transpose-trace-cmd-line/out_results.json + --pftrace-input + ${CMAKE_CURRENT_BINARY_DIR}/transpose-trace-cmd-line/out_results.pftrace) + +set(VALIDATION_FILES + ${CMAKE_CURRENT_BINARY_DIR}/transpose-trace-cmd-line/out_results.pftrace + ${CMAKE_CURRENT_BINARY_DIR}/transpose-trace-cmd-line/out_results.json) + +set_tests_properties( + rocprofv3-test-kernel-rename-cmd-line-validate + PROPERTIES TIMEOUT + 45 + LABELS + "integration-tests" + DEPENDS + "rocprofv3-test-kernel-rename-execute" + FAIL_REGULAR_EXPRESSION + "AssertionError" + ATTACHED_FILES_ON_FAIL + "${VALIDATION_FILES}") + +########################################################################################## +# +# YAML input +# +########################################################################################## + +add_test( + NAME rocprofv3-test-kernel-rename-inp-yaml-execute + COMMAND + $ -i + ${CMAKE_CURRENT_BINARY_DIR}/input-kernel-rename.yml -- $) + +set_tests_properties( + rocprofv3-test-kernel-rename-inp-yaml-execute + PROPERTIES TIMEOUT 45 LABELS "integration-tests" ENVIRONMENT "${kernel-rename-env}" + FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}") + +add_test( + NAME rocprofv3-test-kernel-rename-inp-yaml-validate + COMMAND + ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py --json-input + ${CMAKE_CURRENT_BINARY_DIR}/transpose-trace-inp-yaml/out_results.json + --pftrace-input + ${CMAKE_CURRENT_BINARY_DIR}/transpose-trace-inp-yaml/out_results.pftrace) + +set(VALIDATION_FILES + ${CMAKE_CURRENT_BINARY_DIR}/transpose-trace-inp-yaml/out_results.pftrace + ${CMAKE_CURRENT_BINARY_DIR}/transpose-trace-inp-yaml/out_results.json) + +set_tests_properties( + rocprofv3-test-kernel-rename-inp-yaml-validate + PROPERTIES TIMEOUT + 45 + LABELS + "integration-tests" + DEPENDS + "rocprofv3-test-kernel-rename-inp-yaml-execute" + FAIL_REGULAR_EXPRESSION + "AssertionError" + ATTACHED_FILES_ON_FAIL + "${VALIDATION_FILES}") diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/kernel-rename/conftest.py b/projects/rocprofiler-sdk/tests/rocprofv3/kernel-rename/conftest.py new file mode 100644 index 0000000000..1f9db6b26c --- /dev/null +++ b/projects/rocprofiler-sdk/tests/rocprofv3/kernel-rename/conftest.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python3 + +import pytest +import json + +from rocprofiler_sdk.pytest_utils.dotdict import dotdict +from rocprofiler_sdk.pytest_utils import collapse_dict_list +from rocprofiler_sdk.pytest_utils.perfetto_reader import PerfettoReader + + +def pytest_addoption(parser): + parser.addoption( + "--json-input", + action="store", + help="Path to JSON file.", + ) + parser.addoption( + "--pftrace-input", + action="store", + help="Path to Perfetto trace file.", + ) + + +@pytest.fixture +def json_data(request): + filename = request.config.getoption("--json-input") + with open(filename, "r") as inp: + return dotdict(collapse_dict_list(json.load(inp))) + + +@pytest.fixture +def pftrace_data(request): + filename = request.config.getoption("--pftrace-input") + return PerfettoReader(filename).read()[0] diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/kernel-rename/input-kernel-rename.yml b/projects/rocprofiler-sdk/tests/rocprofv3/kernel-rename/input-kernel-rename.yml new file mode 100644 index 0000000000..08e4255949 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/rocprofv3/kernel-rename/input-kernel-rename.yml @@ -0,0 +1,20 @@ +jobs: + - mangled_kernels: True + sys_trace: False + hsa_trace: False + hsa_core_trace: True + hsa_amd_trace: False + hip_compiler_trace: False + hip_runtime_trace: True + kernel_trace: True + memory_copy_trace: True + marker_trace: False + output_directory: "@CMAKE_CURRENT_BINARY_DIR@/%argt%-trace-inp-yaml" + output_file: out + output_format: [pftrace, json] + log_level: env + kernel_rename: True + perfetto_shmem_size_hint: 128 + perfetto_buffer_size: 2048000 + perfetto_buffer_fill_policy: ring_buffer + perfetto_backend: inprocess diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/kernel-rename/pytest.ini b/projects/rocprofiler-sdk/tests/rocprofv3/kernel-rename/pytest.ini new file mode 100644 index 0000000000..5e1e1c14a0 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/rocprofv3/kernel-rename/pytest.ini @@ -0,0 +1,5 @@ + +[pytest] +addopts = --durations=20 -rA -s -vv +testpaths = validate.py +pythonpath = @ROCPROFILER_SDK_TESTS_BINARY_DIR@/pytest-packages diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/kernel-rename/validate.py b/projects/rocprofiler-sdk/tests/rocprofv3/kernel-rename/validate.py new file mode 100644 index 0000000000..b4dd1d53fc --- /dev/null +++ b/projects/rocprofiler-sdk/tests/rocprofv3/kernel-rename/validate.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python3 + +import sys +import pytest + + +def test_hsa_api_trace(json_data): + data = json_data["rocprofiler-sdk-tool"] + + def get_operation_name(kind_id, op_id): + return data["strings"]["buffer_records"][kind_id]["operations"][op_id] + + def get_kind_name(kind_id): + return data["strings"]["buffer_records"][kind_id]["kind"] + + valid_domain_names = ("HSA_CORE_API",) + + hsa_api_data = data["buffer_records"]["hsa_api"] + + functions = [] + for api in hsa_api_data: + kind = get_kind_name(api["kind"]) + assert kind in valid_domain_names + assert api["end_timestamp"] >= api["start_timestamp"] + functions.append(get_operation_name(api["kind"], api["operation"])) + + functions = list(set(functions)) + assert "hsa_amd_memory_async_copy_on_engine" not in functions + assert "hsa_signal_destroy" in functions + + +def test_hip_api_trace(json_data): + data = json_data["rocprofiler-sdk-tool"] + + def get_operation_name(kind_id, op_id): + return data["strings"]["buffer_records"][kind_id]["operations"][op_id] + + def get_kind_name(kind_id): + return data["strings"]["buffer_records"][kind_id]["kind"] + + valid_domain_names = ("HIP_RUNTIME_API",) + + hip_api_data = data["buffer_records"]["hip_api"] + + functions = [] + for api in hip_api_data: + kind = get_kind_name(api["kind"]) + assert kind in valid_domain_names + assert api["end_timestamp"] >= api["start_timestamp"] + functions.append(get_operation_name(api["kind"], api["operation"])) + + functions = list(set(functions)) + for itr in ( + "__hipPushCallConfiguration", + "__hipPopCallConfiguration", + "__hipRegisterFatBinary", + "__hipRegisterFunction", + ): + assert itr not in functions, f"{itr}" + + for itr in ( + "hipMallocAsync", + "hipMemcpyAsync", + "hipMemsetAsync", + "hipFreeAsync", + "hipLaunchKernel", + ): + assert itr in functions, f"{itr}" + + +def test_kernel_trace(json_data): + data = json_data["rocprofiler-sdk-tool"] + + def get_kernel_name(kernel_id): + return data["kernel_symbols"][kernel_id]["formatted_kernel_name"] + + def get_kernel_rename(corr_id): + for itr in data.strings.correlation_id.external: + if itr.key == corr_id: + return itr.value + return None + + def get_kind_name(kind_id): + return data["strings"]["buffer_records"][kind_id]["kind"] + + valid_kernel_names = ("run",) + kernel_dispatch_data = data["buffer_records"]["kernel_dispatch"] + for dispatch in kernel_dispatch_data: + assert get_kind_name(dispatch["kind"]) == "KERNEL_DISPATCH" + assert dispatch["correlation_id"]["internal"] > 0 + assert dispatch["correlation_id"]["external"] > 0 + + dispatch_info = dispatch["dispatch_info"] + assert dispatch_info["agent_id"]["handle"] > 0 + assert dispatch_info["queue_id"]["handle"] > 0 + assert dispatch_info["kernel_id"] > 0 + assert dispatch["end_timestamp"] >= dispatch["start_timestamp"] + + kernel_name = get_kernel_name(dispatch_info["kernel_id"]) + assert kernel_name not in valid_kernel_names + + external_corr_id = dispatch["correlation_id"]["external"] + assert external_corr_id > 0 + + kernel_rename = get_kernel_rename(external_corr_id) + assert kernel_rename is not None, f"{dispatch}" + assert kernel_rename in valid_kernel_names + + +def test_memory_copy_json_trace(json_data): + data = json_data["rocprofiler-sdk-tool"] + + buffer_records = data["buffer_records"] + agent_data = data["agents"] + memory_copy_data = buffer_records["memory_copy"] + + def get_kind_name(kind_id): + return data["strings"]["buffer_records"][kind_id]["kind"] + + def get_agent(node_id): + for agent in agent_data: + if agent["id"]["handle"] == node_id["handle"]: + return agent + return None + + assert len(memory_copy_data) == 12 + + for row in memory_copy_data: + src_agent = get_agent(row["src_agent_id"]) + dst_agent = get_agent(row["dst_agent_id"]) + assert get_kind_name(row["kind"]) == "MEMORY_COPY" + assert src_agent is not None, f"{row}" + assert dst_agent is not None, f"{row}" + assert row["correlation_id"]["internal"] > 0 + assert row["end_timestamp"] >= row["start_timestamp"] + + +def test_perfetto_data(pftrace_data, json_data): + import rocprofiler_sdk.tests.rocprofv3 as rocprofv3 + + rocprofv3.test_perfetto_data( + pftrace_data, json_data, ("hip", "hsa", "kernel", "memory_copy") + ) + + +if __name__ == "__main__": + exit_code = pytest.main(["-x", __file__] + sys.argv[1:]) + sys.exit(exit_code)