diff --git a/source/docs/how-to/using-rocprofv3.rst b/source/docs/how-to/using-rocprofv3.rst index c9d364d7c4..692e39f157 100644 --- a/source/docs/how-to/using-rocprofv3.rst +++ b/source/docs/how-to/using-rocprofv3.rst @@ -162,8 +162,10 @@ The following table lists the commonly used ``rocprofv3`` command-line options c | Specifies Perfetto shared memory size hint in KB. Default: 64 KB. * - Display - - | ``-L`` [BOOL] \| ``--list-avail`` [BOOL] |br| - - | Lists the PC sampling configurations and metrics available in the counter_defs.yaml file for counter collection. In earlier ``rocprof`` versions, this was known as ``--list-basic``, ``--list-derived``, and ``--list-counters``. + - | ``-L`` [BOOL] \| ``--list-avail`` [BOOL] |br| |br| + | ``--group-by-queue`` [BOOL] + - | Lists the PC sampling configurations and metrics available in the counter_defs.yaml file for counter collection. In earlier ``rocprof`` versions, this was known as ``--list-basic``, ``--list-derived``, and ``--list-counters``. |br| |br| + | For displaying the HSA Queues that kernels and memory copy operations are submitted to rather than the default grouping of HIP Streams for perfetto. * - Other - | ``--preload`` PRELOAD |br| |br| diff --git a/source/lib/output/buffered_output.hpp b/source/lib/output/buffered_output.hpp index d93e117f77..5012f8ac5a 100644 --- a/source/lib/output/buffered_output.hpp +++ b/source/lib/output/buffered_output.hpp @@ -182,10 +182,9 @@ using rocdecode_buffered_output_t = using rocjpeg_buffered_output_t = buffered_output; using kernel_dispatch_buffered_output_with_stream_t = - buffered_output; + buffered_output; using memory_copy_buffered_output_with_stream_t = - buffered_output; + buffered_output; using pc_sampling_stochastic_buffered_output_t = buffered_output; diff --git a/source/lib/output/generateCSV.cpp b/source/lib/output/generateCSV.cpp index 9b04320c5d..5e661e8541 100644 --- a/source/lib/output/generateCSV.cpp +++ b/source/lib/output/generateCSV.cpp @@ -249,10 +249,10 @@ generate_csv(const output_config& cfg, } void -generate_csv(const output_config& cfg, - const metadata& tool_metadata, - const generator& data, - const stats_entry_t& stats) +generate_csv(const output_config& cfg, + const metadata& tool_metadata, + const generator& data, + const stats_entry_t& stats) { if(data.empty()) return; @@ -399,10 +399,10 @@ generate_csv(const output_config& cfg, } void -generate_csv(const output_config& cfg, - const metadata& tool_metadata, - const generator& data, - const stats_entry_t& stats) +generate_csv(const output_config& cfg, + const metadata& tool_metadata, + const generator& data, + const stats_entry_t& stats) { if(data.empty()) return; diff --git a/source/lib/output/generateCSV.hpp b/source/lib/output/generateCSV.hpp index 7910d68545..c675311047 100644 --- a/source/lib/output/generateCSV.hpp +++ b/source/lib/output/generateCSV.hpp @@ -40,10 +40,10 @@ generate_csv(const output_config& cfg, std::vector& data); void -generate_csv(const output_config& cfg, - const metadata& tool_metadata, - const generator& data, - const stats_entry_t& stats); +generate_csv(const output_config& cfg, + const metadata& tool_metadata, + const generator& data, + const stats_entry_t& stats); void generate_csv(const output_config& cfg, @@ -58,10 +58,10 @@ generate_csv(const output_config& cfg, const stats_entry_t& stats); void -generate_csv(const output_config& cfg, - const metadata& tool_metadata, - const generator& data, - const stats_entry_t& stats); +generate_csv(const output_config& cfg, + const metadata& tool_metadata, + const generator& data, + const stats_entry_t& stats); void generate_csv(const output_config& cfg, diff --git a/source/lib/output/generateJSON.cpp b/source/lib/output/generateJSON.cpp index 89f38765ee..290e8c8773 100644 --- a/source/lib/output/generateJSON.cpp +++ b/source/lib/output/generateJSON.cpp @@ -186,11 +186,11 @@ void write_json(json_output& json_ar, const output_config& /*cfg*/, const metadata& /*tool_metadata*/, - const domain_stats_vec_t& domain_stats, - generator&& hip_api_gen, - generator hsa_api_gen, - generator kernel_dispatch_gen, - generator memory_copy_gen, + const domain_stats_vec_t& domain_stats, + generator&& hip_api_gen, + generator hsa_api_gen, + generator kernel_dispatch_gen, + generator memory_copy_gen, generator counter_collection_gen, generator marker_api_gen, generator scratch_memory_gen, diff --git a/source/lib/output/generateJSON.hpp b/source/lib/output/generateJSON.hpp index 97d9fbab0b..d7b4b3d58f 100644 --- a/source/lib/output/generateJSON.hpp +++ b/source/lib/output/generateJSON.hpp @@ -81,14 +81,14 @@ void write_json(json_output&, const output_config& cfg, const metadata& tool_metadata, uint64_t pid); void -write_json(json_output& json_ar, - const output_config& cfg, - const metadata& tool_metadata, - const domain_stats_vec_t& domain_stats, - generator&& hip_api_gen, - generator hsa_api_gen, - generator kernel_dispatch_gen, - generator memory_copy_gen, +write_json(json_output& json_ar, + const output_config& cfg, + const metadata& tool_metadata, + const domain_stats_vec_t& domain_stats, + generator&& hip_api_gen, + generator hsa_api_gen, + generator kernel_dispatch_gen, + generator memory_copy_gen, generator counter_collection_gen, generator marker_api_gen, generator scratch_memory_gen, diff --git a/source/lib/output/generateOTF2.cpp b/source/lib/output/generateOTF2.cpp index 37f18c0813..4c5c78f620 100644 --- a/source/lib/output/generateOTF2.cpp +++ b/source/lib/output/generateOTF2.cpp @@ -356,15 +356,15 @@ create_attribute_list() void write_otf2( - const output_config& cfg, - const metadata& tool_metadata, - uint64_t pid, - const std::vector& agent_data, - std::deque* hip_api_data, - std::deque* hsa_api_data, - std::deque* kernel_dispatch_data, - std::deque* memory_copy_data, - std::deque* marker_api_data, + const output_config& cfg, + const metadata& tool_metadata, + uint64_t pid, + const std::vector& agent_data, + std::deque* hip_api_data, + std::deque* hsa_api_data, + std::deque* kernel_dispatch_data, + std::deque* memory_copy_data, + std::deque* marker_api_data, std::deque* /*scratch_memory_data*/, std::deque* rccl_api_data, std::deque* memory_allocation_data, diff --git a/source/lib/output/generateOTF2.hpp b/source/lib/output/generateOTF2.hpp index f358c567f9..020b399af2 100644 --- a/source/lib/output/generateOTF2.hpp +++ b/source/lib/output/generateOTF2.hpp @@ -36,19 +36,19 @@ namespace tool { void write_otf2( - const output_config& cfg, - const metadata& tool_metadata, - uint64_t pid, - const std::vector& agent_data, - std::deque* hip_api_data, - std::deque* hsa_api_data, - std::deque* kernel_dispatch_data, - std::deque* memory_copy_data, - std::deque* marker_api_data, - std::deque* scratch_memory_data, - std::deque* rccl_api_data, - std::deque* memory_allocation_data, - std::deque* rocdecode_api_data, - std::deque* rocjpeg_api_data); + const output_config& cfg, + const metadata& tool_metadata, + uint64_t pid, + const std::vector& agent_data, + std::deque* hip_api_data, + std::deque* hsa_api_data, + std::deque* kernel_dispatch_data, + std::deque* memory_copy_data, + std::deque* marker_api_data, + std::deque* scratch_memory_data, + std::deque* rccl_api_data, + std::deque* memory_allocation_data, + std::deque* rocdecode_api_data, + std::deque* rocjpeg_api_data); } // namespace tool } // namespace rocprofiler diff --git a/source/lib/output/generatePerfetto.cpp b/source/lib/output/generatePerfetto.cpp index ea3aefc171..819722d4fc 100644 --- a/source/lib/output/generatePerfetto.cpp +++ b/source/lib/output/generatePerfetto.cpp @@ -65,15 +65,15 @@ get_hash_id(Tp&& _val) void write_perfetto( - const output_config& ocfg, - const metadata& tool_metadata, - std::vector agent_data, - const generator& hip_api_gen, - const generator& hsa_api_gen, - const generator& kernel_dispatch_gen, - const generator& memory_copy_gen, - const generator& counter_collection_gen, - const generator& marker_api_gen, + const output_config& ocfg, + const metadata& tool_metadata, + std::vector agent_data, + const generator& hip_api_gen, + const generator& hsa_api_gen, + const generator& kernel_dispatch_gen, + const generator& memory_copy_gen, + const generator& counter_collection_gen, + const generator& marker_api_gen, const generator& /*scratch_memory_gen*/, const generator& rccl_api_gen, const generator& memory_allocation_gen, @@ -133,10 +133,12 @@ write_perfetto( tracing_session->Setup(cfg); tracing_session->StartBlocking(); - - auto tids = std::set{}; - auto demangled = std::unordered_map{}; - auto agent_thread_ids = std::unordered_map>{}; + const auto is_hip_initialized = + tool_metadata.is_runtime_initialized(ROCPROFILER_RUNTIME_INITIALIZATION_HIP); + const auto group_by_queue = ocfg.group_by_queue || !is_hip_initialized; + auto tids = std::set{}; + auto demangled = std::unordered_map{}; + auto agent_thread_ids = std::unordered_map>{}; auto agent_thread_ids_alloc = std::unordered_map>{}; auto agent_queue_ids = std::unordered_map>{}; @@ -154,12 +156,7 @@ write_perfetto( auto agent_queue_tracks = std::unordered_map>{}; - auto agent_stream_compute_tracks = - std::unordered_map>{}; - auto agent_stream_copy_tracks = - std::unordered_map>{}; + auto stream_tracks = std::unordered_map{}; auto _get_agent = [&agent_data](rocprofiler_agent_id_t _id) -> const rocprofiler_agent_t* { for(const auto& itr : agent_data) @@ -194,7 +191,7 @@ write_perfetto( { tids.emplace(itr.thread_id); agent_stream_ids[itr.dst_agent_id].emplace(itr.stream_id); - if(ocfg.group_by_queue) + if(group_by_queue) { agent_thread_ids[itr.dst_agent_id].emplace(itr.thread_id); } @@ -212,7 +209,7 @@ write_perfetto( { tids.emplace(itr.thread_id); agent_stream_ids[itr.dispatch_info.agent_id].emplace(itr.stream_id); - if(ocfg.group_by_queue) + if(group_by_queue) { agent_queue_ids[itr.dispatch_info.agent_id].emplace(itr.dispatch_info.queue_id); } @@ -297,20 +294,11 @@ write_perfetto( { for(auto sitr : aitr.second) { - const auto* _agent = _get_agent(aitr.first); - const auto stream_id = sitr.handle; + const auto stream_id = sitr.handle; { auto _namess = std::stringstream{}; - _namess << "COMPUTE AGENT [" << _agent->logical_node_id << "] STREAM [" << stream_id - << "] "; - - if(_agent->type == ROCPROFILER_AGENT_TYPE_CPU) - _namess << "(CPU)"; - else if(_agent->type == ROCPROFILER_AGENT_TYPE_GPU) - _namess << "(GPU)"; - else - _namess << "(UNK)"; + _namess << fmt::format("STREAM [\" {} \"] ", stream_id); auto _track = ::perfetto::Track{get_hash_id(_namess.str())}; auto _desc = _track.Serialize(); @@ -318,28 +306,7 @@ write_perfetto( perfetto::TrackEvent::SetTrackDescriptor(_track, _desc); - agent_stream_compute_tracks[aitr.first].emplace(sitr, _track); - } - - { - auto _namess = std::stringstream{}; - _namess << "COPY to AGENT [" << _agent->logical_node_id << "] STREAM [" << stream_id - << "] "; - - if(_agent->type == ROCPROFILER_AGENT_TYPE_CPU) - _namess << "(CPU)"; - else if(_agent->type == ROCPROFILER_AGENT_TYPE_GPU) - _namess << "(GPU)"; - else - _namess << "(UNK)"; - - auto _track = ::perfetto::Track{get_hash_id(_namess.str())}; - auto _desc = _track.Serialize(); - _desc.set_name(_namess.str()); - - perfetto::TrackEvent::SetTrackDescriptor(_track, _desc); - - agent_stream_copy_tracks[aitr.first].emplace(sitr, _track); + stream_tracks.emplace(sitr, _track); } } } @@ -570,13 +537,13 @@ write_perfetto( auto name = buffer_names.at(itr.kind, itr.operation); ::perfetto::Track* _track = nullptr; - if(ocfg.group_by_queue) + if(group_by_queue) { _track = &agent_thread_tracks.at(itr.dst_agent_id).at(itr.thread_id); } else { - _track = &agent_stream_copy_tracks.at(itr.dst_agent_id).at(itr.stream_id); + _track = &stream_tracks.at(itr.stream_id); } TRACE_EVENT_BEGIN( @@ -640,7 +607,7 @@ write_perfetto( rocprofiler_agent_id_t, std::unordered_map< rocprofiler_queue_id_t, - std::vector>>{}; + std::vector>>{}; for(auto& itr : generator) { const auto& info = itr.dispatch_info; @@ -671,14 +638,13 @@ write_perfetto( auto name = std::string_view{sym->kernel_name}; ::perfetto::Track* _track = nullptr; - if(ocfg.group_by_queue) + if(group_by_queue) { _track = &agent_queue_tracks.at(info.agent_id).at(info.queue_id); } else { - _track = - &agent_stream_compute_tracks.at(info.agent_id).at((*it)->stream_id); + _track = &stream_tracks.at((*it)->stream_id); } // Temporary fix until timestamp issues are resolved: Set timestamps to be @@ -729,6 +695,11 @@ write_perfetto( .get_agent_index(agents_map.at(info.agent_id).id, ocfg.agent_index_value) .as_string("-"), + "agent_type", + tool_metadata + .get_agent_index(agents_map.at(info.agent_id).id, + ocfg.agent_index_value) + .type, "corr_id", current.correlation_id.internal, "queue", diff --git a/source/lib/output/generatePerfetto.hpp b/source/lib/output/generatePerfetto.hpp index c4626c5327..83a7c4c32b 100644 --- a/source/lib/output/generatePerfetto.hpp +++ b/source/lib/output/generatePerfetto.hpp @@ -37,13 +37,13 @@ namespace tool { void write_perfetto( - const output_config& cfg, - const metadata& tool_metadata, - std::vector agent_data, - const generator& hip_api_gen, - const generator& hsa_api_gen, - const generator& kernel_dispatch_gen, - const generator& memory_copy_gen, + const output_config& cfg, + const metadata& tool_metadata, + std::vector agent_data, + const generator& hip_api_gen, + const generator& hsa_api_gen, + const generator& kernel_dispatch_gen, + const generator& memory_copy_gen, const generator& counter_collection_gen, const generator& marker_api_gen, const generator& scratch_memory_gen, diff --git a/source/lib/output/generateStats.cpp b/source/lib/output/generateStats.cpp index 302ce33dc4..0333640c3f 100644 --- a/source/lib/output/generateStats.cpp +++ b/source/lib/output/generateStats.cpp @@ -63,8 +63,8 @@ get_stats(const stats_map_t& data_v) stats_entry_t generate_stats(const output_config& /*cfg*/, - const metadata& tool_metadata, - const generator& data) + const metadata& tool_metadata, + const generator& data) { auto kernel_stats = stats_map_t{}; for(auto ditr : data) @@ -119,8 +119,8 @@ generate_stats(const output_config& /*cfg*/, stats_entry_t generate_stats(const output_config& /*cfg*/, - const metadata& tool_metadata, - const generator& data) + const metadata& tool_metadata, + const generator& data) { auto memory_copy_stats = stats_map_t{}; for(auto ditr : data) diff --git a/source/lib/output/generateStats.hpp b/source/lib/output/generateStats.hpp index 87790d2a87..4d2b177f3b 100644 --- a/source/lib/output/generateStats.hpp +++ b/source/lib/output/generateStats.hpp @@ -32,9 +32,9 @@ namespace rocprofiler namespace tool { stats_entry_t -generate_stats(const output_config& cfg, - const metadata& tool_metadata, - const generator& data); +generate_stats(const output_config& cfg, + const metadata& tool_metadata, + const generator& data); stats_entry_t generate_stats(const output_config& cfg, @@ -47,9 +47,9 @@ generate_stats(const output_config& cfg const generator& data); stats_entry_t -generate_stats(const output_config& cfg, - const metadata& tool_metadata, - const generator& data); +generate_stats(const output_config& cfg, + const metadata& tool_metadata, + const generator& data); stats_entry_t generate_stats(const output_config& cfg, diff --git a/source/lib/output/metadata.cpp b/source/lib/output/metadata.cpp index 87cf7b9423..c3e40bc21a 100644 --- a/source/lib/output/metadata.cpp +++ b/source/lib/output/metadata.cpp @@ -475,6 +475,26 @@ metadata::add_external_correlation_id(uint64_t val) [](auto& _data, uint64_t _val) { return _data.emplace(_val).second; }, val); } +bool +metadata::add_runtime_initialization(rocprofiler_runtime_initialization_operation_t runtime_op) +{ + return runtime_initialization_set.wlock( + [](auto& _data, rocprofiler_runtime_initialization_operation_t _runtime_op) { + return _data.emplace(_runtime_op).second; + }, + runtime_op); +} + +bool +metadata::is_runtime_initialized(rocprofiler_runtime_initialization_operation_t runtime_op) const +{ + return runtime_initialization_set.rlock( + [](const auto& _data, rocprofiler_runtime_initialization_operation_t _runtime_op) { + return _data.count(_runtime_op) > 0; + }, + runtime_op); +} + std::string_view metadata::get_marker_message(uint64_t corr_id) const { diff --git a/source/lib/output/metadata.hpp b/source/lib/output/metadata.hpp index cdab00614f..2aa8ca2c4b 100644 --- a/source/lib/output/metadata.hpp +++ b/source/lib/output/metadata.hpp @@ -90,6 +90,8 @@ using synced_map = common::Synchronized; template using synced_obj = common::Synchronized; using pc_sampling_stats_t = rocprofiler_tool_pc_sampling_stats; +using runtime_initialization_set_t = + std::unordered_set; enum class agent_indexing { @@ -126,17 +128,18 @@ struct metadata agent_counter_info_map_t agent_counter_info = {}; agent_pc_sample_config_info_map_t agent_pc_sample_config_info = {}; - sdk::buffer_name_info buffer_names = {}; - sdk::callback_name_info callback_names = {}; - synced_map code_objects = {}; - synced_map kernel_symbols = {}; - synced_map marker_messages = {}; - synced_map string_entries = {}; - synced_map external_corr_ids = {}; - synced_map host_functions = {}; - synced_map code_object_load = {}; - att_filenames_map_t att_filenames = {}; - synced_obj pc_sampling_stats = {}; + sdk::buffer_name_info buffer_names = {}; + sdk::callback_name_info callback_names = {}; + synced_map code_objects = {}; + synced_map kernel_symbols = {}; + synced_map marker_messages = {}; + synced_map string_entries = {}; + synced_map external_corr_ids = {}; + synced_map host_functions = {}; + synced_map code_object_load = {}; + att_filenames_map_t att_filenames = {}; + synced_obj pc_sampling_stats = {}; + synced_obj runtime_initialization_set = {}; metadata() = default; metadata(inprocess); @@ -182,6 +185,7 @@ struct metadata bool add_host_function(host_function_info&& func); bool add_string_entry(size_t key, std::string_view str); bool add_external_correlation_id(uint64_t); + bool add_runtime_initialization(rocprofiler_runtime_initialization_operation_t); std::string_view get_marker_message(uint64_t corr_id) const; std::string_view get_kernel_name(uint64_t kernel_id, uint64_t rename_id) const; @@ -193,6 +197,7 @@ struct metadata rocprofiler_tracing_operation_t op) const; agent_index get_agent_index(rocprofiler_agent_id_t agent, agent_indexing index) const; const std::string* get_string_entry(size_t key) const; + bool is_runtime_initialized(rocprofiler_runtime_initialization_operation_t) const; private: bool inprocess_init = false; diff --git a/source/lib/output/stream_info.hpp b/source/lib/output/stream_info.hpp index 186e500c20..84c2614d67 100644 --- a/source/lib/output/stream_info.hpp +++ b/source/lib/output/stream_info.hpp @@ -35,56 +35,55 @@ namespace rocprofiler { namespace tool { -struct tool_buffer_tracing_kernel_dispatch_with_stream_record_t +struct tool_buffer_tracing_kernel_dispatch_ext_record_t : rocprofiler_buffer_tracing_kernel_dispatch_record_t { using base_type = rocprofiler_buffer_tracing_kernel_dispatch_record_t; - tool_buffer_tracing_kernel_dispatch_with_stream_record_t( - const base_type& _base, - const rocprofiler_stream_id_t& _stream_id, - const uint64_t& _kernel_rename_val) + tool_buffer_tracing_kernel_dispatch_ext_record_t(const base_type& _base, + const rocprofiler_stream_id_t& _stream_id, + const uint64_t& _kernel_rename_val) : base_type{_base} , stream_id{_stream_id} , kernel_rename_val{_kernel_rename_val} {} - tool_buffer_tracing_kernel_dispatch_with_stream_record_t(); - ~tool_buffer_tracing_kernel_dispatch_with_stream_record_t() = default; - tool_buffer_tracing_kernel_dispatch_with_stream_record_t( - const tool_buffer_tracing_kernel_dispatch_with_stream_record_t&) = default; - tool_buffer_tracing_kernel_dispatch_with_stream_record_t( - tool_buffer_tracing_kernel_dispatch_with_stream_record_t&&) noexcept = default; - tool_buffer_tracing_kernel_dispatch_with_stream_record_t& operator =( - const tool_buffer_tracing_kernel_dispatch_with_stream_record_t&) = default; - tool_buffer_tracing_kernel_dispatch_with_stream_record_t& operator =( - tool_buffer_tracing_kernel_dispatch_with_stream_record_t&&) noexcept = default; + tool_buffer_tracing_kernel_dispatch_ext_record_t(); + ~tool_buffer_tracing_kernel_dispatch_ext_record_t() = default; + tool_buffer_tracing_kernel_dispatch_ext_record_t( + const tool_buffer_tracing_kernel_dispatch_ext_record_t&) = default; + tool_buffer_tracing_kernel_dispatch_ext_record_t( + tool_buffer_tracing_kernel_dispatch_ext_record_t&&) noexcept = default; + tool_buffer_tracing_kernel_dispatch_ext_record_t& operator =( + const tool_buffer_tracing_kernel_dispatch_ext_record_t&) = default; + tool_buffer_tracing_kernel_dispatch_ext_record_t& operator =( + tool_buffer_tracing_kernel_dispatch_ext_record_t&&) noexcept = default; rocprofiler_stream_id_t stream_id = {}; uint64_t kernel_rename_val = {}; }; -struct tool_buffer_tracing_memory_copy_with_stream_record_t +struct tool_buffer_tracing_memory_copy_ext_record_t : rocprofiler_buffer_tracing_memory_copy_record_t { using base_type = rocprofiler_buffer_tracing_memory_copy_record_t; - tool_buffer_tracing_memory_copy_with_stream_record_t(const base_type& _base, - const rocprofiler_stream_id_t& _stream_id) + tool_buffer_tracing_memory_copy_ext_record_t(const base_type& _base, + const rocprofiler_stream_id_t& _stream_id) : base_type{_base} , stream_id{_stream_id} {} - tool_buffer_tracing_memory_copy_with_stream_record_t(); - ~tool_buffer_tracing_memory_copy_with_stream_record_t() = default; - tool_buffer_tracing_memory_copy_with_stream_record_t( - const tool_buffer_tracing_memory_copy_with_stream_record_t&) = default; - tool_buffer_tracing_memory_copy_with_stream_record_t( - tool_buffer_tracing_memory_copy_with_stream_record_t&&) noexcept = default; - tool_buffer_tracing_memory_copy_with_stream_record_t& operator =( - const tool_buffer_tracing_memory_copy_with_stream_record_t&) = default; - tool_buffer_tracing_memory_copy_with_stream_record_t& operator =( - tool_buffer_tracing_memory_copy_with_stream_record_t&&) noexcept = default; + tool_buffer_tracing_memory_copy_ext_record_t(); + ~tool_buffer_tracing_memory_copy_ext_record_t() = default; + tool_buffer_tracing_memory_copy_ext_record_t( + const tool_buffer_tracing_memory_copy_ext_record_t&) = default; + tool_buffer_tracing_memory_copy_ext_record_t( + tool_buffer_tracing_memory_copy_ext_record_t&&) noexcept = default; + tool_buffer_tracing_memory_copy_ext_record_t& operator =( + const tool_buffer_tracing_memory_copy_ext_record_t&) = default; + tool_buffer_tracing_memory_copy_ext_record_t& operator =( + tool_buffer_tracing_memory_copy_ext_record_t&&) noexcept = default; rocprofiler_stream_id_t stream_id = {}; }; @@ -97,8 +96,8 @@ namespace cereal template void -save(ArchiveT& ar, - const ::rocprofiler::tool::tool_buffer_tracing_kernel_dispatch_with_stream_record_t& data) +save(ArchiveT& ar, + const ::rocprofiler::tool::tool_buffer_tracing_kernel_dispatch_ext_record_t& data) { cereal::save(ar, static_cast(data)); SAVE_DATA_FIELD(stream_id); @@ -107,8 +106,7 @@ save(ArchiveT& template void -save(ArchiveT& ar, - const ::rocprofiler::tool::tool_buffer_tracing_memory_copy_with_stream_record_t& data) +save(ArchiveT& ar, const ::rocprofiler::tool::tool_buffer_tracing_memory_copy_ext_record_t& data) { cereal::save(ar, static_cast(data)); SAVE_DATA_FIELD(stream_id); diff --git a/source/lib/rocprofiler-sdk-tool/stream_stack.cpp b/source/lib/rocprofiler-sdk-tool/stream_stack.cpp index 8cc1cf3139..db3eadc56e 100644 --- a/source/lib/rocprofiler-sdk-tool/stream_stack.cpp +++ b/source/lib/rocprofiler-sdk-tool/stream_stack.cpp @@ -60,7 +60,8 @@ pop_stream_id() rocprofiler_stream_id_t get_stream_id() { - return CHECK_NOTNULL(get_stream_stack())->back(); + return (CHECK_NOTNULL(get_stream_stack())->empty()) ? rocprofiler_stream_id_t{.handle = 0} + : get_stream_stack()->back(); } bool diff --git a/source/lib/rocprofiler-sdk-tool/tool.cpp b/source/lib/rocprofiler-sdk-tool/tool.cpp index b4645ae0d0..9f9b09368b 100644 --- a/source/lib/rocprofiler-sdk-tool/tool.cpp +++ b/source/lib/rocprofiler-sdk-tool/tool.cpp @@ -249,6 +249,7 @@ using kernel_rename_stack_t = std::stack; auto* tool_metadata = as_pointer(tool::metadata::inprocess{}); auto target_kernels = common::Synchronized{}; std::mutex att_shader_data; +auto counter_collection_ctx = rocprofiler_context_id_t{0}; thread_local auto thread_dispatch_rename = as_pointer(); thread_local auto thread_dispatch_rename_dtor = common::scope_destructor{[]() { @@ -262,25 +263,6 @@ struct kernel_rename_and_stream_display_pair uint64_t kernel_rename_val{0}; rocprofiler_stream_id_t stream_id{.handle = 0}; }; -auto kernel_rename_and_stream_display_pair_dtors = - new std::vector{}; - -auto -get_kernel_rename_and_stream_display_pair_lock() -{ - static auto _mutex = std::mutex{}; - return std::unique_lock{_mutex}; -} - -void -add_kernel_rename_and_stream_display_pairs(kernel_rename_and_stream_display_pair* ptr) -{ - auto lock = get_kernel_rename_and_stream_display_pair_lock(); - if(ptr != nullptr && kernel_rename_and_stream_display_pair_dtors != nullptr) - { - kernel_rename_and_stream_display_pair_dtors->emplace_back(ptr); - } -} bool add_kernel_target(uint64_t _kern_id, const std::unordered_set& range) @@ -455,10 +437,19 @@ set_kernel_rename_and_stream_display_correlation_id( tool::get_config().kernel_rename && thread_dispatch_rename != nullptr && !thread_dispatch_rename->empty() && kind == ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KERNEL_DISPATCH; - const bool hip_stream_display_enabled = - !tool::get_config().group_by_queue && - kernel_rename_and_stream_display_pair_dtors != nullptr && - !rocprofiler::tool::stream::stream_stack_empty(); + const bool hip_stream_display_enabled = !tool::get_config().group_by_queue; + if(kernel_rename_service_enabled && ctx_id == counter_collection_ctx) + { + auto val = thread_dispatch_rename->top(); + if(tool_metadata) tool_metadata->add_external_correlation_id(val); + external_corr_id->value = val; + return 0; + } + // End early if counter collection context and no kernel rename service is enabled + if(ctx_id == counter_collection_ctx) + { + return 0; + } kernel_rename_and_stream_display_pair* kernel_rename_and_stream_display_vals = nullptr; if(kernel_rename_service_enabled || hip_stream_display_enabled) @@ -475,12 +466,11 @@ set_kernel_rename_and_stream_display_correlation_id( // Get stream ID from stream HIP display service if(hip_stream_display_enabled && kernel_rename_and_stream_display_vals != nullptr) { - auto stream_id = rocprofiler::tool::stream::get_stream_id(); + auto stream_id = tool::stream::get_stream_id(); kernel_rename_and_stream_display_vals->stream_id = stream_id; } // Set the external correlation id service to point to struct external_corr_id->ptr = kernel_rename_and_stream_display_vals; - add_kernel_rename_and_stream_display_pairs(kernel_rename_and_stream_display_vals); common::consume_args(thr_id, ctx_id, kind, op, internal_corr_id, user_data); @@ -615,6 +605,24 @@ hip_stream_display_callback(rocprofiler_callback_tracing_record_t record, common::consume_args(user_data, data); } +// Stores which runtimes have been initialized in metadata +void +runtime_initialization_callback(rocprofiler_callback_tracing_record_t record, + rocprofiler_user_data_t* user_data, + void* data) +{ + if(record.kind != ROCPROFILER_CALLBACK_TRACING_RUNTIME_INITIALIZATION) return; + ROCP_CI_LOG_IF(WARNING, tool_metadata == nullptr) + << fmt::format("tool cannot record runtime initialization for {}", + tool_metadata->get_operation_name(record.kind, record.operation)); + if(tool_metadata) + { + tool_metadata->add_runtime_initialization( + static_cast(record.operation)); + } + common::consume_args(user_data, data); +} + void callback_tracing_callback(rocprofiler_callback_tracing_record_t record, rocprofiler_user_data_t* user_data, @@ -942,8 +950,10 @@ buffered_tracing_callback(rocprofiler_context_id_t /*context*/, record->correlation_id.external.ptr); stream_id = kernel_stream_pair_ptr->stream_id; kernel_rename_val = kernel_stream_pair_ptr->kernel_rename_val; + delete kernel_stream_pair_ptr; + record->correlation_id.external.value = kernel_rename_val; } - rocprofiler::tool::tool_buffer_tracing_kernel_dispatch_with_stream_record_t + rocprofiler::tool::tool_buffer_tracing_kernel_dispatch_ext_record_t record_with_stream{*record, stream_id, kernel_rename_val}; tool::write_ring_buffer(record_with_stream, domain_type::KERNEL_DISPATCH); } @@ -971,9 +981,11 @@ buffered_tracing_callback(rocprofiler_context_id_t /*context*/, static_cast( record->correlation_id.external.ptr); stream_id = kernel_stream_pair_ptr->stream_id; + delete kernel_stream_pair_ptr; + record->correlation_id.external.ptr = nullptr; } - rocprofiler::tool::tool_buffer_tracing_memory_copy_with_stream_record_t - record_with_stream{*record, stream_id}; + rocprofiler::tool::tool_buffer_tracing_memory_copy_ext_record_t record_with_stream{ + *record, stream_id}; tool::write_ring_buffer(record_with_stream, domain_type::MEMORY_COPY); } else if(header->kind == ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION) @@ -1366,16 +1378,10 @@ counter_record_callback(rocprofiler_dispatch_counting_service_data_t dispatch_da auto counter_record = tool::tool_counter_record_t{}; - counter_record.dispatch_data = dispatch_data; - counter_record.thread_id = user_data.value; - if(dispatch_data.correlation_id.external.ptr != nullptr) - { - // Extract the kernel id - auto* kernel_stream_pair_ptr = static_cast( - dispatch_data.correlation_id.external.ptr); - counter_record.kernel_rename_val = kernel_stream_pair_ptr->kernel_rename_val; - } - + counter_record.dispatch_data = dispatch_data; + counter_record.thread_id = user_data.value; + counter_record.kernel_rename_val = dispatch_data.correlation_id.external.value; + counter_record.dispatch_data.correlation_id.external.value = counter_record.kernel_rename_val; auto serialized_records = std::vector{}; serialized_records.reserve(record_count); @@ -1805,10 +1811,16 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) if(tool::get_config().counter_collection) { + ROCPROFILER_CALL(rocprofiler_create_context(&counter_collection_ctx), + "failed to create context"); ROCPROFILER_CALL( - rocprofiler_configure_callback_dispatch_counting_service( - get_client_ctx(), dispatch_callback, nullptr, counter_record_callback, nullptr), + rocprofiler_configure_callback_dispatch_counting_service(counter_collection_ctx, + dispatch_callback, + nullptr, + counter_record_callback, + nullptr), "Could not setup counting service"); + ROCPROFILER_CALL(rocprofiler_start_context(counter_collection_ctx), "start context failed"); } if(tool::get_config().rocdecode_api_trace) @@ -1873,6 +1885,7 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) } if(!tool::get_config().group_by_queue) { + // Track stream ID information via callback service auto hip_stream_display_ctx = rocprofiler_context_id_t{0}; ROCPROFILER_CALL(rocprofiler_create_context(&hip_stream_display_ctx), @@ -1887,6 +1900,21 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) nullptr), "stream tracing configure failed"); ROCPROFILER_CALL(rocprofiler_start_context(hip_stream_display_ctx), "start context failed"); + + // Track if HIP runtime has been initialized via runtime_intialization service + auto runtime_initialization_ctx = rocprofiler_context_id_t{0}; + ROCPROFILER_CALL(rocprofiler_create_context(&runtime_initialization_ctx), + "failed to create context"); + ROCPROFILER_CALL(rocprofiler_configure_callback_tracing_service( + runtime_initialization_ctx, + ROCPROFILER_CALLBACK_TRACING_RUNTIME_INITIALIZATION, + nullptr, + 0, + runtime_initialization_callback, + nullptr), + "stream tracing configure failed"); + ROCPROFILER_CALL(rocprofiler_start_context(runtime_initialization_ctx), + "start context failed"); } if(tool::get_config().kernel_rename || !tool::get_config().group_by_queue) { @@ -1902,6 +1930,21 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) set_kernel_rename_and_stream_display_correlation_id, nullptr), "Could not configure external correlation id request service"); + + if(tool::get_config().counter_collection) + { + auto counter_external_corr_id_request_kinds = + std::array{ + ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KERNEL_DISPATCH}; + + ROCPROFILER_CALL(rocprofiler_configure_external_correlation_id_request_service( + counter_collection_ctx, + counter_external_corr_id_request_kinds.data(), + counter_external_corr_id_request_kinds.size(), + set_kernel_rename_and_stream_display_correlation_id, + nullptr), + "Could not configure external correlation id request service"); + } } if(tool::get_config().pc_sampling_host_trap) @@ -2229,17 +2272,6 @@ tool_fini(void* /*tool_data*/) run_cleanup(); - if(kernel_rename_and_stream_display_pair_dtors != nullptr) - { - for(auto& itr : *kernel_rename_and_stream_display_pair_dtors) - { - delete itr; - itr = nullptr; - } - delete kernel_rename_and_stream_display_pair_dtors; - kernel_rename_and_stream_display_pair_dtors = nullptr; - } - if(destructors) { for(const auto& itr : *destructors) diff --git a/source/lib/rocprofiler-sdk/hip/stream.cpp b/source/lib/rocprofiler-sdk/hip/stream.cpp index 2b527f798c..9469239e14 100644 --- a/source/lib/rocprofiler-sdk/hip/stream.cpp +++ b/source/lib/rocprofiler-sdk/hip/stream.cpp @@ -34,7 +34,6 @@ #include "lib/rocprofiler-sdk/hsa/queue_controller.hpp" #include "lib/rocprofiler-sdk/registration.hpp" #include "lib/rocprofiler-sdk/tracing/tracing.hpp" -#include "rocprofiler-sdk/hip/compiler_api_id.h" #include #include @@ -367,28 +366,11 @@ enable_stream_stack() { if(itr->is_tracing_one_of(ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY, ROCPROFILER_CALLBACK_TRACING_HIP_RUNTIME_API, - ROCPROFILER_CALLBACK_TRACING_HIP_COMPILER_API, ROCPROFILER_CALLBACK_TRACING_HIP_STREAM, ROCPROFILER_BUFFER_TRACING_MEMORY_COPY, ROCPROFILER_BUFFER_TRACING_HIP_RUNTIME_API, - ROCPROFILER_BUFFER_TRACING_HIP_COMPILER_API, ROCPROFILER_BUFFER_TRACING_HIP_STREAM, - ROCPROFILER_BUFFER_TRACING_HIP_RUNTIME_API_EXT, - ROCPROFILER_BUFFER_TRACING_HIP_COMPILER_API_EXT)) - return true; - } - - return false; -} - -bool -enable_compiler_stream_stack() -{ - for(const auto& itr : context::get_registered_contexts()) - { - if(itr->is_tracing_one_of(ROCPROFILER_CALLBACK_TRACING_HIP_COMPILER_API, - ROCPROFILER_BUFFER_TRACING_HIP_COMPILER_API, - ROCPROFILER_BUFFER_TRACING_HIP_COMPILER_API_EXT)) + ROCPROFILER_BUFFER_TRACING_HIP_RUNTIME_API_EXT)) return true; } @@ -415,11 +397,6 @@ update_table(Tp* _orig, std::integral_constant) ROCP_TRACE << "updating table entry for " << _info.name; constexpr auto num_args = function_args_type::size(); - constexpr auto is_hip_pop_call_config_func = - std::is_same::value && - (static_cast(info_type::operation_idx) == - ROCPROFILER_HIP_COMPILER_API_ID___hipPopCallConfiguration); if constexpr(common::mpl::is_one_of::value) { @@ -433,12 +410,6 @@ update_table(Tp* _orig, std::integral_constant) static_assert(stream_idx == (num_args - rstream_idx - 1), "function has more than one stream argument"); - // don't wrap the compiler API functions unless HIP compiler API tracing is enabled - if constexpr(TableIdx == ROCPROFILER_HIP_TABLE_ID_Compiler) - { - if(!enable_compiler_stream_stack()) return; - } - // 1. get the sub-table containing the function pointer in original table // 2. get reference to function pointer in sub-table in original table // 3. update function pointer with wrapper @@ -460,8 +431,7 @@ update_table(Tp* _orig, std::integral_constant) _func = create_read_functor(_func); } } - else if constexpr(common::mpl::is_one_of::value && - !is_hip_pop_call_config_func) + else if constexpr(common::mpl::is_one_of::value) { constexpr auto stream_idx = common::mpl::index_of::value; @@ -474,12 +444,6 @@ update_table(Tp* _orig, std::integral_constant) static_assert(stream_idx == (num_args - rstream_idx - 1), "function has more than one stream argument"); - // don't wrap the compiler API functions unless HIP compiler API tracing is enabled - if constexpr(TableIdx == ROCPROFILER_HIP_TABLE_ID_Compiler) - { - if(!enable_compiler_stream_stack()) return; - } - ROCP_INFO << _info.name << " has been designated as a stream create function"; // 1. get the sub-table containing the function pointer in original table @@ -541,7 +505,6 @@ using hip_op_args_cb_t = rocprofiler_callback_tracing_operation_args_cb_t; template void update_table(TABLE_TYPE * _tbl); INSTANTIATE_HIP_TABLE_FUNC(hip_runtime_api_table_t, ROCPROFILER_HIP_TABLE_ID_Runtime) -INSTANTIATE_HIP_TABLE_FUNC(hip_compiler_api_table_t, ROCPROFILER_HIP_TABLE_ID_Compiler) } // namespace stream } // namespace hip } // namespace rocprofiler diff --git a/source/lib/rocprofiler-sdk/hip/stream.hpp b/source/lib/rocprofiler-sdk/hip/stream.hpp index 3b46405720..5f079fe682 100644 --- a/source/lib/rocprofiler-sdk/hip/stream.hpp +++ b/source/lib/rocprofiler-sdk/hip/stream.hpp @@ -36,8 +36,7 @@ namespace hip { namespace stream { -using hip_compiler_api_table_t = HipCompilerDispatchTable; -using hip_runtime_api_table_t = HipDispatchTable; +using hip_runtime_api_table_t = HipDispatchTable; rocprofiler_stream_id_t get_stream_id(); diff --git a/source/lib/rocprofiler-sdk/hsa/queue.cpp b/source/lib/rocprofiler-sdk/hsa/queue.cpp index 6483df24dd..c9d977205a 100644 --- a/source/lib/rocprofiler-sdk/hsa/queue.cpp +++ b/source/lib/rocprofiler-sdk/hsa/queue.cpp @@ -253,6 +253,14 @@ WriteInterceptor(const void* packets, tracing::populate_contexts(ROCPROFILER_CALLBACK_TRACING_KERNEL_DISPATCH, ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH, tracing_data_v); + // these are for the services (dispatch counter collection, pc sampling, ATT) which use + // the queue/queue_controller callback mechanism + const auto queue_callback_context_filter = [](const context::context* ctx) { + return (ctx->counter_collection || ctx->pc_sampler || ctx->dispatch_thread_trace); + }; + + for(const auto* itr : context::get_active_contexts(queue_callback_context_filter)) + tracing_data_v.external_correlation_ids.emplace(itr, tracing::empty_user_data); const auto* packets_arr = static_cast(packets); auto transformed_packets = std::vector{}; diff --git a/source/lib/rocprofiler-sdk/registration.cpp b/source/lib/rocprofiler-sdk/registration.cpp index b3dbd9d5d4..1a10b55a8e 100644 --- a/source/lib/rocprofiler-sdk/registration.cpp +++ b/source/lib/rocprofiler-sdk/registration.cpp @@ -858,9 +858,6 @@ rocprofiler_set_api_table(const char* name, // install rocprofiler API wrappers rocprofiler::hip::update_table(hip_compiler_api_table); - // install HIP stream deduction wrappers - rocprofiler::hip::stream::update_table(hip_compiler_api_table); - // allow tools to install API wrappers rocprofiler::intercept_table::notify_intercept_table_registration( ROCPROFILER_HIP_COMPILER_TABLE, diff --git a/tests/bin/CMakeLists.txt b/tests/bin/CMakeLists.txt index 05a6d0cfb8..f8ddfeac0d 100644 --- a/tests/bin/CMakeLists.txt +++ b/tests/bin/CMakeLists.txt @@ -38,3 +38,4 @@ if(rocJPEG_FOUND AND rocJPEG_VERSION VERSION_GREATER 0.6.0) add_subdirectory(rocjpeg) endif() add_subdirectory(hsa-code-object) +add_subdirectory(hip-streams) diff --git a/tests/bin/hip-streams/CMakeLists.txt b/tests/bin/hip-streams/CMakeLists.txt new file mode 100644 index 0000000000..0fa506c53f --- /dev/null +++ b/tests/bin/hip-streams/CMakeLists.txt @@ -0,0 +1,41 @@ +# +# +# +cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR) + +if(NOT CMAKE_HIP_COMPILER) + find_program( + amdclangpp_EXECUTABLE + NAMES amdclang++ + HINTS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm + PATHS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm + PATH_SUFFIXES bin llvm/bin NO_CACHE) + mark_as_advanced(amdclangpp_EXECUTABLE) + + if(amdclangpp_EXECUTABLE) + set(CMAKE_HIP_COMPILER "${amdclangpp_EXECUTABLE}") + endif() +endif() + +project(rocprofiler-sdk-tests-bin-hip-streams LANGUAGES CXX HIP) + +foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO) + if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "") + set(CMAKE_HIP_FLAGS_${_TYPE} "${CMAKE_CXX_FLAGS_${_TYPE}}") + endif() +endforeach() + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_EXTENSIONS OFF) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_HIP_STANDARD 17) +set(CMAKE_HIP_EXTENSIONS OFF) +set(CMAKE_HIP_STANDARD_REQUIRED ON) + +set_source_files_properties(compute_comm_overlap.cpp PROPERTIES LANGUAGE HIP) +add_executable(hip-streams) +target_sources(hip-streams PRIVATE compute_comm_overlap.cpp) +target_link_libraries(hip-streams PRIVATE rocprofiler-sdk::tests-build-flags) + +find_package(Threads REQUIRED) +target_link_libraries(hip-streams PRIVATE Threads::Threads) diff --git a/tests/bin/hip-streams/compute_comm_overlap.cpp b/tests/bin/hip-streams/compute_comm_overlap.cpp new file mode 100644 index 0000000000..6b3dbad2e9 --- /dev/null +++ b/tests/bin/hip-streams/compute_comm_overlap.cpp @@ -0,0 +1,138 @@ +#include +#include +#include +#include "hip/hip_runtime.h" + +#define BLOCKDIM 64 + +/* Macro for checking GPU API return values */ +#define HIP_ASSERT(call) \ + do \ + { \ + hipError_t gpuErr = call; \ + if(hipSuccess != gpuErr) \ + { \ + printf( \ + "GPU API Error - %s:%d: '%s'\n", __FILE__, __LINE__, hipGetErrorString(gpuErr)); \ + exit(1); \ + } \ + } while(0) + +// HIP kernel. Each thread takes care of one element of input +__global__ void +cube(double* input, double* output, size_t offset, size_t elements_per_stream) +{ + size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + size_t gstride = blockDim.x * gridDim.x; + + // Span all elements assigned to this stream + for(size_t id = tid + offset; id < offset + elements_per_stream; id += gstride) + for(size_t i = 0; i < 1000; ++i) + output[id] = input[id] * input[id] * input[id]; +} + +int +main() +{ + // number of streams + const int num_streams = 8; + + // Number of threads in each thread block + const int blockSize = 512; + + // Size of vectors + int n = 100000000; + int elements_per_stream = n / num_streams; + int bytes_per_stream = elements_per_stream * sizeof(double); + + // Host input vectors + double* h_input1{nullptr}; + // Host output vector + double* h_output1{nullptr}; + + // Device input vectors + double* d_input1{nullptr}; + // Device output vector + double* d_output1{nullptr}; + + // Creating events for timers + hipEvent_t start{}, stop{}; + HIP_ASSERT(hipEventCreate(&start)); + HIP_ASSERT(hipEventCreate(&stop)); + + // Creating streams + hipStream_t streams[num_streams]; + for(int i = 0; i < num_streams; ++i) + { + HIP_ASSERT(hipStreamCreate(&streams[i])); + } + + // Size, in bytes, of each vector + size_t bytes = n * sizeof(double); + + // Allocate page locked memory for these vectors on host + HIP_ASSERT(hipHostMalloc(&h_input1, bytes)); + HIP_ASSERT(hipHostMalloc(&h_output1, bytes)); + + // Allocate memory for each vector on GPU + HIP_ASSERT(hipMalloc(&d_input1, bytes)); + HIP_ASSERT(hipMalloc(&d_output1, bytes)); + + // Initialize vectors on host + for(int i = 0; i < n; i++) + { + h_input1[i] = sin(i); + } + + // Number of thread blocks in grid + const int gridSizePerStream = 104; //(int)ceil((float)elements_per_stream/blockSize); + + HIP_ASSERT(hipEventRecord(start)); + // split H2D copies and kernel calls into separate loops + for(int i = 0; i < num_streams; i++) + { + int offset = i * elements_per_stream; + HIP_ASSERT(hipMemcpyAsync(&d_input1[offset], + &h_input1[offset], + bytes_per_stream, + hipMemcpyHostToDevice, + streams[i])); + } + for(int i = 0; i < num_streams; i++) + { + int offset = i * elements_per_stream; + cube<<>>( + d_input1, d_output1, offset, elements_per_stream); + } + for(int i = 0; i < num_streams; i++) + { + int offset = i * elements_per_stream; + HIP_ASSERT(hipMemcpyAsync(&h_output1[offset], + &d_output1[offset], + bytes_per_stream, + hipMemcpyDeviceToHost, + streams[i])); + } + + HIP_ASSERT(hipEventRecord(stop)); + HIP_ASSERT(hipEventSynchronize(stop)); + + float milliseconds = 0; + HIP_ASSERT(hipEventElapsedTime(&milliseconds, start, stop)); + + // Release device memory + HIP_ASSERT(hipFree(d_input1)); + HIP_ASSERT(hipFree(d_output1)); + + // Release host memory + HIP_ASSERT(hipHostFree(h_input1)); + HIP_ASSERT(hipHostFree(h_output1)); + + // Destroy streams + for(int i = 0; i < num_streams; ++i) + { + HIP_ASSERT(hipStreamDestroy(streams[i])); + } + + return 0; +} diff --git a/tests/rocprofv3/hip-stream-display/CMakeLists.txt b/tests/rocprofv3/hip-stream-display/CMakeLists.txt index 782d6d4ce6..34c6721e59 100644 --- a/tests/rocprofv3/hip-stream-display/CMakeLists.txt +++ b/tests/rocprofv3/hip-stream-display/CMakeLists.txt @@ -29,7 +29,6 @@ project( VERSION 0.0.0) find_package(rocprofiler-sdk REQUIRED) -find_package(rocDecode) rocprofiler_configure_pytest_files(CONFIG pytest.ini COPY validate.py conftest.py) @@ -41,9 +40,9 @@ set(hip-stream-display-env "${PRELOAD_ENV}") add_test( NAME rocprofv3-test-hip-stream-display-execute COMMAND - $ --kernel-rename -d + $ --kernel-rename -s -d ${CMAKE_CURRENT_BINARY_DIR}/%tag%-trace -o out --output-format json pftrace - --log-level env -- $) + --log-level env -- $) set_tests_properties( rocprofv3-test-hip-stream-display-execute @@ -56,14 +55,14 @@ set_tests_properties( FAIL_REGULAR_EXPRESSION "threw an exception" DISABLED - $>) + $>) add_test( NAME rocprofv3-test-hip-stream-display-validate COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py --json-input - ${CMAKE_CURRENT_BINARY_DIR}/hip-stream-display/out_results.json --pftrace-input - ${CMAKE_CURRENT_BINARY_DIR}/hip-stream-display/out_results.pftrace) + ${CMAKE_CURRENT_BINARY_DIR}/hip-streams-trace/out_results.json --pftrace-input + ${CMAKE_CURRENT_BINARY_DIR}/hip-streams-trace/out_results.pftrace) set_tests_properties( rocprofv3-test-hip-stream-display-validate @@ -76,4 +75,4 @@ set_tests_properties( FAIL_REGULAR_EXPRESSION "AssertionError" DISABLED - $>) + $>) diff --git a/tests/rocprofv3/hip-stream-display/validate.py b/tests/rocprofv3/hip-stream-display/validate.py index c84c9303e2..f5c9ea2c48 100644 --- a/tests/rocprofv3/hip-stream-display/validate.py +++ b/tests/rocprofv3/hip-stream-display/validate.py @@ -54,15 +54,17 @@ def test_stream_trace(json_data): buffer_records = data["buffer_records"] kernel_dispatch_data = buffer_records["kernel_dispatch"] - memory_copies_data = buffer_records["memory_copies"] + memory_copies_data = buffer_records["memory_copy"] assert len(kernel_dispatch_data) > 0 assert len(memory_copies_data) > 0 - # Expect stream ids to be set to 1 or 2 for transpose executable - expected_stream_ids = set((1, 2)) + # Expect stream ids to be set between 1 and 8 inclusive for transpose executable + expected_stream_ids = set([i for i in range(1, 9)]) + # check buffering data for titr in (kernel_dispatch_data, memory_copies_data): - for node in rocdecode_data: + stream_id_set = set() + for node in titr: assert "size" in node assert "kind" in node assert "operation" in node @@ -70,7 +72,7 @@ def test_stream_trace(json_data): assert "end_timestamp" in node assert "start_timestamp" in node assert "thread_id" in node - assert "_stream_id" in node + assert "stream_id" in node assert node.size > 0 assert node.thread_id > 0 @@ -78,13 +80,15 @@ def test_stream_trace(json_data): assert node.end_timestamp > 0 assert node.start_timestamp < node.end_timestamp - assert node._stream_id.handle in expected_stream_ids + stream_id = node.stream_id.handle + stream_id_set.add(stream_id) + assert stream_id_set == expected_stream_ids def test_perfetto_data(pftrace_data, json_data): import rocprofiler_sdk.tests.rocprofv3 as rocprofv3 - assert pftrace_data != None + assert pftrace_data.empty == False rocprofv3.test_perfetto_data( pftrace_data, json_data,