Fix HIP Streams Duplication Error (#313)

* Fix stream duplication and fixed tests

* Added comments to explain stream.cpp code, change stream nullptr check to occur in update table to prevent readding null stream, simplified hip-streams bin file code, add destroyStreams to hip-streams bin file code

* Removed roctx from CMakeLists.txt

* Updated documentation

* Fix documentation

* Removed update_table for HIP compiler table and updated stream.cpp to remove support for HIP compiler table

* Added runtime initialization check for HIP

* Changed tool name, working on fixing memory management

* Added context for counter collection kernel rename combination

* Changed name from map to set and changed description

* Fix documentation description for group-by-queue

* Merged memory copy and kernel operations onto a single track when on the same stream

* Updated perfetto output to remove hardware information from track name to merge all memory copy and kernel operations on the same stream to the same track:

* Most pr comments addressed

* Added filter for counter collection and removed kernel buffer tracing hack

* Added PR comment fixes

---------

Co-authored-by: Madsen, Jonathan <Jonathan.Madsen@amd.com>
Este commit está contenido en:
Trowbridge, Ian
2025-05-01 00:56:15 -05:00
cometido por GitHub
padre 4f03ebc360
commit e626df43eb
Se han modificado 26 ficheros con 465 adiciones y 287 borrados
+4 -2
Ver fichero
@@ -162,8 +162,10 @@ The following table lists the commonly used ``rocprofv3`` command-line options c
| Specifies Perfetto shared memory size hint in KB. Default: 64 KB.
* - Display
- | ``-L`` [BOOL] \| ``--list-avail`` [BOOL] |br|
- | Lists the PC sampling configurations and metrics available in the counter_defs.yaml file for counter collection. In earlier ``rocprof`` versions, this was known as ``--list-basic``, ``--list-derived``, and ``--list-counters``.
- | ``-L`` [BOOL] \| ``--list-avail`` [BOOL] |br| |br|
| ``--group-by-queue`` [BOOL]
- | Lists the PC sampling configurations and metrics available in the counter_defs.yaml file for counter collection. In earlier ``rocprof`` versions, this was known as ``--list-basic``, ``--list-derived``, and ``--list-counters``. |br| |br|
| For displaying the HSA Queues that kernels and memory copy operations are submitted to rather than the default grouping of HIP Streams for perfetto.
* - Other
- | ``--preload`` PRELOAD |br| |br|
+2 -3
Ver fichero
@@ -182,10 +182,9 @@ using rocdecode_buffered_output_t =
using rocjpeg_buffered_output_t =
buffered_output<rocprofiler_buffer_tracing_rocjpeg_api_record_t, domain_type::ROCJPEG>;
using kernel_dispatch_buffered_output_with_stream_t =
buffered_output<tool_buffer_tracing_kernel_dispatch_with_stream_record_t,
domain_type::KERNEL_DISPATCH>;
buffered_output<tool_buffer_tracing_kernel_dispatch_ext_record_t, domain_type::KERNEL_DISPATCH>;
using memory_copy_buffered_output_with_stream_t =
buffered_output<tool_buffer_tracing_memory_copy_with_stream_record_t, domain_type::MEMORY_COPY>;
buffered_output<tool_buffer_tracing_memory_copy_ext_record_t, domain_type::MEMORY_COPY>;
using pc_sampling_stochastic_buffered_output_t =
buffered_output<rocprofiler::tool::rocprofiler_tool_pc_sampling_stochastic_record_t,
domain_type::PC_SAMPLING_STOCHASTIC>;
+8 -8
Ver fichero
@@ -249,10 +249,10 @@ generate_csv(const output_config& cfg,
}
void
generate_csv(const output_config& cfg,
const metadata& tool_metadata,
const generator<tool_buffer_tracing_kernel_dispatch_with_stream_record_t>& data,
const stats_entry_t& stats)
generate_csv(const output_config& cfg,
const metadata& tool_metadata,
const generator<tool_buffer_tracing_kernel_dispatch_ext_record_t>& data,
const stats_entry_t& stats)
{
if(data.empty()) return;
@@ -399,10 +399,10 @@ generate_csv(const output_config& cfg,
}
void
generate_csv(const output_config& cfg,
const metadata& tool_metadata,
const generator<tool_buffer_tracing_memory_copy_with_stream_record_t>& data,
const stats_entry_t& stats)
generate_csv(const output_config& cfg,
const metadata& tool_metadata,
const generator<tool_buffer_tracing_memory_copy_ext_record_t>& data,
const stats_entry_t& stats)
{
if(data.empty()) return;
+8 -8
Ver fichero
@@ -40,10 +40,10 @@ generate_csv(const output_config& cfg,
std::vector<agent_info>& data);
void
generate_csv(const output_config& cfg,
const metadata& tool_metadata,
const generator<tool_buffer_tracing_kernel_dispatch_with_stream_record_t>& data,
const stats_entry_t& stats);
generate_csv(const output_config& cfg,
const metadata& tool_metadata,
const generator<tool_buffer_tracing_kernel_dispatch_ext_record_t>& data,
const stats_entry_t& stats);
void
generate_csv(const output_config& cfg,
@@ -58,10 +58,10 @@ generate_csv(const output_config& cfg,
const stats_entry_t& stats);
void
generate_csv(const output_config& cfg,
const metadata& tool_metadata,
const generator<tool_buffer_tracing_memory_copy_with_stream_record_t>& data,
const stats_entry_t& stats);
generate_csv(const output_config& cfg,
const metadata& tool_metadata,
const generator<tool_buffer_tracing_memory_copy_ext_record_t>& data,
const stats_entry_t& stats);
void
generate_csv(const output_config& cfg,
+5 -5
Ver fichero
@@ -186,11 +186,11 @@ void
write_json(json_output& json_ar,
const output_config& /*cfg*/,
const metadata& /*tool_metadata*/,
const domain_stats_vec_t& domain_stats,
generator<rocprofiler_buffer_tracing_hip_api_ext_record_t>&& hip_api_gen,
generator<rocprofiler_buffer_tracing_hsa_api_record_t> hsa_api_gen,
generator<tool_buffer_tracing_kernel_dispatch_with_stream_record_t> kernel_dispatch_gen,
generator<tool_buffer_tracing_memory_copy_with_stream_record_t> memory_copy_gen,
const domain_stats_vec_t& domain_stats,
generator<rocprofiler_buffer_tracing_hip_api_ext_record_t>&& hip_api_gen,
generator<rocprofiler_buffer_tracing_hsa_api_record_t> hsa_api_gen,
generator<tool_buffer_tracing_kernel_dispatch_ext_record_t> kernel_dispatch_gen,
generator<tool_buffer_tracing_memory_copy_ext_record_t> memory_copy_gen,
generator<tool_counter_record_t> counter_collection_gen,
generator<rocprofiler_buffer_tracing_marker_api_record_t> marker_api_gen,
generator<rocprofiler_buffer_tracing_scratch_memory_record_t> scratch_memory_gen,
+8 -8
Ver fichero
@@ -81,14 +81,14 @@ void
write_json(json_output&, const output_config& cfg, const metadata& tool_metadata, uint64_t pid);
void
write_json(json_output& json_ar,
const output_config& cfg,
const metadata& tool_metadata,
const domain_stats_vec_t& domain_stats,
generator<rocprofiler_buffer_tracing_hip_api_ext_record_t>&& hip_api_gen,
generator<rocprofiler_buffer_tracing_hsa_api_record_t> hsa_api_gen,
generator<tool_buffer_tracing_kernel_dispatch_with_stream_record_t> kernel_dispatch_gen,
generator<tool_buffer_tracing_memory_copy_with_stream_record_t> memory_copy_gen,
write_json(json_output& json_ar,
const output_config& cfg,
const metadata& tool_metadata,
const domain_stats_vec_t& domain_stats,
generator<rocprofiler_buffer_tracing_hip_api_ext_record_t>&& hip_api_gen,
generator<rocprofiler_buffer_tracing_hsa_api_record_t> hsa_api_gen,
generator<tool_buffer_tracing_kernel_dispatch_ext_record_t> kernel_dispatch_gen,
generator<tool_buffer_tracing_memory_copy_ext_record_t> memory_copy_gen,
generator<tool_counter_record_t> counter_collection_gen,
generator<rocprofiler_buffer_tracing_marker_api_record_t> marker_api_gen,
generator<rocprofiler_buffer_tracing_scratch_memory_record_t> scratch_memory_gen,
+9 -9
Ver fichero
@@ -356,15 +356,15 @@ create_attribute_list()
void
write_otf2(
const output_config& cfg,
const metadata& tool_metadata,
uint64_t pid,
const std::vector<agent_info>& agent_data,
std::deque<rocprofiler_buffer_tracing_hip_api_ext_record_t>* hip_api_data,
std::deque<rocprofiler_buffer_tracing_hsa_api_record_t>* hsa_api_data,
std::deque<tool_buffer_tracing_kernel_dispatch_with_stream_record_t>* kernel_dispatch_data,
std::deque<tool_buffer_tracing_memory_copy_with_stream_record_t>* memory_copy_data,
std::deque<rocprofiler_buffer_tracing_marker_api_record_t>* marker_api_data,
const output_config& cfg,
const metadata& tool_metadata,
uint64_t pid,
const std::vector<agent_info>& agent_data,
std::deque<rocprofiler_buffer_tracing_hip_api_ext_record_t>* hip_api_data,
std::deque<rocprofiler_buffer_tracing_hsa_api_record_t>* hsa_api_data,
std::deque<tool_buffer_tracing_kernel_dispatch_ext_record_t>* kernel_dispatch_data,
std::deque<tool_buffer_tracing_memory_copy_ext_record_t>* memory_copy_data,
std::deque<rocprofiler_buffer_tracing_marker_api_record_t>* marker_api_data,
std::deque<rocprofiler_buffer_tracing_scratch_memory_record_t>* /*scratch_memory_data*/,
std::deque<rocprofiler_buffer_tracing_rccl_api_record_t>* rccl_api_data,
std::deque<rocprofiler_buffer_tracing_memory_allocation_record_t>* memory_allocation_data,
+14 -14
Ver fichero
@@ -36,19 +36,19 @@ namespace tool
{
void
write_otf2(
const output_config& cfg,
const metadata& tool_metadata,
uint64_t pid,
const std::vector<agent_info>& agent_data,
std::deque<rocprofiler_buffer_tracing_hip_api_ext_record_t>* hip_api_data,
std::deque<rocprofiler_buffer_tracing_hsa_api_record_t>* hsa_api_data,
std::deque<tool_buffer_tracing_kernel_dispatch_with_stream_record_t>* kernel_dispatch_data,
std::deque<tool_buffer_tracing_memory_copy_with_stream_record_t>* memory_copy_data,
std::deque<rocprofiler_buffer_tracing_marker_api_record_t>* marker_api_data,
std::deque<rocprofiler_buffer_tracing_scratch_memory_record_t>* scratch_memory_data,
std::deque<rocprofiler_buffer_tracing_rccl_api_record_t>* rccl_api_data,
std::deque<rocprofiler_buffer_tracing_memory_allocation_record_t>* memory_allocation_data,
std::deque<rocprofiler_buffer_tracing_rocdecode_api_ext_record_t>* rocdecode_api_data,
std::deque<rocprofiler_buffer_tracing_rocjpeg_api_record_t>* rocjpeg_api_data);
const output_config& cfg,
const metadata& tool_metadata,
uint64_t pid,
const std::vector<agent_info>& agent_data,
std::deque<rocprofiler_buffer_tracing_hip_api_ext_record_t>* hip_api_data,
std::deque<rocprofiler_buffer_tracing_hsa_api_record_t>* hsa_api_data,
std::deque<tool_buffer_tracing_kernel_dispatch_ext_record_t>* kernel_dispatch_data,
std::deque<tool_buffer_tracing_memory_copy_ext_record_t>* memory_copy_data,
std::deque<rocprofiler_buffer_tracing_marker_api_record_t>* marker_api_data,
std::deque<rocprofiler_buffer_tracing_scratch_memory_record_t>* scratch_memory_data,
std::deque<rocprofiler_buffer_tracing_rccl_api_record_t>* rccl_api_data,
std::deque<rocprofiler_buffer_tracing_memory_allocation_record_t>* memory_allocation_data,
std::deque<rocprofiler_buffer_tracing_rocdecode_api_ext_record_t>* rocdecode_api_data,
std::deque<rocprofiler_buffer_tracing_rocjpeg_api_record_t>* rocjpeg_api_data);
} // namespace tool
} // namespace rocprofiler
+31 -60
Ver fichero
@@ -65,15 +65,15 @@ get_hash_id(Tp&& _val)
void
write_perfetto(
const output_config& ocfg,
const metadata& tool_metadata,
std::vector<agent_info> agent_data,
const generator<rocprofiler_buffer_tracing_hip_api_ext_record_t>& hip_api_gen,
const generator<rocprofiler_buffer_tracing_hsa_api_record_t>& hsa_api_gen,
const generator<tool_buffer_tracing_kernel_dispatch_with_stream_record_t>& kernel_dispatch_gen,
const generator<tool_buffer_tracing_memory_copy_with_stream_record_t>& memory_copy_gen,
const generator<tool_counter_record_t>& counter_collection_gen,
const generator<rocprofiler_buffer_tracing_marker_api_record_t>& marker_api_gen,
const output_config& ocfg,
const metadata& tool_metadata,
std::vector<agent_info> agent_data,
const generator<rocprofiler_buffer_tracing_hip_api_ext_record_t>& hip_api_gen,
const generator<rocprofiler_buffer_tracing_hsa_api_record_t>& hsa_api_gen,
const generator<tool_buffer_tracing_kernel_dispatch_ext_record_t>& kernel_dispatch_gen,
const generator<tool_buffer_tracing_memory_copy_ext_record_t>& memory_copy_gen,
const generator<tool_counter_record_t>& counter_collection_gen,
const generator<rocprofiler_buffer_tracing_marker_api_record_t>& marker_api_gen,
const generator<rocprofiler_buffer_tracing_scratch_memory_record_t>& /*scratch_memory_gen*/,
const generator<rocprofiler_buffer_tracing_rccl_api_record_t>& rccl_api_gen,
const generator<rocprofiler_buffer_tracing_memory_allocation_record_t>& memory_allocation_gen,
@@ -133,10 +133,12 @@ write_perfetto(
tracing_session->Setup(cfg);
tracing_session->StartBlocking();
auto tids = std::set<rocprofiler_thread_id_t>{};
auto demangled = std::unordered_map<std::string_view, std::string>{};
auto agent_thread_ids = std::unordered_map<rocprofiler_agent_id_t, std::set<uint64_t>>{};
const auto is_hip_initialized =
tool_metadata.is_runtime_initialized(ROCPROFILER_RUNTIME_INITIALIZATION_HIP);
const auto group_by_queue = ocfg.group_by_queue || !is_hip_initialized;
auto tids = std::set<rocprofiler_thread_id_t>{};
auto demangled = std::unordered_map<std::string_view, std::string>{};
auto agent_thread_ids = std::unordered_map<rocprofiler_agent_id_t, std::set<uint64_t>>{};
auto agent_thread_ids_alloc = std::unordered_map<rocprofiler_agent_id_t, std::set<uint64_t>>{};
auto agent_queue_ids =
std::unordered_map<rocprofiler_agent_id_t, std::unordered_set<rocprofiler_queue_id_t>>{};
@@ -154,12 +156,7 @@ write_perfetto(
auto agent_queue_tracks =
std::unordered_map<rocprofiler_agent_id_t,
std::unordered_map<rocprofiler_queue_id_t, ::perfetto::Track>>{};
auto agent_stream_compute_tracks =
std::unordered_map<rocprofiler_agent_id_t,
std::unordered_map<rocprofiler_stream_id_t, ::perfetto::Track>>{};
auto agent_stream_copy_tracks =
std::unordered_map<rocprofiler_agent_id_t,
std::unordered_map<rocprofiler_stream_id_t, ::perfetto::Track>>{};
auto stream_tracks = std::unordered_map<rocprofiler_stream_id_t, ::perfetto::Track>{};
auto _get_agent = [&agent_data](rocprofiler_agent_id_t _id) -> const rocprofiler_agent_t* {
for(const auto& itr : agent_data)
@@ -194,7 +191,7 @@ write_perfetto(
{
tids.emplace(itr.thread_id);
agent_stream_ids[itr.dst_agent_id].emplace(itr.stream_id);
if(ocfg.group_by_queue)
if(group_by_queue)
{
agent_thread_ids[itr.dst_agent_id].emplace(itr.thread_id);
}
@@ -212,7 +209,7 @@ write_perfetto(
{
tids.emplace(itr.thread_id);
agent_stream_ids[itr.dispatch_info.agent_id].emplace(itr.stream_id);
if(ocfg.group_by_queue)
if(group_by_queue)
{
agent_queue_ids[itr.dispatch_info.agent_id].emplace(itr.dispatch_info.queue_id);
}
@@ -297,20 +294,11 @@ write_perfetto(
{
for(auto sitr : aitr.second)
{
const auto* _agent = _get_agent(aitr.first);
const auto stream_id = sitr.handle;
const auto stream_id = sitr.handle;
{
auto _namess = std::stringstream{};
_namess << "COMPUTE AGENT [" << _agent->logical_node_id << "] STREAM [" << stream_id
<< "] ";
if(_agent->type == ROCPROFILER_AGENT_TYPE_CPU)
_namess << "(CPU)";
else if(_agent->type == ROCPROFILER_AGENT_TYPE_GPU)
_namess << "(GPU)";
else
_namess << "(UNK)";
_namess << fmt::format("STREAM [\" {} \"] ", stream_id);
auto _track = ::perfetto::Track{get_hash_id(_namess.str())};
auto _desc = _track.Serialize();
@@ -318,28 +306,7 @@ write_perfetto(
perfetto::TrackEvent::SetTrackDescriptor(_track, _desc);
agent_stream_compute_tracks[aitr.first].emplace(sitr, _track);
}
{
auto _namess = std::stringstream{};
_namess << "COPY to AGENT [" << _agent->logical_node_id << "] STREAM [" << stream_id
<< "] ";
if(_agent->type == ROCPROFILER_AGENT_TYPE_CPU)
_namess << "(CPU)";
else if(_agent->type == ROCPROFILER_AGENT_TYPE_GPU)
_namess << "(GPU)";
else
_namess << "(UNK)";
auto _track = ::perfetto::Track{get_hash_id(_namess.str())};
auto _desc = _track.Serialize();
_desc.set_name(_namess.str());
perfetto::TrackEvent::SetTrackDescriptor(_track, _desc);
agent_stream_copy_tracks[aitr.first].emplace(sitr, _track);
stream_tracks.emplace(sitr, _track);
}
}
}
@@ -570,13 +537,13 @@ write_perfetto(
auto name = buffer_names.at(itr.kind, itr.operation);
::perfetto::Track* _track = nullptr;
if(ocfg.group_by_queue)
if(group_by_queue)
{
_track = &agent_thread_tracks.at(itr.dst_agent_id).at(itr.thread_id);
}
else
{
_track = &agent_stream_copy_tracks.at(itr.dst_agent_id).at(itr.stream_id);
_track = &stream_tracks.at(itr.stream_id);
}
TRACE_EVENT_BEGIN(
@@ -640,7 +607,7 @@ write_perfetto(
rocprofiler_agent_id_t,
std::unordered_map<
rocprofiler_queue_id_t,
std::vector<tool_buffer_tracing_kernel_dispatch_with_stream_record_t*>>>{};
std::vector<tool_buffer_tracing_kernel_dispatch_ext_record_t*>>>{};
for(auto& itr : generator)
{
const auto& info = itr.dispatch_info;
@@ -671,14 +638,13 @@ write_perfetto(
auto name = std::string_view{sym->kernel_name};
::perfetto::Track* _track = nullptr;
if(ocfg.group_by_queue)
if(group_by_queue)
{
_track = &agent_queue_tracks.at(info.agent_id).at(info.queue_id);
}
else
{
_track =
&agent_stream_compute_tracks.at(info.agent_id).at((*it)->stream_id);
_track = &stream_tracks.at((*it)->stream_id);
}
// Temporary fix until timestamp issues are resolved: Set timestamps to be
@@ -729,6 +695,11 @@ write_perfetto(
.get_agent_index(agents_map.at(info.agent_id).id,
ocfg.agent_index_value)
.as_string("-"),
"agent_type",
tool_metadata
.get_agent_index(agents_map.at(info.agent_id).id,
ocfg.agent_index_value)
.type,
"corr_id",
current.correlation_id.internal,
"queue",
+7 -7
Ver fichero
@@ -37,13 +37,13 @@ namespace tool
{
void
write_perfetto(
const output_config& cfg,
const metadata& tool_metadata,
std::vector<agent_info> agent_data,
const generator<rocprofiler_buffer_tracing_hip_api_ext_record_t>& hip_api_gen,
const generator<rocprofiler_buffer_tracing_hsa_api_record_t>& hsa_api_gen,
const generator<tool_buffer_tracing_kernel_dispatch_with_stream_record_t>& kernel_dispatch_gen,
const generator<tool_buffer_tracing_memory_copy_with_stream_record_t>& memory_copy_gen,
const output_config& cfg,
const metadata& tool_metadata,
std::vector<agent_info> agent_data,
const generator<rocprofiler_buffer_tracing_hip_api_ext_record_t>& hip_api_gen,
const generator<rocprofiler_buffer_tracing_hsa_api_record_t>& hsa_api_gen,
const generator<tool_buffer_tracing_kernel_dispatch_ext_record_t>& kernel_dispatch_gen,
const generator<tool_buffer_tracing_memory_copy_ext_record_t>& memory_copy_gen,
const generator<tool_counter_record_t>& counter_collection_gen,
const generator<rocprofiler_buffer_tracing_marker_api_record_t>& marker_api_gen,
const generator<rocprofiler_buffer_tracing_scratch_memory_record_t>& scratch_memory_gen,
+4 -4
Ver fichero
@@ -63,8 +63,8 @@ get_stats(const stats_map_t& data_v)
stats_entry_t
generate_stats(const output_config& /*cfg*/,
const metadata& tool_metadata,
const generator<tool_buffer_tracing_kernel_dispatch_with_stream_record_t>& data)
const metadata& tool_metadata,
const generator<tool_buffer_tracing_kernel_dispatch_ext_record_t>& data)
{
auto kernel_stats = stats_map_t{};
for(auto ditr : data)
@@ -119,8 +119,8 @@ generate_stats(const output_config& /*cfg*/,
stats_entry_t
generate_stats(const output_config& /*cfg*/,
const metadata& tool_metadata,
const generator<tool_buffer_tracing_memory_copy_with_stream_record_t>& data)
const metadata& tool_metadata,
const generator<tool_buffer_tracing_memory_copy_ext_record_t>& data)
{
auto memory_copy_stats = stats_map_t{};
for(auto ditr : data)
+6 -6
Ver fichero
@@ -32,9 +32,9 @@ namespace rocprofiler
namespace tool
{
stats_entry_t
generate_stats(const output_config& cfg,
const metadata& tool_metadata,
const generator<tool_buffer_tracing_kernel_dispatch_with_stream_record_t>& data);
generate_stats(const output_config& cfg,
const metadata& tool_metadata,
const generator<tool_buffer_tracing_kernel_dispatch_ext_record_t>& data);
stats_entry_t
generate_stats(const output_config& cfg,
@@ -47,9 +47,9 @@ generate_stats(const output_config& cfg
const generator<rocprofiler_buffer_tracing_hsa_api_record_t>& data);
stats_entry_t
generate_stats(const output_config& cfg,
const metadata& tool_metadata,
const generator<tool_buffer_tracing_memory_copy_with_stream_record_t>& data);
generate_stats(const output_config& cfg,
const metadata& tool_metadata,
const generator<tool_buffer_tracing_memory_copy_ext_record_t>& data);
stats_entry_t
generate_stats(const output_config& cfg,
+20
Ver fichero
@@ -475,6 +475,26 @@ metadata::add_external_correlation_id(uint64_t val)
[](auto& _data, uint64_t _val) { return _data.emplace(_val).second; }, val);
}
bool
metadata::add_runtime_initialization(rocprofiler_runtime_initialization_operation_t runtime_op)
{
return runtime_initialization_set.wlock(
[](auto& _data, rocprofiler_runtime_initialization_operation_t _runtime_op) {
return _data.emplace(_runtime_op).second;
},
runtime_op);
}
bool
metadata::is_runtime_initialized(rocprofiler_runtime_initialization_operation_t runtime_op) const
{
return runtime_initialization_set.rlock(
[](const auto& _data, rocprofiler_runtime_initialization_operation_t _runtime_op) {
return _data.count(_runtime_op) > 0;
},
runtime_op);
}
std::string_view
metadata::get_marker_message(uint64_t corr_id) const
{
+16 -11
Ver fichero
@@ -90,6 +90,8 @@ using synced_map = common::Synchronized<Tp, true>;
template <typename Tp>
using synced_obj = common::Synchronized<Tp, true>;
using pc_sampling_stats_t = rocprofiler_tool_pc_sampling_stats;
using runtime_initialization_set_t =
std::unordered_set<rocprofiler_runtime_initialization_operation_t>;
enum class agent_indexing
{
@@ -126,17 +128,18 @@ struct metadata
agent_counter_info_map_t agent_counter_info = {};
agent_pc_sample_config_info_map_t agent_pc_sample_config_info = {};
sdk::buffer_name_info buffer_names = {};
sdk::callback_name_info callback_names = {};
synced_map<code_object_data_map_t> code_objects = {};
synced_map<kernel_symbol_data_map_t> kernel_symbols = {};
synced_map<marker_message_map_t> marker_messages = {};
synced_map<string_entry_map_t> string_entries = {};
synced_map<external_corr_id_set_t> external_corr_ids = {};
synced_map<host_function_info_map_t> host_functions = {};
synced_map<code_object_load_info_vec_t> code_object_load = {};
att_filenames_map_t att_filenames = {};
synced_obj<pc_sampling_stats_t> pc_sampling_stats = {};
sdk::buffer_name_info buffer_names = {};
sdk::callback_name_info callback_names = {};
synced_map<code_object_data_map_t> code_objects = {};
synced_map<kernel_symbol_data_map_t> kernel_symbols = {};
synced_map<marker_message_map_t> marker_messages = {};
synced_map<string_entry_map_t> string_entries = {};
synced_map<external_corr_id_set_t> external_corr_ids = {};
synced_map<host_function_info_map_t> host_functions = {};
synced_map<code_object_load_info_vec_t> code_object_load = {};
att_filenames_map_t att_filenames = {};
synced_obj<pc_sampling_stats_t> pc_sampling_stats = {};
synced_obj<runtime_initialization_set_t> runtime_initialization_set = {};
metadata() = default;
metadata(inprocess);
@@ -182,6 +185,7 @@ struct metadata
bool add_host_function(host_function_info&& func);
bool add_string_entry(size_t key, std::string_view str);
bool add_external_correlation_id(uint64_t);
bool add_runtime_initialization(rocprofiler_runtime_initialization_operation_t);
std::string_view get_marker_message(uint64_t corr_id) const;
std::string_view get_kernel_name(uint64_t kernel_id, uint64_t rename_id) const;
@@ -193,6 +197,7 @@ struct metadata
rocprofiler_tracing_operation_t op) const;
agent_index get_agent_index(rocprofiler_agent_id_t agent, agent_indexing index) const;
const std::string* get_string_entry(size_t key) const;
bool is_runtime_initialized(rocprofiler_runtime_initialization_operation_t) const;
private:
bool inprocess_init = false;
+30 -32
Ver fichero
@@ -35,56 +35,55 @@ namespace rocprofiler
{
namespace tool
{
struct tool_buffer_tracing_kernel_dispatch_with_stream_record_t
struct tool_buffer_tracing_kernel_dispatch_ext_record_t
: rocprofiler_buffer_tracing_kernel_dispatch_record_t
{
using base_type = rocprofiler_buffer_tracing_kernel_dispatch_record_t;
tool_buffer_tracing_kernel_dispatch_with_stream_record_t(
const base_type& _base,
const rocprofiler_stream_id_t& _stream_id,
const uint64_t& _kernel_rename_val)
tool_buffer_tracing_kernel_dispatch_ext_record_t(const base_type& _base,
const rocprofiler_stream_id_t& _stream_id,
const uint64_t& _kernel_rename_val)
: base_type{_base}
, stream_id{_stream_id}
, kernel_rename_val{_kernel_rename_val}
{}
tool_buffer_tracing_kernel_dispatch_with_stream_record_t();
~tool_buffer_tracing_kernel_dispatch_with_stream_record_t() = default;
tool_buffer_tracing_kernel_dispatch_with_stream_record_t(
const tool_buffer_tracing_kernel_dispatch_with_stream_record_t&) = default;
tool_buffer_tracing_kernel_dispatch_with_stream_record_t(
tool_buffer_tracing_kernel_dispatch_with_stream_record_t&&) noexcept = default;
tool_buffer_tracing_kernel_dispatch_with_stream_record_t& operator =(
const tool_buffer_tracing_kernel_dispatch_with_stream_record_t&) = default;
tool_buffer_tracing_kernel_dispatch_with_stream_record_t& operator =(
tool_buffer_tracing_kernel_dispatch_with_stream_record_t&&) noexcept = default;
tool_buffer_tracing_kernel_dispatch_ext_record_t();
~tool_buffer_tracing_kernel_dispatch_ext_record_t() = default;
tool_buffer_tracing_kernel_dispatch_ext_record_t(
const tool_buffer_tracing_kernel_dispatch_ext_record_t&) = default;
tool_buffer_tracing_kernel_dispatch_ext_record_t(
tool_buffer_tracing_kernel_dispatch_ext_record_t&&) noexcept = default;
tool_buffer_tracing_kernel_dispatch_ext_record_t& operator =(
const tool_buffer_tracing_kernel_dispatch_ext_record_t&) = default;
tool_buffer_tracing_kernel_dispatch_ext_record_t& operator =(
tool_buffer_tracing_kernel_dispatch_ext_record_t&&) noexcept = default;
rocprofiler_stream_id_t stream_id = {};
uint64_t kernel_rename_val = {};
};
struct tool_buffer_tracing_memory_copy_with_stream_record_t
struct tool_buffer_tracing_memory_copy_ext_record_t
: rocprofiler_buffer_tracing_memory_copy_record_t
{
using base_type = rocprofiler_buffer_tracing_memory_copy_record_t;
tool_buffer_tracing_memory_copy_with_stream_record_t(const base_type& _base,
const rocprofiler_stream_id_t& _stream_id)
tool_buffer_tracing_memory_copy_ext_record_t(const base_type& _base,
const rocprofiler_stream_id_t& _stream_id)
: base_type{_base}
, stream_id{_stream_id}
{}
tool_buffer_tracing_memory_copy_with_stream_record_t();
~tool_buffer_tracing_memory_copy_with_stream_record_t() = default;
tool_buffer_tracing_memory_copy_with_stream_record_t(
const tool_buffer_tracing_memory_copy_with_stream_record_t&) = default;
tool_buffer_tracing_memory_copy_with_stream_record_t(
tool_buffer_tracing_memory_copy_with_stream_record_t&&) noexcept = default;
tool_buffer_tracing_memory_copy_with_stream_record_t& operator =(
const tool_buffer_tracing_memory_copy_with_stream_record_t&) = default;
tool_buffer_tracing_memory_copy_with_stream_record_t& operator =(
tool_buffer_tracing_memory_copy_with_stream_record_t&&) noexcept = default;
tool_buffer_tracing_memory_copy_ext_record_t();
~tool_buffer_tracing_memory_copy_ext_record_t() = default;
tool_buffer_tracing_memory_copy_ext_record_t(
const tool_buffer_tracing_memory_copy_ext_record_t&) = default;
tool_buffer_tracing_memory_copy_ext_record_t(
tool_buffer_tracing_memory_copy_ext_record_t&&) noexcept = default;
tool_buffer_tracing_memory_copy_ext_record_t& operator =(
const tool_buffer_tracing_memory_copy_ext_record_t&) = default;
tool_buffer_tracing_memory_copy_ext_record_t& operator =(
tool_buffer_tracing_memory_copy_ext_record_t&&) noexcept = default;
rocprofiler_stream_id_t stream_id = {};
};
@@ -97,8 +96,8 @@ namespace cereal
template <typename ArchiveT>
void
save(ArchiveT& ar,
const ::rocprofiler::tool::tool_buffer_tracing_kernel_dispatch_with_stream_record_t& data)
save(ArchiveT& ar,
const ::rocprofiler::tool::tool_buffer_tracing_kernel_dispatch_ext_record_t& data)
{
cereal::save(ar, static_cast<const rocprofiler_buffer_tracing_kernel_dispatch_record_t&>(data));
SAVE_DATA_FIELD(stream_id);
@@ -107,8 +106,7 @@ save(ArchiveT&
template <typename ArchiveT>
void
save(ArchiveT& ar,
const ::rocprofiler::tool::tool_buffer_tracing_memory_copy_with_stream_record_t& data)
save(ArchiveT& ar, const ::rocprofiler::tool::tool_buffer_tracing_memory_copy_ext_record_t& data)
{
cereal::save(ar, static_cast<const rocprofiler_buffer_tracing_memory_copy_record_t&>(data));
SAVE_DATA_FIELD(stream_id);
@@ -60,7 +60,8 @@ pop_stream_id()
rocprofiler_stream_id_t
get_stream_id()
{
return CHECK_NOTNULL(get_stream_stack())->back();
return (CHECK_NOTNULL(get_stream_stack())->empty()) ? rocprofiler_stream_id_t{.handle = 0}
: get_stream_stack()->back();
}
bool
+83 -51
Ver fichero
@@ -249,6 +249,7 @@ using kernel_rename_stack_t = std::stack<uint64_t>;
auto* tool_metadata = as_pointer<tool::metadata>(tool::metadata::inprocess{});
auto target_kernels = common::Synchronized<targeted_kernels_map_t>{};
std::mutex att_shader_data;
auto counter_collection_ctx = rocprofiler_context_id_t{0};
thread_local auto thread_dispatch_rename = as_pointer<kernel_rename_stack_t>();
thread_local auto thread_dispatch_rename_dtor = common::scope_destructor{[]() {
@@ -262,25 +263,6 @@ struct kernel_rename_and_stream_display_pair
uint64_t kernel_rename_val{0};
rocprofiler_stream_id_t stream_id{.handle = 0};
};
auto kernel_rename_and_stream_display_pair_dtors =
new std::vector<kernel_rename_and_stream_display_pair*>{};
auto
get_kernel_rename_and_stream_display_pair_lock()
{
static auto _mutex = std::mutex{};
return std::unique_lock<std::mutex>{_mutex};
}
void
add_kernel_rename_and_stream_display_pairs(kernel_rename_and_stream_display_pair* ptr)
{
auto lock = get_kernel_rename_and_stream_display_pair_lock();
if(ptr != nullptr && kernel_rename_and_stream_display_pair_dtors != nullptr)
{
kernel_rename_and_stream_display_pair_dtors->emplace_back(ptr);
}
}
bool
add_kernel_target(uint64_t _kern_id, const std::unordered_set<size_t>& range)
@@ -455,10 +437,19 @@ set_kernel_rename_and_stream_display_correlation_id(
tool::get_config().kernel_rename && thread_dispatch_rename != nullptr &&
!thread_dispatch_rename->empty() &&
kind == ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KERNEL_DISPATCH;
const bool hip_stream_display_enabled =
!tool::get_config().group_by_queue &&
kernel_rename_and_stream_display_pair_dtors != nullptr &&
!rocprofiler::tool::stream::stream_stack_empty();
const bool hip_stream_display_enabled = !tool::get_config().group_by_queue;
if(kernel_rename_service_enabled && ctx_id == counter_collection_ctx)
{
auto val = thread_dispatch_rename->top();
if(tool_metadata) tool_metadata->add_external_correlation_id(val);
external_corr_id->value = val;
return 0;
}
// End early if counter collection context and no kernel rename service is enabled
if(ctx_id == counter_collection_ctx)
{
return 0;
}
kernel_rename_and_stream_display_pair* kernel_rename_and_stream_display_vals = nullptr;
if(kernel_rename_service_enabled || hip_stream_display_enabled)
@@ -475,12 +466,11 @@ set_kernel_rename_and_stream_display_correlation_id(
// Get stream ID from stream HIP display service
if(hip_stream_display_enabled && kernel_rename_and_stream_display_vals != nullptr)
{
auto stream_id = rocprofiler::tool::stream::get_stream_id();
auto stream_id = tool::stream::get_stream_id();
kernel_rename_and_stream_display_vals->stream_id = stream_id;
}
// Set the external correlation id service to point to struct
external_corr_id->ptr = kernel_rename_and_stream_display_vals;
add_kernel_rename_and_stream_display_pairs(kernel_rename_and_stream_display_vals);
common::consume_args(thr_id, ctx_id, kind, op, internal_corr_id, user_data);
@@ -615,6 +605,24 @@ hip_stream_display_callback(rocprofiler_callback_tracing_record_t record,
common::consume_args(user_data, data);
}
// Stores which runtimes have been initialized in metadata
void
runtime_initialization_callback(rocprofiler_callback_tracing_record_t record,
rocprofiler_user_data_t* user_data,
void* data)
{
if(record.kind != ROCPROFILER_CALLBACK_TRACING_RUNTIME_INITIALIZATION) return;
ROCP_CI_LOG_IF(WARNING, tool_metadata == nullptr)
<< fmt::format("tool cannot record runtime initialization for {}",
tool_metadata->get_operation_name(record.kind, record.operation));
if(tool_metadata)
{
tool_metadata->add_runtime_initialization(
static_cast<rocprofiler_runtime_initialization_operation_t>(record.operation));
}
common::consume_args(user_data, data);
}
void
callback_tracing_callback(rocprofiler_callback_tracing_record_t record,
rocprofiler_user_data_t* user_data,
@@ -942,8 +950,10 @@ buffered_tracing_callback(rocprofiler_context_id_t /*context*/,
record->correlation_id.external.ptr);
stream_id = kernel_stream_pair_ptr->stream_id;
kernel_rename_val = kernel_stream_pair_ptr->kernel_rename_val;
delete kernel_stream_pair_ptr;
record->correlation_id.external.value = kernel_rename_val;
}
rocprofiler::tool::tool_buffer_tracing_kernel_dispatch_with_stream_record_t
rocprofiler::tool::tool_buffer_tracing_kernel_dispatch_ext_record_t
record_with_stream{*record, stream_id, kernel_rename_val};
tool::write_ring_buffer(record_with_stream, domain_type::KERNEL_DISPATCH);
}
@@ -971,9 +981,11 @@ buffered_tracing_callback(rocprofiler_context_id_t /*context*/,
static_cast<kernel_rename_and_stream_display_pair*>(
record->correlation_id.external.ptr);
stream_id = kernel_stream_pair_ptr->stream_id;
delete kernel_stream_pair_ptr;
record->correlation_id.external.ptr = nullptr;
}
rocprofiler::tool::tool_buffer_tracing_memory_copy_with_stream_record_t
record_with_stream{*record, stream_id};
rocprofiler::tool::tool_buffer_tracing_memory_copy_ext_record_t record_with_stream{
*record, stream_id};
tool::write_ring_buffer(record_with_stream, domain_type::MEMORY_COPY);
}
else if(header->kind == ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION)
@@ -1366,16 +1378,10 @@ counter_record_callback(rocprofiler_dispatch_counting_service_data_t dispatch_da
auto counter_record = tool::tool_counter_record_t{};
counter_record.dispatch_data = dispatch_data;
counter_record.thread_id = user_data.value;
if(dispatch_data.correlation_id.external.ptr != nullptr)
{
// Extract the kernel id
auto* kernel_stream_pair_ptr = static_cast<kernel_rename_and_stream_display_pair*>(
dispatch_data.correlation_id.external.ptr);
counter_record.kernel_rename_val = kernel_stream_pair_ptr->kernel_rename_val;
}
counter_record.dispatch_data = dispatch_data;
counter_record.thread_id = user_data.value;
counter_record.kernel_rename_val = dispatch_data.correlation_id.external.value;
counter_record.dispatch_data.correlation_id.external.value = counter_record.kernel_rename_val;
auto serialized_records = std::vector<tool::tool_counter_value_t>{};
serialized_records.reserve(record_count);
@@ -1805,10 +1811,16 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
if(tool::get_config().counter_collection)
{
ROCPROFILER_CALL(rocprofiler_create_context(&counter_collection_ctx),
"failed to create context");
ROCPROFILER_CALL(
rocprofiler_configure_callback_dispatch_counting_service(
get_client_ctx(), dispatch_callback, nullptr, counter_record_callback, nullptr),
rocprofiler_configure_callback_dispatch_counting_service(counter_collection_ctx,
dispatch_callback,
nullptr,
counter_record_callback,
nullptr),
"Could not setup counting service");
ROCPROFILER_CALL(rocprofiler_start_context(counter_collection_ctx), "start context failed");
}
if(tool::get_config().rocdecode_api_trace)
@@ -1873,6 +1885,7 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
}
if(!tool::get_config().group_by_queue)
{
// Track stream ID information via callback service
auto hip_stream_display_ctx = rocprofiler_context_id_t{0};
ROCPROFILER_CALL(rocprofiler_create_context(&hip_stream_display_ctx),
@@ -1887,6 +1900,21 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
nullptr),
"stream tracing configure failed");
ROCPROFILER_CALL(rocprofiler_start_context(hip_stream_display_ctx), "start context failed");
// Track if HIP runtime has been initialized via runtime_intialization service
auto runtime_initialization_ctx = rocprofiler_context_id_t{0};
ROCPROFILER_CALL(rocprofiler_create_context(&runtime_initialization_ctx),
"failed to create context");
ROCPROFILER_CALL(rocprofiler_configure_callback_tracing_service(
runtime_initialization_ctx,
ROCPROFILER_CALLBACK_TRACING_RUNTIME_INITIALIZATION,
nullptr,
0,
runtime_initialization_callback,
nullptr),
"stream tracing configure failed");
ROCPROFILER_CALL(rocprofiler_start_context(runtime_initialization_ctx),
"start context failed");
}
if(tool::get_config().kernel_rename || !tool::get_config().group_by_queue)
{
@@ -1902,6 +1930,21 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
set_kernel_rename_and_stream_display_correlation_id,
nullptr),
"Could not configure external correlation id request service");
if(tool::get_config().counter_collection)
{
auto counter_external_corr_id_request_kinds =
std::array<rocprofiler_external_correlation_id_request_kind_t, 1>{
ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KERNEL_DISPATCH};
ROCPROFILER_CALL(rocprofiler_configure_external_correlation_id_request_service(
counter_collection_ctx,
counter_external_corr_id_request_kinds.data(),
counter_external_corr_id_request_kinds.size(),
set_kernel_rename_and_stream_display_correlation_id,
nullptr),
"Could not configure external correlation id request service");
}
}
if(tool::get_config().pc_sampling_host_trap)
@@ -2229,17 +2272,6 @@ tool_fini(void* /*tool_data*/)
run_cleanup();
if(kernel_rename_and_stream_display_pair_dtors != nullptr)
{
for(auto& itr : *kernel_rename_and_stream_display_pair_dtors)
{
delete itr;
itr = nullptr;
}
delete kernel_rename_and_stream_display_pair_dtors;
kernel_rename_and_stream_display_pair_dtors = nullptr;
}
if(destructors)
{
for(const auto& itr : *destructors)
+2 -39
Ver fichero
@@ -34,7 +34,6 @@
#include "lib/rocprofiler-sdk/hsa/queue_controller.hpp"
#include "lib/rocprofiler-sdk/registration.hpp"
#include "lib/rocprofiler-sdk/tracing/tracing.hpp"
#include "rocprofiler-sdk/hip/compiler_api_id.h"
#include <rocprofiler-sdk/buffer.h>
#include <rocprofiler-sdk/callback_tracing.h>
@@ -367,28 +366,11 @@ enable_stream_stack()
{
if(itr->is_tracing_one_of(ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY,
ROCPROFILER_CALLBACK_TRACING_HIP_RUNTIME_API,
ROCPROFILER_CALLBACK_TRACING_HIP_COMPILER_API,
ROCPROFILER_CALLBACK_TRACING_HIP_STREAM,
ROCPROFILER_BUFFER_TRACING_MEMORY_COPY,
ROCPROFILER_BUFFER_TRACING_HIP_RUNTIME_API,
ROCPROFILER_BUFFER_TRACING_HIP_COMPILER_API,
ROCPROFILER_BUFFER_TRACING_HIP_STREAM,
ROCPROFILER_BUFFER_TRACING_HIP_RUNTIME_API_EXT,
ROCPROFILER_BUFFER_TRACING_HIP_COMPILER_API_EXT))
return true;
}
return false;
}
bool
enable_compiler_stream_stack()
{
for(const auto& itr : context::get_registered_contexts())
{
if(itr->is_tracing_one_of(ROCPROFILER_CALLBACK_TRACING_HIP_COMPILER_API,
ROCPROFILER_BUFFER_TRACING_HIP_COMPILER_API,
ROCPROFILER_BUFFER_TRACING_HIP_COMPILER_API_EXT))
ROCPROFILER_BUFFER_TRACING_HIP_RUNTIME_API_EXT))
return true;
}
@@ -415,11 +397,6 @@ update_table(Tp* _orig, std::integral_constant<size_t, OpIdx>)
ROCP_TRACE << "updating table entry for " << _info.name;
constexpr auto num_args = function_args_type::size();
constexpr auto is_hip_pop_call_config_func =
std::is_same<decltype(info_type::operation_idx),
rocprofiler_hip_compiler_api_id_t>::value &&
(static_cast<rocprofiler_hip_compiler_api_id_t>(info_type::operation_idx) ==
ROCPROFILER_HIP_COMPILER_API_ID___hipPopCallConfiguration);
if constexpr(common::mpl::is_one_of<hipStream_t, function_args_type>::value)
{
@@ -433,12 +410,6 @@ update_table(Tp* _orig, std::integral_constant<size_t, OpIdx>)
static_assert(stream_idx == (num_args - rstream_idx - 1),
"function has more than one stream argument");
// don't wrap the compiler API functions unless HIP compiler API tracing is enabled
if constexpr(TableIdx == ROCPROFILER_HIP_TABLE_ID_Compiler)
{
if(!enable_compiler_stream_stack()) return;
}
// 1. get the sub-table containing the function pointer in original table
// 2. get reference to function pointer in sub-table in original table
// 3. update function pointer with wrapper
@@ -460,8 +431,7 @@ update_table(Tp* _orig, std::integral_constant<size_t, OpIdx>)
_func = create_read_functor<TableIdx, OpIdx>(_func);
}
}
else if constexpr(common::mpl::is_one_of<hipStream_t*, function_args_type>::value &&
!is_hip_pop_call_config_func)
else if constexpr(common::mpl::is_one_of<hipStream_t*, function_args_type>::value)
{
constexpr auto stream_idx =
common::mpl::index_of<hipStream_t*, function_args_type>::value;
@@ -474,12 +444,6 @@ update_table(Tp* _orig, std::integral_constant<size_t, OpIdx>)
static_assert(stream_idx == (num_args - rstream_idx - 1),
"function has more than one stream argument");
// don't wrap the compiler API functions unless HIP compiler API tracing is enabled
if constexpr(TableIdx == ROCPROFILER_HIP_TABLE_ID_Compiler)
{
if(!enable_compiler_stream_stack()) return;
}
ROCP_INFO << _info.name << " has been designated as a stream create function";
// 1. get the sub-table containing the function pointer in original table
@@ -541,7 +505,6 @@ using hip_op_args_cb_t = rocprofiler_callback_tracing_operation_args_cb_t;
template void update_table<TABLE_TYPE>(TABLE_TYPE * _tbl);
INSTANTIATE_HIP_TABLE_FUNC(hip_runtime_api_table_t, ROCPROFILER_HIP_TABLE_ID_Runtime)
INSTANTIATE_HIP_TABLE_FUNC(hip_compiler_api_table_t, ROCPROFILER_HIP_TABLE_ID_Compiler)
} // namespace stream
} // namespace hip
} // namespace rocprofiler
+1 -2
Ver fichero
@@ -36,8 +36,7 @@ namespace hip
{
namespace stream
{
using hip_compiler_api_table_t = HipCompilerDispatchTable;
using hip_runtime_api_table_t = HipDispatchTable;
using hip_runtime_api_table_t = HipDispatchTable;
rocprofiler_stream_id_t
get_stream_id();
+8
Ver fichero
@@ -253,6 +253,14 @@ WriteInterceptor(const void* packets,
tracing::populate_contexts(ROCPROFILER_CALLBACK_TRACING_KERNEL_DISPATCH,
ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH,
tracing_data_v);
// these are for the services (dispatch counter collection, pc sampling, ATT) which use
// the queue/queue_controller callback mechanism
const auto queue_callback_context_filter = [](const context::context* ctx) {
return (ctx->counter_collection || ctx->pc_sampler || ctx->dispatch_thread_trace);
};
for(const auto* itr : context::get_active_contexts(queue_callback_context_filter))
tracing_data_v.external_correlation_ids.emplace(itr, tracing::empty_user_data);
const auto* packets_arr = static_cast<const rocprofiler_packet*>(packets);
auto transformed_packets = std::vector<rocprofiler_packet>{};
@@ -858,9 +858,6 @@ rocprofiler_set_api_table(const char* name,
// install rocprofiler API wrappers
rocprofiler::hip::update_table(hip_compiler_api_table);
// install HIP stream deduction wrappers
rocprofiler::hip::stream::update_table(hip_compiler_api_table);
// allow tools to install API wrappers
rocprofiler::intercept_table::notify_intercept_table_registration(
ROCPROFILER_HIP_COMPILER_TABLE,
+1
Ver fichero
@@ -38,3 +38,4 @@ if(rocJPEG_FOUND AND rocJPEG_VERSION VERSION_GREATER 0.6.0)
add_subdirectory(rocjpeg)
endif()
add_subdirectory(hsa-code-object)
add_subdirectory(hip-streams)
+41
Ver fichero
@@ -0,0 +1,41 @@
#
#
#
cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR)
if(NOT CMAKE_HIP_COMPILER)
find_program(
amdclangpp_EXECUTABLE
NAMES amdclang++
HINTS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm
PATHS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm
PATH_SUFFIXES bin llvm/bin NO_CACHE)
mark_as_advanced(amdclangpp_EXECUTABLE)
if(amdclangpp_EXECUTABLE)
set(CMAKE_HIP_COMPILER "${amdclangpp_EXECUTABLE}")
endif()
endif()
project(rocprofiler-sdk-tests-bin-hip-streams LANGUAGES CXX HIP)
foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO)
if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "")
set(CMAKE_HIP_FLAGS_${_TYPE} "${CMAKE_CXX_FLAGS_${_TYPE}}")
endif()
endforeach()
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_EXTENSIONS OFF)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_HIP_STANDARD 17)
set(CMAKE_HIP_EXTENSIONS OFF)
set(CMAKE_HIP_STANDARD_REQUIRED ON)
set_source_files_properties(compute_comm_overlap.cpp PROPERTIES LANGUAGE HIP)
add_executable(hip-streams)
target_sources(hip-streams PRIVATE compute_comm_overlap.cpp)
target_link_libraries(hip-streams PRIVATE rocprofiler-sdk::tests-build-flags)
find_package(Threads REQUIRED)
target_link_libraries(hip-streams PRIVATE Threads::Threads)
@@ -0,0 +1,138 @@
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include "hip/hip_runtime.h"
#define BLOCKDIM 64
/* Macro for checking GPU API return values */
#define HIP_ASSERT(call) \
do \
{ \
hipError_t gpuErr = call; \
if(hipSuccess != gpuErr) \
{ \
printf( \
"GPU API Error - %s:%d: '%s'\n", __FILE__, __LINE__, hipGetErrorString(gpuErr)); \
exit(1); \
} \
} while(0)
// HIP kernel. Each thread takes care of one element of input
__global__ void
cube(double* input, double* output, size_t offset, size_t elements_per_stream)
{
size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
size_t gstride = blockDim.x * gridDim.x;
// Span all elements assigned to this stream
for(size_t id = tid + offset; id < offset + elements_per_stream; id += gstride)
for(size_t i = 0; i < 1000; ++i)
output[id] = input[id] * input[id] * input[id];
}
int
main()
{
// number of streams
const int num_streams = 8;
// Number of threads in each thread block
const int blockSize = 512;
// Size of vectors
int n = 100000000;
int elements_per_stream = n / num_streams;
int bytes_per_stream = elements_per_stream * sizeof(double);
// Host input vectors
double* h_input1{nullptr};
// Host output vector
double* h_output1{nullptr};
// Device input vectors
double* d_input1{nullptr};
// Device output vector
double* d_output1{nullptr};
// Creating events for timers
hipEvent_t start{}, stop{};
HIP_ASSERT(hipEventCreate(&start));
HIP_ASSERT(hipEventCreate(&stop));
// Creating streams
hipStream_t streams[num_streams];
for(int i = 0; i < num_streams; ++i)
{
HIP_ASSERT(hipStreamCreate(&streams[i]));
}
// Size, in bytes, of each vector
size_t bytes = n * sizeof(double);
// Allocate page locked memory for these vectors on host
HIP_ASSERT(hipHostMalloc(&h_input1, bytes));
HIP_ASSERT(hipHostMalloc(&h_output1, bytes));
// Allocate memory for each vector on GPU
HIP_ASSERT(hipMalloc(&d_input1, bytes));
HIP_ASSERT(hipMalloc(&d_output1, bytes));
// Initialize vectors on host
for(int i = 0; i < n; i++)
{
h_input1[i] = sin(i);
}
// Number of thread blocks in grid
const int gridSizePerStream = 104; //(int)ceil((float)elements_per_stream/blockSize);
HIP_ASSERT(hipEventRecord(start));
// split H2D copies and kernel calls into separate loops
for(int i = 0; i < num_streams; i++)
{
int offset = i * elements_per_stream;
HIP_ASSERT(hipMemcpyAsync(&d_input1[offset],
&h_input1[offset],
bytes_per_stream,
hipMemcpyHostToDevice,
streams[i]));
}
for(int i = 0; i < num_streams; i++)
{
int offset = i * elements_per_stream;
cube<<<gridSizePerStream, blockSize, 0, streams[i]>>>(
d_input1, d_output1, offset, elements_per_stream);
}
for(int i = 0; i < num_streams; i++)
{
int offset = i * elements_per_stream;
HIP_ASSERT(hipMemcpyAsync(&h_output1[offset],
&d_output1[offset],
bytes_per_stream,
hipMemcpyDeviceToHost,
streams[i]));
}
HIP_ASSERT(hipEventRecord(stop));
HIP_ASSERT(hipEventSynchronize(stop));
float milliseconds = 0;
HIP_ASSERT(hipEventElapsedTime(&milliseconds, start, stop));
// Release device memory
HIP_ASSERT(hipFree(d_input1));
HIP_ASSERT(hipFree(d_output1));
// Release host memory
HIP_ASSERT(hipHostFree(h_input1));
HIP_ASSERT(hipHostFree(h_output1));
// Destroy streams
for(int i = 0; i < num_streams; ++i)
{
HIP_ASSERT(hipStreamDestroy(streams[i]));
}
return 0;
}
@@ -29,7 +29,6 @@ project(
VERSION 0.0.0)
find_package(rocprofiler-sdk REQUIRED)
find_package(rocDecode)
rocprofiler_configure_pytest_files(CONFIG pytest.ini COPY validate.py conftest.py)
@@ -41,9 +40,9 @@ set(hip-stream-display-env "${PRELOAD_ENV}")
add_test(
NAME rocprofv3-test-hip-stream-display-execute
COMMAND
$<TARGET_FILE:rocprofiler-sdk::rocprofv3> --kernel-rename -d
$<TARGET_FILE:rocprofiler-sdk::rocprofv3> --kernel-rename -s -d
${CMAKE_CURRENT_BINARY_DIR}/%tag%-trace -o out --output-format json pftrace
--log-level env -- $<TARGET_FILE:transpose>)
--log-level env -- $<TARGET_FILE:hip-streams>)
set_tests_properties(
rocprofv3-test-hip-stream-display-execute
@@ -56,14 +55,14 @@ set_tests_properties(
FAIL_REGULAR_EXPRESSION
"threw an exception"
DISABLED
$<NOT:$<TARGET_EXISTS:transpose>>)
$<NOT:$<TARGET_EXISTS:hip-streams>>)
add_test(
NAME rocprofv3-test-hip-stream-display-validate
COMMAND
${Python3_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/validate.py --json-input
${CMAKE_CURRENT_BINARY_DIR}/hip-stream-display/out_results.json --pftrace-input
${CMAKE_CURRENT_BINARY_DIR}/hip-stream-display/out_results.pftrace)
${CMAKE_CURRENT_BINARY_DIR}/hip-streams-trace/out_results.json --pftrace-input
${CMAKE_CURRENT_BINARY_DIR}/hip-streams-trace/out_results.pftrace)
set_tests_properties(
rocprofv3-test-hip-stream-display-validate
@@ -76,4 +75,4 @@ set_tests_properties(
FAIL_REGULAR_EXPRESSION
"AssertionError"
DISABLED
$<NOT:$<TARGET_EXISTS:transpose>>)
$<NOT:$<TARGET_EXISTS:hip-streams>>)
+11 -7
Ver fichero
@@ -54,15 +54,17 @@ def test_stream_trace(json_data):
buffer_records = data["buffer_records"]
kernel_dispatch_data = buffer_records["kernel_dispatch"]
memory_copies_data = buffer_records["memory_copies"]
memory_copies_data = buffer_records["memory_copy"]
assert len(kernel_dispatch_data) > 0
assert len(memory_copies_data) > 0
# Expect stream ids to be set to 1 or 2 for transpose executable
expected_stream_ids = set((1, 2))
# Expect stream ids to be set between 1 and 8 inclusive for transpose executable
expected_stream_ids = set([i for i in range(1, 9)])
# check buffering data
for titr in (kernel_dispatch_data, memory_copies_data):
for node in rocdecode_data:
stream_id_set = set()
for node in titr:
assert "size" in node
assert "kind" in node
assert "operation" in node
@@ -70,7 +72,7 @@ def test_stream_trace(json_data):
assert "end_timestamp" in node
assert "start_timestamp" in node
assert "thread_id" in node
assert "_stream_id" in node
assert "stream_id" in node
assert node.size > 0
assert node.thread_id > 0
@@ -78,13 +80,15 @@ def test_stream_trace(json_data):
assert node.end_timestamp > 0
assert node.start_timestamp < node.end_timestamp
assert node._stream_id.handle in expected_stream_ids
stream_id = node.stream_id.handle
stream_id_set.add(stream_id)
assert stream_id_set == expected_stream_ids
def test_perfetto_data(pftrace_data, json_data):
import rocprofiler_sdk.tests.rocprofv3 as rocprofv3
assert pftrace_data != None
assert pftrace_data.empty == False
rocprofv3.test_perfetto_data(
pftrace_data,
json_data,