From e49b501e9aa09d7e6a4413687e85f2bf5b21902a Mon Sep 17 00:00:00 2001 From: lloginov-amd Date: Mon, 19 Jan 2026 16:24:30 +0100 Subject: [PATCH] Add scratch memory support (#2211) --- .../source/lib/core/rocprofiler-sdk.cpp | 3 +- .../lib/core/trace_cache/cache_manager.hpp | 2 +- .../core/trace_cache/perfetto_processor.cpp | 66 ++++++++ .../core/trace_cache/perfetto_processor.hpp | 1 + .../lib/core/trace_cache/rocpd_processor.cpp | 120 +++++++++----- .../lib/core/trace_cache/rocpd_processor.hpp | 1 + .../lib/core/trace_cache/sample_processor.hpp | 18 +++ .../lib/core/trace_cache/sample_type.hpp | 78 +++++++++ .../rocprof-sys/library/rocprofiler-sdk.cpp | 152 +++++++++++++++++- .../library/rocprofiler-sdk/fwd.hpp | 12 +- 10 files changed, 398 insertions(+), 55 deletions(-) diff --git a/projects/rocprofiler-systems/source/lib/core/rocprofiler-sdk.cpp b/projects/rocprofiler-systems/source/lib/core/rocprofiler-sdk.cpp index 52b9f2607e..a0df719742 100644 --- a/projects/rocprofiler-systems/source/lib/core/rocprofiler-sdk.cpp +++ b/projects/rocprofiler-systems/source/lib/core/rocprofiler-sdk.cpp @@ -367,7 +367,6 @@ config_settings(const std::shared_ptr& _config) _skip_domains.emplace("kernel_dispatch"); _skip_domains.emplace("page_migration"); - _skip_domains.emplace("scratch_memory"); _add_operation_settings( "MARKER_API", callback_tracing_info[ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API], @@ -652,7 +651,7 @@ get_backtrace_operations(rocprofiler_callback_tracing_kind_t kindv) { if(callback_operation_option_names.count(kindv) == 0) { - LOG_CRITICAL("callback_operation_operation_names does not have value for {}", + LOG_CRITICAL("callback_operation_option_names does not have value for {}", static_cast(kindv)); ::rocprofsys::set_state(::rocprofsys::State::Finalized); std::abort(); diff --git a/projects/rocprofiler-systems/source/lib/core/trace_cache/cache_manager.hpp b/projects/rocprofiler-systems/source/lib/core/trace_cache/cache_manager.hpp index 47e8cd2a5a..98c1a2880c 100644 --- a/projects/rocprofiler-systems/source/lib/core/trace_cache/cache_manager.hpp +++ b/projects/rocprofiler-systems/source/lib/core/trace_cache/cache_manager.hpp @@ -41,7 +41,7 @@ using storage_parser_t = storage_parser; + backtrace_region_sample, scratch_memory_sample>; using buffer_storage_t = buffer_storage; diff --git a/projects/rocprofiler-systems/source/lib/core/trace_cache/perfetto_processor.cpp b/projects/rocprofiler-systems/source/lib/core/trace_cache/perfetto_processor.cpp index 92a25255e5..6303824aa8 100644 --- a/projects/rocprofiler-systems/source/lib/core/trace_cache/perfetto_processor.cpp +++ b/projects/rocprofiler-systems/source/lib/core/trace_cache/perfetto_processor.cpp @@ -540,6 +540,72 @@ perfetto_processor_t::handle([[maybe_unused]] const kernel_dispatch_sample& _kds #endif } +void +perfetto_processor_t::handle([[maybe_unused]] const scratch_memory_sample& _sms) +{ +#if ROCPROFSYS_USE_ROCM > 0 + auto _corr_id = _sms.correlation_id_internal; + auto _stream_id = _sms.stream_handle; + auto _queue_id_handle = _sms.queue_id_handle; + const auto& _t_info = thread_info::get(_sms.thread_id, SystemTID); + const auto _thread_id_sequent = _t_info->index_data->sequent_value; + auto _beg_ts = _sms.start_timestamp; + auto _end_ts = _sms.end_timestamp; + + auto _agent_device_id = + m_agent_manager.get_agent_by_handle(_sms.agent_id_handle).device_type_index; + auto _name = std::string{ m_metadata.get_buffer_name_info().at( + static_cast(_sms.kind), + static_cast(_sms.operation)) }; + +// Scratch memory samples from SDK versions prior to 7.0.2 do not include +// allocation_size field, so counter tracks are not needed +# if ROCPROFSYS_ROCM_VERSION >= 70002 + using counter_track = + perfetto_counter_track; + + if(!counter_track::exists(_agent_device_id)) + { + auto _track_desc_alloc_size = JOIN("", "GPU Scratch Memory [", _agent_device_id, + "] Thread ", _thread_id_sequent); + counter_track::emplace(_agent_device_id, _track_desc_alloc_size, "bytes"); + } + + if(_sms.operation == ROCPROFILER_SCRATCH_MEMORY_ALLOC) + { + TRACE_COUNTER("rocm_scratch_memory", counter_track::at(_agent_device_id, 0), + _beg_ts, _sms.allocation_size); + } +# endif + + auto _track_desc_events = [&]() { + return JOIN("", "GPU Scratch Memory Events Thread ", _thread_id_sequent); + }; + + const auto _track = + tracing::get_perfetto_track(category::rocm_scratch_memory{}, _track_desc_events); + + auto add_perfetto_annotations = [&](::perfetto::EventContext ctx) { + if(!m_use_annotations) return; + + annotate_perfetto(ctx, { { "begin_ns", _beg_ts }, + { "end_ns", _end_ts }, + { "corr_id", _corr_id }, + { "stream_id", _stream_id }, + { "queue", _queue_id_handle }, + { "allocation_size", _sms.allocation_size }, + { "agent_id", _agent_device_id }, + { "operation", _name }, + { "flags", _sms.flags } }); + }; + + tracing::push_perfetto(category::rocm_scratch_memory{}, _name.c_str(), _track, + _beg_ts, ::perfetto::Flow::ProcessScoped(_corr_id), + add_perfetto_annotations); + tracing::pop_perfetto(category::rocm_scratch_memory{}, "", _track, _end_ts); +#endif +} + void perfetto_processor_t::handle([[maybe_unused]] const memory_copy_sample& _mcs) { diff --git a/projects/rocprofiler-systems/source/lib/core/trace_cache/perfetto_processor.hpp b/projects/rocprofiler-systems/source/lib/core/trace_cache/perfetto_processor.hpp index 5ddb2410b6..4dd487a657 100644 --- a/projects/rocprofiler-systems/source/lib/core/trace_cache/perfetto_processor.hpp +++ b/projects/rocprofiler-systems/source/lib/core/trace_cache/perfetto_processor.hpp @@ -56,6 +56,7 @@ public: void finalize_processing(); void handle(const kernel_dispatch_sample& sample); + void handle(const scratch_memory_sample& sample); void handle(const memory_copy_sample& sample); void handle(const memory_allocate_sample& sample); void handle(const region_sample& sample); diff --git a/projects/rocprofiler-systems/source/lib/core/trace_cache/rocpd_processor.cpp b/projects/rocprofiler-systems/source/lib/core/trace_cache/rocpd_processor.cpp index 567b1de230..cded11c3a8 100644 --- a/projects/rocprofiler-systems/source/lib/core/trace_cache/rocpd_processor.cpp +++ b/projects/rocprofiler-systems/source/lib/core/trace_cache/rocpd_processor.cpp @@ -66,6 +66,37 @@ get_handle_from_code_object( # endif } #endif + +#if ROCPROFSYS_USE_ROCM > 0 +using memory_operation = std::string; +using memory_type = std::string; +std::pair +parse_memory_operation_name(std::string_view memory_operation_name) +{ + static const std::unordered_map> + parsing_map{ + { "MEMORY_ALLOCATION_NONE", { "NONE", "REAL" } }, + { "MEMORY_ALLOCATION_ALLOCATE", { "ALLOC", "REAL" } }, + { "MEMORY_ALLOCATION_VMEM_ALLOCATE", { "ALLOC", "VIRTUAL" } }, + { "MEMORY_ALLOCATION_FREE", { "FREE", "REAL" } }, + { "MEMORY_ALLOCATION_VMEM_FREE", { "FREE", "VIRTUAL" } }, + { "SCRATCH_MEMORY_NONE", { "NONE", "SCRATCH" } }, + { "SCRATCH_MEMORY_ALLOC", { "ALLOC", "SCRATCH" } }, + { "SCRATCH_MEMORY_FREE", { "FREE", "SCRATCH" } }, + { "SCRATCH_MEMORY_ASYNC_RECLAIM", { "ASYNC_RECLAIM", "SCRATCH" } }, + }; + + auto item = parsing_map.find(memory_operation_name); + if(item == parsing_map.end()) + { + LOG_WARNING("Unknown memory operation name: {}", memory_operation_name); + return { "UNKNOWN", "UNKNOWN" }; + } + + return item->second; +} +#endif } // namespace void @@ -110,6 +141,46 @@ rocpd_processor_t::handle([[maybe_unused]] const kernel_dispatch_sample& _kds) #endif } +void +rocpd_processor_t::handle([[maybe_unused]] const scratch_memory_sample& _sms) +{ +#if ROCPROFSYS_USE_ROCM > 0 + auto& n_info = node_info::get_instance(); + auto process = m_metadata->get_process_info(); + + const auto* _name = m_metadata->get_buffer_name_info().at( + static_cast(_sms.kind), + static_cast(_sms.operation)); + + auto agent_primary_key = + m_agent_manager->get_agent_by_handle(_sms.agent_id_handle).base_id; + + auto thread_primary_key = + m_data_processor->map_thread_id_to_primary_key(_sms.thread_id); + + auto category_primary_key = m_data_processor->insert_string( + trait::name::value); + + auto stack_id = _sms.correlation_id_internal; + auto parent_stack_id = _sms.correlation_id_ancestor; + auto correlation_id = 0; + auto address_value = 0; + + auto event_primary_key = m_data_processor->insert_event( + category_primary_key, stack_id, parent_stack_id, correlation_id); + + auto [memory_operation, memory_type] = parse_memory_operation_name(_name); + + auto extdata_json_str = JOIN("", "{\"flags\": ", _sms.flags, "}"); + + m_data_processor->insert_memory_alloc( + n_info.id, process.pid, thread_primary_key, agent_primary_key, + memory_operation.c_str(), memory_type.c_str(), _sms.start_timestamp, + _sms.end_timestamp, address_value, _sms.allocation_size, _sms.queue_id_handle, + _sms.stream_handle, event_primary_key, extdata_json_str.c_str()); +#endif +} + void rocpd_processor_t::handle([[maybe_unused]] const memory_copy_sample& _mcs) { @@ -153,46 +224,6 @@ void rocpd_processor_t::handle([[maybe_unused]] const memory_allocate_sample& _mas) { #if ROCPROFSYS_USE_ROCM > 0 && (ROCPROFILER_VERSION >= 600) - static auto memtype_to_db = - [](std::string_view memory_type) -> std::pair { - constexpr auto MEMORY_PREFIX = std::string_view{ "MEMORY_ALLOCATION_" }; - constexpr auto SCRATCH_PREFIX = std::string_view{ "SCRATCH_MEMORY_" }; - constexpr auto VMEM_PREFIX = std::string_view{ "VMEM_" }; - constexpr auto ASYNC_PREFIX = std::string_view{ "ASYNC_" }; - - std::string _type; - std::string _level; - if(memory_type.find(MEMORY_PREFIX) == 0) - { - _type = memory_type.substr(MEMORY_PREFIX.length()); - if(_type.find(VMEM_PREFIX) == 0) - { - _type = _type.substr(VMEM_PREFIX.length()); - _level = "VIRTUAL"; - } - else - { - _level = "REAL"; - } - } - else if(memory_type.find(SCRATCH_PREFIX) == 0) - { - _type = memory_type.substr(SCRATCH_PREFIX.length()); - _level = "SCRATCH"; - if(memory_type.find(ASYNC_PREFIX) == 0) - { - _type = memory_type.substr(ASYNC_PREFIX.length()); // RECLAIM - } - } - - if(_type == "ALLOCATE") - { - _type = "ALLOC"; - } - - return std::make_pair(_type, _level); - }; - auto& n_info = node_info::get_instance(); auto process = m_metadata->get_process_info(); auto thread_primary_key = @@ -210,7 +241,7 @@ rocpd_processor_t::handle([[maybe_unused]] const memory_allocate_sample& _mas) static_cast(_mas.kind), static_cast(_mas.operation)); - auto [type, level] = memtype_to_db(_name); + auto [memory_operation, memory_type] = parse_memory_operation_name(_name); auto stack_id = _mas.correlation_id_internal; auto parent_stack_id = _mas.correlation_id_ancestor; @@ -224,9 +255,10 @@ rocpd_processor_t::handle([[maybe_unused]] const memory_allocate_sample& _mas) category_primary_key, stack_id, parent_stack_id, correlation_id); m_data_processor->insert_memory_alloc( - n_info.id, process.pid, thread_primary_key, agent_primary_key, type.c_str(), - level.c_str(), _mas.start_timestamp, _mas.end_timestamp, _mas.address_value, - _mas.allocation_size, queue_id, _mas.stream_handle, event_primary_key); + n_info.id, process.pid, thread_primary_key, agent_primary_key, + memory_operation.c_str(), memory_type.c_str(), _mas.start_timestamp, + _mas.end_timestamp, _mas.address_value, _mas.allocation_size, queue_id, + _mas.stream_handle, event_primary_key); } #endif } diff --git a/projects/rocprofiler-systems/source/lib/core/trace_cache/rocpd_processor.hpp b/projects/rocprofiler-systems/source/lib/core/trace_cache/rocpd_processor.hpp index 68f299a3e6..78c1d25670 100644 --- a/projects/rocprofiler-systems/source/lib/core/trace_cache/rocpd_processor.hpp +++ b/projects/rocprofiler-systems/source/lib/core/trace_cache/rocpd_processor.hpp @@ -45,6 +45,7 @@ public: void finalize_processing(); void handle(const kernel_dispatch_sample& sample); + void handle(const scratch_memory_sample& sample); void handle(const memory_copy_sample& sample); void handle(const memory_allocate_sample& sample); void handle(const region_sample& sample); diff --git a/projects/rocprofiler-systems/source/lib/core/trace_cache/sample_processor.hpp b/projects/rocprofiler-systems/source/lib/core/trace_cache/sample_processor.hpp index b55d80740b..b94232ef16 100644 --- a/projects/rocprofiler-systems/source/lib/core/trace_cache/sample_processor.hpp +++ b/projects/rocprofiler-systems/source/lib/core/trace_cache/sample_processor.hpp @@ -43,6 +43,11 @@ struct processor_t static_cast(this)->handle(sample); } + void handle(const scratch_memory_sample& sample) + { + static_cast(this)->handle(sample); + } + void handle(const memory_copy_sample& sample) { static_cast(this)->handle(sample); @@ -84,6 +89,7 @@ protected: struct processor_view_t { using kernel_dispatch_fn_t = void (*)(void*, const kernel_dispatch_sample&) noexcept; + using scratch_memory_fn_t = void (*)(void*, const scratch_memory_sample&) noexcept; using memory_copy_fn_t = void (*)(void*, const memory_copy_sample&) noexcept; #if(ROCPROFILER_VERSION >= 600) using memory_allocate_fn_t = void (*)(void*, const memory_allocate_sample&) noexcept; @@ -101,6 +107,7 @@ struct processor_view_t struct vtable_t { kernel_dispatch_fn_t handle_kernel_dispatch; + scratch_memory_fn_t handle_scratch_memory; memory_copy_fn_t handle_memory_copy; #if(ROCPROFILER_VERSION >= 600) memory_allocate_fn_t handle_memory_allocate; @@ -134,6 +141,11 @@ struct processor_view_t m_vtable->handle_kernel_dispatch(m_object, sample); } + ROCPROFSYS_INLINE void handle(const scratch_memory_sample& sample) const noexcept + { + m_vtable->handle_scratch_memory(m_object, sample); + } + ROCPROFSYS_INLINE void handle(const memory_copy_sample& sample) const noexcept { m_vtable->handle_memory_copy(m_object, sample); @@ -194,6 +206,9 @@ private: +[](void* obj, const kernel_dispatch_sample& sample) noexcept { static_cast(obj)->handle(sample); }, + +[](void* obj, const scratch_memory_sample& sample) noexcept { + static_cast(obj)->handle(sample); + }, +[](void* obj, const memory_copy_sample& sample) noexcept { static_cast(obj)->handle(sample); }, @@ -275,6 +290,9 @@ struct sample_processor_t case type_identifier_t::kernel_dispatch: handle_sample(static_cast(sample)); break; + case type_identifier_t::scratch_memory: + handle_sample(static_cast(sample)); + break; case type_identifier_t::memory_copy: handle_sample(static_cast(sample)); break; diff --git a/projects/rocprofiler-systems/source/lib/core/trace_cache/sample_type.hpp b/projects/rocprofiler-systems/source/lib/core/trace_cache/sample_type.hpp index 9bd20bfc8c..d1119a44e4 100644 --- a/projects/rocprofiler-systems/source/lib/core/trace_cache/sample_type.hpp +++ b/projects/rocprofiler-systems/source/lib/core/trace_cache/sample_type.hpp @@ -46,6 +46,7 @@ enum class type_identifier_t : uint32_t amd_smi_sample = 0x0006, cpu_freq_sample = 0x0007, backtrace_region_sample = 0x0008, + scratch_memory = 0x0009, fragmented_space = 0xFFFF }; @@ -148,6 +149,83 @@ get_size(const kernel_dispatch_sample& item) item.grid_size_z, static_cast(item.stream_handle)); } +struct scratch_memory_sample : cacheable_t +{ + static constexpr type_identifier_t type_identifier = + type_identifier_t::scratch_memory; + + scratch_memory_sample() = default; + scratch_memory_sample(uint64_t _start_timestamp, uint64_t _end_timestamp, + uint64_t _thread_id, uint64_t _agent_id_handle, + uint64_t _queue_id_handle, int32_t _kind, int32_t _operation, + int32_t _flags, uint64_t _allocation_size, + uint64_t _correlation_id_internal, + uint64_t _correlation_id_ancestor, size_t _stream_handle) + : start_timestamp(_start_timestamp) + , end_timestamp(_end_timestamp) + , thread_id(_thread_id) + , agent_id_handle(_agent_id_handle) + , queue_id_handle(_queue_id_handle) + , kind(_kind) + , operation(_operation) + , flags(_flags) + , allocation_size(_allocation_size) + , correlation_id_internal(_correlation_id_internal) + , correlation_id_ancestor(_correlation_id_ancestor) + , stream_handle(_stream_handle) + {} + + uint64_t start_timestamp; + uint64_t end_timestamp; + uint64_t thread_id; + uint64_t agent_id_handle; + uint64_t queue_id_handle; + int32_t kind; + int32_t operation; + int32_t flags; + uint64_t allocation_size; + uint64_t correlation_id_internal; + uint64_t correlation_id_ancestor; + size_t stream_handle; +}; + +template <> +inline void +serialize(uint8_t* buffer, const scratch_memory_sample& item) +{ + utility::store_value(buffer, item.start_timestamp, item.end_timestamp, item.thread_id, + item.agent_id_handle, item.queue_id_handle, item.kind, + item.operation, item.flags, item.allocation_size, + item.correlation_id_internal, item.correlation_id_ancestor, + static_cast(item.stream_handle)); +} + +template <> +inline scratch_memory_sample +deserialize(uint8_t*& buffer) +{ + scratch_memory_sample item; + uint64_t stream_handle; + utility::parse_value(buffer, item.start_timestamp, item.end_timestamp, item.thread_id, + item.agent_id_handle, item.queue_id_handle, item.kind, + item.operation, item.flags, item.allocation_size, + item.correlation_id_internal, item.correlation_id_ancestor, + stream_handle); + item.stream_handle = stream_handle; + return item; +} + +template <> +inline size_t +get_size(const scratch_memory_sample& item) +{ + return utility::get_size(item.start_timestamp, item.end_timestamp, item.thread_id, + item.agent_id_handle, item.queue_id_handle, item.kind, + item.operation, item.flags, item.allocation_size, + item.correlation_id_internal, item.correlation_id_ancestor, + static_cast(item.stream_handle)); +} + struct memory_copy_sample : cacheable_t { static constexpr type_identifier_t type_identifier = type_identifier_t::memory_copy; diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk.cpp index d30368650b..0a337b2400 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk.cpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk.cpp @@ -567,6 +567,18 @@ get_mem_alloc_address( } #endif +uint64_t +get_scratch_mem_alloc_size( + [[maybe_unused]] const rocprofiler_buffer_tracing_scratch_memory_record_t& record) +{ +// Scratch memory samples from SDK versions prior to 7.0.2 do not include allocation_size +#if(ROCPROFSYS_USE_ROCM > 0 && ROCPROFSYS_ROCM_VERSION >= 70002) + return record.allocation_size; +#else + return 0; +#endif +} + void cache_region(const rocprofiler_callback_tracing_record_t* record, const rocprofiler_timestamp_t start_timestamp, @@ -615,13 +627,26 @@ cache_kernel_dispatch(rocprofiler_buffer_tracing_kernel_dispatch_record_t* recor record->dispatch_info.grid_size.z, stream_handle }); } +void +cache_scratch_memory(rocprofiler_buffer_tracing_scratch_memory_record_t* record, + uint64_t stream_handle) +{ + trace_cache::get_metadata_registry().add_stream(stream_handle); + trace_cache::get_buffer_storage().store(trace_cache::scratch_memory_sample{ + record->start_timestamp, record->end_timestamp, record->thread_id, + record->agent_id.handle, record->queue_id.handle, + static_cast(record->kind), static_cast(record->operation), + static_cast(record->flags), get_scratch_mem_alloc_size(*record), + record->correlation_id.internal, get_parent_stack_id(record->correlation_id), + stream_handle }); +} + void cache_memory_copy(rocprofiler_buffer_tracing_memory_copy_record_t* record, uint64_t stream_handle) { trace_cache::get_metadata_registry().add_stream(stream_handle); trace_cache::get_buffer_storage().store(trace_cache::memory_copy_sample{ - record->start_timestamp, record->end_timestamp, record->thread_id, record->dst_agent_id.handle, record->src_agent_id.handle, static_cast(record->kind), static_cast(record->operation), @@ -1759,6 +1784,120 @@ tool_tracing_buffered(rocprofiler_context_id_t /*context*/, } } } + else if(header->kind == ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY) + { + auto* record = + static_cast( + header->payload); + + bool _group_by_queue = _default_group_by_queue; + + const auto* agent = tool_data->get_gpu_tool_agent(record->agent_id); + auto device_id = static_cast(agent->device_id); + + const auto& t_info = thread_info::get(record->thread_id, SystemTID); + auto thread_id_sequent = t_info->index_data->sequent_value; + + auto _corr_id = record->correlation_id.internal; + auto _beg_ns = record->start_timestamp; + auto _end_ns = record->end_timestamp; + auto _name = + tool_data->buffered_tracing_info.at(record->kind, record->operation); + + auto _stream_id = get_stream_id(record).handle; + if(_stream_id == 0) + { + // Scratch memory event is not associated with a HIP stream + _group_by_queue = true; + } + + { + auto track_name = JOIN("", "GPU Scratch Memory [", device_id, + "] Thread ", record->thread_id); + cache_category(); + cache_add_thread_info(record->thread_id); + cache_add_track(track_name.c_str(), record->thread_id); + cache_scratch_memory(record, _stream_id); + } + + if(get_use_timemory()) + { + auto _bundle = kernel_dispatch_bundle_t{ _name }; + + _bundle.push(thread_id_sequent).start().stop(); + _bundle.get([_beg_ns, _end_ns](tim::component::wall_clock* _wc) { + _wc->set_value(_end_ns - _beg_ns); + _wc->set_accum(_end_ns - _beg_ns); + }); + _bundle.pop(); + } + + if(get_use_perfetto()) + { +// Scratch memory samples from SDK versions prior to 7.0.2 do not include +// allocation_size field, so counter tracks are not needed +#if(ROCPROFSYS_USE_ROCM > 0 && ROCPROFSYS_ROCM_VERSION >= 70002) + using counter_track = perfetto_counter_track< + rocprofiler_buffer_tracing_scratch_memory_record_t>; + + if(!counter_track::exists(device_id)) + { + auto track_name_alloc_size = + JOIN("", "GPU Scratch Memory [", device_id, "] (S) Thread ", + thread_id_sequent); + counter_track::emplace(device_id, track_name_alloc_size, "bytes"); + } + + if(record->operation == ROCPROFILER_SCRATCH_MEMORY_ALLOC) + { + TRACE_COUNTER("rocm_scratch_memory", + counter_track::at(device_id, 0), _beg_ns, + record->allocation_size); + } +#endif + auto add_perfetto_annotations = [&](::perfetto::EventContext ctx) { + if(config::get_perfetto_annotations()) + { + tracing::add_perfetto_annotation(ctx, "begin_ns", _beg_ns); + tracing::add_perfetto_annotation(ctx, "end_ns", _end_ns); + tracing::add_perfetto_annotation(ctx, "corr_id", _corr_id); + tracing::add_perfetto_annotation(ctx, "stream_id", + _stream_id); + } + }; + + if(_group_by_queue) + { + auto track_name_events = [&]() { + return JOIN("", "GPU Scratch Memory (S) Events Thread ", + thread_id_sequent); + }; + const auto _track = tracing::get_perfetto_track( + category::rocm_scratch_memory{}, track_name_events); + + tracing::push_perfetto(category::rocm_scratch_memory{}, + _name.data(), _track, _beg_ns, + ::perfetto::Flow::ProcessScoped(_corr_id), + add_perfetto_annotations); + + tracing::pop_perfetto(category::rocm_scratch_memory{}, "", _track, + _end_ns); + } + else + { + const auto _track = tracing::get_perfetto_track( + category::rocm_hip_stream{}, _track_desc_stream, _stream_id); + + tracing::push_perfetto(category::rocm_hip_stream{}, _name.data(), + _track, _beg_ns, + ::perfetto::Flow::ProcessScoped(_corr_id), + add_perfetto_annotations); + + tracing::pop_perfetto(category::rocm_hip_stream{}, "", _track, + _end_ns); + } + } + } else if(header->kind == ROCPROFILER_BUFFER_TRACING_MEMORY_COPY) { auto* record = @@ -2249,6 +2388,17 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* user_data) _data->primary_ctx, ROCPROFILER_BUFFER_TRACING_MEMORY_COPY, nullptr, 0, _data->memory_copy_buffer)); } + if(_buffered_domain.count(ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY) > 0) + { + ROCPROFILER_CALL(rocprofiler_create_buffer( + _data->primary_ctx, buffer_size, watermark, + ROCPROFILER_BUFFER_POLICY_LOSSLESS, tool_tracing_buffered, tool_data, + &_data->scratch_memory_buffer)); + + ROCPROFILER_CALL(rocprofiler_configure_buffer_tracing_service( + _data->primary_ctx, ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY, nullptr, 0, + _data->scratch_memory_buffer)); + } #if(ROCPROFILER_VERSION >= 600) if(_buffered_domain.count(ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION) > 0) diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk/fwd.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk/fwd.hpp index fa75982db4..48d4714044 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk/fwd.hpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk/fwd.hpp @@ -122,7 +122,7 @@ using backtrace_operation_map_t = struct client_data { - static constexpr size_t num_buffers = 4; + static constexpr size_t num_buffers = 5; static constexpr size_t num_contexts = 2; using buffer_name_info_t = rocprofiler::sdk::buffer_name_info_t; @@ -138,6 +138,7 @@ struct client_data rocprofiler_context_id_t primary_ctx = { 0 }; rocprofiler_context_id_t counter_ctx = { 0 }; rocprofiler_buffer_id_t kernel_dispatch_buffer = { 0 }; + rocprofiler_buffer_id_t scratch_memory_buffer = { 0 }; rocprofiler_buffer_id_t memory_copy_buffer = { 0 }; rocprofiler_buffer_id_t memory_alloc_buffer = { 0 }; rocprofiler_buffer_id_t counter_collection_buffer = { 0 }; @@ -179,12 +180,9 @@ client_data::get_contexts() const inline client_data::buffer_id_vec_t client_data::get_buffers() const { - return buffer_id_vec_t{ - kernel_dispatch_buffer, - memory_copy_buffer, - memory_alloc_buffer, - counter_collection_buffer, - }; + return buffer_id_vec_t{ kernel_dispatch_buffer, scratch_memory_buffer, + memory_copy_buffer, memory_alloc_buffer, + counter_collection_buffer }; } inline const rocprofsys_agent_t*