Add scratch memory support (#2211)
이 커밋은 다음에 포함됨:
@@ -367,7 +367,6 @@ config_settings(const std::shared_ptr<settings>& _config)
|
||||
|
||||
_skip_domains.emplace("kernel_dispatch");
|
||||
_skip_domains.emplace("page_migration");
|
||||
_skip_domains.emplace("scratch_memory");
|
||||
|
||||
_add_operation_settings(
|
||||
"MARKER_API", callback_tracing_info[ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API],
|
||||
@@ -652,7 +651,7 @@ get_backtrace_operations(rocprofiler_callback_tracing_kind_t kindv)
|
||||
{
|
||||
if(callback_operation_option_names.count(kindv) == 0)
|
||||
{
|
||||
LOG_CRITICAL("callback_operation_operation_names does not have value for {}",
|
||||
LOG_CRITICAL("callback_operation_option_names does not have value for {}",
|
||||
static_cast<int>(kindv));
|
||||
::rocprofsys::set_state(::rocprofsys::State::Finalized);
|
||||
std::abort();
|
||||
|
||||
@@ -41,7 +41,7 @@ using storage_parser_t =
|
||||
storage_parser<type_identifier_t, kernel_dispatch_sample, memory_copy_sample,
|
||||
memory_allocate_sample, region_sample, in_time_sample,
|
||||
pmc_event_with_sample, amd_smi_sample, cpu_freq_sample,
|
||||
backtrace_region_sample>;
|
||||
backtrace_region_sample, scratch_memory_sample>;
|
||||
|
||||
using buffer_storage_t = buffer_storage<flush_worker_factory_t, type_identifier_t>;
|
||||
|
||||
|
||||
@@ -540,6 +540,72 @@ perfetto_processor_t::handle([[maybe_unused]] const kernel_dispatch_sample& _kds
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
perfetto_processor_t::handle([[maybe_unused]] const scratch_memory_sample& _sms)
|
||||
{
|
||||
#if ROCPROFSYS_USE_ROCM > 0
|
||||
auto _corr_id = _sms.correlation_id_internal;
|
||||
auto _stream_id = _sms.stream_handle;
|
||||
auto _queue_id_handle = _sms.queue_id_handle;
|
||||
const auto& _t_info = thread_info::get(_sms.thread_id, SystemTID);
|
||||
const auto _thread_id_sequent = _t_info->index_data->sequent_value;
|
||||
auto _beg_ts = _sms.start_timestamp;
|
||||
auto _end_ts = _sms.end_timestamp;
|
||||
|
||||
auto _agent_device_id =
|
||||
m_agent_manager.get_agent_by_handle(_sms.agent_id_handle).device_type_index;
|
||||
auto _name = std::string{ m_metadata.get_buffer_name_info().at(
|
||||
static_cast<rocprofiler_buffer_tracing_kind_t>(_sms.kind),
|
||||
static_cast<rocprofiler_tracing_operation_t>(_sms.operation)) };
|
||||
|
||||
// Scratch memory samples from SDK versions prior to 7.0.2 do not include
|
||||
// allocation_size field, so counter tracks are not needed
|
||||
# if ROCPROFSYS_ROCM_VERSION >= 70002
|
||||
using counter_track =
|
||||
perfetto_counter_track<rocprofiler_buffer_tracing_scratch_memory_record_t>;
|
||||
|
||||
if(!counter_track::exists(_agent_device_id))
|
||||
{
|
||||
auto _track_desc_alloc_size = JOIN("", "GPU Scratch Memory [", _agent_device_id,
|
||||
"] Thread ", _thread_id_sequent);
|
||||
counter_track::emplace(_agent_device_id, _track_desc_alloc_size, "bytes");
|
||||
}
|
||||
|
||||
if(_sms.operation == ROCPROFILER_SCRATCH_MEMORY_ALLOC)
|
||||
{
|
||||
TRACE_COUNTER("rocm_scratch_memory", counter_track::at(_agent_device_id, 0),
|
||||
_beg_ts, _sms.allocation_size);
|
||||
}
|
||||
# endif
|
||||
|
||||
auto _track_desc_events = [&]() {
|
||||
return JOIN("", "GPU Scratch Memory Events Thread ", _thread_id_sequent);
|
||||
};
|
||||
|
||||
const auto _track =
|
||||
tracing::get_perfetto_track(category::rocm_scratch_memory{}, _track_desc_events);
|
||||
|
||||
auto add_perfetto_annotations = [&](::perfetto::EventContext ctx) {
|
||||
if(!m_use_annotations) return;
|
||||
|
||||
annotate_perfetto(ctx, { { "begin_ns", _beg_ts },
|
||||
{ "end_ns", _end_ts },
|
||||
{ "corr_id", _corr_id },
|
||||
{ "stream_id", _stream_id },
|
||||
{ "queue", _queue_id_handle },
|
||||
{ "allocation_size", _sms.allocation_size },
|
||||
{ "agent_id", _agent_device_id },
|
||||
{ "operation", _name },
|
||||
{ "flags", _sms.flags } });
|
||||
};
|
||||
|
||||
tracing::push_perfetto(category::rocm_scratch_memory{}, _name.c_str(), _track,
|
||||
_beg_ts, ::perfetto::Flow::ProcessScoped(_corr_id),
|
||||
add_perfetto_annotations);
|
||||
tracing::pop_perfetto(category::rocm_scratch_memory{}, "", _track, _end_ts);
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
perfetto_processor_t::handle([[maybe_unused]] const memory_copy_sample& _mcs)
|
||||
{
|
||||
|
||||
@@ -56,6 +56,7 @@ public:
|
||||
void finalize_processing();
|
||||
|
||||
void handle(const kernel_dispatch_sample& sample);
|
||||
void handle(const scratch_memory_sample& sample);
|
||||
void handle(const memory_copy_sample& sample);
|
||||
void handle(const memory_allocate_sample& sample);
|
||||
void handle(const region_sample& sample);
|
||||
|
||||
@@ -66,6 +66,37 @@ get_handle_from_code_object(
|
||||
# endif
|
||||
}
|
||||
#endif
|
||||
|
||||
#if ROCPROFSYS_USE_ROCM > 0
|
||||
using memory_operation = std::string;
|
||||
using memory_type = std::string;
|
||||
std::pair<memory_operation, memory_type>
|
||||
parse_memory_operation_name(std::string_view memory_operation_name)
|
||||
{
|
||||
static const std::unordered_map<std::string_view,
|
||||
std::pair<memory_operation, memory_type>>
|
||||
parsing_map{
|
||||
{ "MEMORY_ALLOCATION_NONE", { "NONE", "REAL" } },
|
||||
{ "MEMORY_ALLOCATION_ALLOCATE", { "ALLOC", "REAL" } },
|
||||
{ "MEMORY_ALLOCATION_VMEM_ALLOCATE", { "ALLOC", "VIRTUAL" } },
|
||||
{ "MEMORY_ALLOCATION_FREE", { "FREE", "REAL" } },
|
||||
{ "MEMORY_ALLOCATION_VMEM_FREE", { "FREE", "VIRTUAL" } },
|
||||
{ "SCRATCH_MEMORY_NONE", { "NONE", "SCRATCH" } },
|
||||
{ "SCRATCH_MEMORY_ALLOC", { "ALLOC", "SCRATCH" } },
|
||||
{ "SCRATCH_MEMORY_FREE", { "FREE", "SCRATCH" } },
|
||||
{ "SCRATCH_MEMORY_ASYNC_RECLAIM", { "ASYNC_RECLAIM", "SCRATCH" } },
|
||||
};
|
||||
|
||||
auto item = parsing_map.find(memory_operation_name);
|
||||
if(item == parsing_map.end())
|
||||
{
|
||||
LOG_WARNING("Unknown memory operation name: {}", memory_operation_name);
|
||||
return { "UNKNOWN", "UNKNOWN" };
|
||||
}
|
||||
|
||||
return item->second;
|
||||
}
|
||||
#endif
|
||||
} // namespace
|
||||
|
||||
void
|
||||
@@ -110,6 +141,46 @@ rocpd_processor_t::handle([[maybe_unused]] const kernel_dispatch_sample& _kds)
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
rocpd_processor_t::handle([[maybe_unused]] const scratch_memory_sample& _sms)
|
||||
{
|
||||
#if ROCPROFSYS_USE_ROCM > 0
|
||||
auto& n_info = node_info::get_instance();
|
||||
auto process = m_metadata->get_process_info();
|
||||
|
||||
const auto* _name = m_metadata->get_buffer_name_info().at(
|
||||
static_cast<rocprofiler_buffer_tracing_kind_t>(_sms.kind),
|
||||
static_cast<rocprofiler_tracing_operation_t>(_sms.operation));
|
||||
|
||||
auto agent_primary_key =
|
||||
m_agent_manager->get_agent_by_handle(_sms.agent_id_handle).base_id;
|
||||
|
||||
auto thread_primary_key =
|
||||
m_data_processor->map_thread_id_to_primary_key(_sms.thread_id);
|
||||
|
||||
auto category_primary_key = m_data_processor->insert_string(
|
||||
trait::name<category::rocm_scratch_memory>::value);
|
||||
|
||||
auto stack_id = _sms.correlation_id_internal;
|
||||
auto parent_stack_id = _sms.correlation_id_ancestor;
|
||||
auto correlation_id = 0;
|
||||
auto address_value = 0;
|
||||
|
||||
auto event_primary_key = m_data_processor->insert_event(
|
||||
category_primary_key, stack_id, parent_stack_id, correlation_id);
|
||||
|
||||
auto [memory_operation, memory_type] = parse_memory_operation_name(_name);
|
||||
|
||||
auto extdata_json_str = JOIN("", "{\"flags\": ", _sms.flags, "}");
|
||||
|
||||
m_data_processor->insert_memory_alloc(
|
||||
n_info.id, process.pid, thread_primary_key, agent_primary_key,
|
||||
memory_operation.c_str(), memory_type.c_str(), _sms.start_timestamp,
|
||||
_sms.end_timestamp, address_value, _sms.allocation_size, _sms.queue_id_handle,
|
||||
_sms.stream_handle, event_primary_key, extdata_json_str.c_str());
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
rocpd_processor_t::handle([[maybe_unused]] const memory_copy_sample& _mcs)
|
||||
{
|
||||
@@ -153,46 +224,6 @@ void
|
||||
rocpd_processor_t::handle([[maybe_unused]] const memory_allocate_sample& _mas)
|
||||
{
|
||||
#if ROCPROFSYS_USE_ROCM > 0 && (ROCPROFILER_VERSION >= 600)
|
||||
static auto memtype_to_db =
|
||||
[](std::string_view memory_type) -> std::pair<std::string, std::string> {
|
||||
constexpr auto MEMORY_PREFIX = std::string_view{ "MEMORY_ALLOCATION_" };
|
||||
constexpr auto SCRATCH_PREFIX = std::string_view{ "SCRATCH_MEMORY_" };
|
||||
constexpr auto VMEM_PREFIX = std::string_view{ "VMEM_" };
|
||||
constexpr auto ASYNC_PREFIX = std::string_view{ "ASYNC_" };
|
||||
|
||||
std::string _type;
|
||||
std::string _level;
|
||||
if(memory_type.find(MEMORY_PREFIX) == 0)
|
||||
{
|
||||
_type = memory_type.substr(MEMORY_PREFIX.length());
|
||||
if(_type.find(VMEM_PREFIX) == 0)
|
||||
{
|
||||
_type = _type.substr(VMEM_PREFIX.length());
|
||||
_level = "VIRTUAL";
|
||||
}
|
||||
else
|
||||
{
|
||||
_level = "REAL";
|
||||
}
|
||||
}
|
||||
else if(memory_type.find(SCRATCH_PREFIX) == 0)
|
||||
{
|
||||
_type = memory_type.substr(SCRATCH_PREFIX.length());
|
||||
_level = "SCRATCH";
|
||||
if(memory_type.find(ASYNC_PREFIX) == 0)
|
||||
{
|
||||
_type = memory_type.substr(ASYNC_PREFIX.length()); // RECLAIM
|
||||
}
|
||||
}
|
||||
|
||||
if(_type == "ALLOCATE")
|
||||
{
|
||||
_type = "ALLOC";
|
||||
}
|
||||
|
||||
return std::make_pair(_type, _level);
|
||||
};
|
||||
|
||||
auto& n_info = node_info::get_instance();
|
||||
auto process = m_metadata->get_process_info();
|
||||
auto thread_primary_key =
|
||||
@@ -210,7 +241,7 @@ rocpd_processor_t::handle([[maybe_unused]] const memory_allocate_sample& _mas)
|
||||
static_cast<rocprofiler_buffer_tracing_kind_t>(_mas.kind),
|
||||
static_cast<rocprofiler_tracing_operation_t>(_mas.operation));
|
||||
|
||||
auto [type, level] = memtype_to_db(_name);
|
||||
auto [memory_operation, memory_type] = parse_memory_operation_name(_name);
|
||||
|
||||
auto stack_id = _mas.correlation_id_internal;
|
||||
auto parent_stack_id = _mas.correlation_id_ancestor;
|
||||
@@ -224,9 +255,10 @@ rocpd_processor_t::handle([[maybe_unused]] const memory_allocate_sample& _mas)
|
||||
category_primary_key, stack_id, parent_stack_id, correlation_id);
|
||||
|
||||
m_data_processor->insert_memory_alloc(
|
||||
n_info.id, process.pid, thread_primary_key, agent_primary_key, type.c_str(),
|
||||
level.c_str(), _mas.start_timestamp, _mas.end_timestamp, _mas.address_value,
|
||||
_mas.allocation_size, queue_id, _mas.stream_handle, event_primary_key);
|
||||
n_info.id, process.pid, thread_primary_key, agent_primary_key,
|
||||
memory_operation.c_str(), memory_type.c_str(), _mas.start_timestamp,
|
||||
_mas.end_timestamp, _mas.address_value, _mas.allocation_size, queue_id,
|
||||
_mas.stream_handle, event_primary_key);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -45,6 +45,7 @@ public:
|
||||
void finalize_processing();
|
||||
|
||||
void handle(const kernel_dispatch_sample& sample);
|
||||
void handle(const scratch_memory_sample& sample);
|
||||
void handle(const memory_copy_sample& sample);
|
||||
void handle(const memory_allocate_sample& sample);
|
||||
void handle(const region_sample& sample);
|
||||
|
||||
@@ -43,6 +43,11 @@ struct processor_t
|
||||
static_cast<T*>(this)->handle(sample);
|
||||
}
|
||||
|
||||
void handle(const scratch_memory_sample& sample)
|
||||
{
|
||||
static_cast<T*>(this)->handle(sample);
|
||||
}
|
||||
|
||||
void handle(const memory_copy_sample& sample)
|
||||
{
|
||||
static_cast<T*>(this)->handle(sample);
|
||||
@@ -84,6 +89,7 @@ protected:
|
||||
struct processor_view_t
|
||||
{
|
||||
using kernel_dispatch_fn_t = void (*)(void*, const kernel_dispatch_sample&) noexcept;
|
||||
using scratch_memory_fn_t = void (*)(void*, const scratch_memory_sample&) noexcept;
|
||||
using memory_copy_fn_t = void (*)(void*, const memory_copy_sample&) noexcept;
|
||||
#if(ROCPROFILER_VERSION >= 600)
|
||||
using memory_allocate_fn_t = void (*)(void*, const memory_allocate_sample&) noexcept;
|
||||
@@ -101,6 +107,7 @@ struct processor_view_t
|
||||
struct vtable_t
|
||||
{
|
||||
kernel_dispatch_fn_t handle_kernel_dispatch;
|
||||
scratch_memory_fn_t handle_scratch_memory;
|
||||
memory_copy_fn_t handle_memory_copy;
|
||||
#if(ROCPROFILER_VERSION >= 600)
|
||||
memory_allocate_fn_t handle_memory_allocate;
|
||||
@@ -134,6 +141,11 @@ struct processor_view_t
|
||||
m_vtable->handle_kernel_dispatch(m_object, sample);
|
||||
}
|
||||
|
||||
ROCPROFSYS_INLINE void handle(const scratch_memory_sample& sample) const noexcept
|
||||
{
|
||||
m_vtable->handle_scratch_memory(m_object, sample);
|
||||
}
|
||||
|
||||
ROCPROFSYS_INLINE void handle(const memory_copy_sample& sample) const noexcept
|
||||
{
|
||||
m_vtable->handle_memory_copy(m_object, sample);
|
||||
@@ -194,6 +206,9 @@ private:
|
||||
+[](void* obj, const kernel_dispatch_sample& sample) noexcept {
|
||||
static_cast<T*>(obj)->handle(sample);
|
||||
},
|
||||
+[](void* obj, const scratch_memory_sample& sample) noexcept {
|
||||
static_cast<T*>(obj)->handle(sample);
|
||||
},
|
||||
+[](void* obj, const memory_copy_sample& sample) noexcept {
|
||||
static_cast<T*>(obj)->handle(sample);
|
||||
},
|
||||
@@ -275,6 +290,9 @@ struct sample_processor_t
|
||||
case type_identifier_t::kernel_dispatch:
|
||||
handle_sample(static_cast<const kernel_dispatch_sample&>(sample));
|
||||
break;
|
||||
case type_identifier_t::scratch_memory:
|
||||
handle_sample(static_cast<const scratch_memory_sample&>(sample));
|
||||
break;
|
||||
case type_identifier_t::memory_copy:
|
||||
handle_sample(static_cast<const memory_copy_sample&>(sample));
|
||||
break;
|
||||
|
||||
@@ -46,6 +46,7 @@ enum class type_identifier_t : uint32_t
|
||||
amd_smi_sample = 0x0006,
|
||||
cpu_freq_sample = 0x0007,
|
||||
backtrace_region_sample = 0x0008,
|
||||
scratch_memory = 0x0009,
|
||||
fragmented_space = 0xFFFF
|
||||
};
|
||||
|
||||
@@ -148,6 +149,83 @@ get_size(const kernel_dispatch_sample& item)
|
||||
item.grid_size_z, static_cast<uint64_t>(item.stream_handle));
|
||||
}
|
||||
|
||||
struct scratch_memory_sample : cacheable_t
|
||||
{
|
||||
static constexpr type_identifier_t type_identifier =
|
||||
type_identifier_t::scratch_memory;
|
||||
|
||||
scratch_memory_sample() = default;
|
||||
scratch_memory_sample(uint64_t _start_timestamp, uint64_t _end_timestamp,
|
||||
uint64_t _thread_id, uint64_t _agent_id_handle,
|
||||
uint64_t _queue_id_handle, int32_t _kind, int32_t _operation,
|
||||
int32_t _flags, uint64_t _allocation_size,
|
||||
uint64_t _correlation_id_internal,
|
||||
uint64_t _correlation_id_ancestor, size_t _stream_handle)
|
||||
: start_timestamp(_start_timestamp)
|
||||
, end_timestamp(_end_timestamp)
|
||||
, thread_id(_thread_id)
|
||||
, agent_id_handle(_agent_id_handle)
|
||||
, queue_id_handle(_queue_id_handle)
|
||||
, kind(_kind)
|
||||
, operation(_operation)
|
||||
, flags(_flags)
|
||||
, allocation_size(_allocation_size)
|
||||
, correlation_id_internal(_correlation_id_internal)
|
||||
, correlation_id_ancestor(_correlation_id_ancestor)
|
||||
, stream_handle(_stream_handle)
|
||||
{}
|
||||
|
||||
uint64_t start_timestamp;
|
||||
uint64_t end_timestamp;
|
||||
uint64_t thread_id;
|
||||
uint64_t agent_id_handle;
|
||||
uint64_t queue_id_handle;
|
||||
int32_t kind;
|
||||
int32_t operation;
|
||||
int32_t flags;
|
||||
uint64_t allocation_size;
|
||||
uint64_t correlation_id_internal;
|
||||
uint64_t correlation_id_ancestor;
|
||||
size_t stream_handle;
|
||||
};
|
||||
|
||||
template <>
|
||||
inline void
|
||||
serialize(uint8_t* buffer, const scratch_memory_sample& item)
|
||||
{
|
||||
utility::store_value(buffer, item.start_timestamp, item.end_timestamp, item.thread_id,
|
||||
item.agent_id_handle, item.queue_id_handle, item.kind,
|
||||
item.operation, item.flags, item.allocation_size,
|
||||
item.correlation_id_internal, item.correlation_id_ancestor,
|
||||
static_cast<uint64_t>(item.stream_handle));
|
||||
}
|
||||
|
||||
template <>
|
||||
inline scratch_memory_sample
|
||||
deserialize(uint8_t*& buffer)
|
||||
{
|
||||
scratch_memory_sample item;
|
||||
uint64_t stream_handle;
|
||||
utility::parse_value(buffer, item.start_timestamp, item.end_timestamp, item.thread_id,
|
||||
item.agent_id_handle, item.queue_id_handle, item.kind,
|
||||
item.operation, item.flags, item.allocation_size,
|
||||
item.correlation_id_internal, item.correlation_id_ancestor,
|
||||
stream_handle);
|
||||
item.stream_handle = stream_handle;
|
||||
return item;
|
||||
}
|
||||
|
||||
template <>
|
||||
inline size_t
|
||||
get_size(const scratch_memory_sample& item)
|
||||
{
|
||||
return utility::get_size(item.start_timestamp, item.end_timestamp, item.thread_id,
|
||||
item.agent_id_handle, item.queue_id_handle, item.kind,
|
||||
item.operation, item.flags, item.allocation_size,
|
||||
item.correlation_id_internal, item.correlation_id_ancestor,
|
||||
static_cast<uint64_t>(item.stream_handle));
|
||||
}
|
||||
|
||||
struct memory_copy_sample : cacheable_t
|
||||
{
|
||||
static constexpr type_identifier_t type_identifier = type_identifier_t::memory_copy;
|
||||
|
||||
@@ -567,6 +567,18 @@ get_mem_alloc_address(
|
||||
}
|
||||
#endif
|
||||
|
||||
uint64_t
|
||||
get_scratch_mem_alloc_size(
|
||||
[[maybe_unused]] const rocprofiler_buffer_tracing_scratch_memory_record_t& record)
|
||||
{
|
||||
// Scratch memory samples from SDK versions prior to 7.0.2 do not include allocation_size
|
||||
#if(ROCPROFSYS_USE_ROCM > 0 && ROCPROFSYS_ROCM_VERSION >= 70002)
|
||||
return record.allocation_size;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
cache_region(const rocprofiler_callback_tracing_record_t* record,
|
||||
const rocprofiler_timestamp_t start_timestamp,
|
||||
@@ -615,13 +627,26 @@ cache_kernel_dispatch(rocprofiler_buffer_tracing_kernel_dispatch_record_t* recor
|
||||
record->dispatch_info.grid_size.z, stream_handle });
|
||||
}
|
||||
|
||||
void
|
||||
cache_scratch_memory(rocprofiler_buffer_tracing_scratch_memory_record_t* record,
|
||||
uint64_t stream_handle)
|
||||
{
|
||||
trace_cache::get_metadata_registry().add_stream(stream_handle);
|
||||
trace_cache::get_buffer_storage().store(trace_cache::scratch_memory_sample{
|
||||
record->start_timestamp, record->end_timestamp, record->thread_id,
|
||||
record->agent_id.handle, record->queue_id.handle,
|
||||
static_cast<int32_t>(record->kind), static_cast<int32_t>(record->operation),
|
||||
static_cast<int32_t>(record->flags), get_scratch_mem_alloc_size(*record),
|
||||
record->correlation_id.internal, get_parent_stack_id(record->correlation_id),
|
||||
stream_handle });
|
||||
}
|
||||
|
||||
void
|
||||
cache_memory_copy(rocprofiler_buffer_tracing_memory_copy_record_t* record,
|
||||
uint64_t stream_handle)
|
||||
{
|
||||
trace_cache::get_metadata_registry().add_stream(stream_handle);
|
||||
trace_cache::get_buffer_storage().store(trace_cache::memory_copy_sample{
|
||||
|
||||
record->start_timestamp, record->end_timestamp, record->thread_id,
|
||||
record->dst_agent_id.handle, record->src_agent_id.handle,
|
||||
static_cast<int32_t>(record->kind), static_cast<int32_t>(record->operation),
|
||||
@@ -1759,6 +1784,120 @@ tool_tracing_buffered(rocprofiler_context_id_t /*context*/,
|
||||
}
|
||||
}
|
||||
}
|
||||
else if(header->kind == ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY)
|
||||
{
|
||||
auto* record =
|
||||
static_cast<rocprofiler_buffer_tracing_scratch_memory_record_t*>(
|
||||
header->payload);
|
||||
|
||||
bool _group_by_queue = _default_group_by_queue;
|
||||
|
||||
const auto* agent = tool_data->get_gpu_tool_agent(record->agent_id);
|
||||
auto device_id = static_cast<uint32_t>(agent->device_id);
|
||||
|
||||
const auto& t_info = thread_info::get(record->thread_id, SystemTID);
|
||||
auto thread_id_sequent = t_info->index_data->sequent_value;
|
||||
|
||||
auto _corr_id = record->correlation_id.internal;
|
||||
auto _beg_ns = record->start_timestamp;
|
||||
auto _end_ns = record->end_timestamp;
|
||||
auto _name =
|
||||
tool_data->buffered_tracing_info.at(record->kind, record->operation);
|
||||
|
||||
auto _stream_id = get_stream_id(record).handle;
|
||||
if(_stream_id == 0)
|
||||
{
|
||||
// Scratch memory event is not associated with a HIP stream
|
||||
_group_by_queue = true;
|
||||
}
|
||||
|
||||
{
|
||||
auto track_name = JOIN("", "GPU Scratch Memory [", device_id,
|
||||
"] Thread ", record->thread_id);
|
||||
cache_category<category::rocm_scratch_memory>();
|
||||
cache_add_thread_info(record->thread_id);
|
||||
cache_add_track(track_name.c_str(), record->thread_id);
|
||||
cache_scratch_memory(record, _stream_id);
|
||||
}
|
||||
|
||||
if(get_use_timemory())
|
||||
{
|
||||
auto _bundle = kernel_dispatch_bundle_t{ _name };
|
||||
|
||||
_bundle.push(thread_id_sequent).start().stop();
|
||||
_bundle.get([_beg_ns, _end_ns](tim::component::wall_clock* _wc) {
|
||||
_wc->set_value(_end_ns - _beg_ns);
|
||||
_wc->set_accum(_end_ns - _beg_ns);
|
||||
});
|
||||
_bundle.pop();
|
||||
}
|
||||
|
||||
if(get_use_perfetto())
|
||||
{
|
||||
// Scratch memory samples from SDK versions prior to 7.0.2 do not include
|
||||
// allocation_size field, so counter tracks are not needed
|
||||
#if(ROCPROFSYS_USE_ROCM > 0 && ROCPROFSYS_ROCM_VERSION >= 70002)
|
||||
using counter_track = perfetto_counter_track<
|
||||
rocprofiler_buffer_tracing_scratch_memory_record_t>;
|
||||
|
||||
if(!counter_track::exists(device_id))
|
||||
{
|
||||
auto track_name_alloc_size =
|
||||
JOIN("", "GPU Scratch Memory [", device_id, "] (S) Thread ",
|
||||
thread_id_sequent);
|
||||
counter_track::emplace(device_id, track_name_alloc_size, "bytes");
|
||||
}
|
||||
|
||||
if(record->operation == ROCPROFILER_SCRATCH_MEMORY_ALLOC)
|
||||
{
|
||||
TRACE_COUNTER("rocm_scratch_memory",
|
||||
counter_track::at(device_id, 0), _beg_ns,
|
||||
record->allocation_size);
|
||||
}
|
||||
#endif
|
||||
auto add_perfetto_annotations = [&](::perfetto::EventContext ctx) {
|
||||
if(config::get_perfetto_annotations())
|
||||
{
|
||||
tracing::add_perfetto_annotation(ctx, "begin_ns", _beg_ns);
|
||||
tracing::add_perfetto_annotation(ctx, "end_ns", _end_ns);
|
||||
tracing::add_perfetto_annotation(ctx, "corr_id", _corr_id);
|
||||
tracing::add_perfetto_annotation(ctx, "stream_id",
|
||||
_stream_id);
|
||||
}
|
||||
};
|
||||
|
||||
if(_group_by_queue)
|
||||
{
|
||||
auto track_name_events = [&]() {
|
||||
return JOIN("", "GPU Scratch Memory (S) Events Thread ",
|
||||
thread_id_sequent);
|
||||
};
|
||||
const auto _track = tracing::get_perfetto_track(
|
||||
category::rocm_scratch_memory{}, track_name_events);
|
||||
|
||||
tracing::push_perfetto(category::rocm_scratch_memory{},
|
||||
_name.data(), _track, _beg_ns,
|
||||
::perfetto::Flow::ProcessScoped(_corr_id),
|
||||
add_perfetto_annotations);
|
||||
|
||||
tracing::pop_perfetto(category::rocm_scratch_memory{}, "", _track,
|
||||
_end_ns);
|
||||
}
|
||||
else
|
||||
{
|
||||
const auto _track = tracing::get_perfetto_track(
|
||||
category::rocm_hip_stream{}, _track_desc_stream, _stream_id);
|
||||
|
||||
tracing::push_perfetto(category::rocm_hip_stream{}, _name.data(),
|
||||
_track, _beg_ns,
|
||||
::perfetto::Flow::ProcessScoped(_corr_id),
|
||||
add_perfetto_annotations);
|
||||
|
||||
tracing::pop_perfetto(category::rocm_hip_stream{}, "", _track,
|
||||
_end_ns);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if(header->kind == ROCPROFILER_BUFFER_TRACING_MEMORY_COPY)
|
||||
{
|
||||
auto* record =
|
||||
@@ -2249,6 +2388,17 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* user_data)
|
||||
_data->primary_ctx, ROCPROFILER_BUFFER_TRACING_MEMORY_COPY, nullptr, 0,
|
||||
_data->memory_copy_buffer));
|
||||
}
|
||||
if(_buffered_domain.count(ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY) > 0)
|
||||
{
|
||||
ROCPROFILER_CALL(rocprofiler_create_buffer(
|
||||
_data->primary_ctx, buffer_size, watermark,
|
||||
ROCPROFILER_BUFFER_POLICY_LOSSLESS, tool_tracing_buffered, tool_data,
|
||||
&_data->scratch_memory_buffer));
|
||||
|
||||
ROCPROFILER_CALL(rocprofiler_configure_buffer_tracing_service(
|
||||
_data->primary_ctx, ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY, nullptr, 0,
|
||||
_data->scratch_memory_buffer));
|
||||
}
|
||||
|
||||
#if(ROCPROFILER_VERSION >= 600)
|
||||
if(_buffered_domain.count(ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION) > 0)
|
||||
|
||||
@@ -122,7 +122,7 @@ using backtrace_operation_map_t =
|
||||
|
||||
struct client_data
|
||||
{
|
||||
static constexpr size_t num_buffers = 4;
|
||||
static constexpr size_t num_buffers = 5;
|
||||
static constexpr size_t num_contexts = 2;
|
||||
|
||||
using buffer_name_info_t = rocprofiler::sdk::buffer_name_info_t<std::string_view>;
|
||||
@@ -138,6 +138,7 @@ struct client_data
|
||||
rocprofiler_context_id_t primary_ctx = { 0 };
|
||||
rocprofiler_context_id_t counter_ctx = { 0 };
|
||||
rocprofiler_buffer_id_t kernel_dispatch_buffer = { 0 };
|
||||
rocprofiler_buffer_id_t scratch_memory_buffer = { 0 };
|
||||
rocprofiler_buffer_id_t memory_copy_buffer = { 0 };
|
||||
rocprofiler_buffer_id_t memory_alloc_buffer = { 0 };
|
||||
rocprofiler_buffer_id_t counter_collection_buffer = { 0 };
|
||||
@@ -179,12 +180,9 @@ client_data::get_contexts() const
|
||||
inline client_data::buffer_id_vec_t
|
||||
client_data::get_buffers() const
|
||||
{
|
||||
return buffer_id_vec_t{
|
||||
kernel_dispatch_buffer,
|
||||
memory_copy_buffer,
|
||||
memory_alloc_buffer,
|
||||
counter_collection_buffer,
|
||||
};
|
||||
return buffer_id_vec_t{ kernel_dispatch_buffer, scratch_memory_buffer,
|
||||
memory_copy_buffer, memory_alloc_buffer,
|
||||
counter_collection_buffer };
|
||||
}
|
||||
|
||||
inline const rocprofsys_agent_t*
|
||||
|
||||
새 이슈에서 참조
사용자 차단