Files
rocm-systems/plugin/perfetto/perfetto.cpp
T
Ammar ELWazir 8032adb64f Adding rocprofilerv2
Change-Id: Ic0cc280ba207d2b8f6ccae1cd4ac3184152fc1ad
2023-03-09 13:20:33 +00:00

805 строки
37 KiB
C++

/* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#include "rocprofiler.h"
#include <cassert>
#include <condition_variable>
#include <cstdint>
#include <cstdlib>
#include <experimental/filesystem>
#include <fstream>
#include <memory>
#include <optional>
#include <ostream>
#include <sstream>
#include <string>
#include <functional>
#include <iostream>
#include <cxxabi.h>
#include <fcntl.h>
#include <stdarg.h>
#include <stdio.h>
#include <string.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <unistd.h>
#include <systemd/sd-id128.h>
#include "perfetto_sdk/sdk/perfetto.h"
#include "rocprofiler_plugin.h"
#include "../utils.h"
#define STREAM_CONSTANT 98736677
#define QUEUE_CONSTANT 18746479
namespace fs = std::experimental::filesystem;
PERFETTO_DEFINE_CATEGORIES(
perfetto::Category("GENERIC").SetDescription("GENERAL_CATEGORY"),
perfetto::Category("ROCTX_API").SetDescription("ACTIVITY_DOMAIN_ROCTX_API"),
perfetto::Category("HSA_API").SetDescription("ACTIVITY_DOMAIN_HSA_API"),
perfetto::Category("HIP_API").SetDescription("ACTIVITY_DOMAIN_HIP_API"),
perfetto::Category("External_API").SetDescription("ACTIVITY_DOMAIN_EXT_API"),
perfetto::Category("HIP_OPS").SetDescription("ACTIVITY_DOMAIN_HIP_OPS"),
perfetto::Category("HSA_OPS").SetDescription("ACTIVITY_DOMAIN_HSA_OPS"),
perfetto::Category("KERNELS").SetDescription("KERNEL_DISPATCHES"),
perfetto::Category("COUNTERS").SetDescription("PERFORMANCE_COUNTERS"));
PERFETTO_TRACK_EVENT_STATIC_STORAGE();
namespace {
std::string process_name;
static std::string output_file_name;
std::string get_kernel_name(rocprofiler_record_profiler_t& profiler_record) {
std::string kernel_name = "";
size_t name_length = 1;
CHECK_ROCMTOOLS(rocprofiler_query_kernel_info_size(ROCPROFILER_KERNEL_NAME, profiler_record.kernel_id,
&name_length));
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#pragma GCC diagnostic ignored "-Wstringop-overread"
if (name_length > 1) {
const char* kernel_name_c = static_cast<const char*>(malloc(name_length * sizeof(char)));
CHECK_ROCMTOOLS(rocprofiler_query_kernel_info(ROCPROFILER_KERNEL_NAME, profiler_record.kernel_id,
&kernel_name_c));
if (kernel_name_c && strlen(kernel_name_c) > 1)
kernel_name = rocmtools::cxx_demangle(strdup(kernel_name_c));
}
#pragma GCC diagnostic pop
return kernel_name;
}
class perfetto_plugin_t {
public:
perfetto_plugin_t() {
const char* output_dir = getenv("OUTPUT_PATH");
const char* temp_file_name = getenv("OUT_FILE_NAME");
output_file_name = temp_file_name ? std::string(temp_file_name) + "_" : "";
if (output_dir == nullptr) {
stream_.copyfmt(std::cout);
stream_.clear(std::cout.rdstate());
stream_.basic_ios<char>::rdbuf(std::cout.rdbuf());
return;
}
output_prefix_ = output_dir;
if (!fs::is_directory(fs::status(output_prefix_))) {
if (!stream_.fail()) rocmtools::warning("Cannot open output directory '%s'", output_dir);
stream_.setstate(std::ios_base::failbit);
return;
}
perfetto::TracingInitArgs args;
args.backends |= perfetto::kInProcessBackend;
perfetto::Tracing::Initialize(args);
perfetto::TrackEvent::Register();
perfetto::protos::gen::TrackEventConfig track_event_cfg;
track_event_cfg.add_disabled_categories("*");
track_event_cfg.add_enabled_categories("GENERIC");
track_event_cfg.add_enabled_categories("ROCTX_API");
track_event_cfg.add_enabled_categories("HSA_API");
track_event_cfg.add_enabled_categories("HIP_API");
track_event_cfg.add_enabled_categories("External_API");
track_event_cfg.add_enabled_categories("HIP_OPS");
track_event_cfg.add_enabled_categories("HSA_OPS");
track_event_cfg.add_enabled_categories("KERNELS");
track_event_cfg.add_enabled_categories("COUNTERS");
perfetto::TraceConfig trace_cfg;
auto buffer_cfg = trace_cfg.add_buffers();
uint32_t max_buffer_size = 10 * 1024 * 1024; // Default max buffer size is 10 GB
const char* max_buffer_size_str = getenv("rocprofiler_PERFETTO_MAX_BUFFER_SIZE_KIB");
if (max_buffer_size_str && std::atol(max_buffer_size_str) > 0)
max_buffer_size = std::atol(max_buffer_size_str);
// Record up to max buffer size determined by user or the 10 GB (default value)
buffer_cfg->set_size_kb(max_buffer_size);
auto* data_source_cfg = trace_cfg.add_data_sources()->mutable_config();
data_source_cfg->set_name("track_event");
data_source_cfg->set_track_event_config_raw(track_event_cfg.SerializeAsString());
output_prefix_.append(output_file_name + std::to_string(GetPid()) + "_output.pftrace");
file_descriptor_ = open(output_prefix_.string().c_str(), O_RDWR | O_CREAT | O_TRUNC, 0600);
if (file_descriptor_ == -1) rocmtools::warning("Can't open output file\n");
tracing_session_ = perfetto::Tracing::NewTrace();
tracing_session_->Setup(trace_cfg, file_descriptor_);
tracing_session_->StartBlocking();
hostname_[1023] = '\0';
gethostname(hostname_, 1023);
sd_id128_t ret;
char machine_id[SD_ID128_STRING_MAX];
[[maybe_unused]] int status = sd_id128_get_machine(&ret);
assert(status == 0 && "Error: Couldn't get machine id!");
if (sd_id128_to_string(ret, machine_id)) machine_id_ = std::hash<std::string>{}(machine_id);
{
std::lock_guard<std::mutex> lock(thread_tracks_lock_);
process_name =
perfetto::ProcessTrack::Current().Serialize().mutable_process()->process_name();
auto process_track_desc = perfetto::ProcessTrack::Current().Serialize();
uint64_t track_id =
track_counter_.fetch_add((1 + machine_id_) * GetPid(), std::memory_order_acquire);
for (uint64_t tid : track_ids_used_) {
while (track_id == tid) {
track_id =
track_counter_.fetch_add((1 + machine_id_) * GetPid(), std::memory_order_acquire);
}
}
std::string thread_track_str =
rocmtools::string_printf("Node: %s Process ID: %lu Thread ID:", hostname_, GetPid());
process_track_desc.mutable_process()->set_process_name(thread_track_str);
perfetto::TrackEvent::SetTrackDescriptor(perfetto::ProcessTrack::Current(),
process_track_desc);
perfetto::ProcessTrack::Current().Serialize().set_uuid(track_id);
thread_tracks_.emplace(GetPid(), perfetto::ProcessTrack::Current());
}
is_valid_ = true;
}
~perfetto_plugin_t() {
if (is_valid_) {
tracing_session_->StopBlocking();
close(file_descriptor_);
}
}
const char* GetDomainName(rocprofiler_tracer_activity_domain_t domain) {
switch (domain) {
case ACTIVITY_DOMAIN_ROCTX:
return "ROCTX_DOMAIN";
break;
case ACTIVITY_DOMAIN_HIP_API:
return "HIP_API_DOMAIN";
break;
case ACTIVITY_DOMAIN_HIP_OPS:
return "HIP_OPS_DOMAIN";
break;
case ACTIVITY_DOMAIN_HSA_API:
return "HSA_API_DOMAIN";
break;
case ACTIVITY_DOMAIN_HSA_OPS:
return "HSA_OPS_DOMAIN";
break;
case ACTIVITY_DOMAIN_HSA_EVT:
return "HSA_EVT_DOMAIN";
break;
default:
return "";
}
}
std::mutex writing_lock;
int FlushProfilerRecord(rocprofiler_record_profiler_t profiler_record,
rocprofiler_session_id_t session_id) {
std::lock_guard<std::mutex> lock(writing_lock);
// ToDO: rename this variable?
if (!tracing_session_) rocmtools::warning("Tracing session is deleted!\n");
int device_id = profiler_record.gpu_id.handle;
std::unordered_map<int, perfetto::Track>::iterator device_track_it;
{
std::lock_guard<std::mutex> lock(device_tracks_lock_);
device_track_it = device_tracks_.find(device_id);
if (device_track_it == device_tracks_.end()) {
/* Create a new perfetto::Track (Sub-Track) */
device_track_it =
device_tracks_
.emplace(device_id, perfetto::ProcessTrack::Global(((device_id + 1) * machine_id_)))
.first;
auto gpu_desc = device_track_it->second.Serialize();
gpu_desc.mutable_process()->set_pid(device_id);
std::string gpu_str = rocmtools::string_printf("Node: %s Device:", hostname_);
gpu_desc.mutable_process()->set_process_name(gpu_str);
perfetto::TrackEvent::SetTrackDescriptor(device_track_it->second, gpu_desc);
track_ids_used_.emplace_back(device_id + 1 + machine_id_);
}
}
auto& gpu_track = device_track_it->second;
std::pair<int, uint64_t> gpu_queue_id =
std::make_pair(device_id, profiler_record.queue_id.handle);
auto queue_track_it = queue_tracks_.find(gpu_queue_id.first);
{
std::lock_guard<std::mutex> lock(queue_tracks_lock_);
queue_track_it = queue_tracks_.find(gpu_queue_id.first);
if (queue_track_it == queue_tracks_.end()) {
/* Create a new perfetto::Track */
queue_track_it = queue_tracks_
.emplace(gpu_queue_id.first,
perfetto::Track((profiler_record.queue_id.handle + 1 +
profiler_record.gpu_id.handle) *
QUEUE_CONSTANT * machine_id_ * GetPid(),
gpu_track))
.first;
auto queue_desc = queue_track_it->second.Serialize();
std::string queue_str =
rocmtools::string_printf("Process ID: %lu Queue %ld", GetPid(), gpu_queue_id.second);
queue_desc.set_name(queue_str);
perfetto::TrackEvent::SetTrackDescriptor(queue_track_it->second, queue_desc);
}
track_ids_used_.emplace_back(profiler_record.queue_id.handle + machine_id_ + 1 +
profiler_record.gpu_id.handle);
}
auto& queue_track = queue_track_it->second;
// Taken from rocprofiler: The size hasn't changed in recent past
static const uint32_t lds_block_size = 128 * 4;
std::string full_kernel_name = get_kernel_name(profiler_record);
// std::string truncated_kernel_name = rocmtools::truncate_name(full_kernel_name);
// perfetto::StaticString kernel_name(truncated_kernel_name.c_str());
TRACE_EVENT_BEGIN("KERNELS", perfetto::StaticString(full_kernel_name.c_str()), queue_track,
profiler_record.timestamps.begin.value, "Full Kernel Name",
full_kernel_name.c_str(), "Agent ID", device_id, "Queue ID",
profiler_record.queue_id.handle, "GRD",
profiler_record.kernel_properties.grid_size, "WGR",
profiler_record.kernel_properties.workgroup_size, "LDS",
(((profiler_record.kernel_properties.lds_size + (lds_block_size - 1)) &
~(lds_block_size - 1))),
"SCR", profiler_record.kernel_properties.scratch_size, "Arch. VGPR",
profiler_record.kernel_properties.arch_vgpr_count, "Accumilative Vgpr",
profiler_record.kernel_properties.accum_vgpr_count, "SGPR",
profiler_record.kernel_properties.sgpr_count, "Wave Size",
profiler_record.kernel_properties.wave_size, "Signal",
profiler_record.kernel_properties.signal_handle);
TRACE_EVENT_END("KERNELS", queue_track, profiler_record.timestamps.end.value);
auto get_counter_track_fn = [&](std::string counter_name) {
std ::string counter_track_id =
std::to_string(machine_id_) + std::to_string(GetPid()) + counter_name;
std::pair<int, std::string> gpu_counter_track_id = std::make_pair(device_id, counter_name);
std::unordered_map<std::string, perfetto::CounterTrack>::iterator counters_track_it;
{
std::lock_guard<std::mutex> lock(counter_tracks_lock_);
counters_track_it = counter_tracks_.find(gpu_counter_track_id.second);
if (counters_track_it == counter_tracks_.end()) {
/* Create a new perfetto::Track */
counters_track_it =
counter_tracks_
.emplace(gpu_counter_track_id.second,
perfetto::CounterTrack(counter_track_id.c_str(), gpu_track))
.first;
auto counter_track_desc = counters_track_it->second.Serialize();
std::string counter_track_str = "Process ID " + std::to_string(GetPid()) + " - Counter " +
gpu_counter_track_id.second;
counter_track_desc.set_name(counter_track_str);
perfetto::TrackEvent::SetTrackDescriptor(counters_track_it->second, counter_track_desc);
}
}
return counters_track_it->second;
};
// For Counters
if (profiler_record.counters) {
for (uint64_t i = 0; i < profiler_record.counters_count.value; i++) {
if (profiler_record.counters[i].counter_handler.handle > 0) {
size_t name_length = 0;
CHECK_ROCMTOOLS(rocprofiler_query_counter_info_size(
session_id, ROCPROFILER_COUNTER_NAME, profiler_record.counters[i].counter_handler,
&name_length));
if (name_length > 1) {
const char* name_c = static_cast<const char*>(malloc(name_length * sizeof(char)));
CHECK_ROCMTOOLS(
rocprofiler_query_counter_info(session_id, ROCPROFILER_COUNTER_NAME,
profiler_record.counters[i].counter_handler, &name_c));
perfetto::CounterTrack counters_track = get_counter_track_fn(std::string(name_c));
TRACE_COUNTER("COUNTERS", counters_track, profiler_record.timestamps.begin.value,
profiler_record.counters[i].value.value);
// Added an extra zero event for maintaining start-end of the counter
TRACE_COUNTER("COUNTERS", counters_track, profiler_record.timestamps.end.value, 0.001);
}
}
}
}
return 0;
}
int FlushTracerRecord(rocprofiler_record_tracer_t tracer_record,
rocprofiler_session_id_t session_id) {
std::lock_guard<std::mutex> lock(writing_lock);
if (!tracing_session_) rocmtools::warning("Tracing session is deleted!\n");
std::string kernel_name;
char* function_name;
char* activity_name;
std::string roctx_message;
uint64_t roctx_id = 0;
uint64_t thread_id = tracer_record.thread_id.value;
std::unordered_map<uint64_t, perfetto::Track>::iterator thread_track_it;
std::unordered_map<int, perfetto::Track>::iterator device_track_it;
if (tracer_record.domain == ACTIVITY_DOMAIN_HIP_OPS ||
tracer_record.domain == ACTIVITY_DOMAIN_HSA_OPS) {
int device_id = tracer_record.agent_id.handle;
if (tracer_record.domain == ACTIVITY_DOMAIN_HIP_OPS && device_id > 0) device_id--;
{
std::lock_guard<std::mutex> lock(device_tracks_lock_);
device_track_it = device_tracks_.find(device_id);
if (device_track_it == device_tracks_.end()) {
/* Create a new perfetto::Track (Sub-Track) */
device_track_it =
device_tracks_
.emplace(device_id,
perfetto::ProcessTrack::Global(((device_id + 1) * machine_id_)))
.first;
auto gpu_desc = device_track_it->second.Serialize();
gpu_desc.mutable_process()->set_pid(device_id);
std::string gpu_str = rocmtools::string_printf("Node: %s Device:", hostname_);
gpu_desc.mutable_process()->set_process_name(gpu_str);
perfetto::TrackEvent::SetTrackDescriptor(device_track_it->second, gpu_desc);
track_ids_used_.emplace_back(1 + machine_id_ + device_id);
}
}
} else {
std::lock_guard<std::mutex> lock(thread_tracks_lock_);
thread_track_it = thread_tracks_.find(thread_id);
if (thread_track_it == thread_tracks_.end()) {
uint64_t track_id =
track_counter_.fetch_add((1 + machine_id_) * GetPid(), std::memory_order_acquire);
for (uint64_t tid : track_ids_used_) {
while (track_id == tid) {
track_id =
track_counter_.fetch_add((1 + machine_id_) * GetPid(), std::memory_order_acquire);
}
}
thread_track_it =
thread_tracks_.emplace(thread_id, perfetto::ProcessTrack::Global(track_id)).first;
auto thread_track_desc = thread_track_it->second.Serialize();
std::string thread_track_str =
rocmtools::string_printf("Node: %s Process ID: %lu Thread ID:", hostname_, GetPid());
thread_track_desc.mutable_process()->set_pid(thread_id);
thread_track_desc.mutable_process()->set_process_name(thread_track_str);
perfetto::TrackEvent::SetTrackDescriptor(thread_track_it->second, thread_track_desc);
}
}
auto& thread_track = thread_track_it->second;
auto& gpu_track = device_track_it->second;
switch (tracer_record.domain) {
case ACTIVITY_DOMAIN_ROCTX: {
std::unordered_map<uint64_t, perfetto::Track>::iterator roctx_track_it;
{
std::lock_guard<std::mutex> lock(roctx_tracks_lock_);
roctx_track_it = roctx_tracks_.find(thread_id);
if (roctx_track_it == roctx_tracks_.end()) {
/* Create a new perfetto::Track */
uint64_t track_id =
track_counter_.fetch_add((1 + machine_id_) * GetPid(), std::memory_order_acquire);
for (uint64_t tid : track_ids_used_) {
while (track_id == tid) {
track_id = track_counter_.fetch_add((1 + machine_id_) * GetPid(),
std::memory_order_acquire);
}
}
roctx_track_it =
roctx_tracks_.emplace(thread_id, perfetto::Track(track_id, thread_track)).first;
auto roctx_track_desc = roctx_track_it->second.Serialize();
std::string roctx_track_str = rocmtools::string_printf("ROCTX Markers");
roctx_track_desc.set_name(roctx_track_str);
perfetto::TrackEvent::SetTrackDescriptor(roctx_track_it->second, roctx_track_desc);
}
}
auto& roctx_track = roctx_track_it->second;
size_t roctx_message_size = 0;
CHECK_ROCMTOOLS(rocprofiler_query_roctx_tracer_api_data_info_size(
session_id, ROCPROFILER_ROCTX_MESSAGE, tracer_record.api_data_handle,
tracer_record.operation_id, &roctx_message_size));
if (roctx_message_size > 1) {
char* roctx_message_str = static_cast<char*>(malloc(roctx_message_size * sizeof(char)));
CHECK_ROCMTOOLS(rocprofiler_query_roctx_tracer_api_data_info(
session_id, ROCPROFILER_ROCTX_MESSAGE, tracer_record.api_data_handle,
tracer_record.operation_id, &roctx_message_str));
if (roctx_message_str)
roctx_message = rocmtools::cxx_demangle(std::string(strdup(roctx_message_str)));
}
size_t roctx_id_size = 0;
CHECK_ROCMTOOLS(rocprofiler_query_roctx_tracer_api_data_info_size(
session_id, ROCPROFILER_ROCTX_ID, tracer_record.api_data_handle,
tracer_record.operation_id, &roctx_id_size));
if (roctx_id_size > 1) {
char* roctx_id_str = static_cast<char*>(malloc(roctx_id_size * sizeof(char)));
CHECK_ROCMTOOLS(rocprofiler_query_roctx_tracer_api_data_info(
session_id, ROCPROFILER_ROCTX_ID, tracer_record.api_data_handle,
tracer_record.operation_id, &roctx_id_str));
if (roctx_id_str) {
roctx_id = std::stoll(std::string(strdup(roctx_id_str)));
free(roctx_id_str);
}
}
if (tracer_record.operation_id.id == 1) {
perfetto::StaticString roctx_message_pft(
(!roctx_message.empty() ? roctx_message.c_str() : ""));
TRACE_EVENT_BEGIN("ROCTX_API", roctx_message_pft, roctx_track,
tracer_record.timestamps.begin.value, "Timestamp(ns)",
tracer_record.timestamps.begin.value, "RocTx ID", roctx_id);
roctx_track_entries_++;
} else {
TRACE_EVENT_END("ROCTX_API", roctx_track, tracer_record.timestamps.begin.value);
roctx_track_entries_--;
}
break;
}
case ACTIVITY_DOMAIN_HSA_API: {
std::unordered_map<uint64_t, perfetto::Track>::iterator hsa_track_it;
{
std::lock_guard<std::mutex> lock(hsa_tracks_lock_);
hsa_track_it = hsa_tracks_.find(thread_id);
if (hsa_track_it == hsa_tracks_.end()) {
/* Create a new perfetto::Track */
uint64_t track_id =
track_counter_.fetch_add((1 + machine_id_) * GetPid(), std::memory_order_acquire);
for (uint64_t tid : track_ids_used_) {
while (track_id == tid) {
track_id = track_counter_.fetch_add((1 + machine_id_) * GetPid(),
std::memory_order_acquire);
}
}
hsa_track_it =
hsa_tracks_.emplace(thread_id, perfetto::Track(track_id, thread_track)).first;
auto hsa_track_desc = hsa_track_it->second.Serialize();
std::string hsa_track_str = rocmtools::string_printf("HSA API");
hsa_track_desc.set_name(hsa_track_str);
perfetto::TrackEvent::SetTrackDescriptor(hsa_track_it->second, hsa_track_desc);
}
}
auto& hsa_track = hsa_track_it->second;
size_t function_name_size = 0;
CHECK_ROCMTOOLS(rocprofiler_query_hsa_tracer_api_data_info_size(
session_id, ROCPROFILER_HSA_FUNCTION_NAME, tracer_record.api_data_handle,
tracer_record.operation_id, &function_name_size));
if (function_name_size > 1) {
function_name = static_cast<char*>(malloc(function_name_size * sizeof(char)));
CHECK_ROCMTOOLS(rocprofiler_query_hsa_tracer_api_data_info(
session_id, ROCPROFILER_HSA_FUNCTION_NAME, tracer_record.api_data_handle,
tracer_record.operation_id, &function_name));
}
TRACE_EVENT_BEGIN("HSA_API", perfetto::StaticString(function_name), hsa_track,
tracer_record.timestamps.begin.value,
perfetto::Flow::ProcessScoped(tracer_record.correlation_id.value));
TRACE_EVENT_END("HSA_API", hsa_track, tracer_record.timestamps.end.value);
break;
}
case ACTIVITY_DOMAIN_HIP_API: {
std::unordered_map<uint64_t, perfetto::Track>::iterator hip_track_it;
{
std::lock_guard<std::mutex> lock(hip_tracks_lock_);
hip_track_it = hip_tracks_.find(thread_id);
if (hip_track_it == hip_tracks_.end()) {
/* Create a new perfetto::Track */
uint64_t track_id =
track_counter_.fetch_add((1 + machine_id_) * GetPid(), std::memory_order_acquire);
for (uint64_t tid : track_ids_used_) {
while (track_id == tid) {
track_id = track_counter_.fetch_add((1 + machine_id_) * GetPid(),
std::memory_order_acquire);
}
}
hip_track_it =
hip_tracks_.emplace(thread_id, perfetto::Track(track_id, thread_track)).first;
auto hip_track_desc = hip_track_it->second.Serialize();
std::string hip_track_str = rocmtools::string_printf("HIP API");
hip_track_desc.set_name(hip_track_str);
perfetto::TrackEvent::SetTrackDescriptor(hip_track_it->second, hip_track_desc);
}
}
auto& hip_track = hip_track_it->second;
size_t function_name_size = 0;
CHECK_ROCMTOOLS(rocprofiler_query_hip_tracer_api_data_info_size(
session_id, ROCPROFILER_HIP_FUNCTION_NAME, tracer_record.api_data_handle,
tracer_record.operation_id, &function_name_size));
if (function_name_size > 1) {
function_name = static_cast<char*>(malloc(function_name_size * sizeof(char)));
CHECK_ROCMTOOLS(rocprofiler_query_hip_tracer_api_data_info(
session_id, ROCPROFILER_HIP_FUNCTION_NAME, tracer_record.api_data_handle,
tracer_record.operation_id, &function_name));
}
size_t kernel_name_size = 0;
CHECK_ROCMTOOLS(rocprofiler_query_hip_tracer_api_data_info_size(
session_id, ROCPROFILER_HIP_KERNEL_NAME, tracer_record.api_data_handle,
tracer_record.operation_id, &kernel_name_size));
char* kernel_name_str;
if (kernel_name_size > 1) {
kernel_name_str = static_cast<char*>(malloc(kernel_name_size * sizeof(char)));
CHECK_ROCMTOOLS(rocprofiler_query_hip_tracer_api_data_info(
session_id, ROCPROFILER_HIP_KERNEL_NAME, tracer_record.api_data_handle,
tracer_record.operation_id, &kernel_name_str));
if (kernel_name_str) {
kernel_name = rocmtools::cxx_demangle(std::string(kernel_name_str));
free(kernel_name_str);
}
}
if (kernel_name.size() > 0) {
TRACE_EVENT_BEGIN("HIP_API", perfetto::StaticString(function_name), hip_track,
tracer_record.timestamps.begin.value, "Kernel Name", kernel_name,
perfetto::Flow::ProcessScoped(tracer_record.correlation_id.value));
} else {
TRACE_EVENT_BEGIN("HIP_API", perfetto::StaticString(function_name), hip_track,
tracer_record.timestamps.begin.value,
perfetto::Flow::ProcessScoped(tracer_record.correlation_id.value));
}
TRACE_EVENT_END("HIP_API", hip_track, tracer_record.timestamps.end.value);
break;
}
case ACTIVITY_DOMAIN_EXT_API: {
printf("Warning: External API is not supported!\n");
break;
}
case ACTIVITY_DOMAIN_HIP_OPS: {
uint64_t stream_id = 0;
size_t stream_id_str_size = 0;
char* stream_id_str;
CHECK_ROCMTOOLS(rocprofiler_query_hip_tracer_api_data_info_size(
session_id, ROCPROFILER_HIP_STREAM_ID, rocprofiler_tracer_api_data_handle_t{nullptr, 0},
rocprofiler_tracer_operation_id_t{(uint32_t)tracer_record.correlation_id.value},
&stream_id_str_size));
if (stream_id_str_size > 1) {
stream_id_str = static_cast<char*>(malloc(stream_id_str_size * sizeof(char)));
CHECK_ROCMTOOLS(rocprofiler_query_hip_tracer_api_data_info(
session_id, ROCPROFILER_HIP_STREAM_ID, rocprofiler_tracer_api_data_handle_t{nullptr, 0},
rocprofiler_tracer_operation_id_t{(uint32_t)tracer_record.correlation_id.value},
&stream_id_str));
if (stream_id_str != nullptr) stream_id = std::stoll(stream_id_str);
}
std::unordered_map<int, perfetto::Track>::iterator stream_track_it;
{
std::lock_guard<std::mutex> lock(stream_tracks_lock_);
stream_track_it = stream_tracks_.find(stream_id);
if (stream_track_it == stream_tracks_.end()) {
/* Create a new perfetto::Track */
uint64_t track_id = ((1 + stream_id + tracer_record.agent_id.handle) * machine_id_ *
STREAM_CONSTANT * GetPid());
stream_track_it =
stream_tracks_.emplace(stream_id, perfetto::Track(track_id, gpu_track)).first;
auto stream_desc = stream_track_it->second.Serialize();
std::string stream_str =
rocmtools::string_printf("Process ID: %lu Stream %d", GetPid(), stream_id);
stream_desc.set_name(stream_str);
perfetto::TrackEvent::SetTrackDescriptor(stream_track_it->second, stream_desc);
track_ids_used_.emplace_back(1 + machine_id_ + tracer_record.agent_id.handle);
}
}
auto& stream_track = stream_track_it->second;
if (tracer_record.api_data_handle.handle && tracer_record.api_data_handle.size > 1) {
kernel_name = rocmtools::cxx_demangle(
strdup(reinterpret_cast<const char*>(tracer_record.api_data_handle.handle)));
TRACE_EVENT_BEGIN(
"HIP_OPS",
perfetto::StaticString(strdup(rocmtools::truncate_name(kernel_name).c_str())),
stream_track, tracer_record.timestamps.begin.value, "Agent ID",
tracer_record.agent_id.handle, "Process ID", GetPid(), "Kernel Name", kernel_name,
perfetto::Flow::ProcessScoped(tracer_record.correlation_id.value));
} else {
size_t activity_name_size = 0;
CHECK_ROCMTOOLS(rocprofiler_query_hip_tracer_api_data_info_size(
session_id, ROCPROFILER_HIP_ACTIVITY_NAME, tracer_record.api_data_handle,
tracer_record.operation_id, &activity_name_size));
if (activity_name_size > 1) {
activity_name = static_cast<char*>(malloc(activity_name_size * sizeof(char)));
CHECK_ROCMTOOLS(rocprofiler_query_hip_tracer_api_data_info(
session_id, ROCPROFILER_HIP_ACTIVITY_NAME, tracer_record.api_data_handle,
tracer_record.operation_id, &activity_name));
} else {
activity_name = const_cast<char*>(std::string("N/A").c_str());
}
TRACE_EVENT_BEGIN("HIP_OPS", perfetto::StaticString(activity_name), stream_track,
tracer_record.timestamps.begin.value, "Agent ID",
tracer_record.agent_id.handle, "Process ID", GetPid(),
perfetto::Flow::ProcessScoped(tracer_record.correlation_id.value));
}
TRACE_EVENT_END("HIP_OPS", stream_track, tracer_record.timestamps.end.value);
break;
}
case ACTIVITY_DOMAIN_HSA_OPS: {
std::pair<int, uint64_t> gpu_queue_id =
std::make_pair(tracer_record.agent_id.handle, tracer_record.queue_id.handle);
std::unordered_map<int, perfetto::Track>::iterator queue_track_it;
{
std::lock_guard<std::mutex> lock(queue_tracks_lock_);
queue_track_it = queue_tracks_.find(gpu_queue_id.first);
if (queue_track_it == queue_tracks_.end()) {
/* Create a new perfetto::Track */
uint64_t track_id =
((1 + tracer_record.queue_id.handle + tracer_record.agent_id.handle) * machine_id_ *
QUEUE_CONSTANT * GetPid());
queue_track_it =
queue_tracks_.emplace(gpu_queue_id.first, perfetto::Track(track_id, gpu_track))
.first;
auto queue_desc = queue_track_it->second.Serialize();
std::string queue_str = rocmtools::string_printf("Process ID: %lu Queue %ld", GetPid(),
gpu_queue_id.second);
queue_desc.set_name(queue_str);
perfetto::TrackEvent::SetTrackDescriptor(queue_track_it->second, queue_desc);
}
track_ids_used_.emplace_back(tracer_record.queue_id.handle + machine_id_ + 1 +
tracer_record.agent_id.handle);
}
auto& queue_track = queue_track_it->second;
size_t activity_name_size = 0;
CHECK_ROCMTOOLS(rocprofiler_query_hsa_tracer_api_data_info_size(
session_id, ROCPROFILER_HSA_ACTIVITY_NAME, tracer_record.api_data_handle,
tracer_record.operation_id, &activity_name_size));
if (activity_name_size > 1) {
activity_name = static_cast<char*>(malloc(activity_name_size * sizeof(char)));
CHECK_ROCMTOOLS(rocprofiler_query_hsa_tracer_api_data_info(
session_id, ROCPROFILER_HSA_ACTIVITY_NAME, tracer_record.api_data_handle,
tracer_record.operation_id, &activity_name));
}
TRACE_EVENT_BEGIN("HSA_OPS", perfetto::StaticString(activity_name), queue_track,
tracer_record.timestamps.begin.value, "Agent ID",
tracer_record.agent_id.handle, "Queue ID", tracer_record.queue_id.handle,
"Process ID", GetPid(),
perfetto::Flow::ProcessScoped(tracer_record.correlation_id.value));
TRACE_EVENT_END("HSA_OPS", queue_track, tracer_record.timestamps.end.value);
break;
}
default: {
rocmtools::warning("ignored record for domain %d", tracer_record.domain);
break;
}
}
return 0;
}
int WriteBufferRecords(const rocprofiler_record_header_t* begin,
const rocprofiler_record_header_t* end, rocprofiler_session_id_t session_id,
rocprofiler_buffer_id_t buffer_id) {
if (!tracing_session_) rocmtools::warning("Tracing session is deleted!\n");
while (begin < end) {
if (!begin) return 0;
switch (begin->kind) {
case ROCPROFILER_PROFILER_RECORD: {
rocprofiler_record_profiler_t* profiler_record = const_cast<rocprofiler_record_profiler_t*>(
reinterpret_cast<const rocprofiler_record_profiler_t*>(begin));
FlushProfilerRecord(*profiler_record, session_id);
break;
}
case ROCPROFILER_TRACER_RECORD: {
rocprofiler_record_tracer_t* tracer_record = const_cast<rocprofiler_record_tracer_t*>(
reinterpret_cast<const rocprofiler_record_tracer_t*>(begin));
FlushTracerRecord(*tracer_record, session_id);
break;
}
default:
break;
}
rocprofiler_next_record(begin, &begin, session_id, buffer_id);
}
return 0;
}
bool IsValid() const { return is_valid_; }
private:
fs::path output_prefix_;
std::unique_ptr<perfetto::TracingSession> tracing_session_;
int file_descriptor_;
bool is_valid_{false};
size_t roctx_track_entries_{0};
// Correlate stream id(s) with correlation id(s) to identify the stream id of every HIP activity
std::unordered_map<uint64_t, uint64_t> stream_ids_;
// Callback Tracks
std::unordered_map<uint64_t, perfetto::Track> thread_tracks_;
std::unordered_map<uint64_t, perfetto::Track> roctx_tracks_, hsa_tracks_, hip_tracks_,
hip_ext_tracks_;
// Activity Tracks
std::unordered_map<int, perfetto::Track> device_tracks_;
std::unordered_map<int, perfetto::Track> queue_tracks_, stream_tracks_;
std::unordered_map<std::string, perfetto::CounterTrack> counter_tracks_;
std::atomic<uint64_t> track_counter_{GetPid()};
std::vector<uint64_t> track_ids_used_;
std::mutex stream_ids_lock_, thread_tracks_lock_, roctx_tracks_lock_, hsa_tracks_lock_,
hip_tracks_lock_, hip_ext_tracks_lock_, device_tracks_lock_, queue_tracks_lock_,
stream_tracks_lock_, counter_tracks_lock_;
char hostname_[1024];
uint64_t machine_id_;
std::ofstream stream_;
};
perfetto_plugin_t* perfetto_plugin = nullptr;
} // namespace
int rocprofiler_plugin_initialize(uint32_t rocprofiler_major_version,
uint32_t rocprofiler_minor_version) {
if (rocprofiler_major_version != ROCPROFILER_VERSION_MAJOR ||
rocprofiler_minor_version > ROCPROFILER_VERSION_MINOR)
return -1;
if (perfetto_plugin != nullptr) return -1;
perfetto_plugin = new perfetto_plugin_t();
if (perfetto_plugin->IsValid()) return 0;
delete perfetto_plugin;
perfetto_plugin = nullptr;
return -1;
}
void rocprofiler_plugin_finalize() {
if (!perfetto_plugin) return;
delete perfetto_plugin;
perfetto_plugin = nullptr;
}
ROCPROFILER_EXPORT int rocprofiler_plugin_write_buffer_records(const rocprofiler_record_header_t* begin,
const rocprofiler_record_header_t* end,
rocprofiler_session_id_t session_id,
rocprofiler_buffer_id_t buffer_id) {
if (!perfetto_plugin || !perfetto_plugin->IsValid()) return -1;
return perfetto_plugin->WriteBufferRecords(begin, end, session_id, buffer_id);
}
ROCPROFILER_EXPORT int rocprofiler_plugin_write_record(rocprofiler_record_tracer_t record,
rocprofiler_session_id_t session_id) {
if (!perfetto_plugin || !perfetto_plugin->IsValid()) return -1;
if (record.header.id.handle == 0) return 0;
perfetto_plugin->FlushTracerRecord(record, session_id);
return 0;
}