Files
rocm-systems/projects/rocprofiler/src/core/rocprofiler.cpp
T
Xianwei Zhang bc27ae1adf concurrent: improve concurrent profiling
This patch adds barrier packets, together with extra signals,
to enforce the completion order of read packets w.r.t dispatch.
And, PmcStopper is added to stop the profiling finally.

Change-Id: I8e8d3a41d86e42be1d9e5afd44c247be876cf1a5


[ROCm/rocprofiler commit: e26210d9d9]
2020-08-05 18:20:14 -04:00

986 строки
38 KiB
C++

/******************************************************************************
Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#include "inc/rocprofiler.h"
#include <hsa.h>
#include <string.h>
#include <sstream>
#include <vector>
#include "core/context.h"
#include "core/context_pool.h"
#include "core/hsa_queue.h"
#include "core/hsa_interceptor.h"
#include "core/intercept_queue.h"
#include "core/proxy_queue.h"
#include "core/simple_proxy_queue.h"
#include "util/exception.h"
#include "util/hsa_rsrc_factory.h"
#include "util/logger.h"
#define PUBLIC_API __attribute__((visibility("default")))
#define CONSTRUCTOR_API __attribute__((constructor))
#define DESTRUCTOR_API __attribute__((destructor))
#define API_METHOD_PREFIX \
hsa_status_t status = HSA_STATUS_SUCCESS; \
try {
#define API_METHOD_SUFFIX \
} \
catch (std::exception & e) { \
ERR_LOGGING(__FUNCTION__ << "(), " << e.what()); \
status = rocprofiler::GetExcStatus(e); \
} \
return status;
#define ONLOAD_TRACE(str) \
if (getenv("ROCP_ONLOAD_TRACE")) do { \
std::cout << "PID(" << GetPid() << "): PROF_LIB::" << __FUNCTION__ << " " << str << std::endl << std::flush; \
} while(0);
#define ONLOAD_TRACE_BEG() ONLOAD_TRACE("begin")
#define ONLOAD_TRACE_END() ONLOAD_TRACE("end")
static inline uint32_t GetPid() { return syscall(__NR_getpid); }
///////////////////////////////////////////////////////////////////////////////////////////////////
// Internal library methods
//
namespace rocprofiler {
hsa_status_t CreateQueuePro(
hsa_agent_t agent,
uint32_t size,
hsa_queue_type32_t type,
void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data),
void *data,
uint32_t private_segment_size,
uint32_t group_segment_size,
hsa_queue_t **queue);
decltype(hsa_queue_create)* hsa_queue_create_fn;
decltype(hsa_queue_destroy)* hsa_queue_destroy_fn;
decltype(hsa_signal_store_relaxed)* hsa_signal_store_relaxed_fn;
decltype(hsa_signal_store_relaxed)* hsa_signal_store_screlease_fn;
decltype(hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed_fn;
decltype(hsa_queue_store_write_index_relaxed)* hsa_queue_store_write_index_relaxed_fn;
decltype(hsa_queue_load_read_index_relaxed)* hsa_queue_load_read_index_relaxed_fn;
decltype(hsa_queue_load_write_index_scacquire)* hsa_queue_load_write_index_scacquire_fn;
decltype(hsa_queue_store_write_index_screlease)* hsa_queue_store_write_index_screlease_fn;
decltype(hsa_queue_load_read_index_scacquire)* hsa_queue_load_read_index_scacquire_fn;
decltype(hsa_amd_queue_intercept_create)* hsa_amd_queue_intercept_create_fn;
decltype(hsa_amd_queue_intercept_register)* hsa_amd_queue_intercept_register_fn;
decltype(hsa_memory_allocate)* hsa_memory_allocate_fn;
decltype(hsa_memory_assign_agent)* hsa_memory_assign_agent_fn;
decltype(hsa_memory_copy)* hsa_memory_copy_fn;
decltype(hsa_amd_memory_pool_allocate)* hsa_amd_memory_pool_allocate_fn;
decltype(hsa_amd_memory_pool_free)* hsa_amd_memory_pool_free_fn;
decltype(hsa_amd_agents_allow_access)* hsa_amd_agents_allow_access_fn;
decltype(hsa_amd_memory_async_copy)* hsa_amd_memory_async_copy_fn;
decltype(hsa_amd_memory_async_copy_rect)* hsa_amd_memory_async_copy_rect_fn;
decltype(hsa_executable_freeze)* hsa_executable_freeze_fn;
decltype(hsa_executable_destroy)* hsa_executable_destroy_fn;
::HsaApiTable* kHsaApiTable;
void SaveHsaApi(::HsaApiTable* table) {
util::HsaRsrcFactory::InitHsaApiTable(table);
kHsaApiTable = table;
hsa_queue_create_fn = table->core_->hsa_queue_create_fn;
hsa_queue_destroy_fn = table->core_->hsa_queue_destroy_fn;
hsa_signal_store_relaxed_fn = table->core_->hsa_signal_store_relaxed_fn;
hsa_signal_store_screlease_fn = table->core_->hsa_signal_store_screlease_fn;
hsa_queue_load_write_index_relaxed_fn = table->core_->hsa_queue_load_write_index_relaxed_fn;
hsa_queue_store_write_index_relaxed_fn = table->core_->hsa_queue_store_write_index_relaxed_fn;
hsa_queue_load_read_index_relaxed_fn = table->core_->hsa_queue_load_read_index_relaxed_fn;
hsa_queue_load_write_index_scacquire_fn = table->core_->hsa_queue_load_write_index_scacquire_fn;
hsa_queue_store_write_index_screlease_fn = table->core_->hsa_queue_store_write_index_screlease_fn;
hsa_queue_load_read_index_scacquire_fn = table->core_->hsa_queue_load_read_index_scacquire_fn;
hsa_amd_queue_intercept_create_fn = table->amd_ext_->hsa_amd_queue_intercept_create_fn;
hsa_amd_queue_intercept_register_fn = table->amd_ext_->hsa_amd_queue_intercept_register_fn;
}
void RestoreHsaApi() {
::HsaApiTable* table = kHsaApiTable;
table->core_->hsa_queue_create_fn = hsa_queue_create_fn;
table->core_->hsa_queue_destroy_fn = hsa_queue_destroy_fn;
table->core_->hsa_signal_store_relaxed_fn = hsa_signal_store_relaxed_fn;
table->core_->hsa_signal_store_screlease_fn = hsa_signal_store_screlease_fn;
table->core_->hsa_queue_load_write_index_relaxed_fn = hsa_queue_load_write_index_relaxed_fn;
table->core_->hsa_queue_store_write_index_relaxed_fn = hsa_queue_store_write_index_relaxed_fn;
table->core_->hsa_queue_load_read_index_relaxed_fn = hsa_queue_load_read_index_relaxed_fn;
table->core_->hsa_queue_load_write_index_scacquire_fn = hsa_queue_load_write_index_scacquire_fn;
table->core_->hsa_queue_store_write_index_screlease_fn = hsa_queue_store_write_index_screlease_fn;
table->core_->hsa_queue_load_read_index_scacquire_fn = hsa_queue_load_read_index_scacquire_fn;
table->amd_ext_->hsa_amd_queue_intercept_create_fn = hsa_amd_queue_intercept_create_fn;
table->amd_ext_->hsa_amd_queue_intercept_register_fn = hsa_amd_queue_intercept_register_fn;
}
void StandaloneIntercept() {
::HsaApiTable* table = kHsaApiTable;
table->core_->hsa_queue_create_fn = rocprofiler::CreateQueuePro;
}
typedef void (*tool_handler_t)();
typedef void (*tool_handler_prop_t)(rocprofiler_settings_t*);
void * tool_handle = NULL;
// Load profiling tool library
// Return true if intercepting mode is enabled
enum {
DISPATCH_INTERCEPT_MODE = 0x1,
CODE_OBJ_TRACKING_MODE = 0x2,
MEMCOPY_INTERCEPT_MODE = 0x4,
HSA_INTERCEPT_MODE = 0x8,
};
uint32_t LoadTool() {
uint32_t intercept_mode = 0;
const char* tool_lib = getenv("ROCP_TOOL_LIB");
std::ostringstream oss;
if (tool_lib) oss << "load tool library(" << tool_lib << ")";
ONLOAD_TRACE(oss.str());
if (tool_lib) {
intercept_mode = DISPATCH_INTERCEPT_MODE;
tool_handle = dlopen(tool_lib, RTLD_NOW);
if (tool_handle == NULL) {
fprintf(stderr, "ROCProfiler: can't load tool library \"%s\"\n", tool_lib);
fprintf(stderr, "%s\n", dlerror());
abort();
}
tool_handler_t handler = reinterpret_cast<tool_handler_t>(dlsym(tool_handle, "OnLoadTool"));
tool_handler_prop_t handler_prop = reinterpret_cast<tool_handler_prop_t>(dlsym(tool_handle, "OnLoadToolProp"));
if ((handler == NULL) && (handler_prop == NULL)) {
fprintf(stderr, "ROCProfiler: tool library corrupted, OnLoadTool()/OnLoadToolProp() method is expected\n");
fprintf(stderr, "%s\n", dlerror());
abort();
}
tool_handler_t on_unload_handler = reinterpret_cast<tool_handler_t>(dlsym(tool_handle, "OnUnloadTool"));
if (on_unload_handler == NULL) {
fprintf(stderr, "ROCProfiler: tool library corrupted, OnUnloadTool() method is expected\n");
fprintf(stderr, "%s\n", dlerror());
abort();
}
rocprofiler_settings_t settings{};
settings.intercept_mode = (intercept_mode != 0) ? 1 : 0;
settings.trace_size = TraceProfile::GetSize();
settings.trace_local = TraceProfile::IsLocal() ? 1: 0;
settings.timeout = util::HsaRsrcFactory::GetTimeoutNs();
settings.timestamp_on = InterceptQueue::IsTrackerOn() ? 1 : 0;
settings.code_obj_tracking = 1;
if (handler) handler();
else if (handler_prop) handler_prop(&settings);
TraceProfile::SetSize(settings.trace_size);
TraceProfile::SetLocal(settings.trace_local != 0);
util::HsaRsrcFactory::SetTimeoutNs(settings.timeout);
InterceptQueue::TrackerOn(settings.timestamp_on != 0);
if (settings.intercept_mode != 0) intercept_mode = DISPATCH_INTERCEPT_MODE;
if (settings.code_obj_tracking) intercept_mode |= CODE_OBJ_TRACKING_MODE;
if (settings.memcopy_tracking) intercept_mode |= MEMCOPY_INTERCEPT_MODE;
if (settings.hsa_intercepting) intercept_mode |= HSA_INTERCEPT_MODE;
if (settings.k_concurrent) {
Context::k_concurrent_ = settings.k_concurrent;
InterceptQueue::k_concurrent_ = settings.k_concurrent;
}
if (settings.opt_mode) InterceptQueue::opt_mode_ = true;
}
ONLOAD_TRACE("end intercept_mode(" << intercept_mode << ")");
return intercept_mode;
}
void PmcStopper() {
rocprofiler::util::HsaRsrcFactory* rsrc = &rocprofiler::util::HsaRsrcFactory::Instance();
const uint32_t gpu_count = rsrc->GetCountOfGpuAgents();
for (uint32_t gpu_id = 0; gpu_id < gpu_count; gpu_id++) {
// Get agent info
const rocprofiler::util::AgentInfo* agent_info;
if (rsrc->GetGpuAgentInfo(gpu_id, &agent_info) == false) {
fprintf(stderr, "Error: GetGpuAgentInfo(%u) \n", gpu_id);
abort();
}
// Create queue
hsa_queue_t* queue;
hsa_status_t status = rocprofiler::CreateQueuePro(agent_info->dev_id, 1,
HSA_QUEUE_TYPE_MULTI, NULL, NULL, UINT32_MAX, UINT32_MAX, &queue);
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "CreateQueuePro ("
<< gpu_id << ") " << std::hex << status);
// Submit packets
for (auto& pkt: Context::stop_packets_) {
rsrc->Submit(queue, &pkt);
// Wait for stop packet to complete
rsrc->SignalWaitRestore(pkt.completion_signal, 1);
}
hsa_queue_destroy(queue);
}
}
// Unload profiling tool librray
void UnloadTool() {
ONLOAD_TRACE("tool handle(" << tool_handle << ")");
//if (Context::k_concurrent_) PmcStopper();
if (tool_handle) {
tool_handler_t handler = reinterpret_cast<tool_handler_t>(dlsym(tool_handle, "OnUnloadTool"));
if (handler == NULL) {
fprintf(stderr, "ROCProfiler error: tool library corrupted, OnUnloadTool() method is expected\n");
fprintf(stderr, "%s\n", dlerror());
abort();
}
handler();
dlclose(tool_handle);
}
ONLOAD_TRACE_END();
}
CONSTRUCTOR_API void constructor() {
ONLOAD_TRACE_BEG();
util::Logger::Create();
ONLOAD_TRACE_END();
}
DESTRUCTOR_API void destructor() {
ONLOAD_TRACE_BEG();
rocprofiler::MetricsDict::Destroy();
util::HsaRsrcFactory::Destroy();
util::Logger::Destroy();
ONLOAD_TRACE_END();
}
const MetricsDict* GetMetrics(const hsa_agent_t& agent) {
rocprofiler::util::HsaRsrcFactory* hsa_rsrc = &rocprofiler::util::HsaRsrcFactory::Instance();
const rocprofiler::util::AgentInfo* agent_info = hsa_rsrc->GetAgentInfo(agent);
if (agent_info == NULL) EXC_RAISING(HSA_STATUS_ERROR, "agent is not found");
const MetricsDict* metrics = MetricsDict::Create(agent_info);
if (metrics == NULL) EXC_RAISING(HSA_STATUS_ERROR, "MetricsDict create failed");
return metrics;
}
hsa_status_t GetExcStatus(const std::exception& e) {
const util::exception* rocprofiler_exc_ptr = dynamic_cast<const util::exception*>(&e);
return (rocprofiler_exc_ptr) ? static_cast<hsa_status_t>(rocprofiler_exc_ptr->status())
: HSA_STATUS_ERROR;
}
inline size_t CreateEnableCmd(const rocprofiler::util::AgentInfo* agent_info, packet_t* command, const size_t& slot_count) {
const bool is_legacy = (strncmp(agent_info->name, "gfx8", 4) == 0);
const size_t packet_count = (is_legacy) ? Profile::LEGACY_SLOT_SIZE_PKT : 1;
rocprofiler::util::HsaRsrcFactory* hsa_rsrc = &rocprofiler::util::HsaRsrcFactory::Instance();
if (packet_count > slot_count) EXC_RAISING(HSA_STATUS_ERROR, "packet_count > slot_count");
// AQLprofile object
hsa_ven_amd_aqlprofile_profile_t profile{};
profile.agent = agent_info->dev_id;
// Query for cmd buffer size
hsa_status_t status = hsa_rsrc->AqlProfileApi()->hsa_ven_amd_aqlprofile_get_info(
&profile, HSA_VEN_AMD_AQLPROFILE_INFO_ENABLE_CMD, NULL);
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "get_info(ENABLE_CMD).size exc");
if (profile.command_buffer.size == 0) EXC_RAISING(status, "get_info(ENABLE_CMD).size == 0");
// Allocate cmd buffer
const size_t aligment_mask = 0x100 - 1;
profile.command_buffer.ptr =
hsa_rsrc->AllocateSysMemory(agent_info, profile.command_buffer.size);
if ((reinterpret_cast<uintptr_t>(profile.command_buffer.ptr) & aligment_mask) != 0) {
EXC_RAISING(status, "profile.command_buffer.ptr bad alignment");
}
// Generating cmd packet
if (is_legacy) {
packet_t packet{};
// Query for cmd buffer data
status = hsa_rsrc->AqlProfileApi()->hsa_ven_amd_aqlprofile_get_info(
&profile, HSA_VEN_AMD_AQLPROFILE_INFO_ENABLE_CMD, &packet);
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "get_info(ENABLE_CMD).data exc");
// Check for legacy GFXIP
status = hsa_rsrc->AqlProfileApi()->hsa_ven_amd_aqlprofile_legacy_get_pm4(&packet, command);
if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "hsa_ven_amd_aqlprofile_legacy_get_pm4");
} else {
// Query for cmd buffer data
status = hsa_rsrc->AqlProfileApi()->hsa_ven_amd_aqlprofile_get_info(
&profile, HSA_VEN_AMD_AQLPROFILE_INFO_ENABLE_CMD, command);
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "get_info(ENABLE_CMD).data exc");
}
// Return cmd packet data size
return (packet_count * sizeof(packet_t));
}
hsa_status_t CreateQueuePro(
hsa_agent_t agent,
uint32_t size,
hsa_queue_type32_t type,
void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data),
void *data,
uint32_t private_segment_size,
uint32_t group_segment_size,
hsa_queue_t **queue)
{
typedef std::pair<packet_t[Profile::LEGACY_SLOT_SIZE_PKT], uint32_t> cmd_entry_t;
typedef std::vector<cmd_entry_t> cmd_vec_t;
static cmd_vec_t cmd_vec;
static uint32_t cmd_mask = 0;
static std::mutex cmd_mutex;
rocprofiler::util::HsaRsrcFactory* hsa_rsrc = &rocprofiler::util::HsaRsrcFactory::Instance();
// Create HSA queue
hsa_status_t status = hsa_queue_create_fn(
agent,
size,
type,
callback,
data,
private_segment_size,
group_segment_size,
queue);
if (status != HSA_STATUS_SUCCESS) return status;
// Create 'Enable' cmd packet
const rocprofiler::util::AgentInfo* agent_info = hsa_rsrc->GetAgentInfo(agent);
const uint32_t dev_index = 1 << agent_info->dev_index;
const uint32_t dev_mask = 1 << dev_index;
if ((cmd_mask & dev_mask) == 0) {
std::lock_guard<std::mutex> lck(cmd_mutex);
if ((cmd_mask & dev_mask) == 0) {
cmd_mask |= dev_mask;
// Allocating cmd vector
uint32_t mask = 1;
while (1) {
const uint32_t max = 1 << cmd_vec.size();
if (mask >= max) cmd_vec.push_back({});
if (((mask & dev_mask) != 0) || (mask == 0)) break;
mask <<= 1;
}
if (mask == 0) EXC_RAISING(status, "bad device index (" << dev_index << ")");
// Creating cmd packets
cmd_vec[dev_index].second = CreateEnableCmd(agent_info, cmd_vec[dev_index].first, Profile::LEGACY_SLOT_SIZE_PKT);
}
}
// Enable counters for the queue
rocprofiler::util::HsaRsrcFactory::Instance().Submit(*queue, cmd_vec[dev_index].first, cmd_vec[dev_index].second);
return HSA_STATUS_SUCCESS;
}
bool async_copy_handler(hsa_signal_value_t value, void* arg) {
Tracker::entry_t* entry = reinterpret_cast<Tracker::entry_t*>(arg);
printf("%lu: async-copy time(%lu,%lu)\n", entry->index, entry->record->begin, entry->record->end);
return false;
}
hsa_status_t hsa_amd_memory_async_copy_interceptor(
void* dst, hsa_agent_t dst_agent, const void* src,
hsa_agent_t src_agent, size_t size, uint32_t num_dep_signals,
const hsa_signal_t* dep_signals, hsa_signal_t completion_signal)
{
Tracker* tracker = &Tracker::Instance();
Tracker::entry_t* tracker_entry = tracker->Alloc(hsa_agent_t{}, completion_signal);
hsa_status_t status = hsa_amd_memory_async_copy_fn(dst, dst_agent, src,
src_agent, size, num_dep_signals,
dep_signals, tracker_entry->signal);
if (status == HSA_STATUS_SUCCESS) {
tracker->EnableMemcopy(tracker_entry, async_copy_handler, reinterpret_cast<void*>(tracker_entry));
} else {
tracker->Delete(tracker_entry);
}
return status;
}
hsa_status_t hsa_amd_memory_async_copy_rect_interceptor(
const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset, const hsa_pitched_ptr_t* src,
const hsa_dim3_t* src_offset, const hsa_dim3_t* range, hsa_agent_t copy_agent,
hsa_amd_copy_direction_t dir, uint32_t num_dep_signals, const hsa_signal_t* dep_signals,
hsa_signal_t completion_signal)
{
Tracker* tracker = &Tracker::Instance();
Tracker::entry_t* tracker_entry = tracker->Alloc(hsa_agent_t{}, completion_signal);
hsa_status_t status = hsa_amd_memory_async_copy_rect_fn(dst, dst_offset, src,
src_offset, range, copy_agent,
dir, num_dep_signals, dep_signals,
tracker_entry->signal);
if (status == HSA_STATUS_SUCCESS) {
tracker->EnableMemcopy(tracker_entry, async_copy_handler, reinterpret_cast<void*>(tracker_entry));
} else {
tracker->Delete(tracker_entry);
}
return status;
}
rocprofiler_properties_t rocprofiler_properties;
uint32_t TraceProfile::output_buffer_size_ = 0x2000000; // 32M
bool TraceProfile::output_buffer_local_ = true;
std::atomic<Tracker*> Tracker::instance_{};
Tracker::mutex_t Tracker::glob_mutex_;
Tracker::counter_t Tracker::counter_ = 0;
util::Logger::mutex_t util::Logger::mutex_;
std::atomic<util::Logger*> util::Logger::instance_{};
}
CONTEXT_INSTANTIATE();
///////////////////////////////////////////////////////////////////////////////////////////////////
// Public library methods
//
extern "C" {
// HSA-runtime tool on-load method
PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t failed_tool_count,
const char* const* failed_tool_names) {
ONLOAD_TRACE_BEG();
rocprofiler::SaveHsaApi(table);
rocprofiler::ProxyQueue::InitFactory();
bool intercept_mode = false;
// Checking environment to enable intercept mode
const char* intercept_env = getenv("ROCP_HSA_INTERCEPT");
if (intercept_env != NULL) {
switch (atoi(intercept_env)) {
// Intercepting disabled
case 0:
intercept_mode = false;
rocprofiler::InterceptQueue::TrackerOn(false);
break;
// Intercepting enabled without timestamping
case 1:
intercept_mode = true;
rocprofiler::InterceptQueue::TrackerOn(false);
break;
// Intercepting enabled with timestamping
case 2:
intercept_mode = true;
rocprofiler::InterceptQueue::TrackerOn(true);
break;
default:
ERR_LOGGING("Bad ROCP_HSA_INTERCEPT env var value (" << intercept_env << ")");
return false;
}
}
// Loading a tool lib and setting of intercept mode
const uint32_t intercept_mode_mask = rocprofiler::LoadTool();
if (intercept_mode_mask & rocprofiler::DISPATCH_INTERCEPT_MODE) {
intercept_mode = true;
}
if (intercept_mode_mask & rocprofiler::CODE_OBJ_TRACKING_MODE) {
if (intercept_mode == false) EXC_RAISING(HSA_STATUS_ERROR, "code objects tracking without intercept mode enabled");
rocprofiler::util::HsaRsrcFactory::EnableExecutableTracking(table);
}
if (intercept_mode_mask & rocprofiler::MEMCOPY_INTERCEPT_MODE) {
hsa_status_t status = hsa_amd_profiling_async_copy_enable(true);
if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "hsa_amd_profiling_async_copy_enable");
rocprofiler::hsa_amd_memory_async_copy_fn = table->amd_ext_->hsa_amd_memory_async_copy_fn;
rocprofiler::hsa_amd_memory_async_copy_rect_fn = table->amd_ext_->hsa_amd_memory_async_copy_rect_fn;
table->amd_ext_->hsa_amd_memory_async_copy_fn = rocprofiler::hsa_amd_memory_async_copy_interceptor;
table->amd_ext_->hsa_amd_memory_async_copy_rect_fn = rocprofiler::hsa_amd_memory_async_copy_rect_interceptor;
}
if (intercept_mode_mask & rocprofiler::HSA_INTERCEPT_MODE) {
if (intercept_mode_mask & rocprofiler::MEMCOPY_INTERCEPT_MODE) {
EXC_ABORT(HSA_STATUS_ERROR, "HSA_INTERCEPT and MEMCOPY_INTERCEPT conflict");
}
rocprofiler::HsaInterceptor::Enable(true);
rocprofiler::HsaInterceptor::HsaIntercept(table);
}
// HSA intercepting
if (intercept_mode) {
rocprofiler::ProxyQueue::HsaIntercept(table);
rocprofiler::InterceptQueue::HsaIntercept(table);
} else {
rocprofiler::StandaloneIntercept();
}
ONLOAD_TRACE_END();
return true;
}
// HSA-runtime tool on-unload method
PUBLIC_API void OnUnload() {
ONLOAD_TRACE_BEG();
rocprofiler::UnloadTool();
rocprofiler::RestoreHsaApi();
ONLOAD_TRACE_END();
}
// Returns library vesrion
PUBLIC_API uint32_t rocprofiler_version_major() { return ROCPROFILER_VERSION_MAJOR; }
PUBLIC_API uint32_t rocprofiler_version_minor() { return ROCPROFILER_VERSION_MINOR; }
// Returns the last error message
PUBLIC_API hsa_status_t rocprofiler_error_string(const char** str) {
API_METHOD_PREFIX
*str = rocprofiler::util::Logger::LastMessage().c_str();
API_METHOD_SUFFIX
}
// Create new profiling context
PUBLIC_API hsa_status_t rocprofiler_open(hsa_agent_t agent, rocprofiler_feature_t* features,
uint32_t feature_count, rocprofiler_t** handle, uint32_t mode,
rocprofiler_properties_t* properties) {
API_METHOD_PREFIX
rocprofiler::util::HsaRsrcFactory* hsa_rsrc = &rocprofiler::util::HsaRsrcFactory::Instance();
const rocprofiler::util::AgentInfo* agent_info = hsa_rsrc->GetAgentInfo(agent);
if (agent_info == NULL) {
EXC_RAISING(HSA_STATUS_ERROR, "agent is not found");
}
rocprofiler::Queue* queue = NULL;
if (mode != 0) {
if (mode & ROCPROFILER_MODE_STANDALONE) {
if (mode & ROCPROFILER_MODE_CREATEQUEUE) {
if (hsa_rsrc->CreateQueue(agent_info, properties->queue_depth, &(properties->queue)) == false) {
EXC_RAISING(HSA_STATUS_ERROR, "CreateQueue() failed");
}
}
queue = new rocprofiler::HsaQueue(agent_info, properties->queue);
} else {
EXC_RAISING(HSA_STATUS_ERROR, "invalid mode (" << mode << ")");
}
}
rocprofiler::Context** context_ret = reinterpret_cast<rocprofiler::Context**>(handle);
*context_ret = rocprofiler::Context::Create(agent_info, queue, features, feature_count,
properties->handler, properties->handler_arg);
API_METHOD_SUFFIX
}
// Delete profiling info
PUBLIC_API hsa_status_t rocprofiler_close(rocprofiler_t* handle) {
API_METHOD_PREFIX
rocprofiler::Context* context = reinterpret_cast<rocprofiler::Context*>(handle);
if (context) rocprofiler::Context::Destroy(context);
API_METHOD_SUFFIX
}
// Reset context
PUBLIC_API hsa_status_t rocprofiler_reset(rocprofiler_t* handle, uint32_t group_index) {
API_METHOD_PREFIX
rocprofiler::Context* context = reinterpret_cast<rocprofiler::Context*>(handle);
context->Reset(group_index);
API_METHOD_SUFFIX
}
// Return context agent
PUBLIC_API hsa_status_t rocprofiler_get_agent(rocprofiler_t* handle, hsa_agent_t* agent) {
API_METHOD_PREFIX
rocprofiler::Context* context = reinterpret_cast<rocprofiler::Context*>(handle);
*agent = context->GetAgent();
API_METHOD_SUFFIX
}
// Get profiling group count
PUBLIC_API hsa_status_t rocprofiler_group_count(const rocprofiler_t* handle,
uint32_t* group_count) {
API_METHOD_PREFIX
const rocprofiler::Context* context = reinterpret_cast<const rocprofiler::Context*>(handle);
*group_count = context->GetGroupCount();
API_METHOD_SUFFIX
}
// Get profiling group for a given group index
PUBLIC_API hsa_status_t rocprofiler_get_group(rocprofiler_t* handle, uint32_t group_index,
rocprofiler_group_t* group) {
API_METHOD_PREFIX
rocprofiler::Context* context = reinterpret_cast<rocprofiler::Context*>(handle);
*group = context->GetGroupDescr(group_index);
API_METHOD_SUFFIX
}
// Start profiling
PUBLIC_API hsa_status_t rocprofiler_start(rocprofiler_t* handle, uint32_t group_index) {
API_METHOD_PREFIX
rocprofiler::Context* context = reinterpret_cast<rocprofiler::Context*>(handle);
context->Start(group_index);
API_METHOD_SUFFIX
}
// Stop profiling
PUBLIC_API hsa_status_t rocprofiler_stop(rocprofiler_t* handle, uint32_t group_index) {
API_METHOD_PREFIX
rocprofiler::Context* context = reinterpret_cast<rocprofiler::Context*>(handle);
context->Stop(group_index);
API_METHOD_SUFFIX
}
// Read profiling
PUBLIC_API hsa_status_t rocprofiler_read(rocprofiler_t* handle, uint32_t group_index) {
API_METHOD_PREFIX
rocprofiler::Context* context = reinterpret_cast<rocprofiler::Context*>(handle);
context->Read(group_index);
API_METHOD_SUFFIX
}
// Get profiling data
PUBLIC_API hsa_status_t rocprofiler_get_data(rocprofiler_t* handle, uint32_t group_index) {
API_METHOD_PREFIX
rocprofiler::Context* context = reinterpret_cast<rocprofiler::Context*>(handle);
context->GetData(group_index);
API_METHOD_SUFFIX
}
// Start profiling
PUBLIC_API hsa_status_t rocprofiler_group_start(rocprofiler_group_t* group) {
API_METHOD_PREFIX
rocprofiler_start(group->context, group->index);
API_METHOD_SUFFIX
}
// Stop profiling
PUBLIC_API hsa_status_t rocprofiler_group_stop(rocprofiler_group_t* group) {
API_METHOD_PREFIX
rocprofiler_stop(group->context, group->index);
API_METHOD_SUFFIX
}
// Read profiling
PUBLIC_API hsa_status_t rocprofiler_group_read(rocprofiler_group_t* group) {
API_METHOD_PREFIX
rocprofiler_read(group->context, group->index);
API_METHOD_SUFFIX
}
// Get profiling data
PUBLIC_API hsa_status_t rocprofiler_group_get_data(rocprofiler_group_t* group) {
API_METHOD_PREFIX
rocprofiler::Context* context = reinterpret_cast<rocprofiler::Context*>(group->context);
context->GetData(group->index);
API_METHOD_SUFFIX
}
// Get metrics data
PUBLIC_API hsa_status_t rocprofiler_get_metrics(const rocprofiler_t* handle) {
API_METHOD_PREFIX
const rocprofiler::Context* context = reinterpret_cast<const rocprofiler::Context*>(handle);
context->GetMetricsData();
API_METHOD_SUFFIX
}
// Set/remove queue callbacks
PUBLIC_API hsa_status_t rocprofiler_set_queue_callbacks(rocprofiler_queue_callbacks_t callbacks, void* data) {
API_METHOD_PREFIX
rocprofiler::InterceptQueue::SetCallbacks(callbacks, data);
API_METHOD_SUFFIX
}
// Remove queue callbacks
PUBLIC_API hsa_status_t rocprofiler_remove_queue_callbacks() {
API_METHOD_PREFIX
rocprofiler::InterceptQueue::RemoveCallbacks();
API_METHOD_SUFFIX
}
// Start/stop queue callbacks
PUBLIC_API hsa_status_t rocprofiler_start_queue_callbacks() {
API_METHOD_PREFIX
rocprofiler::InterceptQueue::Start();
API_METHOD_SUFFIX
}
PUBLIC_API hsa_status_t rocprofiler_stop_queue_callbacks() {
API_METHOD_PREFIX
rocprofiler::InterceptQueue::Stop();
API_METHOD_SUFFIX
}
// Method for iterating the events output data
PUBLIC_API hsa_status_t rocprofiler_iterate_trace_data(
rocprofiler_t* handle, hsa_ven_amd_aqlprofile_data_callback_t callback, void* data) {
API_METHOD_PREFIX
rocprofiler::Context* context = reinterpret_cast<rocprofiler::Context*>(handle);
context->IterateTraceData(callback, data);
API_METHOD_SUFFIX
}
////////////////////////////////////////////////////////////////////////////////
// Open profiling pool
PUBLIC_API hsa_status_t rocprofiler_pool_open(hsa_agent_t agent, // GPU handle
rocprofiler_feature_t* features, // [in] profiling features array
uint32_t feature_count, // profiling info count
rocprofiler_pool_t** pool, // [out] context object
uint32_t mode, // profiling mode mask
rocprofiler_pool_properties_t* properties) // pool properties
{
API_METHOD_PREFIX
rocprofiler::util::HsaRsrcFactory* hsa_rsrc = &rocprofiler::util::HsaRsrcFactory::Instance();
const rocprofiler::util::AgentInfo* agent_info = hsa_rsrc->GetAgentInfo(agent);
if (agent_info == NULL) {
EXC_RAISING(HSA_STATUS_ERROR, "agent is not found");
}
rocprofiler::ContextPool* obj = rocprofiler::ContextPool::Create(
properties->num_entries,
properties->payload_bytes,
agent_info,
features,
feature_count,
properties->handler,
properties->handler_arg
);
*pool = reinterpret_cast<rocprofiler_pool_t*>(obj);
API_METHOD_SUFFIX
}
// Close profiling pool
PUBLIC_API hsa_status_t rocprofiler_pool_close(rocprofiler_pool_t* pool) // profiling pool handle
{
API_METHOD_PREFIX
rocprofiler::ContextPool* obj = reinterpret_cast<rocprofiler::ContextPool*>(pool);
rocprofiler::ContextPool::Destroy(obj);
API_METHOD_SUFFIX
}
// Fetch profiling pool entry
PUBLIC_API hsa_status_t rocprofiler_pool_fetch(rocprofiler_pool_t* pool, // profiling pool handle
rocprofiler_pool_entry_t* entry) // [out] empty profling pool entry
{
API_METHOD_PREFIX
rocprofiler::ContextPool* context_pool = reinterpret_cast<rocprofiler::ContextPool*>(pool);
context_pool->Fetch(entry);
API_METHOD_SUFFIX
}
// Fetch profiling pool entry
PUBLIC_API hsa_status_t rocprofiler_pool_flush(rocprofiler_pool_t* pool) // profiling pool handle
{
API_METHOD_PREFIX
rocprofiler::ContextPool* context_pool = reinterpret_cast<rocprofiler::ContextPool*>(pool);
context_pool->Flush();
API_METHOD_SUFFIX
}
////////////////////////////////////////////////////////////////////////////////
// Return the info for a given info kind
PUBLIC_API hsa_status_t rocprofiler_get_info(
const hsa_agent_t *agent,
rocprofiler_info_kind_t kind,
void *data)
{
API_METHOD_PREFIX
if (agent == NULL) EXC_RAISING(HSA_STATUS_ERROR, "NULL agent");
uint32_t* result_32bit_ptr = reinterpret_cast<uint32_t*>(data);
switch (kind) {
case ROCPROFILER_INFO_KIND_METRIC_COUNT:
*result_32bit_ptr = rocprofiler::GetMetrics(*agent)->Size();
break;
case ROCPROFILER_INFO_KIND_TRACE_COUNT:
*result_32bit_ptr = 1;
break;
default:
EXC_RAISING(HSA_STATUS_ERROR, "unknown info kind(" << kind << ")");
}
API_METHOD_SUFFIX
}
// Iterate over the info for a given info kind, and invoke an application-defined callback on every iteration
PUBLIC_API hsa_status_t rocprofiler_iterate_info(
const hsa_agent_t* agent,
rocprofiler_info_kind_t kind,
hsa_status_t (*callback)(const rocprofiler_info_data_t info, void* data),
void* data)
{
API_METHOD_PREFIX
rocprofiler::util::HsaRsrcFactory* hsa_rsrc = &rocprofiler::util::HsaRsrcFactory::Instance();
rocprofiler_info_data_t info{};
info.kind = kind;
uint32_t agent_idx = 0;
uint32_t agent_max = 0;
const rocprofiler::util::AgentInfo* agent_info = NULL;
if (agent != NULL) {
agent_info = hsa_rsrc->GetAgentInfo(*agent);
agent_idx = agent_info->dev_index;
agent_max = agent_idx + 1;
}
while (hsa_rsrc->GetGpuAgentInfo(agent_idx, &agent_info)) {
info.agent_index = agent_idx;
switch (kind) {
case ROCPROFILER_INFO_KIND_METRIC:
{
const rocprofiler::MetricsDict* dict = rocprofiler::GetMetrics(agent_info->dev_id);
auto nodes_vec = dict->GetNodes();
for (auto* node : nodes_vec) {
const std::string& name = node->opts["name"];
const std::string& descr = node->opts["descr"];
const std::string& expr = node->opts["expr"];
info.metric.name = strdup(name.c_str());
info.metric.description = strdup(descr.c_str());
info.metric.expr = expr.empty() ? NULL : strdup(expr.c_str());
info.metric.instances = 1;
if (expr.empty()) {
// Getting the block name
const std::string block_name = node->opts["block"];
// Querying profile
rocprofiler::profile_t profile = {};
profile.agent = agent_info->dev_id;
profile.type = HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC;
// Query block id info
hsa_ven_amd_aqlprofile_id_query_t query = {block_name.c_str(), 0, 0};
hsa_status_t status = rocprofiler::util::HsaRsrcFactory::Instance().AqlProfileApi()->hsa_ven_amd_aqlprofile_get_info(
&profile, HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_ID, &query);
if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(HSA_STATUS_ERROR, "get block id info: '" << block_name << "'");
// Metric object
const std::string metric_name = (query.instance_count > 1) ? name + "[0]" : name;
const rocprofiler::Metric* metric = dict->Get(metric_name);
if (metric == NULL) EXC_RAISING(HSA_STATUS_ERROR, "metric '" << name << "' is not found");
// Process metrics counters
const rocprofiler::counters_vec_t& counters_vec = metric->GetCounters();
if (counters_vec.size() != 1) EXC_RAISING(HSA_STATUS_ERROR, "error: '" << metric->GetName() << "' is not basic");
// Query block counters number
uint32_t block_counters;
profile.events = &(counters_vec[0]->event);
status = rocprofiler::util::HsaRsrcFactory::Instance().AqlProfileApi()->hsa_ven_amd_aqlprofile_get_info(
&profile, HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_COUNTERS, &block_counters);
if (status != HSA_STATUS_SUCCESS) continue;
info.metric.instances = query.instance_count;
info.metric.block_name = block_name.c_str();
info.metric.block_counters = block_counters;
}
status = callback(info, data);
if (status != HSA_STATUS_SUCCESS) break;
}
break;
}
case ROCPROFILER_INFO_KIND_TRACE:
{
info.trace.name = strdup("TT");
info.trace.description = strdup("Thread Trace");
info.trace.parameter_count = 5;
status = callback(info, data);
if (status != HSA_STATUS_SUCCESS) break;
break;
}
default:
EXC_RAISING(HSA_STATUS_ERROR, "unknown info kind(" << kind << ")");
}
++agent_idx;
if (agent_idx == agent_max) break;
}
if (status == HSA_STATUS_INFO_BREAK) status = HSA_STATUS_SUCCESS;
if (status != HSA_STATUS_SUCCESS) ERR_LOGGING("iterate_info error, info kind(" << kind << ")");
API_METHOD_SUFFIX
}
// Iterate over the info for a given info query, and invoke an application-defined callback on every iteration
PUBLIC_API hsa_status_t rocprofiler_query_info(
const hsa_agent_t *agent,
rocprofiler_info_query_t query,
hsa_status_t (*callback)(const rocprofiler_info_data_t info, void *data),
void *data)
{
API_METHOD_PREFIX
EXC_RAISING(HSA_STATUS_ERROR, "Not implemented");
API_METHOD_SUFFIX
}
// Creates a profiled queue. All dispatches on this queue will be profiled
PUBLIC_API hsa_status_t rocprofiler_queue_create_profiled(
hsa_agent_t agent, uint32_t size, hsa_queue_type32_t type,
void (*callback)(hsa_status_t status, hsa_queue_t* source, void* data),
void* data, uint32_t private_segment_size, uint32_t group_segment_size,
hsa_queue_t** queue)
{
API_METHOD_PREFIX
status = rocprofiler::InterceptQueue::QueueCreateTracked(
agent, size, type, callback, data, private_segment_size, group_segment_size, queue);
API_METHOD_SUFFIX
}
// Return time value for a given time ID and profiling timestamp
PUBLIC_API hsa_status_t rocprofiler_get_time(
rocprofiler_time_id_t time_id,
uint64_t timestamp,
uint64_t* value_ns,
uint64_t* error_ns)
{
API_METHOD_PREFIX
if (error_ns != NULL) {
*error_ns = 0;
status = rocprofiler::util::HsaRsrcFactory::Instance().GetTimeErr(time_id, error_ns);
}
if ((status == HSA_STATUS_SUCCESS) && (value_ns != NULL)) {
*value_ns = 0;
status = rocprofiler::util::HsaRsrcFactory::Instance().GetTimeVal(time_id, timestamp, value_ns);
}
API_METHOD_SUFFIX
}
} // extern "C"
///////////////////////////////////////////////////////////////////////////////////////////////////
// HSA API callbacks routines
//
bool rocprofiler::HsaInterceptor::enable_ = false;
thread_local bool rocprofiler::HsaInterceptor::recursion_ = false;;
rocprofiler_hsa_callbacks_t rocprofiler::HsaInterceptor::callbacks_{};
rocprofiler::HsaInterceptor::arg_t rocprofiler::HsaInterceptor::arg_{};
hsa_ven_amd_loader_1_01_pfn_t rocprofiler::HsaInterceptor::LoaderApiTable{};
rocprofiler::HsaInterceptor::mutex_t rocprofiler::HsaInterceptor::mutex_;
// Set HSA callbacks. If a callback is NULL then it is disabled
extern "C" PUBLIC_API hsa_status_t rocprofiler_set_hsa_callbacks(const rocprofiler_hsa_callbacks_t callbacks, void* arg) {
API_METHOD_PREFIX
rocprofiler::HsaInterceptor::SetCallbacks(callbacks, arg);
rocprofiler::InterceptQueue::SetSubmitCallback(callbacks.submit, arg);
API_METHOD_SUFFIX
}