[ROCm/roctracer commit: c1cca98a71]
Этот коммит содержится в:
Evgeny
2019-06-24 21:05:12 -05:00
родитель 775e64f392
Коммит 7a00dcb37e
19 изменённых файлов: 1420 добавлений и 308 удалений
+15 -5
Просмотреть файл
@@ -90,11 +90,6 @@ roctracer_status_t roctracer_op_code(
uint32_t* op, // [out] op code
uint32_t* kind = NULL); // [out] op kind code
// Set properties
roctracer_status_t roctracer_set_properties(
roctracer_domain_t domain, // tracing domain
void* propertes); // tracing properties
////////////////////////////////////////////////////////////////////////////////
// Callback API
//
@@ -214,6 +209,21 @@ roctracer_status_t roctracer_disable_activity();
roctracer_status_t roctracer_flush_activity(
roctracer_pool_t* pool = NULL); // memory pool, NULL is a default one
// Load/Un;oad methods
// Set properties
roctracer_status_t roctracer_set_properties(
roctracer_domain_t domain, // tracing domain
void* propertes); // tracing properties
struct HsaApiTable;
bool roctracer_load(
HsaApiTable* table,
uint64_t runtime_version,
uint64_t failed_tool_count,
const char* const* failed_tool_names);
void roctracer_unload();
#ifdef __cplusplus
} // extern "C" block
#endif // __cplusplus
+2
Просмотреть файл
@@ -74,8 +74,10 @@ extern AmdExtTable AmdExtTable_saved;
extern ImageExtTable ImageExtTable_saved;
struct ops_properties_t {
void* table;
activity_async_callback_t async_copy_callback_fun;
void* async_copy_callback_arg;
const char* output_prefix;
};
}; // namespace hsa_support
+3
Просмотреть файл
@@ -4,6 +4,9 @@
set ( TARGET_LIB "${TARGET_NAME}" )
set ( LIB_SRC
${LIB_DIR}/core/roctracer.cpp
${LIB_DIR}/proxy/proxy_queue.cpp
${LIB_DIR}/proxy/simple_proxy_queue.cpp
${LIB_DIR}/proxy/intercept_queue.cpp
${LIB_DIR}/util/hsa_rsrc_factory.cpp
)
add_library ( ${TARGET_LIB} SHARED ${LIB_SRC} )
+200 -46
Просмотреть файл
@@ -35,12 +35,18 @@ THE SOFTWARE.
#include <sys/syscall.h>
#include "core/loader.h"
#include "core/trace_buffer.h"
#include "proxy/tracker.h"
#include "ext/hsa_rt_utils.hpp"
#include "util/exception.h"
#include "util/hsa_rsrc_factory.h"
#include "util/logger.h"
#include "proxy/hsa_queue.h"
#include "proxy/intercept_queue.h"
#include "proxy/proxy_queue.h"
#include "proxy/simple_proxy_queue.h"
#define PUBLIC_API __attribute__((visibility("default")))
#define CONSTRUCTOR_API __attribute__((constructor))
#define DESTRUCTOR_API __attribute__((destructor))
@@ -85,17 +91,86 @@ THE SOFTWARE.
///////////////////////////////////////////////////////////////////////////////////////////////////
// Internal library methods
//
namespace rocprofiler {
decltype(hsa_queue_create)* hsa_queue_create_fn;
decltype(hsa_queue_destroy)* hsa_queue_destroy_fn;
decltype(hsa_signal_store_relaxed)* hsa_signal_store_relaxed_fn;
decltype(hsa_signal_store_relaxed)* hsa_signal_store_screlease_fn;
decltype(hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed_fn;
decltype(hsa_queue_store_write_index_relaxed)* hsa_queue_store_write_index_relaxed_fn;
decltype(hsa_queue_load_read_index_relaxed)* hsa_queue_load_read_index_relaxed_fn;
decltype(hsa_queue_load_write_index_scacquire)* hsa_queue_load_write_index_scacquire_fn;
decltype(hsa_queue_store_write_index_screlease)* hsa_queue_store_write_index_screlease_fn;
decltype(hsa_queue_load_read_index_scacquire)* hsa_queue_load_read_index_scacquire_fn;
decltype(hsa_amd_queue_intercept_create)* hsa_amd_queue_intercept_create_fn;
decltype(hsa_amd_queue_intercept_register)* hsa_amd_queue_intercept_register_fn;
decltype(hsa_amd_memory_async_copy)* hsa_amd_memory_async_copy_fn;
decltype(hsa_amd_memory_async_copy_rect)* hsa_amd_memory_async_copy_rect_fn;
::HsaApiTable* kHsaApiTable;
void SaveHsaApi(::HsaApiTable* table) {
util::HsaRsrcFactory::InitHsaApiTable(table);
kHsaApiTable = table;
hsa_queue_create_fn = table->core_->hsa_queue_create_fn;
hsa_queue_destroy_fn = table->core_->hsa_queue_destroy_fn;
hsa_signal_store_relaxed_fn = table->core_->hsa_signal_store_relaxed_fn;
hsa_signal_store_screlease_fn = table->core_->hsa_signal_store_screlease_fn;
hsa_queue_load_write_index_relaxed_fn = table->core_->hsa_queue_load_write_index_relaxed_fn;
hsa_queue_store_write_index_relaxed_fn = table->core_->hsa_queue_store_write_index_relaxed_fn;
hsa_queue_load_read_index_relaxed_fn = table->core_->hsa_queue_load_read_index_relaxed_fn;
hsa_queue_load_write_index_scacquire_fn = table->core_->hsa_queue_load_write_index_scacquire_fn;
hsa_queue_store_write_index_screlease_fn = table->core_->hsa_queue_store_write_index_screlease_fn;
hsa_queue_load_read_index_scacquire_fn = table->core_->hsa_queue_load_read_index_scacquire_fn;
hsa_amd_queue_intercept_create_fn = table->amd_ext_->hsa_amd_queue_intercept_create_fn;
hsa_amd_queue_intercept_register_fn = table->amd_ext_->hsa_amd_queue_intercept_register_fn;
}
void RestoreHsaApi() {
::HsaApiTable* table = kHsaApiTable;
table->core_->hsa_queue_create_fn = hsa_queue_create_fn;
table->core_->hsa_queue_destroy_fn = hsa_queue_destroy_fn;
table->core_->hsa_signal_store_relaxed_fn = hsa_signal_store_relaxed_fn;
table->core_->hsa_signal_store_screlease_fn = hsa_signal_store_screlease_fn;
table->core_->hsa_queue_load_write_index_relaxed_fn = hsa_queue_load_write_index_relaxed_fn;
table->core_->hsa_queue_store_write_index_relaxed_fn = hsa_queue_store_write_index_relaxed_fn;
table->core_->hsa_queue_load_read_index_relaxed_fn = hsa_queue_load_read_index_relaxed_fn;
table->core_->hsa_queue_load_write_index_scacquire_fn = hsa_queue_load_write_index_scacquire_fn;
table->core_->hsa_queue_store_write_index_screlease_fn = hsa_queue_store_write_index_screlease_fn;
table->core_->hsa_queue_load_read_index_scacquire_fn = hsa_queue_load_read_index_scacquire_fn;
table->amd_ext_->hsa_amd_queue_intercept_create_fn = hsa_amd_queue_intercept_create_fn;
table->amd_ext_->hsa_amd_queue_intercept_register_fn = hsa_amd_queue_intercept_register_fn;
}
}
namespace roctracer {
decltype(hsa_amd_memory_async_copy)* hsa_amd_memory_async_copy_fn;
decltype(hsa_amd_memory_async_copy_rect)* hsa_amd_memory_async_copy_rect_fn;
TraceBuffer<trace_entry_t> trace_buffer(0x200000);
namespace hsa_support {
// callbacks table
cb_table_t cb_table;
// asyc copy activity callback
bool async_copy_callback_enabled = false;
activity_async_callback_t async_copy_callback_fun = NULL;
void* async_copy_callback_arg = NULL;
bool async_copy_callback_enabled = false;
const char* output_prefix = NULL;
// Table of function pointers to HSA Core Runtime
CoreApiTable CoreApiTable_saved{};
// Table of function pointers to AMD extensions
@@ -288,15 +363,6 @@ class MemoryPool {
pthread_cond_t read_cond_;
};
CONSTRUCTOR_API void constructor() {
util::Logger::Create();
}
DESTRUCTOR_API void destructor() {
::util::HsaRsrcFactory::Destroy();
util::Logger::Destroy();
}
// Correlation id storage
static thread_local activity_correlation_id_t correlation_id_tls = 0;
typedef std::map<activity_correlation_id_t, activity_correlation_id_t> correlation_id_map_t;
@@ -367,18 +433,73 @@ void HCC_AsyncActivityCallback(uint32_t op_id, void* record, void* arg) {
pool->Write(*record_ptr);
}
bool hsa_async_copy_handler(hsa_signal_value_t value, void* arg) {
::proxy::Tracker::entry_t* entry = reinterpret_cast<::proxy::Tracker::entry_t*>(arg);
// Open output file
FILE* open_output_file(const char* prefix, const char* name) {
FILE* file_handle = NULL;
if (prefix != NULL) {
std::ostringstream oss;
oss << prefix << "/" << name;
file_handle = fopen(oss.str().c_str(), "w");
if (file_handle == NULL) {
std::ostringstream errmsg;
errmsg << "ROCTracer: fopen error, file '" << oss.str().c_str() << "'";
perror(errmsg.str().c_str());
abort();
}
} else file_handle = stdout;
return file_handle;
}
FILE* kernel_file_handle = NULL;
void hsa_kernel_handler(::proxy::Tracker::entry_t* entry) {
static uint64_t index = 0;
if (index == 0) {
kernel_file_handle = open_output_file(hsa_support::output_prefix, "results.txt");
}
fprintf(kernel_file_handle, "dispatch[%lu], gpu-id(%u), tid(%u), kernel-name(\"%s\"), time(%lu,%lu,%lu,%lu)\n",
index,
::util::HsaRsrcFactory::Instance().GetAgentInfo(entry->agent)->dev_index,
entry->kernel.tid,
entry->kernel.name,
entry->dispatch,
entry->begin,
entry->end,
entry->complete);
#if 0
fprintf(file_handle, "dispatch[%u], gpu-id(%u), queue-id(%u), queue-index(%lu), tid(%lu), grd(%u), wgr(%u), lds(%u), scr(%u), vgpr(%u), sgpr(%u), fbar(%u), sig(0x%lx), kernel-name(\"%s\")",
index,
HsaRsrcFactory::Instance().GetAgentInfo(entry->agent)->dev_index,
entry->data.queue_id,
entry->data.queue_index,
entry->data.thread_id,
entry->kernel_properties.grid_size,
entry->kernel_properties.workgroup_size,
entry->kernel_properties.lds_size,
entry->kernel_properties.scratch_size,
entry->kernel_properties.vgpr_count,
entry->kernel_properties.sgpr_count,
entry->kernel_properties.fbarrier_count,
entry->kernel_properties.signal.handle,
nik_name.c_str());
if (record) fprintf(file_handle, ", time(%lu,%lu,%lu,%lu)",
record->dispatch,
record->begin,
record->end,
record->complete);
fprintf(file_handle, "\n");
fflush(file_handle);
#endif
index++;
}
void hsa_async_copy_handler(::proxy::Tracker::entry_t* entry) {
activity_record_t record{};
record.domain = ACTIVITY_DOMAIN_HSA_OPS; // activity domain id
record.correlation_id = entry->index; // activity ID
record.begin_ns = entry->record->begin; // host begin timestamp
record.end_ns = entry->record->end; // host end timestamp
record.begin_ns = entry->begin; // host begin timestamp
record.end_ns = entry->end; // host end timestamp
record.device_id = 0; // device id
hsa_support::async_copy_callback_fun(hsa_support::HSA_OP_ID_async_copy, &record, hsa_support::async_copy_callback_arg);
return false;
}
hsa_status_t hsa_amd_memory_async_copy_interceptor(
@@ -388,17 +509,15 @@ hsa_status_t hsa_amd_memory_async_copy_interceptor(
{
hsa_status_t status = HSA_STATUS_SUCCESS;
if (hsa_support::async_copy_callback_enabled) {
::proxy::Tracker* tracker = &::proxy::Tracker::Instance();
::proxy::Tracker::entry_t* tracker_entry = tracker->Alloc(hsa_agent_t{}, completion_signal);
trace_entry_t* entry = trace_buffer.GetEntry();
::proxy::Tracker::Enable(COPY_ENTRY_TYPE, hsa_agent_t{}, completion_signal, entry);
status = hsa_amd_memory_async_copy_fn(dst, dst_agent, src,
src_agent, size, num_dep_signals,
dep_signals, tracker_entry->signal);
if (status == HSA_STATUS_SUCCESS) {
tracker->EnableMemcopy(tracker_entry, hsa_async_copy_handler, reinterpret_cast<void*>(tracker_entry));
} else {
tracker->Delete(tracker_entry);
}
} else {
dep_signals, entry->signal);
if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "hsa_amd_memory_async_copy interceptor");
}
else
{
status = hsa_amd_memory_async_copy_fn(dst, dst_agent, src,
src_agent, size, num_dep_signals,
dep_signals, completion_signal);
@@ -414,18 +533,16 @@ hsa_status_t hsa_amd_memory_async_copy_rect_interceptor(
{
hsa_status_t status = HSA_STATUS_SUCCESS;
if (hsa_support::async_copy_callback_enabled) {
::proxy::Tracker* tracker = &::proxy::Tracker::Instance();
::proxy::Tracker::entry_t* tracker_entry = tracker->Alloc(hsa_agent_t{}, completion_signal);
trace_entry_t* entry = trace_buffer.GetEntry();
::proxy::Tracker::Enable(COPY_ENTRY_TYPE, hsa_agent_t{}, completion_signal, entry);
status = hsa_amd_memory_async_copy_rect_fn(dst, dst_offset, src,
src_offset, range, copy_agent,
dir, num_dep_signals, dep_signals,
tracker_entry->signal);
if (status == HSA_STATUS_SUCCESS) {
tracker->EnableMemcopy(tracker_entry, hsa_async_copy_handler, reinterpret_cast<void*>(tracker_entry));
} else {
tracker->Delete(tracker_entry);
}
} else {
entry->signal);
if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "hsa_amd_memory_async_copy interceptor");
}
else
{
status = hsa_amd_memory_async_copy_rect_fn(dst, dst_offset, src,
src_offset, range, copy_agent,
dir, num_dep_signals, dep_signals,
@@ -443,10 +560,6 @@ memory_pool_mutex_t memory_pool_mutex;
LOADER_INSTANTIATE();
std::atomic<proxy::Tracker*> proxy::Tracker::instance_{};
proxy::Tracker::mutex_t proxy::Tracker::glob_mutex_;
proxy::Tracker::counter_t proxy::Tracker::counter_ = 0;
///////////////////////////////////////////////////////////////////////////////////////////////////
// Public library methods
//
@@ -685,6 +798,7 @@ static void roctracer_enable_activity_impl(
switch (domain) {
case ACTIVITY_DOMAIN_HSA_OPS: {
roctracer::hsa_support::async_copy_callback_enabled = true;
rocprofiler::InterceptQueue::Enable(true);
break;
}
case ACTIVITY_DOMAIN_HSA_API: break;
@@ -747,6 +861,7 @@ static void roctracer_disable_activity_impl(
switch (domain) {
case ACTIVITY_DOMAIN_HSA_OPS: {
roctracer::hsa_support::async_copy_callback_enabled = false;
rocprofiler::InterceptQueue::Enable(false);
break;
}
case ACTIVITY_DOMAIN_HSA_API: break;
@@ -812,8 +927,25 @@ PUBLIC_API roctracer_status_t roctracer_set_properties(
case ACTIVITY_DOMAIN_HSA_OPS: {
// HSA OPS properties
roctracer::hsa_ops_properties_t* ops_properties = reinterpret_cast<roctracer::hsa_ops_properties_t*>(properties);
HsaApiTable* table = reinterpret_cast<HsaApiTable*>(ops_properties->table);
roctracer::hsa_support::async_copy_callback_fun = ops_properties->async_copy_callback_fun;
roctracer::hsa_support::async_copy_callback_arg = ops_properties->async_copy_callback_arg;
roctracer::hsa_support::output_prefix = ops_properties->output_prefix;
// HSA dispatches intercepting
rocprofiler::SaveHsaApi(table);
rocprofiler::ProxyQueue::InitFactory();
rocprofiler::ProxyQueue::HsaIntercept(table);
rocprofiler::InterceptQueue::HsaIntercept(table);
// HSA async-copy tracing
hsa_status_t status = hsa_amd_profiling_async_copy_enable(true);
if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "hsa_amd_profiling_async_copy_enable");
roctracer::hsa_amd_memory_async_copy_fn = table->amd_ext_->hsa_amd_memory_async_copy_fn;
roctracer::hsa_amd_memory_async_copy_rect_fn = table->amd_ext_->hsa_amd_memory_async_copy_rect_fn;
table->amd_ext_->hsa_amd_memory_async_copy_fn = roctracer::hsa_amd_memory_async_copy_interceptor;
table->amd_ext_->hsa_amd_memory_async_copy_rect_fn = roctracer::hsa_amd_memory_async_copy_rect_interceptor;
break;
}
case ACTIVITY_DOMAIN_HSA_API: {
@@ -834,17 +966,39 @@ PUBLIC_API roctracer_status_t roctracer_set_properties(
}
// HSA-runtime tool on-load method
PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t failed_tool_count,
PUBLIC_API bool roctracer_load(HsaApiTable* table, uint64_t runtime_version, uint64_t failed_tool_count,
const char* const* failed_tool_names) {
// enabled HSA async-copy tracing
hsa_status_t status = hsa_amd_profiling_async_copy_enable(true);
if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "hsa_amd_profiling_async_copy_enable");
roctracer::hsa_amd_memory_async_copy_fn = table->amd_ext_->hsa_amd_memory_async_copy_fn;
roctracer::hsa_amd_memory_async_copy_rect_fn = table->amd_ext_->hsa_amd_memory_async_copy_rect_fn;
table->amd_ext_->hsa_amd_memory_async_copy_fn = roctracer::hsa_amd_memory_async_copy_interceptor;
table->amd_ext_->hsa_amd_memory_async_copy_rect_fn = roctracer::hsa_amd_memory_async_copy_rect_interceptor;
static bool is_loaded = false;
if (is_loaded) return true;
is_loaded = true;
return true;
}
PUBLIC_API void roctracer_unload() {
static bool is_unloaded = false;
if (is_unloaded) return;
is_unloaded = true;
roctracer::trace_buffer.Flush(roctracer::COPY_ENTRY_TYPE, roctracer::hsa_async_copy_handler);
roctracer::trace_buffer.Flush(roctracer::KERNEL_ENTRY_TYPE, roctracer::hsa_kernel_handler);
if ((roctracer::hsa_support::output_prefix != NULL) && (roctracer::kernel_file_handle != NULL)) fclose(roctracer::kernel_file_handle);
}
PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t failed_tool_count,
const char* const* failed_tool_names) {
return roctracer_load(table, runtime_version, failed_tool_count, failed_tool_names);
}
PUBLIC_API void OnUnload() { roctracer_unload(); }
CONSTRUCTOR_API void constructor() {
roctracer::util::Logger::Create();
}
DESTRUCTOR_API void destructor() {
roctracer_unload();
util::HsaRsrcFactory::Destroy();
roctracer::util::Logger::Destroy();
}
} // extern "C"
+79
Просмотреть файл
@@ -0,0 +1,79 @@
#ifndef SRC_CORE_TRACE_BUFFER_H_
#define SRC_CORE_TRACE_BUFFER_H_
namespace roctracer {
enum {
TRACE_ENTRY_INV = 0,
TRACE_ENTRY_INIT = 1,
TRACE_ENTRY_COMPL = 2
};
enum {
API_ENTRY_TYPE,
COPY_ENTRY_TYPE,
KERNEL_ENTRY_TYPE
};
struct trace_entry_t {
std::atomic<uint32_t> valid;
uint32_t type;
uint64_t dispatch;
uint64_t begin; // kernel begin timestamp, ns
uint64_t end; // kernel end timestamp, ns
uint64_t complete;
hsa_agent_t agent;
hsa_signal_t orig;
hsa_signal_t signal;
union {
struct {
} copy;
struct {
const char* name;
hsa_agent_t agent;
uint32_t tid;
} kernel;
};
};
template <typename Entry>
class TraceBuffer {
public:
typedef void (*callabck_t)(Entry*);
TraceBuffer(uint32_t size) {
size_ = size;
data_ = new Entry[size_];;
memset(data_, 0, size_ * sizeof(Entry));
read_pointer_ = data_;
}
Entry* GetEntry() {
Entry* ptr = read_pointer_.fetch_add(1);
if (ptr >= (data_ + size_)) {
fprintf(stderr, "GetEntry: trace buffer is out of range\n");
abort();
}
return ptr;
}
void Flush(uint32_t type, callabck_t fun) {
Entry* ptr = data_;
for (; (ptr < read_pointer_) && (ptr < (data_ + size_)); ptr++) {
if (ptr->type == type) {
if (ptr->valid == TRACE_ENTRY_COMPL) {
fun(ptr);
}
}
}
if (ptr >= (data_ + size_)) {
fprintf(stderr, "Flush: trace buffer is out of range\n");
}
}
private:
Entry* data_;
uint32_t size_;
std::atomic<Entry*> read_pointer_;
};
} // namespace roctracer
#endif // SRC_CORE_TRACE_BUFFER_H_
+67
Просмотреть файл
@@ -0,0 +1,67 @@
/******************************************************************************
Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#ifndef _SRC_CORE_HSA_PROXY_QUEUE_H
#define _SRC_CORE_HSA_PROXY_QUEUE_H
#include <hsa.h>
#include <atomic>
#include <map>
#include <mutex>
#include "proxy/proxy_queue.h"
#include "util/exception.h"
namespace rocprofiler {
extern decltype(hsa_queue_destroy)* hsa_queue_destroy_fn;
extern decltype(hsa_amd_queue_intercept_create)* hsa_amd_queue_intercept_create_fn;
extern decltype(hsa_amd_queue_intercept_register)* hsa_amd_queue_intercept_register_fn;
class HsaProxyQueue : public ProxyQueue {
public:
hsa_status_t SetInterceptCB(on_submit_cb_t on_submit_cb, void* data) {
return hsa_amd_queue_intercept_register_fn(queue_, on_submit_cb, data);
}
void Submit(const packet_t* packet) {
EXC_RAISING(HSA_STATUS_ERROR, "HsaProxyQueue::Submit() is not supported");
}
private:
hsa_status_t Init(hsa_agent_t agent, uint32_t size, hsa_queue_type32_t type,
void (*callback)(hsa_status_t status, hsa_queue_t* source, void* data),
void* data, uint32_t private_segment_size, uint32_t group_segment_size,
hsa_queue_t** queue) {
const auto status = hsa_amd_queue_intercept_create_fn(
agent, size, type, callback, data, private_segment_size, group_segment_size, &queue_);
*queue = queue_;
return status;
}
hsa_status_t Cleanup() const { return hsa_queue_destroy_fn(queue_); }
hsa_queue_t* queue_;
};
} // namespace rocprofiler
#endif // _SRC_CORE_HSA_PROXY_QUEUE_H
+46
Просмотреть файл
@@ -0,0 +1,46 @@
/******************************************************************************
Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#ifndef _SRC_CORE_HSA_QUEUE_H
#define _SRC_CORE_HSA_QUEUE_H
#include <atomic>
#include "proxy/queue.h"
namespace rocprofiler {
class HsaQueue : public Queue {
public:
HsaQueue(const util::AgentInfo* agent_info, hsa_queue_t* queue) : queue_(queue) {}
void Submit(const packet_t* packet) {
util::HsaRsrcFactory::Instance().Submit(queue_, packet);
}
private:
hsa_queue_t* queue_;
};
} // namespace rocprofiler
#endif // _SRC_CORE_HSA_QUEUE_H
+42
Просмотреть файл
@@ -0,0 +1,42 @@
/******************************************************************************
Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#include "proxy/intercept_queue.h"
namespace rocprofiler {
void InterceptQueue::HsaIntercept(HsaApiTable* table) {
table->core_->hsa_queue_create_fn = rocprofiler::InterceptQueue::QueueCreate;
table->core_->hsa_queue_destroy_fn = rocprofiler::InterceptQueue::QueueDestroy;
}
InterceptQueue::mutex_t InterceptQueue::mutex_;
//rocprofiler_callback_t InterceptQueue::dispatch_callback_ = NULL;
//InterceptQueue::queue_callback_t InterceptQueue::create_callback_ = NULL;
//InterceptQueue::queue_callback_t InterceptQueue::destroy_callback_ = NULL;
//void* InterceptQueue::callback_data_ = NULL;
InterceptQueue::obj_map_t* InterceptQueue::obj_map_ = NULL;
const char* InterceptQueue::kernel_none_ = "";
bool InterceptQueue::in_create_call_ = false;
InterceptQueue::queue_id_t InterceptQueue::current_queue_id = 0;
bool InterceptQueue::is_enabled = false;
} // namespace rocprofiler
+301
Просмотреть файл
@@ -0,0 +1,301 @@
/******************************************************************************
Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#ifndef _SRC_CORE_INTERCEPT_QUEUE_H
#define _SRC_CORE_INTERCEPT_QUEUE_H
#include <amd_hsa_kernel_code.h>
#include <cxxabi.h>
#include <dlfcn.h>
#include <sys/syscall.h>
#include <atomic>
#include <iostream>
#include <map>
#include <mutex>
#include "core/trace_buffer.h"
#include "proxy/tracker.h"
#include "proxy/proxy_queue.h"
#include "util/hsa_rsrc_factory.h"
#include "util/exception.h"
namespace roctracer { extern TraceBuffer<trace_entry_t> trace_buffer; }
namespace rocprofiler {
extern decltype(hsa_queue_create)* hsa_queue_create_fn;
extern decltype(hsa_queue_destroy)* hsa_queue_destroy_fn;
class InterceptQueue {
public:
typedef std::recursive_mutex mutex_t;
typedef std::map<uint64_t, InterceptQueue*> obj_map_t;
typedef hsa_status_t (*queue_callback_t)(hsa_queue_t*, void* data);
typedef void (*queue_event_callback_t)(hsa_status_t status, hsa_queue_t *queue, void *arg);
typedef uint32_t queue_id_t;
static void HsaIntercept(HsaApiTable* table);
static hsa_status_t InterceptQueueCreate(hsa_agent_t agent, uint32_t size, hsa_queue_type32_t type,
void (*callback)(hsa_status_t status, hsa_queue_t* source,
void* data),
void* data, uint32_t private_segment_size,
uint32_t group_segment_size, hsa_queue_t** queue,
const bool& tracker_on) {
std::lock_guard<mutex_t> lck(mutex_);
hsa_status_t status = HSA_STATUS_ERROR;
if (in_create_call_) EXC_ABORT(status, "recursive InterceptQueueCreate()");
in_create_call_ = true;
ProxyQueue* proxy = ProxyQueue::Create(agent, size, type, queue_event_callback, data, private_segment_size,
group_segment_size, queue, &status);
if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "ProxyQueue::Create()");
status = util::HsaRsrcFactory::HsaApi()->hsa_amd_profiling_set_profiler_enabled(*queue, true);
if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "hsa_amd_profiling_set_profiler_enabled()");
if (!obj_map_) obj_map_ = new obj_map_t;
InterceptQueue* obj = new InterceptQueue(agent, *queue, proxy);
obj->queue_event_callback_ = callback;
obj->queue_id = current_queue_id;
(*obj_map_)[(uint64_t)(*queue)] = obj;
status = (is_enabled) ? proxy->SetInterceptCB(OnSubmitCB, obj) : proxy->SetInterceptCB(OnSubmitCB_dummy, obj);
#if 0
if (create_callback_ != NULL) {
status = create_callback_(*queue, callback_data_);
}
#endif
++current_queue_id;
in_create_call_ = false;
return status;
}
static hsa_status_t QueueCreate(hsa_agent_t agent, uint32_t size, hsa_queue_type32_t type,
void (*callback)(hsa_status_t status, hsa_queue_t* source,
void* data),
void* data, uint32_t private_segment_size,
uint32_t group_segment_size, hsa_queue_t** queue) {
return InterceptQueueCreate(agent, size, type, callback, data, private_segment_size, group_segment_size, queue, false);
}
static hsa_status_t QueueCreateTracked(hsa_agent_t agent, uint32_t size, hsa_queue_type32_t type,
void (*callback)(hsa_status_t status, hsa_queue_t* source,
void* data),
void* data, uint32_t private_segment_size,
uint32_t group_segment_size, hsa_queue_t** queue) {
return InterceptQueueCreate(agent, size, type, callback, data, private_segment_size, group_segment_size, queue, true);
}
static hsa_status_t QueueDestroy(hsa_queue_t* queue) {
std::lock_guard<mutex_t> lck(mutex_);
hsa_status_t status = HSA_STATUS_SUCCESS;
#if 0
if (destroy_callback_ != NULL) {
status = destroy_callback_(queue, callback_data_);
}
#endif
if (status == HSA_STATUS_SUCCESS) {
status = DelObj(queue);
}
return status;
}
static void OnSubmitCB_dummy(const void* in_packets, uint64_t count, uint64_t user_que_idx, void* data,
hsa_amd_queue_intercept_packet_writer writer) {
const packet_t* packets_arr = reinterpret_cast<const packet_t*>(in_packets);
// Submitting the original packets if profiling was not enabled
if (writer != NULL) {
writer(packets_arr, count);
} else {
InterceptQueue* obj = reinterpret_cast<InterceptQueue*>(data);
Queue* proxy = obj->proxy_;
proxy->Submit(packets_arr, count);
}
}
static void OnSubmitCB(const void* in_packets, uint64_t count, uint64_t user_que_idx, void* data,
hsa_amd_queue_intercept_packet_writer writer) {
const packet_t* packets_arr = reinterpret_cast<const packet_t*>(in_packets);
InterceptQueue* obj = reinterpret_cast<InterceptQueue*>(data);
Queue* proxy = obj->proxy_;
// Travers input packets
for (uint64_t j = 0; j < count; ++j) {
const packet_t* packet = &packets_arr[j];
// Checking for dispatch packet type
if (GetHeaderType(packet) == HSA_PACKET_TYPE_KERNEL_DISPATCH) {
const hsa_kernel_dispatch_packet_t* dispatch_packet =
reinterpret_cast<const hsa_kernel_dispatch_packet_t*>(packet);
// Prepareing dispatch callback data
const hsa_signal_t completion_signal = dispatch_packet->completion_signal;
const amd_kernel_code_t* kernel_code = GetKernelCode(dispatch_packet);
const uint64_t kernel_symbol = kernel_code->runtime_loader_kernel_symbol;
const char* kernel_name = GetKernelName(kernel_symbol);
// Adding kernel timing tracker
::proxy::Tracker::entry_t* entry = roctracer::trace_buffer.GetEntry();
entry->kernel.tid = syscall(__NR_gettid);
entry->kernel.name = kernel_name;
::proxy::Tracker::Enable(roctracer::KERNEL_ENTRY_TYPE, obj->agent_info_->dev_id, completion_signal, entry);
const_cast<hsa_kernel_dispatch_packet_t*>(dispatch_packet)->completion_signal = entry->signal;
}
}
// Submitting the original packets if profiling was not enabled
if (writer != NULL) {
writer(packets_arr, count);
} else {
proxy->Submit(packets_arr, count);
}
}
#if 0
static void SetCallbacks(rocprofiler_callback_t dispatch_callback,
queue_callback_t create_callback,
queue_callback_t destroy_callback,
void* data)
{
std::lock_guard<mutex_t> lck(mutex_);
callback_data_ = data;
dispatch_callback_ = dispatch_callback;
create_callback_ = create_callback;
destroy_callback_ = destroy_callback;
}
#endif
static void Enable(bool val) { is_enabled = val; }
private:
static void queue_event_callback(hsa_status_t status, hsa_queue_t *queue, void *arg) {
if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "queue error handling is not supported");
InterceptQueue* obj = GetObj(queue);
if (obj->queue_event_callback_) obj->queue_event_callback_(status, obj->queue_, arg);
}
static hsa_packet_type_t GetHeaderType(const packet_t* packet) {
const packet_word_t* header = reinterpret_cast<const packet_word_t*>(packet);
return static_cast<hsa_packet_type_t>((*header >> HSA_PACKET_HEADER_TYPE) & header_type_mask);
}
static const amd_kernel_code_t* GetKernelCode(const hsa_kernel_dispatch_packet_t* dispatch_packet) {
const amd_kernel_code_t* kernel_code = NULL;
hsa_status_t status =
util::HsaRsrcFactory::Instance().LoaderApi()->hsa_ven_amd_loader_query_host_address(
reinterpret_cast<const void*>(dispatch_packet->kernel_object),
reinterpret_cast<const void**>(&kernel_code));
if (HSA_STATUS_SUCCESS != status) {
kernel_code = reinterpret_cast<amd_kernel_code_t*>(dispatch_packet->kernel_object);
}
return kernel_code;
}
static const char* GetKernelName(const uint64_t kernel_symbol) {
amd_runtime_loader_debug_info_t* dbg_info =
reinterpret_cast<amd_runtime_loader_debug_info_t*>(kernel_symbol);
const char* kernel_name = (dbg_info != NULL) ? dbg_info->kernel_name : NULL;
return (kernel_name != NULL) ? strdup(kernel_name) : strdup(kernel_none_);
#if 0
// Kernel name is mangled name
// apply __cxa_demangle() to demangle it
const char* funcname = NULL;
if (kernel_name != NULL) {
size_t funcnamesize = 0;
int status;
const char* ret = abi::__cxa_demangle(kernel_name, NULL, &funcnamesize, &status);
funcname = (ret != 0) ? ret : strdup(kernel_name);
}
if (funcname == NULL) funcname = strdup(kernel_none_);
return funcname;
#endif
}
// method to get an intercept queue object
static InterceptQueue* GetObj(const hsa_queue_t* queue) {
std::lock_guard<mutex_t> lck(mutex_);
InterceptQueue* obj = NULL;
obj_map_t::const_iterator it = obj_map_->find((uint64_t)queue);
if (it != obj_map_->end()) {
obj = it->second;
assert(queue == obj->queue_);
}
return obj;
}
// method to delete an intercept queue object
static hsa_status_t DelObj(const hsa_queue_t* queue) {
std::lock_guard<mutex_t> lck(mutex_);
hsa_status_t status = HSA_STATUS_ERROR;
obj_map_t::const_iterator it = obj_map_->find((uint64_t)queue);
if (it != obj_map_->end()) {
const InterceptQueue* obj = it->second;
assert(queue == obj->queue_);
delete obj;
obj_map_->erase(it);
status = HSA_STATUS_SUCCESS;;
}
return status;
}
InterceptQueue(const hsa_agent_t& agent, hsa_queue_t* const queue, ProxyQueue* proxy) :
queue_(queue),
proxy_(proxy)
{
agent_info_ = util::HsaRsrcFactory::Instance().GetAgentInfo(agent);
queue_event_callback_ = NULL;
}
~InterceptQueue() {
ProxyQueue::Destroy(proxy_);
}
static bool is_enabled;
static mutex_t mutex_;
static const packet_word_t header_type_mask = (1ul << HSA_PACKET_HEADER_WIDTH_TYPE) - 1;
#if 0
static queue_callback_t create_callback_;
static queue_callback_t destroy_callback_;
static void* callback_data_;
#endif
static obj_map_t* obj_map_;
static const char* kernel_none_;
static bool in_create_call_;
static queue_id_t current_queue_id;
hsa_queue_t* const queue_;
ProxyQueue* const proxy_;
const util::AgentInfo* agent_info_;
queue_event_callback_t queue_event_callback_;
queue_id_t queue_id;
};
} // namespace rocprofiler
#endif // _SRC_CORE_INTERCEPT_QUEUE_H
+63
Просмотреть файл
@@ -0,0 +1,63 @@
/******************************************************************************
Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#include "proxy/proxy_queue.h"
#include "proxy/hsa_proxy_queue.h"
#include "proxy/simple_proxy_queue.h"
namespace rocprofiler {
void ProxyQueue::HsaIntercept(HsaApiTable* table) {
if (rocp_type_) SimpleProxyQueue::HsaIntercept(table);
}
ProxyQueue* ProxyQueue::Create(hsa_agent_t agent, uint32_t size, hsa_queue_type32_t type,
void (*callback)(hsa_status_t status, hsa_queue_t* source,
void* data),
void* data, uint32_t private_segment_size,
uint32_t group_segment_size, hsa_queue_t** queue,
hsa_status_t* status) {
hsa_status_t suc = HSA_STATUS_ERROR;
ProxyQueue* instance =
(rocp_type_) ? (ProxyQueue*) new SimpleProxyQueue() : (ProxyQueue*) new HsaProxyQueue();
if (instance != NULL) {
suc = instance->Init(agent, size, type, callback, data, private_segment_size,
group_segment_size, queue);
if (suc != HSA_STATUS_SUCCESS) {
delete instance;
instance = NULL;
}
}
*status = suc;
assert(*status == HSA_STATUS_SUCCESS);
return instance;
}
hsa_status_t ProxyQueue::Destroy(const ProxyQueue* obj) {
assert(obj != NULL);
auto suc = obj->Cleanup();
delete obj;
return suc;
}
bool ProxyQueue::rocp_type_ = false;
} // namespace rocprofiler
+77
Просмотреть файл
@@ -0,0 +1,77 @@
/******************************************************************************
Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#ifndef _SRC_CORE_PROXY_QUEUE_H
#define _SRC_CORE_PROXY_QUEUE_H
#include <hsa.h>
#include <hsa_api_trace.h>
#include <atomic>
#include <map>
#include <mutex>
#include "proxy/queue.h"
#include "proxy/types.h"
struct HsaApiTable;
namespace rocprofiler {
typedef void (*hsa_amd_queue_intercept_packet_writer)(const void* packets, uint64_t count);
typedef void (*on_submit_cb_t)(const void* packet, uint64_t count, uint64_t que_idx, void* data,
hsa_amd_queue_intercept_packet_writer writer);
class ProxyQueue : public Queue {
public:
static void InitFactory() {
const char* type = getenv("ROCP_PROXY_QUEUE");
if (type != NULL) {
if (strncmp(type, "rocp", 4) == 0) rocp_type_ = true;
}
}
static void HsaIntercept(HsaApiTable* table);
static ProxyQueue* Create(hsa_agent_t agent, uint32_t size, hsa_queue_type32_t type,
void (*callback)(hsa_status_t status, hsa_queue_t* source, void* data),
void* data, uint32_t private_segment_size, uint32_t group_segment_size,
hsa_queue_t** queue, hsa_status_t* status);
static hsa_status_t Destroy(const ProxyQueue* obj);
virtual hsa_status_t Init(hsa_agent_t agent, uint32_t size, hsa_queue_type32_t type,
void (*callback)(hsa_status_t status, hsa_queue_t* source, void* data),
void* data, uint32_t private_segment_size, uint32_t group_segment_size,
hsa_queue_t** queue) = 0;
virtual hsa_status_t Cleanup() const = 0;
virtual hsa_status_t SetInterceptCB(on_submit_cb_t on_submit_cb, void* data) = 0;
virtual void Submit(const packet_t* packet) = 0;
protected:
virtual ~ProxyQueue(){};
private:
static bool rocp_type_;
};
} // namespace rocprofiler
#endif // _SRC_CORE_PROXY_QUEUE_H
+42
Просмотреть файл
@@ -0,0 +1,42 @@
/******************************************************************************
Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#ifndef _SRC_CORE_QUEUE_H
#define _SRC_CORE_QUEUE_H
#include "proxy/types.h"
namespace rocprofiler {
class Queue {
public:
Queue() {}
virtual ~Queue() {}
virtual void Submit(const packet_t* packet) = 0;
virtual void Submit(const packet_t* packet, const size_t& count) {
for (const packet_t* p = packet; p < packet + count; ++p) Submit(p);
}
};
} // namespace rocprofiler
#endif // _SRC_CORE_QUEUE_H
+40
Просмотреть файл
@@ -0,0 +1,40 @@
/******************************************************************************
Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#include "proxy/simple_proxy_queue.h"
namespace rocprofiler {
void SimpleProxyQueue::HsaIntercept(HsaApiTable* table) {
table->core_->hsa_signal_store_relaxed_fn = rocprofiler::SimpleProxyQueue::SignalStore;
table->core_->hsa_signal_store_screlease_fn = rocprofiler::SimpleProxyQueue::SignalStore;
table->core_->hsa_queue_load_write_index_relaxed_fn = rocprofiler::SimpleProxyQueue::GetQueueIndex;
table->core_->hsa_queue_store_write_index_relaxed_fn = rocprofiler::SimpleProxyQueue::SetQueueIndex;
table->core_->hsa_queue_load_read_index_relaxed_fn = rocprofiler::SimpleProxyQueue::GetSubmitIndex;
table->core_->hsa_queue_load_write_index_scacquire_fn = rocprofiler::SimpleProxyQueue::GetQueueIndex;
table->core_->hsa_queue_store_write_index_screlease_fn = rocprofiler::SimpleProxyQueue::SetQueueIndex;
table->core_->hsa_queue_load_read_index_scacquire_fn = rocprofiler::SimpleProxyQueue::GetSubmitIndex;
}
SimpleProxyQueue::queue_map_t* SimpleProxyQueue::queue_map_ = NULL;
} // namespace rocprofiler
+261
Просмотреть файл
@@ -0,0 +1,261 @@
/******************************************************************************
Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#ifndef _SRC_CORE_SIMPLE_PROXY_QUEUE_H
#define _SRC_CORE_SIMPLE_PROXY_QUEUE_H
#include <hsa.h>
#include <atomic>
#include <map>
#include <mutex>
#include "proxy/proxy_queue.h"
#include "util/hsa_rsrc_factory.h"
#ifndef ROCP_PROXY_LOCK
# define ROCP_PROXY_LOCK 1
#endif
namespace rocprofiler {
extern decltype(hsa_queue_create)* hsa_queue_create_fn;
extern decltype(hsa_queue_destroy)* hsa_queue_destroy_fn;
extern decltype(hsa_signal_store_relaxed)* hsa_signal_store_relaxed_fn;
extern decltype(hsa_signal_store_relaxed)* hsa_signal_store_screlease_fn;
extern decltype(hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed_fn;
extern decltype(hsa_queue_store_write_index_relaxed)* hsa_queue_store_write_index_relaxed_fn;
extern decltype(hsa_queue_load_read_index_relaxed)* hsa_queue_load_read_index_relaxed_fn;
extern decltype(hsa_queue_load_write_index_scacquire)* hsa_queue_load_write_index_scacquire_fn;
extern decltype(hsa_queue_store_write_index_screlease)* hsa_queue_store_write_index_screlease_fn;
extern decltype(hsa_queue_load_read_index_scacquire)* hsa_queue_load_read_index_scacquire_fn;
typedef decltype(hsa_signal_t::handle) signal_handle_t;
class SimpleProxyQueue : public ProxyQueue {
public:
static void HsaIntercept(HsaApiTable* table);
static void SignalStore(hsa_signal_t signal, hsa_signal_value_t que_idx) {
auto it = queue_map_->find(signal.handle);
if (it != queue_map_->end()) {
SimpleProxyQueue* instance = it->second;
instance->mutex_lock();
const uint64_t begin = instance->submit_index_;
const uint64_t end = que_idx + 1;
instance->submit_index_ = end;
instance->mutex_unlock();
for (uint64_t j = begin; j < end; ++j) {
// Submited packet
const uint32_t idx = j & instance->queue_mask_;
packet_t* packet = reinterpret_cast<packet_t*>(instance->queue_->base_address) + idx;
if (instance->on_submit_cb_ != NULL)
instance->on_submit_cb_(packet, 1, j, instance->on_submit_cb_data_, NULL);
else
instance->Submit(packet);
}
} else {
hsa_signal_store_relaxed_fn(signal, que_idx);
}
}
static uint64_t GetSubmitIndex(const hsa_queue_t* queue) {
uint64_t index = 0;
auto it = queue_map_->find(queue->doorbell_signal.handle);
if (it != queue_map_->end()) {
SimpleProxyQueue* instance = it->second;
index = instance->submit_index_;
} else {
index = hsa_queue_load_read_index_relaxed_fn(queue);
}
return index;
}
static uint64_t GetQueueIndex(const hsa_queue_t* queue) {
uint64_t index = 0;
auto it = queue_map_->find(queue->doorbell_signal.handle);
if (it != queue_map_->end()) {
SimpleProxyQueue* instance = it->second;
instance->mutex_lock();
index = instance->queue_index_;
} else {
index = hsa_queue_load_write_index_relaxed_fn(queue);
}
return index;
}
static void SetQueueIndex(const hsa_queue_t* queue, uint64_t value) {
auto it = queue_map_->find(queue->doorbell_signal.handle);
if (it != queue_map_->end()) {
SimpleProxyQueue* instance = it->second;
instance->queue_index_ = value;
instance->mutex_unlock();
} else {
hsa_queue_store_write_index_relaxed_fn(queue, value);
}
}
hsa_status_t SetInterceptCB(on_submit_cb_t on_submit_cb, void* data) {
on_submit_cb_ = on_submit_cb;
on_submit_cb_data_ = data;
return HSA_STATUS_SUCCESS;
}
void Submit(const packet_t* packet) {
// Compute the write index of queue
const uint64_t que_idx = hsa_queue_load_write_index_relaxed_fn(queue_);
// Waiting untill there is a free space in the queue
while (que_idx >= (hsa_queue_load_read_index_relaxed_fn(queue_) + size_));
// Increment the write index
hsa_queue_store_write_index_relaxed_fn(queue_, que_idx + 1);
const uint32_t mask = queue_->size - 1;
const uint32_t idx = que_idx & mask;
// Copy packet to the queue
const packet_word_t* src = reinterpret_cast<const packet_word_t*>(packet);
packet_word_t* dst = reinterpret_cast<packet_word_t*>(base_address_ + idx);
for (unsigned i = 1; i < sizeof(packet_t) / sizeof(packet_word_t); ++i) {
dst[i] = src[i];
}
// To maintain global order to ensure the prior copy of the packet contents is made visible
// before the header is updated.
// With in-order CP it will wait until the first packet in the blob will be valid.
std::atomic<packet_word_t>* header_atomic_ptr =
reinterpret_cast<std::atomic<packet_word_t>*>(&dst[0]);
header_atomic_ptr->store(src[0], std::memory_order_release);
// Doorbell signaling to submit the packet
hsa_signal_store_relaxed_fn(doorbell_signal_, que_idx);
}
SimpleProxyQueue()
: agent_info_(NULL),
queue_(NULL),
base_address_(NULL),
doorbell_signal_({}),
queue_index_(0),
queue_mask_(0),
submit_index_(0),
on_submit_cb_(NULL),
on_submit_cb_data_(NULL)
{
printf("ROCProfiler: SimpleProxyQueue is enabled\n");
fflush(stdout);
}
~SimpleProxyQueue() {}
private:
typedef std::map<signal_handle_t, SimpleProxyQueue*> queue_map_t;
hsa_status_t Init(hsa_agent_t agent, uint32_t size, hsa_queue_type32_t type,
void (*callback)(hsa_status_t status, hsa_queue_t* source, void* data),
void* data, uint32_t private_segment_size, uint32_t group_segment_size,
hsa_queue_t** queue) {
size_ = size;
auto status = Init(agent, size);
*queue = queue_;
return status;
}
hsa_status_t Init(hsa_agent_t agent, uint32_t size) {
hsa_status_t status = HSA_STATUS_ERROR;
agent_info_ = util::HsaRsrcFactory::Instance().GetAgentInfo(agent);
if (agent_info_ != NULL) {
if (agent_info_->dev_type == HSA_DEVICE_TYPE_GPU) {
status = hsa_queue_create_fn(agent, size, HSA_QUEUE_TYPE_MULTI, NULL, NULL, UINT32_MAX,
UINT32_MAX, &queue_);
if (status == HSA_STATUS_SUCCESS) {
base_address_ = reinterpret_cast<packet_t*>(queue_->base_address);
doorbell_signal_ = queue_->doorbell_signal;
data_array_ = calloc(size + 1, sizeof(packet_t));
uintptr_t addr = (uintptr_t)data_array_;
queue_->base_address = (void*)((addr + align_mask_) & ~align_mask_);
status = hsa_signal_create(1, 0, NULL, &(queue_->doorbell_signal));
if (status != HSA_STATUS_SUCCESS) abort();
queue_mask_ = size - 1;
if (queue_map_ == NULL) queue_map_ = new queue_map_t;
(*queue_map_)[queue_->doorbell_signal.handle] = this;
}
else abort();
}
}
if (status != HSA_STATUS_SUCCESS) abort();
return status;
}
hsa_status_t Cleanup() const {
hsa_status_t status = HSA_STATUS_ERROR;
hsa_signal_t queue_signal = queue_->doorbell_signal;
// Destroy original HSA queue
queue_->base_address = base_address_;
queue_->doorbell_signal = doorbell_signal_;
status = hsa_queue_destroy_fn(queue_);
if (status != HSA_STATUS_SUCCESS) abort();
// Destroy overloaded virtual queue data and signal
free(data_array_);
status = hsa_signal_destroy(queue_signal);
if (status != HSA_STATUS_SUCCESS) abort();
return status;
}
void mutex_lock() {
#if ROCP_PROXY_LOCK
mutex_.lock();
#endif
}
void mutex_unlock() {
#if ROCP_PROXY_LOCK
mutex_.unlock();
#endif
}
uint32_t size_;
static queue_map_t* queue_map_;
const util::AgentInfo* agent_info_;
hsa_queue_t* queue_;
static const uintptr_t align_mask_ = sizeof(packet_t) - 1;
packet_t* base_address_;
hsa_signal_t doorbell_signal_;
uint64_t queue_index_;
uint64_t queue_mask_;
uint64_t submit_index_;
std::mutex mutex_;
on_submit_cb_t on_submit_cb_;
void* on_submit_cb_data_;
void* data_array_;
};
} // namespace rocprofiler
#endif // _SRC_CORE_SIMPLE_PROXY_QUEUE_H
+23 -195
Просмотреть файл
@@ -20,8 +20,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#ifndef SRC_CORE_TRACKER_H_
#define SRC_CORE_TRACKER_H_
#ifndef SRC_PROXY_TRACKER_H_
#define SRC_PROXY_TRACKER_H_
#include <amd_hsa_signal.h>
#include <assert.h>
@@ -29,180 +29,58 @@ THE SOFTWARE.
#include <hsa_ext_amd.h>
#include <atomic>
#include <list>
#include <mutex>
#include "util/hsa_rsrc_factory.h"
#include "util/exception.h"
#include "util/logger.h"
#include "core/trace_buffer.h"
namespace proxy {
// Dispatch record
typedef struct {
uint64_t dispatch; // dispatch timestamp, ns
uint64_t begin; // kernel begin timestamp, ns
uint64_t end; // kernel end timestamp, ns
uint64_t complete; // completion signal timestamp, ns
} async_record_t;
class Tracker {
public:
typedef std::mutex mutex_t;
typedef util::HsaRsrcFactory::timestamp_t timestamp_t;
typedef async_record_t record_t;
struct entry_t;
typedef std::list<entry_t*> sig_list_t;
typedef sig_list_t::iterator sig_list_it_t;
typedef uint64_t counter_t;
struct entry_t {
counter_t index;
std::atomic<bool> valid;
Tracker* tracker;
sig_list_t::iterator it;
hsa_agent_t agent;
hsa_signal_t orig;
hsa_signal_t signal;
record_t* record;
std::atomic<void*> handler;
void* arg;
bool is_memcopy;
};
static Tracker* Create() {
std::lock_guard<mutex_t> lck(glob_mutex_);
Tracker* obj = instance_.load(std::memory_order_relaxed);
if (obj == NULL) {
obj = new Tracker;
if (obj == NULL) EXC_ABORT(HSA_STATUS_ERROR, "Tracker creation failed");
instance_.store(obj, std::memory_order_release);
}
return obj;
}
static Tracker& Instance() {
Tracker* obj = instance_.load(std::memory_order_acquire);
if (obj == NULL) obj = Create();
return *obj;
}
static void Destroy() {
std::lock_guard<mutex_t> lck(glob_mutex_);
if (instance_ != NULL) delete instance_;
instance_ = NULL;
}
typedef roctracer::trace_entry_t entry_t;
// Add tracker entry
entry_t* Alloc(const hsa_agent_t& agent, const hsa_signal_t& orig) {
inline static void Enable(uint32_t type, const hsa_agent_t& agent, const hsa_signal_t& signal, entry_t* entry) {
hsa_status_t status = HSA_STATUS_ERROR;
util::HsaRsrcFactory* hsa_rsrc = &(util::HsaRsrcFactory::Instance());
// Creating a new tracker entry
entry_t* entry = new entry_t{};
assert(entry);
entry->tracker = this;
entry->type = type;
entry->agent = agent;
entry->orig = orig;
// Creating a record with the dispatch timestamps
record_t* record = new record_t{};
assert(record);
record->dispatch = hsa_rsrc_->TimestampNs();
entry->record = record;
entry->orig = signal;
entry->dispatch = hsa_rsrc->TimestampNs();
entry->valid.store(roctracer::TRACE_ENTRY_INIT, std::memory_order_release);
// Creating a proxy signal
status = hsa_signal_create(1, 0, NULL, &(entry->signal));
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_signal_create");
status = hsa_amd_signal_async_handler(entry->signal, HSA_SIGNAL_CONDITION_LT, 1, Handler, entry);
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_signal_async_handler");
// Adding antry to the list
mutex_.lock();
entry->it = sig_list_.insert(sig_list_.end(), entry);
entry->index = counter_++;
mutex_.unlock();
return entry;
}
// Delete tracker entry
void Delete(entry_t* entry) {
hsa_signal_destroy(entry->signal);
mutex_.lock();
sig_list_.erase(entry->it);
mutex_.unlock();
delete entry;
}
// Enable tracker entry
void Enable(entry_t* entry, void* handler, void* arg) {
// Set entry handler and release the entry
entry->arg = arg;
entry->handler.store(handler, std::memory_order_release);
// Debug trace
if (trace_on_) {
auto outstanding = outstanding_.fetch_add(1);
fprintf(stdout, "Tracker::Add: entry %p, record %p, outst %lu\n", entry, entry->record, outstanding);
fflush(stdout);
}
}
void EnableDispatch(entry_t* entry, hsa_amd_signal_handler handler, void* arg) {
entry->is_memcopy = false;
Enable(entry, reinterpret_cast<void*>(handler), arg);
}
void EnableMemcopy(entry_t* entry, hsa_amd_signal_handler handler, void* arg) {
entry->is_memcopy = true;
Enable(entry, reinterpret_cast<void*>(handler), arg);
}
private:
Tracker() :
outstanding_(0),
hsa_rsrc_(&(util::HsaRsrcFactory::Instance()))
{}
~Tracker() {
auto it = sig_list_.begin();
auto end = sig_list_.end();
while (it != end) {
auto cur = it++;
hsa_rsrc_->SignalWait((*cur)->signal);
Erase(cur);
}
}
// Delete an entry by iterator
void Erase(const sig_list_it_t& it) { Delete(*it); }
// Entry completion
inline void Complete(hsa_signal_value_t signal_value, entry_t* entry) {
record_t* record = entry->record;
// Debug trace
if (trace_on_) {
auto outstanding = outstanding_.fetch_sub(1);
fprintf(stdout, "Tracker::Handler: entry %p, record %p, outst %lu\n", entry, entry->record, outstanding);
fflush(stdout);
}
inline static void Complete(hsa_signal_value_t signal_value, entry_t* entry) {
// Query begin/end and complete timestamps
if (entry->is_memcopy) {
util::HsaRsrcFactory* hsa_rsrc = &(util::HsaRsrcFactory::Instance());
if (entry->type == roctracer::COPY_ENTRY_TYPE) {
hsa_amd_profiling_async_copy_time_t async_copy_time{};
hsa_status_t status = hsa_amd_profiling_get_async_copy_time(entry->signal, &async_copy_time);
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_profiling_get_async_copy_time");
record->begin = hsa_rsrc_->SysclockToNs(async_copy_time.start);
record->end = hsa_rsrc_->SysclockToNs(async_copy_time.end);
entry->begin = hsa_rsrc->SysclockToNs(async_copy_time.start);
entry->end = hsa_rsrc->SysclockToNs(async_copy_time.end);
} else {
hsa_amd_profiling_dispatch_time_t dispatch_time{};
hsa_status_t status = hsa_amd_profiling_get_dispatch_time(entry->agent, entry->signal, &dispatch_time);
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_profiling_get_dispatch_time");
record->begin = hsa_rsrc_->SysclockToNs(dispatch_time.start);
record->end = hsa_rsrc_->SysclockToNs(dispatch_time.end);
entry->begin = hsa_rsrc->SysclockToNs(dispatch_time.start);
entry->end = hsa_rsrc->SysclockToNs(dispatch_time.end);
}
record->complete = hsa_rsrc_->TimestampNs();
entry->valid.store(true, std::memory_order_release);
entry->complete = hsa_rsrc->TimestampNs();
entry->valid.store(roctracer::TRACE_ENTRY_COMPL, std::memory_order_release);
// Original intercepted signal completion
hsa_signal_t orig = entry->orig;
@@ -216,72 +94,22 @@ class Tracker {
if (signal_value != new_value) EXC_ABORT(HSA_STATUS_ERROR, "Tracker::Complete bad signal value");
hsa_signal_store_screlease(orig, signal_value);
}
}
inline static void HandleEntry(hsa_signal_value_t signal_value, entry_t* entry) {
// Call entry handler
void* handler = static_cast<void*>(entry->handler);
reinterpret_cast<hsa_amd_signal_handler>(handler)(signal_value, entry->arg);
// Delete tracker entry
entry->tracker->Delete(entry);
hsa_signal_destroy(entry->signal);
}
// Handler for packet completion
static bool Handler(hsa_signal_value_t signal_value, void* arg) {
// Acquire entry
entry_t* entry = reinterpret_cast<entry_t*>(arg);
volatile std::atomic<void*>* ptr = &entry->handler;
while (ptr->load(std::memory_order_acquire) == NULL) sched_yield();
while (entry->valid.load(std::memory_order_acquire) != roctracer::TRACE_ENTRY_INIT) sched_yield();
// Complete entry
Tracker* tracker = entry->tracker;
tracker->Complete(signal_value, entry);
if (ordering_enabled_ == false) {
HandleEntry(signal_value, entry);
} else {
// Acquire last entry
entry_t* back = tracker->sig_list_.back();
volatile std::atomic<void*>* ptr = &back->handler;
while (ptr->load(std::memory_order_acquire) == NULL) sched_yield();
tracker->handler_mutex_.lock();
sig_list_it_t it = tracker->sig_list_.begin();
sig_list_it_t end = back->it;
while (it != end) {
entry = *(it++);
if (entry->valid.load(std::memory_order_acquire)) {
HandleEntry(signal_value, entry);
} else {
break;
}
}
tracker->handler_mutex_.unlock();
}
Tracker::Complete(signal_value, entry);
return false;
}
// instance
static std::atomic<Tracker*> instance_;
static mutex_t glob_mutex_;
static counter_t counter_;
// Tracked signals list
sig_list_t sig_list_;
// Inter-thread synchronization
mutex_t mutex_;
mutex_t handler_mutex_;
// Outstanding dispatches
std::atomic<uint64_t> outstanding_;
// HSA resources factory
util::HsaRsrcFactory* hsa_rsrc_;
// Handling ordering enabled
static const bool ordering_enabled_ = false;
// Enable tracing
static const bool trace_on_ = false;
};
} // namespace rocprofiler
#endif // SRC_CORE_TRACKER_H_
#endif // SRC_PROXY_TRACKER_H_
+50
Просмотреть файл
@@ -0,0 +1,50 @@
/******************************************************************************
Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*******************************************************************************/
#ifndef SRC_CORE_TYPES_H_
#define SRC_CORE_TYPES_H_
#include <iostream>
#include <hsa_ven_amd_aqlprofile.h>
namespace rocprofiler {
typedef hsa_ven_amd_aqlprofile_pfn_t pfn_t;
typedef hsa_ven_amd_aqlprofile_event_t event_t;
typedef hsa_ven_amd_aqlprofile_parameter_t parameter_t;
typedef hsa_ven_amd_aqlprofile_profile_t profile_t;
typedef hsa_ext_amd_aql_pm4_packet_t packet_t;
typedef uint32_t packet_word_t;
typedef uint64_t timestamp_t;
inline std::ostream& operator<< (std::ostream& out, const event_t& event) {
out << "[block_name(" << event.block_name << "). block_index(" << event.block_index << "). counter_id(" << event.counter_id << ")]";
return out;
}
inline std::ostream& operator<< (std::ostream& out, const parameter_t& parameter) {
out << "[parameter_name(" << parameter.parameter_name << "). value(" << parameter.value << ")]";
return out;
}
} // namespace rocprofiler
#endif // SRC_CORE_TYPES_H_
+20 -18
Просмотреть файл
@@ -145,7 +145,7 @@ HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize
CHECK_STATUS("loader API table query failed", status);
// Instantiate HSA timer
timer_ = new HsaTimer;
timer_ = new HsaTimer(&hsa_api_);
CHECK_STATUS("HSA timer allocation failed",
(timer_ == NULL) ? HSA_STATUS_ERROR : HSA_STATUS_SUCCESS);
@@ -172,7 +172,6 @@ void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) {
hsa_api_.hsa_init = table->core_->hsa_init_fn;
hsa_api_.hsa_shut_down = table->core_->hsa_shut_down_fn;
hsa_api_.hsa_agent_get_info = table->core_->hsa_agent_get_info_fn;
hsa_api_.hsa_iterate_agents = table->core_->hsa_iterate_agents_fn;
hsa_api_.hsa_queue_create = table->core_->hsa_queue_create_fn;
@@ -180,14 +179,13 @@ void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) {
hsa_api_.hsa_queue_load_write_index_relaxed = table->core_->hsa_queue_load_write_index_relaxed_fn;
hsa_api_.hsa_queue_store_write_index_relaxed = table->core_->hsa_queue_store_write_index_relaxed_fn;
hsa_api_.hsa_queue_load_read_index_relaxed = table->core_->hsa_queue_load_read_index_relaxed_fn;
hsa_api_.hsa_signal_create = table->core_->hsa_signal_create_fn;
hsa_api_.hsa_signal_destroy = table->core_->hsa_signal_destroy_fn;
hsa_api_.hsa_signal_load_relaxed = table->core_->hsa_signal_load_relaxed_fn;
hsa_api_.hsa_signal_store_relaxed = table->core_->hsa_signal_store_relaxed_fn;
hsa_api_.hsa_signal_store_screlease = table->core_->hsa_signal_store_screlease_fn;
hsa_api_.hsa_signal_wait_scacquire = table->core_->hsa_signal_wait_scacquire_fn;
hsa_api_.hsa_system_get_major_extension_table = table->core_->hsa_system_get_major_extension_table_fn;
hsa_api_.hsa_signal_store_screlease = table->core_->hsa_signal_store_screlease_fn;
hsa_api_.hsa_code_object_reader_create_from_file = table->core_->hsa_code_object_reader_create_from_file_fn;
hsa_api_.hsa_executable_create_alt = table->core_->hsa_executable_create_alt_fn;
@@ -195,21 +193,23 @@ void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) {
hsa_api_.hsa_executable_freeze = table->core_->hsa_executable_freeze_fn;
hsa_api_.hsa_executable_get_symbol = table->core_->hsa_executable_get_symbol_fn;
hsa_api_.hsa_system_get_info = table->core_->hsa_system_get_info_fn;
hsa_api_.hsa_system_get_major_extension_table = table->core_->hsa_system_get_major_extension_table_fn;
hsa_api_.hsa_amd_agent_iterate_memory_pools = table->amd_ext_->hsa_amd_agent_iterate_memory_pools_fn;
hsa_api_.hsa_amd_memory_pool_get_info = table->amd_ext_->hsa_amd_memory_pool_get_info_fn;
hsa_api_.hsa_amd_memory_pool_allocate = table->amd_ext_->hsa_amd_memory_pool_allocate_fn;
hsa_api_.hsa_amd_agents_allow_access = table->amd_ext_->hsa_amd_agents_allow_access_fn;
hsa_api_.hsa_amd_memory_async_copy = table->amd_ext_->hsa_amd_memory_async_copy_fn;
hsa_api_.hsa_amd_signal_async_handler = table->amd_ext_->hsa_amd_signal_async_handler_fn;
hsa_api_.hsa_amd_profiling_set_profiler_enabled = table->amd_ext_->hsa_amd_profiling_set_profiler_enabled_fn;
hsa_api_.hsa_amd_profiling_get_async_copy_time = table->amd_ext_->hsa_amd_profiling_get_async_copy_time_fn;
hsa_api_.hsa_amd_profiling_get_dispatch_time = table->amd_ext_->hsa_amd_profiling_get_dispatch_time_fn;
} else {
hsa_api_.hsa_init = hsa_init;
hsa_api_.hsa_shut_down = hsa_shut_down;
hsa_api_.hsa_agent_get_info = hsa_agent_get_info;
hsa_api_.hsa_iterate_agents = hsa_iterate_agents;
hsa_api_.hsa_queue_create = hsa_queue_create;
@@ -217,19 +217,13 @@ void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) {
hsa_api_.hsa_queue_load_write_index_relaxed = hsa_queue_load_write_index_relaxed;
hsa_api_.hsa_queue_store_write_index_relaxed = hsa_queue_store_write_index_relaxed;
hsa_api_.hsa_queue_load_read_index_relaxed = hsa_queue_load_read_index_relaxed;
hsa_api_.hsa_signal_create = hsa_signal_create;
hsa_api_.hsa_signal_destroy = hsa_signal_destroy;
hsa_api_.hsa_signal_load_relaxed = hsa_signal_load_relaxed;
hsa_api_.hsa_signal_store_relaxed = hsa_signal_store_relaxed;
hsa_api_.hsa_signal_wait_scacquire = hsa_signal_wait_scacquire;
hsa_api_.hsa_amd_agent_iterate_memory_pools = hsa_amd_agent_iterate_memory_pools;
hsa_api_.hsa_amd_memory_pool_get_info = hsa_amd_memory_pool_get_info;
hsa_api_.hsa_amd_memory_pool_allocate = hsa_amd_memory_pool_allocate;
hsa_api_.hsa_amd_agents_allow_access = hsa_amd_agents_allow_access;
hsa_api_.hsa_amd_memory_async_copy = hsa_amd_memory_async_copy;
hsa_api_.hsa_system_get_major_extension_table = hsa_system_get_major_extension_table;
hsa_api_.hsa_signal_store_screlease = hsa_signal_store_screlease;
hsa_api_.hsa_code_object_reader_create_from_file = hsa_code_object_reader_create_from_file;
hsa_api_.hsa_executable_create_alt = hsa_executable_create_alt;
@@ -237,11 +231,19 @@ void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) {
hsa_api_.hsa_executable_freeze = hsa_executable_freeze;
hsa_api_.hsa_executable_get_symbol = hsa_executable_get_symbol;
hsa_api_.hsa_system_get_info = hsa_system_get_info;
hsa_api_.hsa_system_get_major_extension_table = hsa_system_get_major_extension_table;
hsa_api_.hsa_amd_agent_iterate_memory_pools = hsa_amd_agent_iterate_memory_pools;
hsa_api_.hsa_amd_memory_pool_get_info = hsa_amd_memory_pool_get_info;
hsa_api_.hsa_amd_memory_pool_allocate = hsa_amd_memory_pool_allocate;
hsa_api_.hsa_amd_agents_allow_access = hsa_amd_agents_allow_access;
hsa_api_.hsa_amd_memory_async_copy = hsa_amd_memory_async_copy;
hsa_api_.hsa_amd_signal_async_handler = hsa_amd_signal_async_handler;
hsa_api_.hsa_amd_profiling_set_profiler_enabled = hsa_amd_profiling_set_profiler_enabled;
hsa_api_.hsa_amd_profiling_get_async_copy_time = hsa_amd_profiling_get_async_copy_time;
hsa_api_.hsa_amd_profiling_get_dispatch_time = hsa_amd_profiling_get_dispatch_time;
hsa_api_.hsa_signal_load_relaxed = hsa_signal_load_relaxed;
hsa_api_.hsa_signal_store_screlease = hsa_signal_store_screlease;
}
}
}
+36 -32
Просмотреть файл
@@ -71,41 +71,43 @@ static const size_t MEM_PAGE_MASK = MEM_PAGE_BYTES - 1;
typedef decltype(hsa_agent_t::handle) hsa_agent_handle_t;
struct hsa_pfn_t {
decltype(::hsa_init)* hsa_init;
decltype(::hsa_shut_down)* hsa_shut_down;
decltype(::hsa_agent_get_info)* hsa_agent_get_info;
decltype(hsa_init)* hsa_init;
decltype(hsa_shut_down)* hsa_shut_down;
decltype(hsa_agent_get_info)* hsa_agent_get_info;
decltype(hsa_iterate_agents)* hsa_iterate_agents;
decltype(::hsa_iterate_agents)* hsa_iterate_agents;
decltype(hsa_queue_create)* hsa_queue_create;
decltype(hsa_queue_destroy)* hsa_queue_destroy;
decltype(hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed;
decltype(hsa_queue_store_write_index_relaxed)* hsa_queue_store_write_index_relaxed;
decltype(hsa_queue_load_read_index_relaxed)* hsa_queue_load_read_index_relaxed;
decltype(::hsa_queue_create)* hsa_queue_create;
decltype(::hsa_queue_destroy)* hsa_queue_destroy;
decltype(::hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed;
decltype(::hsa_queue_store_write_index_relaxed)* hsa_queue_store_write_index_relaxed;
decltype(::hsa_queue_load_read_index_relaxed)* hsa_queue_load_read_index_relaxed;
decltype(::hsa_signal_create)* hsa_signal_create;
decltype(::hsa_signal_destroy)* hsa_signal_destroy;
decltype(::hsa_signal_store_relaxed)* hsa_signal_store_relaxed;
decltype(::hsa_signal_wait_scacquire)* hsa_signal_wait_scacquire;
decltype(hsa_signal_create)* hsa_signal_create;
decltype(hsa_signal_destroy)* hsa_signal_destroy;
decltype(hsa_signal_load_relaxed)* hsa_signal_load_relaxed;
decltype(hsa_signal_store_relaxed)* hsa_signal_store_relaxed;
decltype(hsa_signal_wait_scacquire)* hsa_signal_wait_scacquire;
decltype(hsa_signal_store_screlease)* hsa_signal_store_screlease;
decltype(::hsa_amd_agent_iterate_memory_pools)* hsa_amd_agent_iterate_memory_pools;
decltype(::hsa_amd_memory_pool_get_info)* hsa_amd_memory_pool_get_info;
decltype(::hsa_amd_memory_pool_allocate)* hsa_amd_memory_pool_allocate;
decltype(::hsa_amd_agents_allow_access)* hsa_amd_agents_allow_access;
decltype(::hsa_amd_memory_async_copy)* hsa_amd_memory_async_copy;
decltype(hsa_code_object_reader_create_from_file)* hsa_code_object_reader_create_from_file;
decltype(hsa_executable_create_alt)* hsa_executable_create_alt;
decltype(hsa_executable_load_agent_code_object)* hsa_executable_load_agent_code_object;
decltype(hsa_executable_freeze)* hsa_executable_freeze;
decltype(hsa_executable_get_symbol)* hsa_executable_get_symbol;
decltype(::hsa_system_get_major_extension_table)* hsa_system_get_major_extension_table;
decltype(hsa_system_get_info)* hsa_system_get_info;
decltype(hsa_system_get_major_extension_table)* hsa_system_get_major_extension_table;
decltype(::hsa_code_object_reader_create_from_file)* hsa_code_object_reader_create_from_file;
decltype(::hsa_executable_create_alt)* hsa_executable_create_alt;
decltype(::hsa_executable_load_agent_code_object)* hsa_executable_load_agent_code_object;
decltype(::hsa_executable_freeze)* hsa_executable_freeze;
decltype(::hsa_executable_get_symbol)* hsa_executable_get_symbol;
decltype(hsa_amd_agent_iterate_memory_pools)* hsa_amd_agent_iterate_memory_pools;
decltype(hsa_amd_memory_pool_get_info)* hsa_amd_memory_pool_get_info;
decltype(hsa_amd_memory_pool_allocate)* hsa_amd_memory_pool_allocate;
decltype(hsa_amd_agents_allow_access)* hsa_amd_agents_allow_access;
decltype(hsa_amd_memory_async_copy)* hsa_amd_memory_async_copy;
decltype(::hsa_amd_signal_async_handler)* hsa_amd_signal_async_handler;
decltype(::hsa_amd_profiling_get_async_copy_time)* hsa_amd_profiling_get_async_copy_time;
decltype(::hsa_amd_profiling_get_dispatch_time)* hsa_amd_profiling_get_dispatch_time;
decltype(::hsa_signal_load_relaxed)* hsa_signal_load_relaxed;
decltype(::hsa_signal_store_screlease)* hsa_signal_store_screlease;
decltype(hsa_amd_signal_async_handler)* hsa_amd_signal_async_handler;
decltype(hsa_amd_profiling_set_profiler_enabled)* hsa_amd_profiling_set_profiler_enabled;
decltype(hsa_amd_profiling_get_async_copy_time)* hsa_amd_profiling_get_async_copy_time;
decltype(hsa_amd_profiling_get_dispatch_time)* hsa_amd_profiling_get_dispatch_time;
};
// Encapsulates information about a Hsa Agent such as its
@@ -167,9 +169,9 @@ class HsaTimer {
static const timestamp_t TIMESTAMP_MAX = UINT64_MAX;
typedef long double freq_t;
HsaTimer() {
HsaTimer(const hsa_pfn_t* hsa_api) : hsa_api_(hsa_api) {
timestamp_t sysclock_hz = 0;
hsa_status_t status = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &sysclock_hz);
hsa_status_t status = hsa_api_->hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &sysclock_hz);
CHECK_STATUS("hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY)", status);
sysclock_factor_ = (freq_t)1000000000 / (freq_t)sysclock_hz;
}
@@ -185,7 +187,7 @@ class HsaTimer {
// Return timestamp in 'ns'
timestamp_t timestamp_ns() const {
timestamp_t sysclock;
hsa_status_t status = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &sysclock);
hsa_status_t status = hsa_api_->hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &sysclock);
CHECK_STATUS("hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP)", status);
return sysclock_to_ns(sysclock);
}
@@ -193,6 +195,8 @@ class HsaTimer {
private:
// Timestamp frequency factor
freq_t sysclock_factor_;
// HSA API table
const hsa_pfn_t* const hsa_api_;
};
class HsaRsrcFactory {
+53 -12
Просмотреть файл
@@ -32,6 +32,7 @@ THE SOFTWARE.
#include <inc/roctracer_hcc.h>
#include <inc/ext/hsa_rt_utils.hpp>
#include <src/core/loader.h>
#include <src/core/trace_buffer.h>
#include <util/xml.h>
#define PUBLIC_API __attribute__((visibility("default")))
@@ -78,6 +79,19 @@ void fatal(const std::string msg) {
abort();
}
struct api_trace_entry_t {
uint32_t valid;
uint32_t type;
uint32_t cid;
timestamp_t begin;
timestamp_t end;
uint32_t pid;
uint32_t tid;
hsa_api_data_t data;
};
roctracer::TraceBuffer<api_trace_entry_t> api_trace_buffer(0x200000);
// HSA API callback function
void hsa_api_callback(
uint32_t domain,
@@ -87,23 +101,35 @@ void hsa_api_callback(
{
(void)arg;
const hsa_api_data_t* data = reinterpret_cast<const hsa_api_data_t*>(callback_data);
if (data->phase == ACTIVITY_API_PHASE_ENTER) {
hsa_begin_timestamp = timer->timestamp_fn_ns();
} else {
const timestamp_t end_timestamp = (cid == HSA_API_ID_hsa_shut_down) ? hsa_begin_timestamp : timer->timestamp_fn_ns();
std::ostringstream os;
os << hsa_begin_timestamp << ":" << end_timestamp << " " << GetPid() << ":" << GetTid() << " " << hsa_api_data_pair_t(cid, *data);
fprintf(hsa_api_file_handle, "%s\n", os.str().c_str());
api_trace_entry_t* entry = api_trace_buffer.GetEntry();
entry->valid = roctracer::TRACE_ENTRY_COMPL;
entry->cid = cid;
entry->begin = hsa_begin_timestamp;
entry->end = end_timestamp;
entry->pid = GetPid();
entry->tid = GetTid();
entry->data = *data;
}
}
void hsa_api_flush_cb(api_trace_entry_t* entry) {
std::ostringstream os;
os << entry->begin << ":" << entry->end << " " << entry->pid << ":" << entry->tid << " " << hsa_api_data_pair_t(entry->cid, entry->data);
fprintf(hsa_api_file_handle, "%s\n", os.str().c_str());
}
void hsa_activity_callback(
uint32_t op,
activity_record_t* record,
void* arg)
{
fprintf(hsa_async_copy_file_handle, "%lu:%lu async-copy%lu\n", record->begin_ns, record->end_ns, record->correlation_id);
static uint64_t index = 0;
fprintf(hsa_async_copy_file_handle, "%lu:%lu async-copy%lu\n", record->begin_ns, record->end_ns, index);
index++;
}
void hip_api_callback(
@@ -164,14 +190,14 @@ void hcc_activity_callback(const char* begin, const char* end, void* arg) {
const roctracer_record_t* end_record = reinterpret_cast<const roctracer_record_t*>(end);
while (record < end_record) {
const char * name = roctracer_op_string(record->domain, record->op, record->kind);
if (record->domain == ACTIVITY_DOMAIN_HCC_OPS) {
const char * name = roctracer_op_string(record->domain, record->op, record->kind);
fprintf(hcc_activity_file_handle, "%lu:%lu %d:%lu %s:%lu\n",
record->begin_ns, record->end_ns, record->device_id, record->queue_id, name, record->correlation_id);
} else {
#if 0
fprintf(stderr, "Bad domain %d\n", record->domain);
abort();
fprintf(hip_api_file_handle, "%lu:%lu %u:%u %s()\n",
record->begin_ns, record->end_ns, record->process_id, record->thread_id, name);
#endif
}
ROCTRACER_CALL(roctracer_next_record(record, &record));
@@ -312,8 +338,10 @@ extern "C" PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version,
// initialize HSA tracing
roctracer_set_properties(ACTIVITY_DOMAIN_HSA_API, (void*)table);
roctracer::hsa_ops_properties_t ops_properties{
table,
reinterpret_cast<activity_async_callback_t>(hsa_activity_callback),
NULL};
NULL,
output_prefix};
roctracer_set_properties(ACTIVITY_DOMAIN_HSA_OPS, &ops_properties);
fprintf(stdout, " HSA-trace("); fflush(stdout);
@@ -340,23 +368,32 @@ extern "C" PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version,
fprintf(stdout, " HIP-trace()\n"); fflush(stdout);
// Allocating tracing pool
roctracer_properties_t properties{};
properties.buffer_size = 0x1000;
properties.buffer_size = 0x80000;
properties.buffer_callback_fun = hcc_activity_callback;
ROCTRACER_CALL(roctracer_open_pool(&properties));
ROCTRACER_CALL(roctracer_enable_domain_activity(ACTIVITY_DOMAIN_HCC_OPS));
ROCTRACER_CALL(roctracer_enable_domain_activity(ACTIVITY_DOMAIN_HIP_API));
//ROCTRACER_CALL(roctracer_enable_domain_activity(ACTIVITY_DOMAIN_HIP_API));
ROCTRACER_CALL(roctracer_enable_domain_callback(ACTIVITY_DOMAIN_HIP_API, hip_api_callback, NULL));
}
return true;
return roctracer_load(table, runtime_version, failed_tool_count, failed_tool_names);
}
// HSA-runtime tool on-unload method
extern "C" PUBLIC_API void OnUnload() {
static bool is_unloaded = false;
if (is_unloaded) {
return;
}
is_unloaded = true;
roctracer_unload();
if (trace_hsa) {
ROCTRACER_CALL(roctracer_disable_domain_callback(ACTIVITY_DOMAIN_HSA_API));
ROCTRACER_CALL(roctracer_disable_domain_activity(ACTIVITY_DOMAIN_HSA_OPS));
api_trace_buffer.Flush(0, hsa_api_flush_cb);
fclose(hsa_api_file_handle);
fclose(hsa_async_copy_file_handle);
}
@@ -364,9 +401,13 @@ extern "C" PUBLIC_API void OnUnload() {
ROCTRACER_CALL(roctracer_disable_domain_callback(ACTIVITY_DOMAIN_HIP_API));
ROCTRACER_CALL(roctracer_disable_domain_activity(ACTIVITY_DOMAIN_HIP_API));
ROCTRACER_CALL(roctracer_disable_domain_activity(ACTIVITY_DOMAIN_HCC_OPS));
ROCTRACER_CALL(roctracer_flush_activity());
ROCTRACER_CALL(roctracer_close_pool());
fclose(hip_api_file_handle);
fclose(hcc_activity_file_handle);
}
}
extern "C" CONSTRUCTOR_API void constructor() {}
extern "C" DESTRUCTOR_API void destructor() { OnUnload(); }