counters dumping optimization
Change-Id: I8c694e5380e15179453148dd9ab3a3e51b6db861
[ROCm/rocprofiler commit: 2a7f77b290]
Этот коммит содержится в:
@@ -73,6 +73,7 @@ typedef struct {
|
||||
uint32_t timestamp_on;
|
||||
uint32_t hsa_intercepting;
|
||||
uint32_t k_concurrent;
|
||||
uint32_t opt_mode;
|
||||
} rocprofiler_settings_t;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
@@ -183,7 +183,7 @@ class Context {
|
||||
|
||||
uint32_t GetGroupCount() const { return set_.size(); }
|
||||
|
||||
inline rocprofiler_group_t GetGroupInfo(Group* g) {
|
||||
inline rocprofiler_group_t GetGroupDescr(Group* g) {
|
||||
rocprofiler::info_vector_t& info_vector = g->GetInfoVector();
|
||||
rocprofiler_group_t group = {};
|
||||
group.index = g->GetIndex();
|
||||
@@ -192,12 +192,12 @@ class Context {
|
||||
group.feature_count = info_vector.size();
|
||||
return group;
|
||||
}
|
||||
inline rocprofiler_group_t GetGroupInfo(const uint32_t& index) {
|
||||
inline rocprofiler_group_t GetGroupDescr(const uint32_t& index) {
|
||||
rocprofiler_group_t group = {};
|
||||
if (set_.empty()) {
|
||||
group.context = reinterpret_cast<rocprofiler_t*>(this);
|
||||
} else {
|
||||
group = GetGroupInfo(&set_[index]);
|
||||
group = GetGroupDescr(&set_[index]);
|
||||
}
|
||||
return group;
|
||||
}
|
||||
@@ -288,8 +288,8 @@ class Context {
|
||||
Context* context = group->GetContext();
|
||||
auto r = group->FetchDecrRefsCount();
|
||||
if (r == 1) {
|
||||
const rocprofiler_group_t group_info = context->GetGroupInfo(group);
|
||||
context->handler_(group_info, context->handler_arg_);
|
||||
const rocprofiler_group_t group_descr = context->GetGroupDescr(group);
|
||||
context->handler_(group_descr, context->handler_arg_);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
@@ -298,6 +298,22 @@ class Context {
|
||||
Group* GetGroup(const uint32_t& index) { return &set_[index]; }
|
||||
rocprofiler_handler_t GetHandler(void** arg) const { *arg = handler_arg_; return handler_; }
|
||||
|
||||
void SetDispatchSignal(const hsa_signal_t &signal) {
|
||||
dispatch_signal_ = signal;
|
||||
}
|
||||
hsa_signal_t& GetDispatchSignal() {
|
||||
return dispatch_signal_;
|
||||
}
|
||||
void SetOrigSignal(const hsa_signal_t &signal) {
|
||||
orig_signal_ = signal;
|
||||
}
|
||||
const hsa_signal_t& GetOrigSignal() const {
|
||||
return orig_signal_;
|
||||
}
|
||||
rocprofiler_dispatch_record_t* GetRecord() {
|
||||
return &record_;
|
||||
}
|
||||
|
||||
private:
|
||||
Context(const util::AgentInfo* agent_info, Queue* queue, rocprofiler_feature_t* info,
|
||||
const uint32_t info_count, rocprofiler_handler_t handler, void* handler_arg)
|
||||
@@ -309,7 +325,10 @@ class Context {
|
||||
metrics_(NULL),
|
||||
handler_(handler),
|
||||
handler_arg_(handler_arg),
|
||||
pcsmp_mode_(false)
|
||||
pcsmp_mode_(false),
|
||||
dispatch_signal_{},
|
||||
orig_signal_{},
|
||||
record_{}
|
||||
{}
|
||||
|
||||
~Context() { Destruct(); }
|
||||
@@ -355,6 +374,9 @@ class Context {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
hsa_status_t status = hsa_signal_create(1, 0, NULL, &dispatch_signal_);
|
||||
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "MetricsDict create failed");
|
||||
}
|
||||
|
||||
// Initialize rocprofiler context
|
||||
@@ -593,6 +615,11 @@ class Context {
|
||||
|
||||
// PC sampling mode
|
||||
bool pcsmp_mode_;
|
||||
|
||||
// kernel packet dispatch copmletion signal
|
||||
hsa_signal_t dispatch_signal_;
|
||||
hsa_signal_t orig_signal_;
|
||||
rocprofiler_dispatch_record_t record_;
|
||||
};
|
||||
|
||||
} // namespace rocprofiler
|
||||
|
||||
@@ -43,5 +43,6 @@ rocprofiler_hsa_callback_fun_t InterceptQueue::submit_callback_fun_ = NULL;
|
||||
void* InterceptQueue::submit_callback_arg_ = NULL;
|
||||
|
||||
bool InterceptQueue::k_concurrent_ = false;
|
||||
bool InterceptQueue::opt_mode_ = false;
|
||||
|
||||
} // namespace rocprofiler
|
||||
|
||||
@@ -92,6 +92,8 @@ class InterceptQueue {
|
||||
(*obj_map_)[(uint64_t)(*queue)] = obj;
|
||||
if (k_concurrent_) {
|
||||
status = proxy->SetInterceptCB(OnSubmitCB_SQTT, obj);
|
||||
} else if (opt_mode_) {
|
||||
status = proxy->SetInterceptCB(OnSubmitCB_opt, obj);
|
||||
} else {
|
||||
status = proxy->SetInterceptCB(OnSubmitCB, obj);
|
||||
}
|
||||
@@ -138,6 +140,85 @@ class InterceptQueue {
|
||||
return status;
|
||||
}
|
||||
|
||||
static void OnSubmitCB_opt(const void* in_packets, uint64_t count, uint64_t user_que_idx, void* data,
|
||||
hsa_amd_queue_intercept_packet_writer writer) {
|
||||
const packet_t* packets_arr = reinterpret_cast<const packet_t*>(in_packets);
|
||||
InterceptQueue* obj = reinterpret_cast<InterceptQueue*>(data);
|
||||
Queue* proxy = obj->proxy_;
|
||||
|
||||
// Travers input packets
|
||||
for (uint64_t j = 0; j < count; ++j) {
|
||||
const packet_t* packet = &packets_arr[j];
|
||||
bool to_submit = true;
|
||||
|
||||
// Checking for dispatch packet type
|
||||
if ((GetHeaderType(packet) == HSA_PACKET_TYPE_KERNEL_DISPATCH) &&
|
||||
(dispatch_callback_.load(std::memory_order_acquire) != NULL)) {
|
||||
const hsa_kernel_dispatch_packet_t* dispatch_packet =
|
||||
reinterpret_cast<const hsa_kernel_dispatch_packet_t*>(packet);
|
||||
const hsa_signal_t completion_signal = dispatch_packet->completion_signal;
|
||||
#if 0
|
||||
// Prepareing dispatch callback data
|
||||
uint64_t kernel_object = dispatch_packet->kernel_object;
|
||||
const amd_kernel_code_t* kernel_code = GetKernelCode(kernel_object);
|
||||
const char* kernel_name = QueryKernelName(kernel_object, kernel_code);
|
||||
#endif
|
||||
rocprofiler_callback_data_t data = {obj->agent_info_->dev_id,
|
||||
obj->agent_info_->dev_index,
|
||||
obj->queue_,
|
||||
user_que_idx,
|
||||
obj->queue_id,
|
||||
completion_signal,
|
||||
dispatch_packet,
|
||||
NULL, // kernel_name
|
||||
0, // kernel_object
|
||||
NULL, // kernel_code
|
||||
0, // (uint32_t)syscall(__NR_gettid),
|
||||
NULL};
|
||||
|
||||
// Calling dispatch callback
|
||||
rocprofiler_group_t group = {};
|
||||
hsa_status_t status = (dispatch_callback_.load())(&data, callback_data_, &group);
|
||||
#if 0
|
||||
free(const_cast<char*>(kernel_name));
|
||||
#endif
|
||||
Context* context = reinterpret_cast<Context*>(group.context);
|
||||
// Injecting profiling start/stop packets
|
||||
if ((status == HSA_STATUS_SUCCESS) && (context != NULL)) {
|
||||
if (group.feature_count != 0) {
|
||||
if (tracker_ != NULL) {
|
||||
const_cast<hsa_kernel_dispatch_packet_t*>(dispatch_packet)->completion_signal = context->GetDispatchSignal();
|
||||
Group* context_group = context->GetGroup(group.index);
|
||||
Tracker::Enable_opt(context_group, completion_signal);
|
||||
context_group->IncrRefsCount();
|
||||
}
|
||||
|
||||
const pkt_vector_t& start_vector = context->StartPackets(group.index);
|
||||
const pkt_vector_t& stop_vector = context->StopPackets(group.index);
|
||||
pkt_vector_t packets = start_vector;
|
||||
packets.insert(packets.end(), *packet);
|
||||
packets.insert(packets.end(), stop_vector.begin(), stop_vector.end());
|
||||
if (writer != NULL) {
|
||||
writer(&packets[0], packets.size());
|
||||
} else {
|
||||
proxy->Submit(&packets[0], packets.size());
|
||||
}
|
||||
to_submit = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Submitting the original packets if profiling was not enabled
|
||||
if (to_submit) {
|
||||
if (writer != NULL) {
|
||||
writer(packet, 1);
|
||||
} else {
|
||||
proxy->Submit(packet, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void OnSubmitCB(const void* in_packets, uint64_t count, uint64_t user_que_idx, void* data,
|
||||
hsa_amd_queue_intercept_packet_writer writer) {
|
||||
const packet_t* packets_arr = reinterpret_cast<const packet_t*>(in_packets);
|
||||
@@ -400,6 +481,7 @@ class InterceptQueue {
|
||||
static bool IsTrackerOn() { return tracker_on_; }
|
||||
|
||||
static bool k_concurrent_;
|
||||
static bool opt_mode_;
|
||||
|
||||
private:
|
||||
static void queue_event_callback(hsa_status_t status, hsa_queue_t *queue, void *arg) {
|
||||
|
||||
@@ -217,6 +217,7 @@ uint32_t LoadTool() {
|
||||
if (settings.memcopy_tracking) intercept_mode |= MEMCOPY_INTERCEPT_MODE;
|
||||
if (settings.hsa_intercepting) intercept_mode |= HSA_INTERCEPT_MODE;
|
||||
if (settings.k_concurrent) InterceptQueue::k_concurrent_ = true;
|
||||
if (settings.opt_mode) InterceptQueue::opt_mode_ = true;
|
||||
}
|
||||
|
||||
ONLOAD_TRACE("end intercept_mode(" << intercept_mode << ")");
|
||||
@@ -537,8 +538,7 @@ PUBLIC_API hsa_status_t rocprofiler_open(hsa_agent_t agent, rocprofiler_feature_
|
||||
if (mode != 0) {
|
||||
if (mode & ROCPROFILER_MODE_STANDALONE) {
|
||||
if (mode & ROCPROFILER_MODE_CREATEQUEUE) {
|
||||
if (hsa_rsrc->CreateQueue(agent_info, properties->queue_depth, &(properties->queue)) ==
|
||||
false) {
|
||||
if (hsa_rsrc->CreateQueue(agent_info, properties->queue_depth, &(properties->queue)) == false) {
|
||||
EXC_RAISING(HSA_STATUS_ERROR, "CreateQueue() failed");
|
||||
}
|
||||
}
|
||||
@@ -592,7 +592,7 @@ PUBLIC_API hsa_status_t rocprofiler_get_group(rocprofiler_t* handle, uint32_t gr
|
||||
rocprofiler_group_t* group) {
|
||||
API_METHOD_PREFIX
|
||||
rocprofiler::Context* context = reinterpret_cast<rocprofiler::Context*>(handle);
|
||||
*group = context->GetGroupInfo(group_index);
|
||||
*group = context->GetGroupDescr(group_index);
|
||||
API_METHOD_SUFFIX
|
||||
}
|
||||
|
||||
|
||||
@@ -155,6 +155,49 @@ class Tracker {
|
||||
Enable(entry, reinterpret_cast<void*>(handler), arg);
|
||||
}
|
||||
|
||||
// Enable tracking
|
||||
static void Enable_opt(Group* group, const hsa_signal_t& orig_signal) {
|
||||
Context* context = group->GetContext();
|
||||
context->SetOrigSignal(orig_signal);
|
||||
context->GetRecord()->dispatch = util::HsaRsrcFactory::Instance().TimestampNs();
|
||||
|
||||
// Creating a proxy signal
|
||||
const hsa_signal_value_t signal_value = (orig_signal.handle) ?
|
||||
util::HsaRsrcFactory::Instance().HsaApi()->hsa_signal_load_relaxed(orig_signal) : 1;
|
||||
hsa_signal_t& dispatch_signal = context->GetDispatchSignal();
|
||||
util::HsaRsrcFactory::Instance().HsaApi()->hsa_signal_store_screlease(dispatch_signal, signal_value);
|
||||
hsa_status_t status =
|
||||
util::HsaRsrcFactory::Instance().HsaApi()->hsa_amd_signal_async_handler(dispatch_signal, HSA_SIGNAL_CONDITION_LT, signal_value, Handler, group);
|
||||
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_signal_async_handler");
|
||||
}
|
||||
|
||||
// Tracker handler
|
||||
static bool Handler_opt(hsa_signal_value_t signal_value, void* arg) {
|
||||
Group* group = reinterpret_cast<Group*>(arg);
|
||||
Context* context = group->GetContext();
|
||||
hsa_signal_t dispatch_signal = context->GetDispatchSignal();
|
||||
record_t* record = context->GetRecord();
|
||||
hsa_amd_profiling_dispatch_time_t dispatch_time{};
|
||||
hsa_status_t status =
|
||||
util::HsaRsrcFactory::Instance().HsaApi()->hsa_amd_profiling_get_dispatch_time(context->GetAgent(), dispatch_signal, &dispatch_time);
|
||||
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_profiling_get_dispatch_time");
|
||||
record->begin = util::HsaRsrcFactory::Instance().SysclockToNs(dispatch_time.start);
|
||||
record->end = util::HsaRsrcFactory::Instance().SysclockToNs(dispatch_time.end);
|
||||
record->complete = util::HsaRsrcFactory::Instance().TimestampNs();
|
||||
|
||||
// Original intercepted signal completion
|
||||
const hsa_signal_t& orig_signal = context->GetOrigSignal();
|
||||
if (orig_signal.handle) {
|
||||
amd_signal_t* orig_signal_ptr = reinterpret_cast<amd_signal_t*>(orig_signal.handle);
|
||||
amd_signal_t* prof_signal_ptr = reinterpret_cast<amd_signal_t*>(dispatch_signal.handle);
|
||||
orig_signal_ptr->start_ts = prof_signal_ptr->start_ts;
|
||||
orig_signal_ptr->end_ts = prof_signal_ptr->end_ts;
|
||||
util::HsaRsrcFactory::Instance().HsaApi()->hsa_signal_store_screlease(orig_signal, signal_value);
|
||||
}
|
||||
|
||||
Context::Handler(signal_value, arg);
|
||||
}
|
||||
|
||||
private:
|
||||
Tracker() :
|
||||
outstanding_(0),
|
||||
|
||||
@@ -86,7 +86,7 @@ export HSA_TOOLS_LIB=librocprofiler64.so.1
|
||||
# enable intercepting mode in rocprofiler
|
||||
export ROCP_HSA_INTERCEPT=2
|
||||
# test macro for kernel iterations number
|
||||
export ROCP_KITER=100
|
||||
export ROCP_KITER=20
|
||||
# test macro for per-kernel dispatching number
|
||||
export ROCP_DITER=10
|
||||
eval_test "Standalone intercepting test" ./test/stand_intercept_test
|
||||
@@ -95,8 +95,8 @@ unset ROCP_HSA_INTERCEPT
|
||||
## Intercepting usage model test
|
||||
# tool library loaded by ROC profiler
|
||||
export ROCP_TOOL_LIB=./test/libintercept_test.so
|
||||
export ROCP_KITER=50
|
||||
export ROCP_DITER=50
|
||||
export ROCP_KITER=20
|
||||
export ROCP_DITER=20
|
||||
export ROCP_AGENTS=1
|
||||
export ROCP_THRS=3
|
||||
eval_test "Intercepting usage model test" ./test/ctrl
|
||||
@@ -114,19 +114,40 @@ if [ ! -e $ROCP_TOOL_LIB ] ; then
|
||||
export ROCP_TOOL_LIB=test/libtool.so
|
||||
fi
|
||||
|
||||
export ROCP_KITER=50
|
||||
export ROCP_DITER=50
|
||||
export ROCP_KITER=20
|
||||
export ROCP_DITER=20
|
||||
export ROCP_AGENTS=1
|
||||
export ROCP_THRS=1
|
||||
export ROCP_INPUT=input.xml
|
||||
eval_test "'rocprof' libtool test" ./test/ctrl
|
||||
export ROCP_INPUT=pmc_input.xml
|
||||
eval_test "'rocprof' libtool PMC test" ./test/ctrl
|
||||
|
||||
export ROCP_KITER=10
|
||||
export ROCP_DITER=10
|
||||
export ROCP_KITER=20
|
||||
export ROCP_DITER=20
|
||||
export ROCP_AGENTS=1
|
||||
export ROCP_THRS=10
|
||||
export ROCP_INPUT=input1.xml
|
||||
eval_test "'rocprof' libtool test n-threads" ./test/ctrl
|
||||
export ROCP_INPUT=pmc_input.xml
|
||||
eval_test "'rocprof' libtool PMC n-thread test" ./test/ctrl
|
||||
|
||||
export ROCP_KITER=20
|
||||
export ROCP_DITER=20
|
||||
export ROCP_AGENTS=1
|
||||
export ROCP_THRS=1
|
||||
export ROCP_INPUT=pmc_input1.xml
|
||||
eval_test "'rocprof' libtool PMC test1" ./test/ctrl
|
||||
|
||||
export ROCP_KITER=20
|
||||
export ROCP_DITER=20
|
||||
export ROCP_AGENTS=1
|
||||
export ROCP_THRS=10
|
||||
export ROCP_INPUT=pmc_input1.xml
|
||||
eval_test "'rocprof' libtool PMC n-thread test1" ./test/ctrl
|
||||
|
||||
export ROCP_KITER=20
|
||||
export ROCP_DITER=20
|
||||
export ROCP_AGENTS=1
|
||||
export ROCP_THRS=1
|
||||
export ROCP_INPUT=sqtt_input.xml
|
||||
eval_test "'rocprof' libtool SQTT test" ./test/ctrl
|
||||
|
||||
## SPM test
|
||||
# export ROCP_KITER=3
|
||||
@@ -144,7 +165,7 @@ export ROCP_MCOPY_TRACKING=1
|
||||
|
||||
export ROCP_KITER=1
|
||||
export ROCP_DITER=4
|
||||
export ROCP_INPUT=input2.xml
|
||||
export ROCP_INPUT=set_input.xml
|
||||
eval_test "libtool test, counter sets" ./test/ctrl
|
||||
|
||||
## OpenCL test
|
||||
@@ -159,7 +180,7 @@ export ROCP_HSA_INTERC=1
|
||||
|
||||
export ROCP_KITER=10
|
||||
export ROCP_DITER=10
|
||||
export ROCP_INPUT=input1.xml
|
||||
#export ROCP_INPUT=input1.xml
|
||||
eval_test "libtool test, counter sets" ./test/ctrl
|
||||
|
||||
## OpenCL test
|
||||
|
||||
@@ -1,23 +0,0 @@
|
||||
# Filter by dispatches range, GPU index and kernel names
|
||||
<metric
|
||||
# range format "3:9"
|
||||
range=""
|
||||
# list of gpu indexes "0,1,2,3"
|
||||
gpu_index=""
|
||||
# list of matched sub-strings "Simple1,Conv1,SimpleConvolution"
|
||||
kernel=""
|
||||
></metric>
|
||||
|
||||
# List of metrics
|
||||
<metric
|
||||
name=SQ:4,SQ_WAVES,SQ_INSTS_SMEM,SQ_INSTS_VALU,TA_FLAT_WRITE_WAVEFRONTS[0],TA_FLAT_WRITE_WAVEFRONTS[1],GPUBusy,VALUBusy,SALUBusy,MemUnitBusy,SFetchInsts,FetchSize,VWriteInsts
|
||||
></metric>
|
||||
|
||||
# SQTT trace with parameters
|
||||
<trace name="SQTT">
|
||||
<parameters
|
||||
MASK=0x0f00
|
||||
TOKEN_MASK=0x144b
|
||||
TOKEN_MASK2=0xffff
|
||||
></parameters>
|
||||
</trace>
|
||||
@@ -0,0 +1,4 @@
|
||||
# List of metrics
|
||||
<metric
|
||||
name=SQ:4,SQ_WAVES,SQ_INSTS_SMEM,SQ_INSTS_VALU,TA_FLAT_WRITE_WAVEFRONTS[0],TA_FLAT_WRITE_WAVEFRONTS[1],GPUBusy,VALUBusy,SALUBusy,MemUnitBusy,SFetchInsts,FetchSize,VWriteInsts
|
||||
></metric>
|
||||
+2
-2
@@ -3,9 +3,9 @@
|
||||
# range format "3:9"
|
||||
range=""
|
||||
# list of gpu indexes "0,1,2,3"
|
||||
gpu_index=""
|
||||
gpu_index="0,1,2,3"
|
||||
# list of matched sub-strings "Simple1,Conv1,SimpleConvolution"
|
||||
kernel=""
|
||||
kernel="Simple1,Conv1,SimpleConvolution"
|
||||
></metric>
|
||||
|
||||
# List of metrics
|
||||
@@ -0,0 +1,8 @@
|
||||
# SQTT trace with parameters
|
||||
<trace name="SQTT">
|
||||
<parameters
|
||||
MASK=0x0f00
|
||||
TOKEN_MASK=0x144b
|
||||
TOKEN_MASK2=0xffff
|
||||
></parameters>
|
||||
</trace>
|
||||
@@ -27,6 +27,7 @@ THE SOFTWARE.
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include <assert.h>
|
||||
#include <cxxabi.h>
|
||||
#include <dirent.h>
|
||||
#include <hsa.h>
|
||||
#include <pthread.h>
|
||||
@@ -99,6 +100,7 @@ struct context_entry_t {
|
||||
unsigned feature_count;
|
||||
rocprofiler_callback_data_t data;
|
||||
kernel_properties_t kernel_properties;
|
||||
uint64_t kernel_object;
|
||||
FILE* file_handle;
|
||||
};
|
||||
|
||||
@@ -169,6 +171,21 @@ void check_status(hsa_status_t status) {
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
// Dispatch opt code /////////////////////////////////////////////////////////////////
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
// Context callback arg
|
||||
struct callbacks_arg_t {
|
||||
rocprofiler_pool_t** pools;
|
||||
};
|
||||
|
||||
// Handler callback arg
|
||||
struct handler_arg_t {
|
||||
rocprofiler_feature_t* features;
|
||||
unsigned feature_count;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Print profiling results output break if terminal output is enabled
|
||||
void results_output_break() {
|
||||
const bool is_terminal_output = (result_file_opened == false);
|
||||
@@ -589,7 +606,6 @@ void dump_context_array(hsa_queue_t* queue) {
|
||||
|
||||
// Profiling completion handler
|
||||
// Dump and delete the context entry
|
||||
// Return true if the context was dumped successfully
|
||||
bool context_handler(rocprofiler_group_t group, void* arg) {
|
||||
context_entry_t* entry = reinterpret_cast<context_entry_t*>(arg);
|
||||
|
||||
@@ -621,6 +637,62 @@ bool context_handler(rocprofiler_group_t group, void* arg) {
|
||||
return false;
|
||||
}
|
||||
|
||||
static const amd_kernel_code_t* GetKernelCode(uint64_t kernel_object) {
|
||||
const amd_kernel_code_t* kernel_code = NULL;
|
||||
hsa_status_t status =
|
||||
HsaRsrcFactory::Instance().LoaderApi()->hsa_ven_amd_loader_query_host_address(
|
||||
reinterpret_cast<const void*>(kernel_object),
|
||||
reinterpret_cast<const void**>(&kernel_code));
|
||||
if (HSA_STATUS_SUCCESS != status) {
|
||||
kernel_code = reinterpret_cast<amd_kernel_code_t*>(kernel_object);
|
||||
}
|
||||
return kernel_code;
|
||||
}
|
||||
|
||||
// Demangle C++ symbol name
|
||||
static const char* cpp_demangle(const char* symname) {
|
||||
size_t size = 0;
|
||||
int status;
|
||||
const char* ret = abi::__cxa_demangle(symname, NULL, &size, &status);
|
||||
return (ret != 0) ? ret : strdup(symname);
|
||||
}
|
||||
|
||||
static const char* QueryKernelName(uint64_t kernel_object, const amd_kernel_code_t* kernel_code) {
|
||||
const char* kernel_symname = HsaRsrcFactory::GetKernelNameRef(kernel_object);
|
||||
return cpp_demangle(kernel_symname);
|
||||
}
|
||||
|
||||
// Profiling completion handler
|
||||
// Dump context entry
|
||||
bool context_pool_handler(const rocprofiler_pool_entry_t* entry, void* arg) {
|
||||
// Context entry
|
||||
context_entry_t* ctx_entry = reinterpret_cast<context_entry_t*>(entry->payload);
|
||||
handler_arg_t* handler_arg = reinterpret_cast<handler_arg_t*>(arg);
|
||||
ctx_entry->features = handler_arg->features;
|
||||
ctx_entry->feature_count = handler_arg->feature_count;
|
||||
ctx_entry->file_handle = result_file_handle;
|
||||
|
||||
const uint64_t kernel_object = ctx_entry->kernel_object;
|
||||
const amd_kernel_code_t* kernel_code = GetKernelCode(kernel_object);
|
||||
ctx_entry->data.kernel_name = QueryKernelName(kernel_object, kernel_code);
|
||||
|
||||
if (pthread_mutex_lock(&mutex) != 0) {
|
||||
perror("pthread_mutex_lock");
|
||||
abort();
|
||||
}
|
||||
|
||||
dump_context_entry(ctx_entry);
|
||||
|
||||
if (pthread_mutex_unlock(&mutex) != 0) {
|
||||
perror("pthread_mutex_unlock");
|
||||
abort();
|
||||
}
|
||||
|
||||
free((void*)(ctx_entry->data.kernel_name));
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
// Profiling completion handler for concurrent implementation
|
||||
// Dump the context entry
|
||||
// Return true if the context was dumped successfully
|
||||
@@ -687,29 +759,13 @@ bool check_filter(const rocprofiler_callback_data_t* callback_data, const callba
|
||||
return found;
|
||||
}
|
||||
|
||||
// Kernel disoatch callback
|
||||
hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, void* user_data,
|
||||
rocprofiler_group_t* group) {
|
||||
// Passed tool data
|
||||
// Setting kernel properties
|
||||
void set_kernel_properties(const rocprofiler_callback_data_t* callback_data,
|
||||
kernel_properties_t* kernel_properties_ptr)
|
||||
{
|
||||
const hsa_kernel_dispatch_packet_t* packet = callback_data->packet;
|
||||
const amd_kernel_code_t* kernel_code = callback_data->kernel_code;
|
||||
callbacks_data_t* tool_data = reinterpret_cast<callbacks_data_t*>(user_data);
|
||||
// HSA status
|
||||
hsa_status_t status = HSA_STATUS_ERROR;
|
||||
|
||||
// Checking dispatch condition
|
||||
if (tool_data->filter_on == 1) {
|
||||
if (check_filter(callback_data, tool_data) == false) {
|
||||
next_context_count();
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
}
|
||||
// Profiling context
|
||||
rocprofiler_t* context = NULL;
|
||||
// Context entry
|
||||
context_entry_t* entry = alloc_context_entry();
|
||||
// kernel properties
|
||||
kernel_properties_t* kernel_properties_ptr = &(entry->kernel_properties);
|
||||
uint64_t grid_size = packet->grid_size_x * packet->grid_size_y * packet->grid_size_z;
|
||||
if (grid_size > UINT32_MAX) abort();
|
||||
kernel_properties_ptr->grid_size = (uint32_t)grid_size;
|
||||
@@ -722,6 +778,28 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data,
|
||||
kernel_properties_ptr->sgpr_count = AMD_HSA_BITS_GET(kernel_code->compute_pgm_rsrc1, AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT);
|
||||
kernel_properties_ptr->fbarrier_count = kernel_code->workgroup_fbarrier_count;
|
||||
kernel_properties_ptr->signal = callback_data->completion_signal;
|
||||
}
|
||||
|
||||
// Kernel disoatch callback
|
||||
hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, void* user_data,
|
||||
rocprofiler_group_t* group) {
|
||||
// Passed tool data
|
||||
callbacks_data_t* tool_data = reinterpret_cast<callbacks_data_t*>(user_data);
|
||||
// HSA status
|
||||
hsa_status_t status = HSA_STATUS_ERROR;
|
||||
|
||||
// Checking dispatch condition
|
||||
if (tool_data->filter_on == 1) {
|
||||
if (check_filter(callback_data, tool_data) == false) {
|
||||
next_context_count();
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
}
|
||||
// Profiling context
|
||||
// Context entry
|
||||
context_entry_t* entry = alloc_context_entry();
|
||||
// Setting kernel properties
|
||||
set_kernel_properties(callback_data, &(entry->kernel_properties));
|
||||
|
||||
// context properties
|
||||
rocprofiler_properties_t properties{};
|
||||
@@ -747,6 +825,7 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data,
|
||||
}
|
||||
|
||||
// Open profiling context
|
||||
rocprofiler_t* context = NULL;
|
||||
status = rocprofiler_open(callback_data->agent, features, feature_count,
|
||||
&context, 0 /*ROCPROFILER_MODE_SINGLEGROUP*/, &properties);
|
||||
check_status(status);
|
||||
@@ -780,6 +859,36 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data,
|
||||
return status;
|
||||
}
|
||||
|
||||
// Kernel disoatch callback
|
||||
hsa_status_t dispatch_callback_opt(const rocprofiler_callback_data_t* callback_data, void* user_data,
|
||||
rocprofiler_group_t* group) {
|
||||
hsa_status_t status = HSA_STATUS_ERROR;
|
||||
hsa_agent_t agent = callback_data->agent;
|
||||
const unsigned gpu_id = HsaRsrcFactory::Instance().GetAgentInfo(agent)->dev_index;
|
||||
callbacks_arg_t* callbacks_arg = reinterpret_cast<callbacks_arg_t*>(user_data);
|
||||
rocprofiler_pool_t* pool = callbacks_arg->pools[gpu_id];
|
||||
rocprofiler_pool_entry_t pool_entry{};
|
||||
status = rocprofiler_pool_fetch(pool, &pool_entry);
|
||||
check_status(status);
|
||||
// Profiling context entry
|
||||
rocprofiler_t* context = pool_entry.context;
|
||||
context_entry_t* entry = reinterpret_cast<context_entry_t*>(pool_entry.payload);
|
||||
// Setting kernel properties
|
||||
set_kernel_properties(callback_data, &(entry->kernel_properties));
|
||||
// Get group[0]
|
||||
status = rocprofiler_get_group(context, 0, group);
|
||||
check_status(status);
|
||||
|
||||
// Fill profiling context entry
|
||||
entry->index = UINT32_MAX;
|
||||
entry->agent = agent;
|
||||
entry->group = *group;
|
||||
entry->data = *callback_data;
|
||||
entry->kernel_object = callback_data->packet->kernel_object;
|
||||
reinterpret_cast<std::atomic<bool>*>(&entry->valid)->store(true);
|
||||
return status;
|
||||
}
|
||||
|
||||
hsa_status_t dispatch_callback_con(const rocprofiler_callback_data_t* callback_data, void* user_data,
|
||||
rocprofiler_group_t* group) {
|
||||
// Passed tool data
|
||||
@@ -1096,6 +1205,8 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings)
|
||||
if (settings->hsa_intercepting) rocprofiler_set_hsa_callbacks(hsa_callbacks, (void*)14);
|
||||
// Enable concurrent SQTT
|
||||
check_env_var("ROCP_K_CONCURRENT", settings->k_concurrent);
|
||||
// Enable optmized mode
|
||||
check_env_var("ROCP_OPT_MODE", settings->opt_mode);
|
||||
|
||||
is_trace_local = settings->trace_local;
|
||||
|
||||
@@ -1181,6 +1292,8 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings)
|
||||
range_vec->push_back(*(range_vec->begin()) + 1);
|
||||
}
|
||||
|
||||
const bool filter_disabled = (gpu_index_vec->empty() && kernel_string_vec->empty() && range_vec->empty());
|
||||
|
||||
// Getting traces
|
||||
const auto traces_list = xml->GetNodes("top.trace");
|
||||
if (traces_list.size() > 1) fatal("ROCProfiler: only one trace supported at a time");
|
||||
@@ -1298,30 +1411,78 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings)
|
||||
// Context array aloocation
|
||||
context_array = new context_array_t;
|
||||
|
||||
// Adding dispatch observer
|
||||
rocprofiler_queue_callbacks_t callbacks_ptrs{0};
|
||||
if (settings->k_concurrent != 0) {
|
||||
callbacks_ptrs.dispatch = dispatch_callback_con;
|
||||
bool opt_mode_cond = ((features_found != 0) &&
|
||||
(metrics_set->empty()) &&
|
||||
(traces_found == 0) &&
|
||||
(is_spm_trace == false) &&
|
||||
(filter_disabled == true));
|
||||
if (settings->opt_mode == 0) opt_mode_cond = false;
|
||||
if (!opt_mode_cond) settings->opt_mode = 0;
|
||||
if (opt_mode_cond) {
|
||||
// Handler arg
|
||||
handler_arg_t* handler_arg = new handler_arg_t{};
|
||||
handler_arg->features = features;
|
||||
handler_arg->feature_count = feature_count;
|
||||
|
||||
// Context properties
|
||||
rocprofiler_pool_properties_t properties{};
|
||||
properties.num_entries = (CTX_OUTSTANDING_MAX != 0) ? CTX_OUTSTANDING_MAX : 1000;
|
||||
properties.payload_bytes = sizeof(context_entry_t);
|
||||
properties.handler = context_pool_handler;
|
||||
properties.handler_arg = handler_arg;
|
||||
|
||||
// Available GPU agents
|
||||
const unsigned gpu_count = HsaRsrcFactory::Instance().GetCountOfGpuAgents();
|
||||
callbacks_arg_t* callbacks_arg = new callbacks_arg_t{};
|
||||
callbacks_arg->pools = new rocprofiler_pool_t* [gpu_count];
|
||||
for (unsigned gpu_id = 0; gpu_id < gpu_count; gpu_id++) {
|
||||
// Getting GPU device info
|
||||
const AgentInfo* agent_info = NULL;
|
||||
if (HsaRsrcFactory::Instance().GetGpuAgentInfo(gpu_id, &agent_info) == false) {
|
||||
fprintf(stderr, "GetGpuAgentInfo failed\n");
|
||||
abort();
|
||||
}
|
||||
|
||||
// Open profiling pool
|
||||
rocprofiler_pool_t* pool = NULL;
|
||||
hsa_status_t status = rocprofiler_pool_open(agent_info->dev_id, features, features_found,
|
||||
&pool, 0, &properties);
|
||||
check_status(status);
|
||||
callbacks_arg->pools[gpu_id] = pool;
|
||||
}
|
||||
|
||||
// Adding dispatch observer
|
||||
rocprofiler_queue_callbacks_t callbacks_ptrs{0};
|
||||
callbacks_ptrs.dispatch = dispatch_callback_opt;
|
||||
callbacks_ptrs.destroy = destroy_callback;
|
||||
|
||||
rocprofiler_set_queue_callbacks(callbacks_ptrs, callbacks_arg);
|
||||
} else {
|
||||
callbacks_ptrs.dispatch = dispatch_callback;
|
||||
// Adding dispatch observer
|
||||
rocprofiler_queue_callbacks_t callbacks_ptrs{0};
|
||||
if (settings->k_concurrent != 0) {
|
||||
callbacks_ptrs.dispatch = dispatch_callback_con;
|
||||
} else {
|
||||
callbacks_ptrs.dispatch = dispatch_callback;
|
||||
}
|
||||
callbacks_ptrs.destroy = destroy_callback;
|
||||
|
||||
callbacks_data = new callbacks_data_t{};
|
||||
callbacks_data->features = features;
|
||||
callbacks_data->feature_count = features_found;
|
||||
callbacks_data->set = (metrics_set->empty()) ? NULL : metrics_set;
|
||||
callbacks_data->group_index = 0;
|
||||
callbacks_data->file_handle = result_file_handle;
|
||||
callbacks_data->gpu_index = (gpu_index_vec->empty()) ? NULL : gpu_index_vec;
|
||||
callbacks_data->kernel_string = (kernel_string_vec->empty()) ? NULL : kernel_string_vec;
|
||||
callbacks_data->range = (range_vec->empty()) ? NULL : range_vec;;
|
||||
callbacks_data->filter_on = (callbacks_data->gpu_index != NULL) ||
|
||||
(callbacks_data->kernel_string != NULL) ||
|
||||
(callbacks_data->range != NULL)
|
||||
? 1 : 0;
|
||||
|
||||
rocprofiler_set_queue_callbacks(callbacks_ptrs, callbacks_data);
|
||||
}
|
||||
callbacks_ptrs.destroy = destroy_callback;
|
||||
|
||||
callbacks_data = new callbacks_data_t{};
|
||||
callbacks_data->features = features;
|
||||
callbacks_data->feature_count = features_found;
|
||||
callbacks_data->set = (metrics_set->empty()) ? NULL : metrics_set;
|
||||
callbacks_data->group_index = 0;
|
||||
callbacks_data->file_handle = result_file_handle;
|
||||
callbacks_data->gpu_index = (gpu_index_vec->empty()) ? NULL : gpu_index_vec;
|
||||
callbacks_data->kernel_string = (kernel_string_vec->empty()) ? NULL : kernel_string_vec;
|
||||
callbacks_data->range = (range_vec->empty()) ? NULL : range_vec;;
|
||||
callbacks_data->filter_on = (callbacks_data->gpu_index != NULL) ||
|
||||
(callbacks_data->kernel_string != NULL) ||
|
||||
(callbacks_data->range != NULL)
|
||||
? 1 : 0;
|
||||
|
||||
rocprofiler_set_queue_callbacks(callbacks_ptrs, callbacks_data);
|
||||
|
||||
xml::Xml::Destroy(xml);
|
||||
|
||||
|
||||
Ссылка в новой задаче
Block a user