Change-Id: I8c694e5380e15179453148dd9ab3a3e51b6db861


[ROCm/rocprofiler commit: 2a7f77b290]
Этот коммит содержится в:
Evgeny
2020-07-07 16:18:29 -05:00
родитель a288114390
Коммит 3d44ea1a05
13 изменённых файлов: 415 добавлений и 90 удалений
+1
Просмотреть файл
@@ -73,6 +73,7 @@ typedef struct {
uint32_t timestamp_on;
uint32_t hsa_intercepting;
uint32_t k_concurrent;
uint32_t opt_mode;
} rocprofiler_settings_t;
////////////////////////////////////////////////////////////////////////////////
+33 -6
Просмотреть файл
@@ -183,7 +183,7 @@ class Context {
uint32_t GetGroupCount() const { return set_.size(); }
inline rocprofiler_group_t GetGroupInfo(Group* g) {
inline rocprofiler_group_t GetGroupDescr(Group* g) {
rocprofiler::info_vector_t& info_vector = g->GetInfoVector();
rocprofiler_group_t group = {};
group.index = g->GetIndex();
@@ -192,12 +192,12 @@ class Context {
group.feature_count = info_vector.size();
return group;
}
inline rocprofiler_group_t GetGroupInfo(const uint32_t& index) {
inline rocprofiler_group_t GetGroupDescr(const uint32_t& index) {
rocprofiler_group_t group = {};
if (set_.empty()) {
group.context = reinterpret_cast<rocprofiler_t*>(this);
} else {
group = GetGroupInfo(&set_[index]);
group = GetGroupDescr(&set_[index]);
}
return group;
}
@@ -288,8 +288,8 @@ class Context {
Context* context = group->GetContext();
auto r = group->FetchDecrRefsCount();
if (r == 1) {
const rocprofiler_group_t group_info = context->GetGroupInfo(group);
context->handler_(group_info, context->handler_arg_);
const rocprofiler_group_t group_descr = context->GetGroupDescr(group);
context->handler_(group_descr, context->handler_arg_);
}
return false;
}
@@ -298,6 +298,22 @@ class Context {
Group* GetGroup(const uint32_t& index) { return &set_[index]; }
rocprofiler_handler_t GetHandler(void** arg) const { *arg = handler_arg_; return handler_; }
void SetDispatchSignal(const hsa_signal_t &signal) {
dispatch_signal_ = signal;
}
hsa_signal_t& GetDispatchSignal() {
return dispatch_signal_;
}
void SetOrigSignal(const hsa_signal_t &signal) {
orig_signal_ = signal;
}
const hsa_signal_t& GetOrigSignal() const {
return orig_signal_;
}
rocprofiler_dispatch_record_t* GetRecord() {
return &record_;
}
private:
Context(const util::AgentInfo* agent_info, Queue* queue, rocprofiler_feature_t* info,
const uint32_t info_count, rocprofiler_handler_t handler, void* handler_arg)
@@ -309,7 +325,10 @@ class Context {
metrics_(NULL),
handler_(handler),
handler_arg_(handler_arg),
pcsmp_mode_(false)
pcsmp_mode_(false),
dispatch_signal_{},
orig_signal_{},
record_{}
{}
~Context() { Destruct(); }
@@ -355,6 +374,9 @@ class Context {
}
}
}
hsa_status_t status = hsa_signal_create(1, 0, NULL, &dispatch_signal_);
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "MetricsDict create failed");
}
// Initialize rocprofiler context
@@ -593,6 +615,11 @@ class Context {
// PC sampling mode
bool pcsmp_mode_;
// kernel packet dispatch copmletion signal
hsa_signal_t dispatch_signal_;
hsa_signal_t orig_signal_;
rocprofiler_dispatch_record_t record_;
};
} // namespace rocprofiler
+1
Просмотреть файл
@@ -43,5 +43,6 @@ rocprofiler_hsa_callback_fun_t InterceptQueue::submit_callback_fun_ = NULL;
void* InterceptQueue::submit_callback_arg_ = NULL;
bool InterceptQueue::k_concurrent_ = false;
bool InterceptQueue::opt_mode_ = false;
} // namespace rocprofiler
+82
Просмотреть файл
@@ -92,6 +92,8 @@ class InterceptQueue {
(*obj_map_)[(uint64_t)(*queue)] = obj;
if (k_concurrent_) {
status = proxy->SetInterceptCB(OnSubmitCB_SQTT, obj);
} else if (opt_mode_) {
status = proxy->SetInterceptCB(OnSubmitCB_opt, obj);
} else {
status = proxy->SetInterceptCB(OnSubmitCB, obj);
}
@@ -138,6 +140,85 @@ class InterceptQueue {
return status;
}
static void OnSubmitCB_opt(const void* in_packets, uint64_t count, uint64_t user_que_idx, void* data,
hsa_amd_queue_intercept_packet_writer writer) {
const packet_t* packets_arr = reinterpret_cast<const packet_t*>(in_packets);
InterceptQueue* obj = reinterpret_cast<InterceptQueue*>(data);
Queue* proxy = obj->proxy_;
// Travers input packets
for (uint64_t j = 0; j < count; ++j) {
const packet_t* packet = &packets_arr[j];
bool to_submit = true;
// Checking for dispatch packet type
if ((GetHeaderType(packet) == HSA_PACKET_TYPE_KERNEL_DISPATCH) &&
(dispatch_callback_.load(std::memory_order_acquire) != NULL)) {
const hsa_kernel_dispatch_packet_t* dispatch_packet =
reinterpret_cast<const hsa_kernel_dispatch_packet_t*>(packet);
const hsa_signal_t completion_signal = dispatch_packet->completion_signal;
#if 0
// Prepareing dispatch callback data
uint64_t kernel_object = dispatch_packet->kernel_object;
const amd_kernel_code_t* kernel_code = GetKernelCode(kernel_object);
const char* kernel_name = QueryKernelName(kernel_object, kernel_code);
#endif
rocprofiler_callback_data_t data = {obj->agent_info_->dev_id,
obj->agent_info_->dev_index,
obj->queue_,
user_que_idx,
obj->queue_id,
completion_signal,
dispatch_packet,
NULL, // kernel_name
0, // kernel_object
NULL, // kernel_code
0, // (uint32_t)syscall(__NR_gettid),
NULL};
// Calling dispatch callback
rocprofiler_group_t group = {};
hsa_status_t status = (dispatch_callback_.load())(&data, callback_data_, &group);
#if 0
free(const_cast<char*>(kernel_name));
#endif
Context* context = reinterpret_cast<Context*>(group.context);
// Injecting profiling start/stop packets
if ((status == HSA_STATUS_SUCCESS) && (context != NULL)) {
if (group.feature_count != 0) {
if (tracker_ != NULL) {
const_cast<hsa_kernel_dispatch_packet_t*>(dispatch_packet)->completion_signal = context->GetDispatchSignal();
Group* context_group = context->GetGroup(group.index);
Tracker::Enable_opt(context_group, completion_signal);
context_group->IncrRefsCount();
}
const pkt_vector_t& start_vector = context->StartPackets(group.index);
const pkt_vector_t& stop_vector = context->StopPackets(group.index);
pkt_vector_t packets = start_vector;
packets.insert(packets.end(), *packet);
packets.insert(packets.end(), stop_vector.begin(), stop_vector.end());
if (writer != NULL) {
writer(&packets[0], packets.size());
} else {
proxy->Submit(&packets[0], packets.size());
}
to_submit = false;
}
}
}
// Submitting the original packets if profiling was not enabled
if (to_submit) {
if (writer != NULL) {
writer(packet, 1);
} else {
proxy->Submit(packet, 1);
}
}
}
}
static void OnSubmitCB(const void* in_packets, uint64_t count, uint64_t user_que_idx, void* data,
hsa_amd_queue_intercept_packet_writer writer) {
const packet_t* packets_arr = reinterpret_cast<const packet_t*>(in_packets);
@@ -400,6 +481,7 @@ class InterceptQueue {
static bool IsTrackerOn() { return tracker_on_; }
static bool k_concurrent_;
static bool opt_mode_;
private:
static void queue_event_callback(hsa_status_t status, hsa_queue_t *queue, void *arg) {
+3 -3
Просмотреть файл
@@ -217,6 +217,7 @@ uint32_t LoadTool() {
if (settings.memcopy_tracking) intercept_mode |= MEMCOPY_INTERCEPT_MODE;
if (settings.hsa_intercepting) intercept_mode |= HSA_INTERCEPT_MODE;
if (settings.k_concurrent) InterceptQueue::k_concurrent_ = true;
if (settings.opt_mode) InterceptQueue::opt_mode_ = true;
}
ONLOAD_TRACE("end intercept_mode(" << intercept_mode << ")");
@@ -537,8 +538,7 @@ PUBLIC_API hsa_status_t rocprofiler_open(hsa_agent_t agent, rocprofiler_feature_
if (mode != 0) {
if (mode & ROCPROFILER_MODE_STANDALONE) {
if (mode & ROCPROFILER_MODE_CREATEQUEUE) {
if (hsa_rsrc->CreateQueue(agent_info, properties->queue_depth, &(properties->queue)) ==
false) {
if (hsa_rsrc->CreateQueue(agent_info, properties->queue_depth, &(properties->queue)) == false) {
EXC_RAISING(HSA_STATUS_ERROR, "CreateQueue() failed");
}
}
@@ -592,7 +592,7 @@ PUBLIC_API hsa_status_t rocprofiler_get_group(rocprofiler_t* handle, uint32_t gr
rocprofiler_group_t* group) {
API_METHOD_PREFIX
rocprofiler::Context* context = reinterpret_cast<rocprofiler::Context*>(handle);
*group = context->GetGroupInfo(group_index);
*group = context->GetGroupDescr(group_index);
API_METHOD_SUFFIX
}
+43
Просмотреть файл
@@ -155,6 +155,49 @@ class Tracker {
Enable(entry, reinterpret_cast<void*>(handler), arg);
}
// Enable tracking
static void Enable_opt(Group* group, const hsa_signal_t& orig_signal) {
Context* context = group->GetContext();
context->SetOrigSignal(orig_signal);
context->GetRecord()->dispatch = util::HsaRsrcFactory::Instance().TimestampNs();
// Creating a proxy signal
const hsa_signal_value_t signal_value = (orig_signal.handle) ?
util::HsaRsrcFactory::Instance().HsaApi()->hsa_signal_load_relaxed(orig_signal) : 1;
hsa_signal_t& dispatch_signal = context->GetDispatchSignal();
util::HsaRsrcFactory::Instance().HsaApi()->hsa_signal_store_screlease(dispatch_signal, signal_value);
hsa_status_t status =
util::HsaRsrcFactory::Instance().HsaApi()->hsa_amd_signal_async_handler(dispatch_signal, HSA_SIGNAL_CONDITION_LT, signal_value, Handler, group);
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_signal_async_handler");
}
// Tracker handler
static bool Handler_opt(hsa_signal_value_t signal_value, void* arg) {
Group* group = reinterpret_cast<Group*>(arg);
Context* context = group->GetContext();
hsa_signal_t dispatch_signal = context->GetDispatchSignal();
record_t* record = context->GetRecord();
hsa_amd_profiling_dispatch_time_t dispatch_time{};
hsa_status_t status =
util::HsaRsrcFactory::Instance().HsaApi()->hsa_amd_profiling_get_dispatch_time(context->GetAgent(), dispatch_signal, &dispatch_time);
if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_profiling_get_dispatch_time");
record->begin = util::HsaRsrcFactory::Instance().SysclockToNs(dispatch_time.start);
record->end = util::HsaRsrcFactory::Instance().SysclockToNs(dispatch_time.end);
record->complete = util::HsaRsrcFactory::Instance().TimestampNs();
// Original intercepted signal completion
const hsa_signal_t& orig_signal = context->GetOrigSignal();
if (orig_signal.handle) {
amd_signal_t* orig_signal_ptr = reinterpret_cast<amd_signal_t*>(orig_signal.handle);
amd_signal_t* prof_signal_ptr = reinterpret_cast<amd_signal_t*>(dispatch_signal.handle);
orig_signal_ptr->start_ts = prof_signal_ptr->start_ts;
orig_signal_ptr->end_ts = prof_signal_ptr->end_ts;
util::HsaRsrcFactory::Instance().HsaApi()->hsa_signal_store_screlease(orig_signal, signal_value);
}
Context::Handler(signal_value, arg);
}
private:
Tracker() :
outstanding_(0),
+34 -13
Просмотреть файл
@@ -86,7 +86,7 @@ export HSA_TOOLS_LIB=librocprofiler64.so.1
# enable intercepting mode in rocprofiler
export ROCP_HSA_INTERCEPT=2
# test macro for kernel iterations number
export ROCP_KITER=100
export ROCP_KITER=20
# test macro for per-kernel dispatching number
export ROCP_DITER=10
eval_test "Standalone intercepting test" ./test/stand_intercept_test
@@ -95,8 +95,8 @@ unset ROCP_HSA_INTERCEPT
## Intercepting usage model test
# tool library loaded by ROC profiler
export ROCP_TOOL_LIB=./test/libintercept_test.so
export ROCP_KITER=50
export ROCP_DITER=50
export ROCP_KITER=20
export ROCP_DITER=20
export ROCP_AGENTS=1
export ROCP_THRS=3
eval_test "Intercepting usage model test" ./test/ctrl
@@ -114,19 +114,40 @@ if [ ! -e $ROCP_TOOL_LIB ] ; then
export ROCP_TOOL_LIB=test/libtool.so
fi
export ROCP_KITER=50
export ROCP_DITER=50
export ROCP_KITER=20
export ROCP_DITER=20
export ROCP_AGENTS=1
export ROCP_THRS=1
export ROCP_INPUT=input.xml
eval_test "'rocprof' libtool test" ./test/ctrl
export ROCP_INPUT=pmc_input.xml
eval_test "'rocprof' libtool PMC test" ./test/ctrl
export ROCP_KITER=10
export ROCP_DITER=10
export ROCP_KITER=20
export ROCP_DITER=20
export ROCP_AGENTS=1
export ROCP_THRS=10
export ROCP_INPUT=input1.xml
eval_test "'rocprof' libtool test n-threads" ./test/ctrl
export ROCP_INPUT=pmc_input.xml
eval_test "'rocprof' libtool PMC n-thread test" ./test/ctrl
export ROCP_KITER=20
export ROCP_DITER=20
export ROCP_AGENTS=1
export ROCP_THRS=1
export ROCP_INPUT=pmc_input1.xml
eval_test "'rocprof' libtool PMC test1" ./test/ctrl
export ROCP_KITER=20
export ROCP_DITER=20
export ROCP_AGENTS=1
export ROCP_THRS=10
export ROCP_INPUT=pmc_input1.xml
eval_test "'rocprof' libtool PMC n-thread test1" ./test/ctrl
export ROCP_KITER=20
export ROCP_DITER=20
export ROCP_AGENTS=1
export ROCP_THRS=1
export ROCP_INPUT=sqtt_input.xml
eval_test "'rocprof' libtool SQTT test" ./test/ctrl
## SPM test
# export ROCP_KITER=3
@@ -144,7 +165,7 @@ export ROCP_MCOPY_TRACKING=1
export ROCP_KITER=1
export ROCP_DITER=4
export ROCP_INPUT=input2.xml
export ROCP_INPUT=set_input.xml
eval_test "libtool test, counter sets" ./test/ctrl
## OpenCL test
@@ -159,7 +180,7 @@ export ROCP_HSA_INTERC=1
export ROCP_KITER=10
export ROCP_DITER=10
export ROCP_INPUT=input1.xml
#export ROCP_INPUT=input1.xml
eval_test "libtool test, counter sets" ./test/ctrl
## OpenCL test
-23
Просмотреть файл
@@ -1,23 +0,0 @@
# Filter by dispatches range, GPU index and kernel names
<metric
# range format "3:9"
range=""
# list of gpu indexes "0,1,2,3"
gpu_index=""
# list of matched sub-strings "Simple1,Conv1,SimpleConvolution"
kernel=""
></metric>
# List of metrics
<metric
name=SQ:4,SQ_WAVES,SQ_INSTS_SMEM,SQ_INSTS_VALU,TA_FLAT_WRITE_WAVEFRONTS[0],TA_FLAT_WRITE_WAVEFRONTS[1],GPUBusy,VALUBusy,SALUBusy,MemUnitBusy,SFetchInsts,FetchSize,VWriteInsts
></metric>
# SQTT trace with parameters
<trace name="SQTT">
<parameters
MASK=0x0f00
TOKEN_MASK=0x144b
TOKEN_MASK2=0xffff
></parameters>
</trace>
+4
Просмотреть файл
@@ -0,0 +1,4 @@
# List of metrics
<metric
name=SQ:4,SQ_WAVES,SQ_INSTS_SMEM,SQ_INSTS_VALU,TA_FLAT_WRITE_WAVEFRONTS[0],TA_FLAT_WRITE_WAVEFRONTS[1],GPUBusy,VALUBusy,SALUBusy,MemUnitBusy,SFetchInsts,FetchSize,VWriteInsts
></metric>
+2 -2
Просмотреть файл
@@ -3,9 +3,9 @@
# range format "3:9"
range=""
# list of gpu indexes "0,1,2,3"
gpu_index=""
gpu_index="0,1,2,3"
# list of matched sub-strings "Simple1,Conv1,SimpleConvolution"
kernel=""
kernel="Simple1,Conv1,SimpleConvolution"
></metric>
# List of metrics
+8
Просмотреть файл
@@ -0,0 +1,8 @@
# SQTT trace with parameters
<trace name="SQTT">
<parameters
MASK=0x0f00
TOKEN_MASK=0x144b
TOKEN_MASK2=0xffff
></parameters>
</trace>
+204 -43
Просмотреть файл
@@ -27,6 +27,7 @@ THE SOFTWARE.
///////////////////////////////////////////////////////////////////////////////
#include <assert.h>
#include <cxxabi.h>
#include <dirent.h>
#include <hsa.h>
#include <pthread.h>
@@ -99,6 +100,7 @@ struct context_entry_t {
unsigned feature_count;
rocprofiler_callback_data_t data;
kernel_properties_t kernel_properties;
uint64_t kernel_object;
FILE* file_handle;
};
@@ -169,6 +171,21 @@ void check_status(hsa_status_t status) {
}
}
//////////////////////////////////////////////////////////////////////////////////////
// Dispatch opt code /////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////
// Context callback arg
struct callbacks_arg_t {
rocprofiler_pool_t** pools;
};
// Handler callback arg
struct handler_arg_t {
rocprofiler_feature_t* features;
unsigned feature_count;
};
///////////////////////////////////////////////////////////////////////////////////////////////////////
// Print profiling results output break if terminal output is enabled
void results_output_break() {
const bool is_terminal_output = (result_file_opened == false);
@@ -589,7 +606,6 @@ void dump_context_array(hsa_queue_t* queue) {
// Profiling completion handler
// Dump and delete the context entry
// Return true if the context was dumped successfully
bool context_handler(rocprofiler_group_t group, void* arg) {
context_entry_t* entry = reinterpret_cast<context_entry_t*>(arg);
@@ -621,6 +637,62 @@ bool context_handler(rocprofiler_group_t group, void* arg) {
return false;
}
static const amd_kernel_code_t* GetKernelCode(uint64_t kernel_object) {
const amd_kernel_code_t* kernel_code = NULL;
hsa_status_t status =
HsaRsrcFactory::Instance().LoaderApi()->hsa_ven_amd_loader_query_host_address(
reinterpret_cast<const void*>(kernel_object),
reinterpret_cast<const void**>(&kernel_code));
if (HSA_STATUS_SUCCESS != status) {
kernel_code = reinterpret_cast<amd_kernel_code_t*>(kernel_object);
}
return kernel_code;
}
// Demangle C++ symbol name
static const char* cpp_demangle(const char* symname) {
size_t size = 0;
int status;
const char* ret = abi::__cxa_demangle(symname, NULL, &size, &status);
return (ret != 0) ? ret : strdup(symname);
}
static const char* QueryKernelName(uint64_t kernel_object, const amd_kernel_code_t* kernel_code) {
const char* kernel_symname = HsaRsrcFactory::GetKernelNameRef(kernel_object);
return cpp_demangle(kernel_symname);
}
// Profiling completion handler
// Dump context entry
bool context_pool_handler(const rocprofiler_pool_entry_t* entry, void* arg) {
// Context entry
context_entry_t* ctx_entry = reinterpret_cast<context_entry_t*>(entry->payload);
handler_arg_t* handler_arg = reinterpret_cast<handler_arg_t*>(arg);
ctx_entry->features = handler_arg->features;
ctx_entry->feature_count = handler_arg->feature_count;
ctx_entry->file_handle = result_file_handle;
const uint64_t kernel_object = ctx_entry->kernel_object;
const amd_kernel_code_t* kernel_code = GetKernelCode(kernel_object);
ctx_entry->data.kernel_name = QueryKernelName(kernel_object, kernel_code);
if (pthread_mutex_lock(&mutex) != 0) {
perror("pthread_mutex_lock");
abort();
}
dump_context_entry(ctx_entry);
if (pthread_mutex_unlock(&mutex) != 0) {
perror("pthread_mutex_unlock");
abort();
}
free((void*)(ctx_entry->data.kernel_name));
return false;
}
// Profiling completion handler for concurrent implementation
// Dump the context entry
// Return true if the context was dumped successfully
@@ -687,29 +759,13 @@ bool check_filter(const rocprofiler_callback_data_t* callback_data, const callba
return found;
}
// Kernel disoatch callback
hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, void* user_data,
rocprofiler_group_t* group) {
// Passed tool data
// Setting kernel properties
void set_kernel_properties(const rocprofiler_callback_data_t* callback_data,
kernel_properties_t* kernel_properties_ptr)
{
const hsa_kernel_dispatch_packet_t* packet = callback_data->packet;
const amd_kernel_code_t* kernel_code = callback_data->kernel_code;
callbacks_data_t* tool_data = reinterpret_cast<callbacks_data_t*>(user_data);
// HSA status
hsa_status_t status = HSA_STATUS_ERROR;
// Checking dispatch condition
if (tool_data->filter_on == 1) {
if (check_filter(callback_data, tool_data) == false) {
next_context_count();
return HSA_STATUS_SUCCESS;
}
}
// Profiling context
rocprofiler_t* context = NULL;
// Context entry
context_entry_t* entry = alloc_context_entry();
// kernel properties
kernel_properties_t* kernel_properties_ptr = &(entry->kernel_properties);
uint64_t grid_size = packet->grid_size_x * packet->grid_size_y * packet->grid_size_z;
if (grid_size > UINT32_MAX) abort();
kernel_properties_ptr->grid_size = (uint32_t)grid_size;
@@ -722,6 +778,28 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data,
kernel_properties_ptr->sgpr_count = AMD_HSA_BITS_GET(kernel_code->compute_pgm_rsrc1, AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT);
kernel_properties_ptr->fbarrier_count = kernel_code->workgroup_fbarrier_count;
kernel_properties_ptr->signal = callback_data->completion_signal;
}
// Kernel disoatch callback
hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, void* user_data,
rocprofiler_group_t* group) {
// Passed tool data
callbacks_data_t* tool_data = reinterpret_cast<callbacks_data_t*>(user_data);
// HSA status
hsa_status_t status = HSA_STATUS_ERROR;
// Checking dispatch condition
if (tool_data->filter_on == 1) {
if (check_filter(callback_data, tool_data) == false) {
next_context_count();
return HSA_STATUS_SUCCESS;
}
}
// Profiling context
// Context entry
context_entry_t* entry = alloc_context_entry();
// Setting kernel properties
set_kernel_properties(callback_data, &(entry->kernel_properties));
// context properties
rocprofiler_properties_t properties{};
@@ -747,6 +825,7 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data,
}
// Open profiling context
rocprofiler_t* context = NULL;
status = rocprofiler_open(callback_data->agent, features, feature_count,
&context, 0 /*ROCPROFILER_MODE_SINGLEGROUP*/, &properties);
check_status(status);
@@ -780,6 +859,36 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data,
return status;
}
// Kernel disoatch callback
hsa_status_t dispatch_callback_opt(const rocprofiler_callback_data_t* callback_data, void* user_data,
rocprofiler_group_t* group) {
hsa_status_t status = HSA_STATUS_ERROR;
hsa_agent_t agent = callback_data->agent;
const unsigned gpu_id = HsaRsrcFactory::Instance().GetAgentInfo(agent)->dev_index;
callbacks_arg_t* callbacks_arg = reinterpret_cast<callbacks_arg_t*>(user_data);
rocprofiler_pool_t* pool = callbacks_arg->pools[gpu_id];
rocprofiler_pool_entry_t pool_entry{};
status = rocprofiler_pool_fetch(pool, &pool_entry);
check_status(status);
// Profiling context entry
rocprofiler_t* context = pool_entry.context;
context_entry_t* entry = reinterpret_cast<context_entry_t*>(pool_entry.payload);
// Setting kernel properties
set_kernel_properties(callback_data, &(entry->kernel_properties));
// Get group[0]
status = rocprofiler_get_group(context, 0, group);
check_status(status);
// Fill profiling context entry
entry->index = UINT32_MAX;
entry->agent = agent;
entry->group = *group;
entry->data = *callback_data;
entry->kernel_object = callback_data->packet->kernel_object;
reinterpret_cast<std::atomic<bool>*>(&entry->valid)->store(true);
return status;
}
hsa_status_t dispatch_callback_con(const rocprofiler_callback_data_t* callback_data, void* user_data,
rocprofiler_group_t* group) {
// Passed tool data
@@ -1096,6 +1205,8 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings)
if (settings->hsa_intercepting) rocprofiler_set_hsa_callbacks(hsa_callbacks, (void*)14);
// Enable concurrent SQTT
check_env_var("ROCP_K_CONCURRENT", settings->k_concurrent);
// Enable optmized mode
check_env_var("ROCP_OPT_MODE", settings->opt_mode);
is_trace_local = settings->trace_local;
@@ -1181,6 +1292,8 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings)
range_vec->push_back(*(range_vec->begin()) + 1);
}
const bool filter_disabled = (gpu_index_vec->empty() && kernel_string_vec->empty() && range_vec->empty());
// Getting traces
const auto traces_list = xml->GetNodes("top.trace");
if (traces_list.size() > 1) fatal("ROCProfiler: only one trace supported at a time");
@@ -1298,30 +1411,78 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings)
// Context array aloocation
context_array = new context_array_t;
// Adding dispatch observer
rocprofiler_queue_callbacks_t callbacks_ptrs{0};
if (settings->k_concurrent != 0) {
callbacks_ptrs.dispatch = dispatch_callback_con;
bool opt_mode_cond = ((features_found != 0) &&
(metrics_set->empty()) &&
(traces_found == 0) &&
(is_spm_trace == false) &&
(filter_disabled == true));
if (settings->opt_mode == 0) opt_mode_cond = false;
if (!opt_mode_cond) settings->opt_mode = 0;
if (opt_mode_cond) {
// Handler arg
handler_arg_t* handler_arg = new handler_arg_t{};
handler_arg->features = features;
handler_arg->feature_count = feature_count;
// Context properties
rocprofiler_pool_properties_t properties{};
properties.num_entries = (CTX_OUTSTANDING_MAX != 0) ? CTX_OUTSTANDING_MAX : 1000;
properties.payload_bytes = sizeof(context_entry_t);
properties.handler = context_pool_handler;
properties.handler_arg = handler_arg;
// Available GPU agents
const unsigned gpu_count = HsaRsrcFactory::Instance().GetCountOfGpuAgents();
callbacks_arg_t* callbacks_arg = new callbacks_arg_t{};
callbacks_arg->pools = new rocprofiler_pool_t* [gpu_count];
for (unsigned gpu_id = 0; gpu_id < gpu_count; gpu_id++) {
// Getting GPU device info
const AgentInfo* agent_info = NULL;
if (HsaRsrcFactory::Instance().GetGpuAgentInfo(gpu_id, &agent_info) == false) {
fprintf(stderr, "GetGpuAgentInfo failed\n");
abort();
}
// Open profiling pool
rocprofiler_pool_t* pool = NULL;
hsa_status_t status = rocprofiler_pool_open(agent_info->dev_id, features, features_found,
&pool, 0, &properties);
check_status(status);
callbacks_arg->pools[gpu_id] = pool;
}
// Adding dispatch observer
rocprofiler_queue_callbacks_t callbacks_ptrs{0};
callbacks_ptrs.dispatch = dispatch_callback_opt;
callbacks_ptrs.destroy = destroy_callback;
rocprofiler_set_queue_callbacks(callbacks_ptrs, callbacks_arg);
} else {
callbacks_ptrs.dispatch = dispatch_callback;
// Adding dispatch observer
rocprofiler_queue_callbacks_t callbacks_ptrs{0};
if (settings->k_concurrent != 0) {
callbacks_ptrs.dispatch = dispatch_callback_con;
} else {
callbacks_ptrs.dispatch = dispatch_callback;
}
callbacks_ptrs.destroy = destroy_callback;
callbacks_data = new callbacks_data_t{};
callbacks_data->features = features;
callbacks_data->feature_count = features_found;
callbacks_data->set = (metrics_set->empty()) ? NULL : metrics_set;
callbacks_data->group_index = 0;
callbacks_data->file_handle = result_file_handle;
callbacks_data->gpu_index = (gpu_index_vec->empty()) ? NULL : gpu_index_vec;
callbacks_data->kernel_string = (kernel_string_vec->empty()) ? NULL : kernel_string_vec;
callbacks_data->range = (range_vec->empty()) ? NULL : range_vec;;
callbacks_data->filter_on = (callbacks_data->gpu_index != NULL) ||
(callbacks_data->kernel_string != NULL) ||
(callbacks_data->range != NULL)
? 1 : 0;
rocprofiler_set_queue_callbacks(callbacks_ptrs, callbacks_data);
}
callbacks_ptrs.destroy = destroy_callback;
callbacks_data = new callbacks_data_t{};
callbacks_data->features = features;
callbacks_data->feature_count = features_found;
callbacks_data->set = (metrics_set->empty()) ? NULL : metrics_set;
callbacks_data->group_index = 0;
callbacks_data->file_handle = result_file_handle;
callbacks_data->gpu_index = (gpu_index_vec->empty()) ? NULL : gpu_index_vec;
callbacks_data->kernel_string = (kernel_string_vec->empty()) ? NULL : kernel_string_vec;
callbacks_data->range = (range_vec->empty()) ? NULL : range_vec;;
callbacks_data->filter_on = (callbacks_data->gpu_index != NULL) ||
(callbacks_data->kernel_string != NULL) ||
(callbacks_data->range != NULL)
? 1 : 0;
rocprofiler_set_queue_callbacks(callbacks_ptrs, callbacks_data);
xml::Xml::Destroy(xml);