From 3d44ea1a0577d81d7fde79fd85ba99502fcebd32 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Tue, 7 Jul 2020 16:18:29 -0500 Subject: [PATCH] counters dumping optimization Change-Id: I8c694e5380e15179453148dd9ab3a3e51b6db861 [ROCm/rocprofiler commit: 2a7f77b290b95b67a9c82b5cb2f2cda39908ce29] --- projects/rocprofiler/inc/rocprofiler.h | 1 + projects/rocprofiler/src/core/context.h | 39 ++- .../rocprofiler/src/core/intercept_queue.cpp | 1 + .../rocprofiler/src/core/intercept_queue.h | 82 ++++++ projects/rocprofiler/src/core/rocprofiler.cpp | 6 +- projects/rocprofiler/src/core/tracker.h | 43 +++ projects/rocprofiler/test/run.sh | 47 +++- projects/rocprofiler/test/tool/input.xml | 23 -- projects/rocprofiler/test/tool/pmc_input.xml | 4 + .../test/tool/{input1.xml => pmc_input1.xml} | 4 +- .../test/tool/{input2.xml => set_input.xml} | 0 projects/rocprofiler/test/tool/sqtt_input.xml | 8 + projects/rocprofiler/test/tool/tool.cpp | 247 +++++++++++++++--- 13 files changed, 415 insertions(+), 90 deletions(-) delete mode 100644 projects/rocprofiler/test/tool/input.xml create mode 100644 projects/rocprofiler/test/tool/pmc_input.xml rename projects/rocprofiler/test/tool/{input1.xml => pmc_input1.xml} (86%) rename projects/rocprofiler/test/tool/{input2.xml => set_input.xml} (100%) create mode 100644 projects/rocprofiler/test/tool/sqtt_input.xml diff --git a/projects/rocprofiler/inc/rocprofiler.h b/projects/rocprofiler/inc/rocprofiler.h index 8ed532ff6e..7fc57aca66 100644 --- a/projects/rocprofiler/inc/rocprofiler.h +++ b/projects/rocprofiler/inc/rocprofiler.h @@ -73,6 +73,7 @@ typedef struct { uint32_t timestamp_on; uint32_t hsa_intercepting; uint32_t k_concurrent; + uint32_t opt_mode; } rocprofiler_settings_t; //////////////////////////////////////////////////////////////////////////////// diff --git a/projects/rocprofiler/src/core/context.h b/projects/rocprofiler/src/core/context.h index 7131d3383b..77bf17eafd 100644 --- a/projects/rocprofiler/src/core/context.h +++ b/projects/rocprofiler/src/core/context.h @@ -183,7 +183,7 @@ class Context { uint32_t GetGroupCount() const { return set_.size(); } - inline rocprofiler_group_t GetGroupInfo(Group* g) { + inline rocprofiler_group_t GetGroupDescr(Group* g) { rocprofiler::info_vector_t& info_vector = g->GetInfoVector(); rocprofiler_group_t group = {}; group.index = g->GetIndex(); @@ -192,12 +192,12 @@ class Context { group.feature_count = info_vector.size(); return group; } - inline rocprofiler_group_t GetGroupInfo(const uint32_t& index) { + inline rocprofiler_group_t GetGroupDescr(const uint32_t& index) { rocprofiler_group_t group = {}; if (set_.empty()) { group.context = reinterpret_cast(this); } else { - group = GetGroupInfo(&set_[index]); + group = GetGroupDescr(&set_[index]); } return group; } @@ -288,8 +288,8 @@ class Context { Context* context = group->GetContext(); auto r = group->FetchDecrRefsCount(); if (r == 1) { - const rocprofiler_group_t group_info = context->GetGroupInfo(group); - context->handler_(group_info, context->handler_arg_); + const rocprofiler_group_t group_descr = context->GetGroupDescr(group); + context->handler_(group_descr, context->handler_arg_); } return false; } @@ -298,6 +298,22 @@ class Context { Group* GetGroup(const uint32_t& index) { return &set_[index]; } rocprofiler_handler_t GetHandler(void** arg) const { *arg = handler_arg_; return handler_; } + void SetDispatchSignal(const hsa_signal_t &signal) { + dispatch_signal_ = signal; + } + hsa_signal_t& GetDispatchSignal() { + return dispatch_signal_; + } + void SetOrigSignal(const hsa_signal_t &signal) { + orig_signal_ = signal; + } + const hsa_signal_t& GetOrigSignal() const { + return orig_signal_; + } + rocprofiler_dispatch_record_t* GetRecord() { + return &record_; + } + private: Context(const util::AgentInfo* agent_info, Queue* queue, rocprofiler_feature_t* info, const uint32_t info_count, rocprofiler_handler_t handler, void* handler_arg) @@ -309,7 +325,10 @@ class Context { metrics_(NULL), handler_(handler), handler_arg_(handler_arg), - pcsmp_mode_(false) + pcsmp_mode_(false), + dispatch_signal_{}, + orig_signal_{}, + record_{} {} ~Context() { Destruct(); } @@ -355,6 +374,9 @@ class Context { } } } + + hsa_status_t status = hsa_signal_create(1, 0, NULL, &dispatch_signal_); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "MetricsDict create failed"); } // Initialize rocprofiler context @@ -593,6 +615,11 @@ class Context { // PC sampling mode bool pcsmp_mode_; + + // kernel packet dispatch copmletion signal + hsa_signal_t dispatch_signal_; + hsa_signal_t orig_signal_; + rocprofiler_dispatch_record_t record_; }; } // namespace rocprofiler diff --git a/projects/rocprofiler/src/core/intercept_queue.cpp b/projects/rocprofiler/src/core/intercept_queue.cpp index b6eabc6f5b..809c00c36a 100644 --- a/projects/rocprofiler/src/core/intercept_queue.cpp +++ b/projects/rocprofiler/src/core/intercept_queue.cpp @@ -43,5 +43,6 @@ rocprofiler_hsa_callback_fun_t InterceptQueue::submit_callback_fun_ = NULL; void* InterceptQueue::submit_callback_arg_ = NULL; bool InterceptQueue::k_concurrent_ = false; +bool InterceptQueue::opt_mode_ = false; } // namespace rocprofiler diff --git a/projects/rocprofiler/src/core/intercept_queue.h b/projects/rocprofiler/src/core/intercept_queue.h index 93174017fe..d5a7a8f697 100644 --- a/projects/rocprofiler/src/core/intercept_queue.h +++ b/projects/rocprofiler/src/core/intercept_queue.h @@ -92,6 +92,8 @@ class InterceptQueue { (*obj_map_)[(uint64_t)(*queue)] = obj; if (k_concurrent_) { status = proxy->SetInterceptCB(OnSubmitCB_SQTT, obj); + } else if (opt_mode_) { + status = proxy->SetInterceptCB(OnSubmitCB_opt, obj); } else { status = proxy->SetInterceptCB(OnSubmitCB, obj); } @@ -138,6 +140,85 @@ class InterceptQueue { return status; } + static void OnSubmitCB_opt(const void* in_packets, uint64_t count, uint64_t user_que_idx, void* data, + hsa_amd_queue_intercept_packet_writer writer) { + const packet_t* packets_arr = reinterpret_cast(in_packets); + InterceptQueue* obj = reinterpret_cast(data); + Queue* proxy = obj->proxy_; + + // Travers input packets + for (uint64_t j = 0; j < count; ++j) { + const packet_t* packet = &packets_arr[j]; + bool to_submit = true; + + // Checking for dispatch packet type + if ((GetHeaderType(packet) == HSA_PACKET_TYPE_KERNEL_DISPATCH) && + (dispatch_callback_.load(std::memory_order_acquire) != NULL)) { + const hsa_kernel_dispatch_packet_t* dispatch_packet = + reinterpret_cast(packet); + const hsa_signal_t completion_signal = dispatch_packet->completion_signal; +#if 0 + // Prepareing dispatch callback data + uint64_t kernel_object = dispatch_packet->kernel_object; + const amd_kernel_code_t* kernel_code = GetKernelCode(kernel_object); + const char* kernel_name = QueryKernelName(kernel_object, kernel_code); +#endif + rocprofiler_callback_data_t data = {obj->agent_info_->dev_id, + obj->agent_info_->dev_index, + obj->queue_, + user_que_idx, + obj->queue_id, + completion_signal, + dispatch_packet, + NULL, // kernel_name + 0, // kernel_object + NULL, // kernel_code + 0, // (uint32_t)syscall(__NR_gettid), + NULL}; + + // Calling dispatch callback + rocprofiler_group_t group = {}; + hsa_status_t status = (dispatch_callback_.load())(&data, callback_data_, &group); +#if 0 + free(const_cast(kernel_name)); +#endif + Context* context = reinterpret_cast(group.context); + // Injecting profiling start/stop packets + if ((status == HSA_STATUS_SUCCESS) && (context != NULL)) { + if (group.feature_count != 0) { + if (tracker_ != NULL) { + const_cast(dispatch_packet)->completion_signal = context->GetDispatchSignal(); + Group* context_group = context->GetGroup(group.index); + Tracker::Enable_opt(context_group, completion_signal); + context_group->IncrRefsCount(); + } + + const pkt_vector_t& start_vector = context->StartPackets(group.index); + const pkt_vector_t& stop_vector = context->StopPackets(group.index); + pkt_vector_t packets = start_vector; + packets.insert(packets.end(), *packet); + packets.insert(packets.end(), stop_vector.begin(), stop_vector.end()); + if (writer != NULL) { + writer(&packets[0], packets.size()); + } else { + proxy->Submit(&packets[0], packets.size()); + } + to_submit = false; + } + } + } + + // Submitting the original packets if profiling was not enabled + if (to_submit) { + if (writer != NULL) { + writer(packet, 1); + } else { + proxy->Submit(packet, 1); + } + } + } + } + static void OnSubmitCB(const void* in_packets, uint64_t count, uint64_t user_que_idx, void* data, hsa_amd_queue_intercept_packet_writer writer) { const packet_t* packets_arr = reinterpret_cast(in_packets); @@ -400,6 +481,7 @@ class InterceptQueue { static bool IsTrackerOn() { return tracker_on_; } static bool k_concurrent_; + static bool opt_mode_; private: static void queue_event_callback(hsa_status_t status, hsa_queue_t *queue, void *arg) { diff --git a/projects/rocprofiler/src/core/rocprofiler.cpp b/projects/rocprofiler/src/core/rocprofiler.cpp index 64493739f2..924626fee1 100644 --- a/projects/rocprofiler/src/core/rocprofiler.cpp +++ b/projects/rocprofiler/src/core/rocprofiler.cpp @@ -217,6 +217,7 @@ uint32_t LoadTool() { if (settings.memcopy_tracking) intercept_mode |= MEMCOPY_INTERCEPT_MODE; if (settings.hsa_intercepting) intercept_mode |= HSA_INTERCEPT_MODE; if (settings.k_concurrent) InterceptQueue::k_concurrent_ = true; + if (settings.opt_mode) InterceptQueue::opt_mode_ = true; } ONLOAD_TRACE("end intercept_mode(" << intercept_mode << ")"); @@ -537,8 +538,7 @@ PUBLIC_API hsa_status_t rocprofiler_open(hsa_agent_t agent, rocprofiler_feature_ if (mode != 0) { if (mode & ROCPROFILER_MODE_STANDALONE) { if (mode & ROCPROFILER_MODE_CREATEQUEUE) { - if (hsa_rsrc->CreateQueue(agent_info, properties->queue_depth, &(properties->queue)) == - false) { + if (hsa_rsrc->CreateQueue(agent_info, properties->queue_depth, &(properties->queue)) == false) { EXC_RAISING(HSA_STATUS_ERROR, "CreateQueue() failed"); } } @@ -592,7 +592,7 @@ PUBLIC_API hsa_status_t rocprofiler_get_group(rocprofiler_t* handle, uint32_t gr rocprofiler_group_t* group) { API_METHOD_PREFIX rocprofiler::Context* context = reinterpret_cast(handle); - *group = context->GetGroupInfo(group_index); + *group = context->GetGroupDescr(group_index); API_METHOD_SUFFIX } diff --git a/projects/rocprofiler/src/core/tracker.h b/projects/rocprofiler/src/core/tracker.h index 823dc17d97..2efa296b0a 100644 --- a/projects/rocprofiler/src/core/tracker.h +++ b/projects/rocprofiler/src/core/tracker.h @@ -155,6 +155,49 @@ class Tracker { Enable(entry, reinterpret_cast(handler), arg); } + // Enable tracking + static void Enable_opt(Group* group, const hsa_signal_t& orig_signal) { + Context* context = group->GetContext(); + context->SetOrigSignal(orig_signal); + context->GetRecord()->dispatch = util::HsaRsrcFactory::Instance().TimestampNs(); + + // Creating a proxy signal + const hsa_signal_value_t signal_value = (orig_signal.handle) ? + util::HsaRsrcFactory::Instance().HsaApi()->hsa_signal_load_relaxed(orig_signal) : 1; + hsa_signal_t& dispatch_signal = context->GetDispatchSignal(); + util::HsaRsrcFactory::Instance().HsaApi()->hsa_signal_store_screlease(dispatch_signal, signal_value); + hsa_status_t status = + util::HsaRsrcFactory::Instance().HsaApi()->hsa_amd_signal_async_handler(dispatch_signal, HSA_SIGNAL_CONDITION_LT, signal_value, Handler, group); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_signal_async_handler"); + } + + // Tracker handler + static bool Handler_opt(hsa_signal_value_t signal_value, void* arg) { + Group* group = reinterpret_cast(arg); + Context* context = group->GetContext(); + hsa_signal_t dispatch_signal = context->GetDispatchSignal(); + record_t* record = context->GetRecord(); + hsa_amd_profiling_dispatch_time_t dispatch_time{}; + hsa_status_t status = + util::HsaRsrcFactory::Instance().HsaApi()->hsa_amd_profiling_get_dispatch_time(context->GetAgent(), dispatch_signal, &dispatch_time); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_profiling_get_dispatch_time"); + record->begin = util::HsaRsrcFactory::Instance().SysclockToNs(dispatch_time.start); + record->end = util::HsaRsrcFactory::Instance().SysclockToNs(dispatch_time.end); + record->complete = util::HsaRsrcFactory::Instance().TimestampNs(); + + // Original intercepted signal completion + const hsa_signal_t& orig_signal = context->GetOrigSignal(); + if (orig_signal.handle) { + amd_signal_t* orig_signal_ptr = reinterpret_cast(orig_signal.handle); + amd_signal_t* prof_signal_ptr = reinterpret_cast(dispatch_signal.handle); + orig_signal_ptr->start_ts = prof_signal_ptr->start_ts; + orig_signal_ptr->end_ts = prof_signal_ptr->end_ts; + util::HsaRsrcFactory::Instance().HsaApi()->hsa_signal_store_screlease(orig_signal, signal_value); + } + + Context::Handler(signal_value, arg); + } + private: Tracker() : outstanding_(0), diff --git a/projects/rocprofiler/test/run.sh b/projects/rocprofiler/test/run.sh index 4c985d3e5f..4f5baf0e05 100755 --- a/projects/rocprofiler/test/run.sh +++ b/projects/rocprofiler/test/run.sh @@ -86,7 +86,7 @@ export HSA_TOOLS_LIB=librocprofiler64.so.1 # enable intercepting mode in rocprofiler export ROCP_HSA_INTERCEPT=2 # test macro for kernel iterations number -export ROCP_KITER=100 +export ROCP_KITER=20 # test macro for per-kernel dispatching number export ROCP_DITER=10 eval_test "Standalone intercepting test" ./test/stand_intercept_test @@ -95,8 +95,8 @@ unset ROCP_HSA_INTERCEPT ## Intercepting usage model test # tool library loaded by ROC profiler export ROCP_TOOL_LIB=./test/libintercept_test.so -export ROCP_KITER=50 -export ROCP_DITER=50 +export ROCP_KITER=20 +export ROCP_DITER=20 export ROCP_AGENTS=1 export ROCP_THRS=3 eval_test "Intercepting usage model test" ./test/ctrl @@ -114,19 +114,40 @@ if [ ! -e $ROCP_TOOL_LIB ] ; then export ROCP_TOOL_LIB=test/libtool.so fi -export ROCP_KITER=50 -export ROCP_DITER=50 +export ROCP_KITER=20 +export ROCP_DITER=20 export ROCP_AGENTS=1 export ROCP_THRS=1 -export ROCP_INPUT=input.xml -eval_test "'rocprof' libtool test" ./test/ctrl +export ROCP_INPUT=pmc_input.xml +eval_test "'rocprof' libtool PMC test" ./test/ctrl -export ROCP_KITER=10 -export ROCP_DITER=10 +export ROCP_KITER=20 +export ROCP_DITER=20 export ROCP_AGENTS=1 export ROCP_THRS=10 -export ROCP_INPUT=input1.xml -eval_test "'rocprof' libtool test n-threads" ./test/ctrl +export ROCP_INPUT=pmc_input.xml +eval_test "'rocprof' libtool PMC n-thread test" ./test/ctrl + +export ROCP_KITER=20 +export ROCP_DITER=20 +export ROCP_AGENTS=1 +export ROCP_THRS=1 +export ROCP_INPUT=pmc_input1.xml +eval_test "'rocprof' libtool PMC test1" ./test/ctrl + +export ROCP_KITER=20 +export ROCP_DITER=20 +export ROCP_AGENTS=1 +export ROCP_THRS=10 +export ROCP_INPUT=pmc_input1.xml +eval_test "'rocprof' libtool PMC n-thread test1" ./test/ctrl + +export ROCP_KITER=20 +export ROCP_DITER=20 +export ROCP_AGENTS=1 +export ROCP_THRS=1 +export ROCP_INPUT=sqtt_input.xml +eval_test "'rocprof' libtool SQTT test" ./test/ctrl ## SPM test # export ROCP_KITER=3 @@ -144,7 +165,7 @@ export ROCP_MCOPY_TRACKING=1 export ROCP_KITER=1 export ROCP_DITER=4 -export ROCP_INPUT=input2.xml +export ROCP_INPUT=set_input.xml eval_test "libtool test, counter sets" ./test/ctrl ## OpenCL test @@ -159,7 +180,7 @@ export ROCP_HSA_INTERC=1 export ROCP_KITER=10 export ROCP_DITER=10 -export ROCP_INPUT=input1.xml +#export ROCP_INPUT=input1.xml eval_test "libtool test, counter sets" ./test/ctrl ## OpenCL test diff --git a/projects/rocprofiler/test/tool/input.xml b/projects/rocprofiler/test/tool/input.xml deleted file mode 100644 index f8016ebc03..0000000000 --- a/projects/rocprofiler/test/tool/input.xml +++ /dev/null @@ -1,23 +0,0 @@ -# Filter by dispatches range, GPU index and kernel names - - -# List of metrics - - -# SQTT trace with parameters - - - diff --git a/projects/rocprofiler/test/tool/pmc_input.xml b/projects/rocprofiler/test/tool/pmc_input.xml new file mode 100644 index 0000000000..6b9e3d6aa9 --- /dev/null +++ b/projects/rocprofiler/test/tool/pmc_input.xml @@ -0,0 +1,4 @@ +# List of metrics + diff --git a/projects/rocprofiler/test/tool/input1.xml b/projects/rocprofiler/test/tool/pmc_input1.xml similarity index 86% rename from projects/rocprofiler/test/tool/input1.xml rename to projects/rocprofiler/test/tool/pmc_input1.xml index 9fff096c96..6863fa2919 100644 --- a/projects/rocprofiler/test/tool/input1.xml +++ b/projects/rocprofiler/test/tool/pmc_input1.xml @@ -3,9 +3,9 @@ # range format "3:9" range="" # list of gpu indexes "0,1,2,3" - gpu_index="" + gpu_index="0,1,2,3" # list of matched sub-strings "Simple1,Conv1,SimpleConvolution" - kernel="" + kernel="Simple1,Conv1,SimpleConvolution" > # List of metrics diff --git a/projects/rocprofiler/test/tool/input2.xml b/projects/rocprofiler/test/tool/set_input.xml similarity index 100% rename from projects/rocprofiler/test/tool/input2.xml rename to projects/rocprofiler/test/tool/set_input.xml diff --git a/projects/rocprofiler/test/tool/sqtt_input.xml b/projects/rocprofiler/test/tool/sqtt_input.xml new file mode 100644 index 0000000000..5c9ee63dd8 --- /dev/null +++ b/projects/rocprofiler/test/tool/sqtt_input.xml @@ -0,0 +1,8 @@ +# SQTT trace with parameters + + + diff --git a/projects/rocprofiler/test/tool/tool.cpp b/projects/rocprofiler/test/tool/tool.cpp index 072055e5f1..d820c17535 100644 --- a/projects/rocprofiler/test/tool/tool.cpp +++ b/projects/rocprofiler/test/tool/tool.cpp @@ -27,6 +27,7 @@ THE SOFTWARE. /////////////////////////////////////////////////////////////////////////////// #include +#include #include #include #include @@ -99,6 +100,7 @@ struct context_entry_t { unsigned feature_count; rocprofiler_callback_data_t data; kernel_properties_t kernel_properties; + uint64_t kernel_object; FILE* file_handle; }; @@ -169,6 +171,21 @@ void check_status(hsa_status_t status) { } } +////////////////////////////////////////////////////////////////////////////////////// +// Dispatch opt code ///////////////////////////////////////////////////////////////// +////////////////////////////////////////////////////////////////////////////////////// +// Context callback arg +struct callbacks_arg_t { + rocprofiler_pool_t** pools; +}; + +// Handler callback arg +struct handler_arg_t { + rocprofiler_feature_t* features; + unsigned feature_count; +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////////// // Print profiling results output break if terminal output is enabled void results_output_break() { const bool is_terminal_output = (result_file_opened == false); @@ -589,7 +606,6 @@ void dump_context_array(hsa_queue_t* queue) { // Profiling completion handler // Dump and delete the context entry -// Return true if the context was dumped successfully bool context_handler(rocprofiler_group_t group, void* arg) { context_entry_t* entry = reinterpret_cast(arg); @@ -621,6 +637,62 @@ bool context_handler(rocprofiler_group_t group, void* arg) { return false; } +static const amd_kernel_code_t* GetKernelCode(uint64_t kernel_object) { + const amd_kernel_code_t* kernel_code = NULL; + hsa_status_t status = + HsaRsrcFactory::Instance().LoaderApi()->hsa_ven_amd_loader_query_host_address( + reinterpret_cast(kernel_object), + reinterpret_cast(&kernel_code)); + if (HSA_STATUS_SUCCESS != status) { + kernel_code = reinterpret_cast(kernel_object); + } + return kernel_code; +} + +// Demangle C++ symbol name +static const char* cpp_demangle(const char* symname) { + size_t size = 0; + int status; + const char* ret = abi::__cxa_demangle(symname, NULL, &size, &status); + return (ret != 0) ? ret : strdup(symname); +} + +static const char* QueryKernelName(uint64_t kernel_object, const amd_kernel_code_t* kernel_code) { + const char* kernel_symname = HsaRsrcFactory::GetKernelNameRef(kernel_object); + return cpp_demangle(kernel_symname); +} + +// Profiling completion handler +// Dump context entry +bool context_pool_handler(const rocprofiler_pool_entry_t* entry, void* arg) { + // Context entry + context_entry_t* ctx_entry = reinterpret_cast(entry->payload); + handler_arg_t* handler_arg = reinterpret_cast(arg); + ctx_entry->features = handler_arg->features; + ctx_entry->feature_count = handler_arg->feature_count; + ctx_entry->file_handle = result_file_handle; + + const uint64_t kernel_object = ctx_entry->kernel_object; + const amd_kernel_code_t* kernel_code = GetKernelCode(kernel_object); + ctx_entry->data.kernel_name = QueryKernelName(kernel_object, kernel_code); + + if (pthread_mutex_lock(&mutex) != 0) { + perror("pthread_mutex_lock"); + abort(); + } + + dump_context_entry(ctx_entry); + + if (pthread_mutex_unlock(&mutex) != 0) { + perror("pthread_mutex_unlock"); + abort(); + } + + free((void*)(ctx_entry->data.kernel_name)); + + return false; +} + // Profiling completion handler for concurrent implementation // Dump the context entry // Return true if the context was dumped successfully @@ -687,29 +759,13 @@ bool check_filter(const rocprofiler_callback_data_t* callback_data, const callba return found; } -// Kernel disoatch callback -hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, void* user_data, - rocprofiler_group_t* group) { - // Passed tool data +// Setting kernel properties +void set_kernel_properties(const rocprofiler_callback_data_t* callback_data, + kernel_properties_t* kernel_properties_ptr) +{ const hsa_kernel_dispatch_packet_t* packet = callback_data->packet; const amd_kernel_code_t* kernel_code = callback_data->kernel_code; - callbacks_data_t* tool_data = reinterpret_cast(user_data); - // HSA status - hsa_status_t status = HSA_STATUS_ERROR; - // Checking dispatch condition - if (tool_data->filter_on == 1) { - if (check_filter(callback_data, tool_data) == false) { - next_context_count(); - return HSA_STATUS_SUCCESS; - } - } - // Profiling context - rocprofiler_t* context = NULL; - // Context entry - context_entry_t* entry = alloc_context_entry(); - // kernel properties - kernel_properties_t* kernel_properties_ptr = &(entry->kernel_properties); uint64_t grid_size = packet->grid_size_x * packet->grid_size_y * packet->grid_size_z; if (grid_size > UINT32_MAX) abort(); kernel_properties_ptr->grid_size = (uint32_t)grid_size; @@ -722,6 +778,28 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, kernel_properties_ptr->sgpr_count = AMD_HSA_BITS_GET(kernel_code->compute_pgm_rsrc1, AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT); kernel_properties_ptr->fbarrier_count = kernel_code->workgroup_fbarrier_count; kernel_properties_ptr->signal = callback_data->completion_signal; +} + +// Kernel disoatch callback +hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, void* user_data, + rocprofiler_group_t* group) { + // Passed tool data + callbacks_data_t* tool_data = reinterpret_cast(user_data); + // HSA status + hsa_status_t status = HSA_STATUS_ERROR; + + // Checking dispatch condition + if (tool_data->filter_on == 1) { + if (check_filter(callback_data, tool_data) == false) { + next_context_count(); + return HSA_STATUS_SUCCESS; + } + } + // Profiling context + // Context entry + context_entry_t* entry = alloc_context_entry(); + // Setting kernel properties + set_kernel_properties(callback_data, &(entry->kernel_properties)); // context properties rocprofiler_properties_t properties{}; @@ -747,6 +825,7 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, } // Open profiling context + rocprofiler_t* context = NULL; status = rocprofiler_open(callback_data->agent, features, feature_count, &context, 0 /*ROCPROFILER_MODE_SINGLEGROUP*/, &properties); check_status(status); @@ -780,6 +859,36 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, return status; } +// Kernel disoatch callback +hsa_status_t dispatch_callback_opt(const rocprofiler_callback_data_t* callback_data, void* user_data, + rocprofiler_group_t* group) { + hsa_status_t status = HSA_STATUS_ERROR; + hsa_agent_t agent = callback_data->agent; + const unsigned gpu_id = HsaRsrcFactory::Instance().GetAgentInfo(agent)->dev_index; + callbacks_arg_t* callbacks_arg = reinterpret_cast(user_data); + rocprofiler_pool_t* pool = callbacks_arg->pools[gpu_id]; + rocprofiler_pool_entry_t pool_entry{}; + status = rocprofiler_pool_fetch(pool, &pool_entry); + check_status(status); + // Profiling context entry + rocprofiler_t* context = pool_entry.context; + context_entry_t* entry = reinterpret_cast(pool_entry.payload); + // Setting kernel properties + set_kernel_properties(callback_data, &(entry->kernel_properties)); + // Get group[0] + status = rocprofiler_get_group(context, 0, group); + check_status(status); + + // Fill profiling context entry + entry->index = UINT32_MAX; + entry->agent = agent; + entry->group = *group; + entry->data = *callback_data; + entry->kernel_object = callback_data->packet->kernel_object; + reinterpret_cast*>(&entry->valid)->store(true); + return status; +} + hsa_status_t dispatch_callback_con(const rocprofiler_callback_data_t* callback_data, void* user_data, rocprofiler_group_t* group) { // Passed tool data @@ -1096,6 +1205,8 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) if (settings->hsa_intercepting) rocprofiler_set_hsa_callbacks(hsa_callbacks, (void*)14); // Enable concurrent SQTT check_env_var("ROCP_K_CONCURRENT", settings->k_concurrent); + // Enable optmized mode + check_env_var("ROCP_OPT_MODE", settings->opt_mode); is_trace_local = settings->trace_local; @@ -1181,6 +1292,8 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) range_vec->push_back(*(range_vec->begin()) + 1); } + const bool filter_disabled = (gpu_index_vec->empty() && kernel_string_vec->empty() && range_vec->empty()); + // Getting traces const auto traces_list = xml->GetNodes("top.trace"); if (traces_list.size() > 1) fatal("ROCProfiler: only one trace supported at a time"); @@ -1298,30 +1411,78 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) // Context array aloocation context_array = new context_array_t; - // Adding dispatch observer - rocprofiler_queue_callbacks_t callbacks_ptrs{0}; - if (settings->k_concurrent != 0) { - callbacks_ptrs.dispatch = dispatch_callback_con; + bool opt_mode_cond = ((features_found != 0) && + (metrics_set->empty()) && + (traces_found == 0) && + (is_spm_trace == false) && + (filter_disabled == true)); + if (settings->opt_mode == 0) opt_mode_cond = false; + if (!opt_mode_cond) settings->opt_mode = 0; + if (opt_mode_cond) { + // Handler arg + handler_arg_t* handler_arg = new handler_arg_t{}; + handler_arg->features = features; + handler_arg->feature_count = feature_count; + + // Context properties + rocprofiler_pool_properties_t properties{}; + properties.num_entries = (CTX_OUTSTANDING_MAX != 0) ? CTX_OUTSTANDING_MAX : 1000; + properties.payload_bytes = sizeof(context_entry_t); + properties.handler = context_pool_handler; + properties.handler_arg = handler_arg; + + // Available GPU agents + const unsigned gpu_count = HsaRsrcFactory::Instance().GetCountOfGpuAgents(); + callbacks_arg_t* callbacks_arg = new callbacks_arg_t{}; + callbacks_arg->pools = new rocprofiler_pool_t* [gpu_count]; + for (unsigned gpu_id = 0; gpu_id < gpu_count; gpu_id++) { + // Getting GPU device info + const AgentInfo* agent_info = NULL; + if (HsaRsrcFactory::Instance().GetGpuAgentInfo(gpu_id, &agent_info) == false) { + fprintf(stderr, "GetGpuAgentInfo failed\n"); + abort(); + } + + // Open profiling pool + rocprofiler_pool_t* pool = NULL; + hsa_status_t status = rocprofiler_pool_open(agent_info->dev_id, features, features_found, + &pool, 0, &properties); + check_status(status); + callbacks_arg->pools[gpu_id] = pool; + } + + // Adding dispatch observer + rocprofiler_queue_callbacks_t callbacks_ptrs{0}; + callbacks_ptrs.dispatch = dispatch_callback_opt; + callbacks_ptrs.destroy = destroy_callback; + + rocprofiler_set_queue_callbacks(callbacks_ptrs, callbacks_arg); } else { - callbacks_ptrs.dispatch = dispatch_callback; + // Adding dispatch observer + rocprofiler_queue_callbacks_t callbacks_ptrs{0}; + if (settings->k_concurrent != 0) { + callbacks_ptrs.dispatch = dispatch_callback_con; + } else { + callbacks_ptrs.dispatch = dispatch_callback; + } + callbacks_ptrs.destroy = destroy_callback; + + callbacks_data = new callbacks_data_t{}; + callbacks_data->features = features; + callbacks_data->feature_count = features_found; + callbacks_data->set = (metrics_set->empty()) ? NULL : metrics_set; + callbacks_data->group_index = 0; + callbacks_data->file_handle = result_file_handle; + callbacks_data->gpu_index = (gpu_index_vec->empty()) ? NULL : gpu_index_vec; + callbacks_data->kernel_string = (kernel_string_vec->empty()) ? NULL : kernel_string_vec; + callbacks_data->range = (range_vec->empty()) ? NULL : range_vec;; + callbacks_data->filter_on = (callbacks_data->gpu_index != NULL) || + (callbacks_data->kernel_string != NULL) || + (callbacks_data->range != NULL) + ? 1 : 0; + + rocprofiler_set_queue_callbacks(callbacks_ptrs, callbacks_data); } - callbacks_ptrs.destroy = destroy_callback; - - callbacks_data = new callbacks_data_t{}; - callbacks_data->features = features; - callbacks_data->feature_count = features_found; - callbacks_data->set = (metrics_set->empty()) ? NULL : metrics_set; - callbacks_data->group_index = 0; - callbacks_data->file_handle = result_file_handle; - callbacks_data->gpu_index = (gpu_index_vec->empty()) ? NULL : gpu_index_vec; - callbacks_data->kernel_string = (kernel_string_vec->empty()) ? NULL : kernel_string_vec; - callbacks_data->range = (range_vec->empty()) ? NULL : range_vec;; - callbacks_data->filter_on = (callbacks_data->gpu_index != NULL) || - (callbacks_data->kernel_string != NULL) || - (callbacks_data->range != NULL) - ? 1 : 0; - - rocprofiler_set_queue_callbacks(callbacks_ptrs, callbacks_data); xml::Xml::Destroy(xml);