From 472624e3bdc8fef13961c30fec8564dab805d309 Mon Sep 17 00:00:00 2001 From: Ammar ELWazir Date: Tue, 20 Jun 2023 20:27:56 +0000 Subject: [PATCH] SWDEV-374256: GPU Kernel Dispatch Trace Period Support Change-Id: Idaabe82a30013e3aba4bcb65bd0a89ce2d14ad97 --- bin/tblextr.py | 4 +- test/CMakeLists.txt | 2 +- test/tool/tool.cpp | 358 ++++++++++++++++++++++++++++---------------- 3 files changed, 234 insertions(+), 130 deletions(-) diff --git a/bin/tblextr.py b/bin/tblextr.py index 1a8f8427f4..73e3847501 100755 --- a/bin/tblextr.py +++ b/bin/tblextr.py @@ -737,7 +737,9 @@ def fill_ops_db(kernel_table_name, mcopy_table_name, db, indir): if roctx_range == '': roctx_range = name else: if is_barrier: continue - else: fatal("hcc ops data not found: '" + record + "', " + str(corr_id) + ", " + str(proc_id)) + else: + if "ROCP_CTRL_RATE" in os.environ: continue + else: fatal("hcc ops data not found: '" + record + "', " + str(corr_id) + ", " + str(proc_id)) # activity record rec_vals[4] = name # Name diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 7aea337b21..d63a7f152a 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -160,7 +160,7 @@ set ( TEST_LIB "rocprof-tool" ) set ( TEST_LIB_SRC ${TEST_DIR}/tool/tool.cpp ${UTIL_SRC} ) add_library ( ${TEST_LIB} SHARED ${TEST_LIB_SRC} ) target_include_directories ( ${TEST_LIB} PRIVATE ${TEST_DIR} ${ROOT_DIR} ${PROJECT_SOURCE_DIR}/include ) -target_link_libraries ( ${TEST_LIB} ${ROCPROFILER_TARGET} hsa-runtime64::hsa-runtime64 Threads::Threads dl ) +target_link_libraries ( ${TEST_LIB} ${ROCPROFILER_TARGET} hsa-runtime64::hsa-runtime64 Threads::Threads atomic dl ) ## TODO(aelwazir): Should be replaced by the current location in the main CMakeLists.txt install(TARGETS ${TEST_LIB} DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/tests-v1/test COMPONENT tests) diff --git a/test/tool/tool.cpp b/test/tool/tool.cpp index e56efb1f3b..eac19ac692 100644 --- a/test/tool/tool.cpp +++ b/test/tool/tool.cpp @@ -36,7 +36,7 @@ THE SOFTWARE. #include #include #include -#include /* For SYS_xxx definitions */ +#include /* For SYS_xxx definitions */ #include #include @@ -60,10 +60,12 @@ THE SOFTWARE. #define DESTRUCTOR_API __attribute__((destructor)) #define KERNEL_NAME_LEN_MAX 128 -#define ONLOAD_TRACE(str) \ - if (getenv("ROCP_ONLOAD_TRACE")) do { \ - std::cout << "PID(" << GetPid() << "): PROF_TOOL_LIB::" << __FUNCTION__ << " " << str << std::endl << std::flush; \ - } while(0); +#define ONLOAD_TRACE(str) \ + if (getenv("ROCP_ONLOAD_TRACE")) do { \ + std::cout << "PID(" << GetPid() << "): PROF_TOOL_LIB::" << __FUNCTION__ << " " << str \ + << std::endl \ + << std::flush; \ + } while (0); #define ONLOAD_TRACE_BEG() ONLOAD_TRACE("begin") #define ONLOAD_TRACE_END() ONLOAD_TRACE("end") @@ -174,6 +176,30 @@ void check_status(hsa_status_t status) { } } +////////////////////////////////////////////////////////////////////////////////////// +// Profiling control thread ///////////////////////////////////////////////////////////////// +////////////////////////////////////////////////////////////////////////////////////// +uint32_t control_delay_us = 0; +uint32_t control_len_us = 0; +uint32_t control_dist_us = 0; +std::thread* trace_period_thread = nullptr; +std::atomic trace_period_stop{false}; +std::atomic allow_profiling{false}; +void trace_period_fun() { + std::this_thread::sleep_for(std::chrono::milliseconds(control_delay_us)); + do { + allow_profiling = true; + if (trace_period_stop) { + allow_profiling = false; + break; + } + std::this_thread::sleep_for(std::chrono::milliseconds(control_len_us)); + allow_profiling = false; + if (trace_period_stop) break; + std::this_thread::sleep_for(std::chrono::milliseconds(control_dist_us)); + } while (!trace_period_stop); +} + ////////////////////////////////////////////////////////////////////////////////////// // Dispatch opt code ///////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////// @@ -249,7 +275,9 @@ void* monitor_thr_fun(void*) { const uint32_t inflight = context_count - context_collected; std::cerr << std::flush; std::clog << std::flush; - std::cout << "ROCProfiler: count(" << context_count << "), outstanding(" << inflight << "/" << CTX_OUTSTANDING_MAX << ")" << std::endl << std::flush; + std::cout << "ROCProfiler: count(" << context_count << "), outstanding(" << inflight << "/" + << CTX_OUTSTANDING_MAX << ")" << std::endl + << std::flush; if (pthread_mutex_unlock(&mutex) != 0) { perror("pthread_mutex_unlock"); abort(); @@ -275,7 +303,7 @@ uint32_t next_context_count() { // Allocate entry to store profiling context context_entry_t* alloc_context_entry() { if (CTX_OUTSTANDING_MAX != 0) { - while((context_count - context_collected) > CTX_OUTSTANDING_MAX) usleep(1000); + while ((context_count - context_collected) > CTX_OUTSTANDING_MAX) usleep(1000); } if (pthread_mutex_lock(&mutex) != 0) { @@ -325,8 +353,10 @@ context_entry_t* ck_ctx_entry(hsa_agent_t agent, bool& found) { if (ctx_a_map == NULL) ctx_a_map = new ctx_a_map_t; auto ret = ctx_a_map->insert({agent.handle, NULL}); found = !ret.second; - if (found) ctx_a_map->erase(agent.handle); - else ret.first->second = new context_entry_t{}; + if (found) + ctx_a_map->erase(agent.handle); + else + ret.first->second = new context_entry_t{}; return ret.first->second; } @@ -398,32 +428,27 @@ bool dump_context_entry(context_entry_t* entry, bool to_clean = true) { const uint32_t index = entry->index; if (index != UINT32_MAX) { FILE* file_handle = entry->file_handle; - const std::string nik_name = (to_truncate_names == 0) ? entry->data.kernel_name : filtr_kernel_name(entry->data.kernel_name); + const std::string nik_name = (to_truncate_names == 0) + ? entry->data.kernel_name + : filtr_kernel_name(entry->data.kernel_name); const AgentInfo* agent_info = HsaRsrcFactory::Instance().GetAgentInfo(entry->agent); - fprintf(file_handle, "dispatch[%u], gpu-id(%u), queue-id(%u), queue-index(%lu), pid(%u), tid(%u), grd(%u), wgr(%u), lds(%u), scr(%u), arch_vgpr(%u), accum_vgpr(%u), sgpr(%u), wave_size(%u), sig(0x%lx), obj(0x%lx), kernel-name(\"%s\")", - index, - agent_info->dev_index, - entry->data.queue_id, - entry->data.queue_index, - my_pid, - entry->data.thread_id, - entry->kernel_properties.grid_size, - entry->kernel_properties.workgroup_size, - (entry->kernel_properties.lds_size + (AgentInfo::lds_block_size - 1)) & ~(AgentInfo::lds_block_size - 1), - entry->kernel_properties.scratch_size, - entry->kernel_properties.arch_vgpr_count, - entry->kernel_properties.accum_vgpr_count, - entry->kernel_properties.sgpr_count, - entry->kernel_properties.wave_size, - entry->kernel_properties.signal.handle, - entry->kernel_properties.object, - nik_name.c_str()); - if (record) fprintf(file_handle, ", time(%lu,%lu,%lu,%lu)", - record->dispatch, - record->begin, - record->end, - record->complete); + fprintf(file_handle, + "dispatch[%u], gpu-id(%u), queue-id(%u), queue-index(%lu), pid(%u), tid(%u), grd(%u), " + "wgr(%u), lds(%u), scr(%u), arch_vgpr(%u), accum_vgpr(%u), sgpr(%u), wave_size(%u), " + "sig(0x%lx), obj(0x%lx), kernel-name(\"%s\")", + index, agent_info->dev_index, entry->data.queue_id, entry->data.queue_index, my_pid, + entry->data.thread_id, entry->kernel_properties.grid_size, + entry->kernel_properties.workgroup_size, + (entry->kernel_properties.lds_size + (AgentInfo::lds_block_size - 1)) & + ~(AgentInfo::lds_block_size - 1), + entry->kernel_properties.scratch_size, entry->kernel_properties.arch_vgpr_count, + entry->kernel_properties.accum_vgpr_count, entry->kernel_properties.sgpr_count, + entry->kernel_properties.wave_size, entry->kernel_properties.signal.handle, + entry->kernel_properties.object, nik_name.c_str()); + if (record) + fprintf(file_handle, ", time(%lu,%lu,%lu,%lu)", record->dispatch, record->begin, record->end, + record->complete); fprintf(file_handle, "\n"); fflush(file_handle); } @@ -475,8 +500,10 @@ void dump_context_array(hsa_queue_t* queue) { while (valid->load() == false) sched_yield(); if ((queue == NULL) || (entry->data.queue == queue)) { if (entry->active == true) { - if (dump_context_entry(&(cur->second)) == false) done = false; - else entry->active = false; + if (dump_context_entry(&(cur->second)) == false) + done = false; + else + entry->active = false; } } } @@ -511,7 +538,8 @@ bool context_handler(rocprofiler_group_t group, void* arg) { if (ret) dealloc_context_entry(entry); if (trace_on) { - fprintf(stdout, "tool::handler: context_array %d tid %u\n", (int)(context_array->size()), GetTid()); + fprintf(stdout, "tool::handler: context_array %d tid %u\n", (int)(context_array->size()), + GetTid()); fflush(stdout); } @@ -570,7 +598,8 @@ bool context_handler_con(rocprofiler_group_t group, void* arg) { } if (trace_on) { - fprintf(stdout, "tool::handler_con: context_map %d tid %u\n", (int)(ctx_a_map->size()), GetTid()); + fprintf(stdout, "tool::handler_con: context_map %d tid %u\n", (int)(ctx_a_map->size()), + GetTid()); fflush(stdout); } @@ -582,7 +611,8 @@ bool context_handler_con(rocprofiler_group_t group, void* arg) { return false; } -bool check_filter(const rocprofiler_callback_data_t* callback_data, const callbacks_data_t* tool_data) { +bool check_filter(const rocprofiler_callback_data_t* callback_data, + const callbacks_data_t* tool_data) { bool found = true; std::vector* range_ptr = tool_data->range; @@ -604,7 +634,7 @@ bool check_filter(const rocprofiler_callback_data_t* callback_data, const callba } } } - std::vector* kernel_string = tool_data->kernel_string; + std::vector* kernel_string = tool_data->kernel_string; if (found && kernel_string) { found = false; for (const std::string& s : *kernel_string) { @@ -643,8 +673,9 @@ enum amd_compute_gfx10_gfx11_pgm_rsrc_three_t { }; // Kernel code properties. -enum amd_kernel_code_property_t { - AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER, 0, 1), +enum amd_kernel_code_property_t { + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER, 0, + 1), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR, 1, 1), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR, 2, 1), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR, 3, 1), @@ -652,7 +683,8 @@ enum amd_kernel_code_property_t { AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT, 5, 1), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE, 6, 1), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_RESERVED0, 7, 3), - AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32, 10, 1), // GFX10+ + AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32, 10, + 1), // GFX10+ AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK, 11, 1), AMD_HSA_BITS_CREATE_ENUM_ENTRIES(AMD_KERNEL_CODE_PROPERTY_RESERVED1, 12, 4), }; @@ -669,38 +701,48 @@ static const kernel_descriptor_t* GetKernelCode(uint64_t kernel_object) { return kernel_code; } -static uint32_t arch_vgpr_count(const AgentInfo &info, const kernel_descriptor_t &kernel_code) { +static uint32_t arch_vgpr_count(const AgentInfo& info, const kernel_descriptor_t& kernel_code) { if (strcmp(info.name, "gfx90a") == 0 || strncmp(info.name, "gfx94", 5) == 0) - return (AMD_HSA_BITS_GET(kernel_code.compute_pgm_rsrc3, AMD_COMPUTE_PGM_RSRC_THREE_ACCUM_OFFSET) + 1) * 4; + return (AMD_HSA_BITS_GET(kernel_code.compute_pgm_rsrc3, + AMD_COMPUTE_PGM_RSRC_THREE_ACCUM_OFFSET) + + 1) * + 4; - return (AMD_HSA_BITS_GET(kernel_code.compute_pgm_rsrc1, AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT) + 1) - * (AMD_HSA_BITS_GET(kernel_code.kernel_code_properties, AMD_KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32) ? 8 : 4); + return (AMD_HSA_BITS_GET(kernel_code.compute_pgm_rsrc1, + AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT) + + 1) * + (AMD_HSA_BITS_GET(kernel_code.kernel_code_properties, + AMD_KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32) + ? 8 + : 4); } -static uint32_t accum_vgpr_count(const AgentInfo &info, const kernel_descriptor_t &kernel_code) { - if (strcmp(info.name, "gfx908") == 0) - return arch_vgpr_count(info, kernel_code); +static uint32_t accum_vgpr_count(const AgentInfo& info, const kernel_descriptor_t& kernel_code) { + if (strcmp(info.name, "gfx908") == 0) return arch_vgpr_count(info, kernel_code); if (strcmp(info.name, "gfx90a") == 0 || strncmp(info.name, "gfx94", 5) == 0) return (AMD_HSA_BITS_GET(kernel_code.compute_pgm_rsrc1, - AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT) + 1) * 8 - - arch_vgpr_count(info, kernel_code); + AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT) + + 1) * + 8 - + arch_vgpr_count(info, kernel_code); return 0; } -static uint32_t sgpr_count(const AgentInfo &info, const kernel_descriptor_t &kernel_code) { +static uint32_t sgpr_count(const AgentInfo& info, const kernel_descriptor_t& kernel_code) { // GFX10 and later always allocate 128 sgprs. - if (std::atoi(&info.gfxip[3]) >= 10) - return 128; + if (std::atoi(&info.gfxip[3]) >= 10) return 128; return (AMD_HSA_BITS_GET(kernel_code.compute_pgm_rsrc1, - AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT) / 2 + 1) * 16; + AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT) / + 2 + + 1) * + 16; } // Setting kernel properties void set_kernel_properties(const rocprofiler_callback_data_t* callback_data, - context_entry_t* entry) -{ + context_entry_t* entry) { const hsa_kernel_dispatch_packet_t* packet = callback_data->packet; kernel_properties_t* kernel_properties_ptr = &(entry->kernel_properties); const kernel_descriptor_t* kernel_code = (kernel_descriptor_t*)callback_data->kernel_code; @@ -718,18 +760,22 @@ void set_kernel_properties(const rocprofiler_callback_data_t* callback_data, uint64_t grid_size = packet->grid_size_x * packet->grid_size_y * packet->grid_size_z; if (grid_size > UINT32_MAX) abort(); kernel_properties_ptr->grid_size = (uint32_t)grid_size; - uint64_t workgroup_size = packet->workgroup_size_x * packet->workgroup_size_y * packet->workgroup_size_z; + uint64_t workgroup_size = + packet->workgroup_size_x * packet->workgroup_size_y * packet->workgroup_size_z; if (workgroup_size > UINT32_MAX) abort(); kernel_properties_ptr->workgroup_size = (uint32_t)workgroup_size; kernel_properties_ptr->lds_size = packet->group_segment_size; kernel_properties_ptr->scratch_size = packet->private_segment_size; const AgentInfo* agent_info = HsaRsrcFactory::Instance().GetAgentInfo(callback_data->agent); - assert (agent_info != nullptr); + assert(agent_info != nullptr); kernel_properties_ptr->arch_vgpr_count = arch_vgpr_count(*agent_info, *kernel_code); kernel_properties_ptr->accum_vgpr_count = accum_vgpr_count(*agent_info, *kernel_code); kernel_properties_ptr->sgpr_count = sgpr_count(*agent_info, *kernel_code); - kernel_properties_ptr->wave_size = AMD_HSA_BITS_GET(kernel_code->kernel_code_properties, - AMD_KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32) ? 32 : 64; + kernel_properties_ptr->wave_size = + AMD_HSA_BITS_GET(kernel_code->kernel_code_properties, + AMD_KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32) + ? 32 + : 64; kernel_properties_ptr->signal = callback_data->completion_signal; kernel_properties_ptr->object = callback_data->packet->kernel_object; } @@ -737,6 +783,7 @@ void set_kernel_properties(const rocprofiler_callback_data_t* callback_data, // Kernel disoatch callback hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, void* user_data, rocprofiler_group_t* group) { + if(!allow_profiling) return HSA_STATUS_SUCCESS; // TODO: return success, make atomic flag // Passed tool data callbacks_data_t* tool_data = reinterpret_cast(user_data); @@ -781,8 +828,8 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, // Open profiling context rocprofiler_t* context = NULL; - status = rocprofiler_open(callback_data->agent, features, feature_count, - &context, 0 /*ROCPROFILER_MODE_SINGLEGROUP*/, &properties); + status = rocprofiler_open(callback_data->agent, features, feature_count, &context, + 0 /*ROCPROFILER_MODE_SINGLEGROUP*/, &properties); check_status(status); // Check that we have only one profiling group @@ -805,7 +852,8 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, reinterpret_cast*>(&entry->valid)->store(true); if (trace_on) { - fprintf(stdout, "tool::dispatch: context_array %d tid %u\n", (int)(context_array->size()), GetTid()); + fprintf(stdout, "tool::dispatch: context_array %d tid %u\n", (int)(context_array->size()), + GetTid()); fflush(stdout); } @@ -813,8 +861,8 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, } // Kernel disoatch callback -hsa_status_t dispatch_callback_opt(const rocprofiler_callback_data_t* callback_data, void* user_data, - rocprofiler_group_t* group) { +hsa_status_t dispatch_callback_opt(const rocprofiler_callback_data_t* callback_data, + void* user_data, rocprofiler_group_t* group) { hsa_status_t status = HSA_STATUS_ERROR; hsa_agent_t agent = callback_data->agent; const unsigned gpu_id = HsaRsrcFactory::Instance().GetAgentInfo(agent)->dev_index; @@ -847,19 +895,20 @@ hsa_status_t destroy_callback(hsa_queue_t* queue, void*) { return HSA_STATUS_SUCCESS; } -static hsa_status_t info_callback(const rocprofiler_info_data_t info, void * arg) { +static hsa_status_t info_callback(const rocprofiler_info_data_t info, void* arg) { const char symb = *reinterpret_cast(arg); if (((symb == 'b') && (info.metric.expr == NULL)) || - ((symb == 'd') && (info.metric.expr != NULL))) - { + ((symb == 'd') && (info.metric.expr != NULL))) { if (info.metric.expr != NULL) { - fprintf(stdout, "\n gpu-agent%d : %s : %s\n", info.agent_index, info.metric.name, info.metric.description); + fprintf(stdout, "\n gpu-agent%d : %s : %s\n", info.agent_index, info.metric.name, + info.metric.description); fprintf(stdout, " %s = %s\n", info.metric.name, info.metric.expr); } else { fprintf(stdout, "\n gpu-agent%d : %s", info.agent_index, info.metric.name); if (info.metric.instances > 1) fprintf(stdout, "[0-%u]", info.metric.instances - 1); fprintf(stdout, " : %s\n", info.metric.description); - fprintf(stdout, " block %s has %u counters\n", info.metric.block_name, info.metric.block_counters); + fprintf(stdout, " block %s has %u counters\n", info.metric.block_name, + info.metric.block_counters); } fflush(stdout); } @@ -873,11 +922,14 @@ std::string normalize_token(const std::string& token, bool not_empty, const std: std::string error_str = "none"; if (first_pos != std::string::npos) { const size_t last_pos = token.find_last_not_of(space_chars_set); - if (last_pos == std::string::npos) error_str = "token string error: \"" + token + "\""; + if (last_pos == std::string::npos) + error_str = "token string error: \"" + token + "\""; else { const size_t end_pos = last_pos + 1; - if (end_pos <= first_pos) error_str = "token string error: \"" + token + "\""; - else norm_len = end_pos - first_pos; + if (end_pos <= first_pos) + error_str = "token string error: \"" + token + "\""; + else + norm_len = end_pos - first_pos; } } if (((first_pos != std::string::npos) && (norm_len == 0)) || @@ -887,7 +939,8 @@ std::string normalize_token(const std::string& token, bool not_empty, const std: return (norm_len != 0) ? token.substr(first_pos, norm_len) : std::string(""); } -int get_xml_array(const xml::Xml::level_t* node, const std::string& field, const std::string& delim, std::vector* vec, const char* label = NULL) { +int get_xml_array(const xml::Xml::level_t* node, const std::string& field, const std::string& delim, + std::vector* vec, const char* label = NULL) { int parse_iter = 0; const auto& opts = node->opts; auto it = opts.find(field); @@ -911,7 +964,9 @@ int get_xml_array(const xml::Xml::level_t* node, const std::string& field, const return parse_iter; } -int get_xml_array(xml::Xml* xml, const std::string& tag, const std::string& field, const std::string& delim, std::vector* vec, const char* label = NULL) { +int get_xml_array(xml::Xml* xml, const std::string& tag, const std::string& field, + const std::string& delim, std::vector* vec, + const char* label = NULL) { int parse_iter = 0; const auto nodes = xml->GetNodes(tag); auto rit = nodes.rbegin(); @@ -923,12 +978,13 @@ int get_xml_array(xml::Xml* xml, const std::string& tag, const std::string& fiel } if (rit != rend) { parse_iter = get_xml_array(*rit, field, delim, vec, label); - //fatal("Tokens array parsing error, file '" + xml->GetName() + "', " + tag + "::" + field); + // fatal("Tokens array parsing error, file '" + xml->GetName() + "', " + tag + "::" + field); } return parse_iter; } -int get_xml_array(xml::Xml* xml, const std::string& tag, const std::string& field, const std::string& delim, std::vector* vec, const char* label = NULL) { +int get_xml_array(xml::Xml* xml, const std::string& tag, const std::string& field, + const std::string& delim, std::vector* vec, const char* label = NULL) { std::vector str_vec; const int parse_iter = get_xml_array(xml, tag, field, delim, &str_vec, label); for (const std::string& str : str_vec) vec->push_back(atoi(str.c_str())); @@ -937,21 +993,18 @@ int get_xml_array(xml::Xml* xml, const std::string& tag, const std::string& fiel static inline void check_env_var(const char* var_name, uint32_t& val) { const char* str = getenv(var_name); - if (str != NULL ) val = atol(str); + if (str != NULL) val = atol(str); } static inline void check_env_var(const char* var_name, uint64_t& val) { const char* str = getenv(var_name); - if (str != NULL ) val = atoll(str); + if (str != NULL) val = atoll(str); } // HSA intercepting routines // HSA unified callback function -hsa_status_t hsa_unified_callback( - rocprofiler_hsa_cb_id_t id, - const rocprofiler_hsa_callback_data_t* data, - void* arg) -{ +hsa_status_t hsa_unified_callback(rocprofiler_hsa_cb_id_t id, + const rocprofiler_hsa_callback_data_t* data, void* arg) { printf("hsa_unified_callback(%d, %p, %p):\n", (int)id, data, arg); if (data == NULL) abort(); @@ -992,27 +1045,22 @@ hsa_status_t hsa_unified_callback( } // HSA callbacks structure -rocprofiler_hsa_callbacks_t hsa_callbacks { - hsa_unified_callback, - hsa_unified_callback, - hsa_unified_callback, - hsa_unified_callback, - NULL, - NULL -}; +rocprofiler_hsa_callbacks_t hsa_callbacks{hsa_unified_callback, + hsa_unified_callback, + hsa_unified_callback, + hsa_unified_callback, + NULL, + NULL}; // HSA kernel symbol callback -hsa_status_t hsa_ksymbol_cb(rocprofiler_hsa_cb_id_t id, - const rocprofiler_hsa_callback_data_t* data, - void* arg) -{ +hsa_status_t hsa_ksymbol_cb(rocprofiler_hsa_cb_id_t id, const rocprofiler_hsa_callback_data_t* data, + void* arg) { HsaRsrcFactory::SetKernelNameRef(data->ksymbol.object, data->ksymbol.name, data->ksymbol.unload); return HSA_STATUS_SUCCESS; } // Tool constructor -extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) -{ +extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) { ONLOAD_TRACE_BEG(); if (pthread_mutex_lock(&mutex) != 0) { @@ -1046,32 +1094,52 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) for (auto* entry : defaults_list) { const auto& opts = entry->opts; auto it = opts.find("basenames"); - if (it != opts.end()) { to_truncate_names = (it->second == "on") ? 1 : 0; } + if (it != opts.end()) { + to_truncate_names = (it->second == "on") ? 1 : 0; + } it = opts.find("timestamp"); - if (it != opts.end()) { settings->timestamp_on = (it->second == "on") ? 1 : 0; } + if (it != opts.end()) { + settings->timestamp_on = (it->second == "on") ? 1 : 0; + } it = opts.find("ctx-wait"); - if (it != opts.end()) { CTX_OUTSTANDING_WAIT = atol(it->second.c_str()); } + if (it != opts.end()) { + CTX_OUTSTANDING_WAIT = atol(it->second.c_str()); + } it = opts.find("ctx-limit"); - if (it != opts.end()) { CTX_OUTSTANDING_MAX = atol(it->second.c_str()); } + if (it != opts.end()) { + CTX_OUTSTANDING_MAX = atol(it->second.c_str()); + } it = opts.find("heartbeat"); - if (it != opts.end()) { CTX_OUTSTANDING_MON = atol(it->second.c_str()); } + if (it != opts.end()) { + CTX_OUTSTANDING_MON = atol(it->second.c_str()); + } it = opts.find("trace-size"); if (it != opts.end()) { std::string str = normalize_token(it->second, true, "option trace-size"); uint32_t multiplier = 1; switch (str.back()) { - case 'K': multiplier = 1024; break; - case 'M': multiplier = 1024 * 1024; break; + case 'K': + multiplier = 1024; + break; + case 'M': + multiplier = 1024 * 1024; + break; } if (multiplier != 1) str = str.substr(0, str.length() - 1); settings->trace_size = strtoull(str.c_str(), NULL, 0) * multiplier; } it = opts.find("trace-local"); - if (it != opts.end()) { settings->trace_local = (it->second == "on"); } + if (it != opts.end()) { + settings->trace_local = (it->second == "on"); + } it = opts.find("obj-tracking"); - if (it != opts.end()) { settings->code_obj_tracking = (it->second == "on"); } + if (it != opts.end()) { + settings->code_obj_tracking = (it->second == "on"); + } it = opts.find("memcopies"); - if (it != opts.end()) { settings->memcopy_tracking = (it->second == "on"); } + if (it != opts.end()) { + settings->memcopy_tracking = (it->second == "on"); + } } } // Enable verbose mode @@ -1110,9 +1178,12 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) if (*info_symb != 'b' && *info_symb != 'd') { fprintf(stderr, "ROCProfiler: bad info symbol '%c', ROCP_INFO env", *info_symb); } else { - if (*info_symb == 'b') printf("Basic HW counters:\n"); - else printf("Derived metrics:\n"); - hsa_status_t status = rocprofiler_iterate_info(NULL, ROCPROFILER_INFO_KIND_METRIC, info_callback, info_symb); + if (*info_symb == 'b') + printf("Basic HW counters:\n"); + else + printf("Derived metrics:\n"); + hsa_status_t status = + rocprofiler_iterate_info(NULL, ROCPROFILER_INFO_KIND_METRIC, info_callback, info_symb); check_status(status); } exit(1); @@ -1137,7 +1208,8 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) perror(errmsg.str().c_str()); abort(); } - } else result_file_handle = stdout; + } else + result_file_handle = stdout; result_file_opened = (result_prefix != NULL) && (result_file_handle != NULL); @@ -1173,8 +1245,7 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) // Skipping cpu count to get to correct gpu index const uint32_t cpu_count = HsaRsrcFactory::Instance().GetCountOfCpuAgents(); - std::transform(gpu_index_vec->begin(), gpu_index_vec->end(), - gpu_index_vec->begin(), + std::transform(gpu_index_vec->begin(), gpu_index_vec->end(), gpu_index_vec->begin(), [&](int count) { return count + cpu_count; }); // Getting kernel names @@ -1184,15 +1255,15 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) // Getting profiling range range_vec = new std::vector; const int range_parse_iter = get_xml_array(xml, "top.metric", "range", ":", range_vec, " "); - if ((range_vec->size() > 2) || (range_parse_iter > 1)) - { + if ((range_vec->size() > 2) || (range_parse_iter > 1)) { fatal("Bad range format, input file " + xml->GetName()); } if ((range_vec->size() == 1) && (range_parse_iter == 0)) { range_vec->push_back(*(range_vec->begin()) + 1); } - const bool filter_disabled = (gpu_index_vec->empty() && kernel_string_vec->empty() && range_vec->empty()); + const bool filter_disabled = + (gpu_index_vec->empty() && kernel_string_vec->empty() && range_vec->empty()); // Getting traces const auto traces_list = xml->GetNodes("top.trace"); @@ -1214,12 +1285,37 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) const uint32_t features_found = metrics_vec.size(); + // Getting Trace Period + const char* ctrl_str = getenv("ROCP_CTRL_RATE"); + if (ctrl_str != nullptr) { + uint32_t ctrl_delay = 0; + uint32_t ctrl_len = 0; + uint32_t ctrl_rate = 0; + + if (sscanf(ctrl_str, "%d:%d:%d", &ctrl_delay, &ctrl_len, &ctrl_rate) != 3 || + ctrl_len > ctrl_rate) + std::cerr << "Invalid ROCP_CTRL_RATE variable (ctrl_delay:ctrl_len:ctrl_rate)" << std::endl; + + control_dist_us = ctrl_rate - ctrl_len; + control_len_us = ctrl_len; + control_delay_us = ctrl_delay; + + if (ctrl_delay != UINT32_MAX) { + std::cout << "ROCProfiler: trace control: delay(" << ctrl_delay << "us), length(" << ctrl_len + << "us), rate(" << ctrl_rate << "us)" << std::endl; + trace_period_thread = new std::thread(trace_period_fun); + } else { + std::cout << "ROCProfiler: trace start disabled" << std::endl; + } + } else { + allow_profiling = true; + } + // Context array aloocation context_array = new context_array_t; - bool opt_mode_cond = ((features_found != 0) && - (metrics_set->empty()) && - (filter_disabled == true)); + bool opt_mode_cond = + ((features_found != 0) && (metrics_set->empty()) && (filter_disabled == true)); if (settings->opt_mode == 0) opt_mode_cond = false; if (!opt_mode_cond) settings->opt_mode = 0; if (opt_mode_cond) { @@ -1238,7 +1334,7 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) // Available GPU agents const unsigned gpu_count = HsaRsrcFactory::Instance().GetCountOfGpuAgents(); callbacks_arg_t* callbacks_arg = new callbacks_arg_t{}; - callbacks_arg->pools = new rocprofiler_pool_t* [gpu_count]; + callbacks_arg->pools = new rocprofiler_pool_t*[gpu_count]; for (unsigned gpu_id = 0; gpu_id < gpu_count; gpu_id++) { // Getting GPU device info const AgentInfo* agent_info = NULL; @@ -1281,11 +1377,12 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) callbacks_data->file_handle = result_file_handle; callbacks_data->gpu_index = (gpu_index_vec->empty()) ? NULL : gpu_index_vec; callbacks_data->kernel_string = (kernel_string_vec->empty()) ? NULL : kernel_string_vec; - callbacks_data->range = (range_vec->empty()) ? NULL : range_vec;; + callbacks_data->range = (range_vec->empty()) ? NULL : range_vec; + ; callbacks_data->filter_on = (callbacks_data->gpu_index != NULL) || - (callbacks_data->kernel_string != NULL) || - (callbacks_data->range != NULL) - ? 1 : 0; + (callbacks_data->kernel_string != NULL) || (callbacks_data->range != NULL) + ? 1 + : 0; rocprofiler_set_queue_callbacks(callbacks_ptrs, callbacks_data); } @@ -1296,7 +1393,11 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) pthread_t thread; pthread_attr_t attr; int err = pthread_attr_init(&attr); - if (err) { errno = err; perror("pthread_attr_init"); abort(); } + if (err) { + errno = err; + perror("pthread_attr_init"); + abort(); + } err = pthread_create(&thread, &attr, monitor_thr_fun, NULL); } @@ -1324,7 +1425,8 @@ void rocprofiler_unload(bool is_destr) { // Dump stored profiling output data fflush(stdout); if (result_file_opened) { - printf("\nROCPRofiler:"); fflush(stdout); + printf("\nROCPRofiler:"); + fflush(stdout); if (CTX_OUTSTANDING_WAIT == 1) dump_context_array(NULL); fclose(result_file_handle); printf(" %u contexts collected, output directory %s\n", context_collected, result_prefix);