/****************************************************************************** MIT License Copyright (c) 2018 ROCm Core Technology Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. *******************************************************************************/ /////////////////////////////////////////////////////////////////////////////// // // // Test tool used as ROC profiler library demo // // // /////////////////////////////////////////////////////////////////////////////// #include #include #include #include #include #include #include #include /* For SYS_xxx definitions */ #include #include #include #include #include #include #include #include #include "inc/rocprofiler.h" #include "util/hsa_rsrc_factory.h" #include "util/xml.h" #define PUBLIC_API __attribute__((visibility("default"))) #define CONSTRUCTOR_API __attribute__((constructor)) #define DESTRUCTOR_API __attribute__((destructor)) #define KERNEL_NAME_LEN_MAX 128 // Disoatch callback data type struct callbacks_data_t { rocprofiler_feature_t* features; unsigned feature_count; std::vector* set; unsigned group_index; FILE* file_handle; int filter_on; std::vector* gpu_index; std::vector* kernel_string; std::vector* range; }; // Context stored entry type struct context_entry_t { uint32_t valid; uint32_t index; hsa_agent_t agent; rocprofiler_group_t group; rocprofiler_feature_t* features; unsigned feature_count; rocprofiler_callback_data_t data; FILE* file_handle; }; // const std::string rcfile_name = "rpl_rc.xml"; // verbose mode static uint32_t verbose = 0; // Enable tracing static const bool trace_on = false; // Tool is unloaded volatile bool is_loaded = false; // Dispatch callbacks and context handlers synchronization pthread_mutex_t mutex = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP; // Dispatch callback data callbacks_data_t* callbacks_data = NULL; // Stored contexts array typedef std::map context_array_t; context_array_t* context_array = NULL; typedef std::list wait_list_t; wait_list_t* wait_list = NULL; // Contexts collected count volatile uint32_t context_count = 0; volatile uint32_t context_collected = 0; // Profiling results output file name const char* result_prefix = NULL; // Global results file handle FILE* result_file_handle = NULL; // True if a result file is opened bool result_file_opened = false; // Dispatch filters // Metrics set std::vector* metrics_set = NULL; // GPU index filter std::vector* gpu_index_vec = NULL; // Kernel name filter std::vector* kernel_string_vec = NULL; // DIspatch number range filter std::vector* range_vec = NULL; // Otstanding dispatches parameters static uint32_t CTX_OUTSTANDING_MAX = 0; static uint32_t CTX_OUTSTANDING_MON = 0; // to truncate kernel names uint32_t to_truncate_names = 0; // local SQTT buffer bool is_sqtt_local = true; static inline uint32_t GetPid() { return syscall(__NR_getpid); } static inline uint32_t GetTid() { return syscall(__NR_gettid); } // Error handler void fatal(const std::string msg) { fflush(stdout); fprintf(stderr, "%s\n\n", msg.c_str()); fflush(stderr); abort(); } // Check returned HSA API status void check_status(hsa_status_t status) { if (status != HSA_STATUS_SUCCESS) { const char* error_string = NULL; rocprofiler_error_string(&error_string); fprintf(stderr, "ERROR: %s\n", error_string); abort(); } } std::string filtr_kernel_name(const std::string name) { auto rit = name.rbegin(); auto rend = name.rend(); uint32_t counter = 0; char open_token = 0; char close_token = 0; while (rit != rend) { if (counter == 0) { switch (*rit) { case ')': counter = 1; open_token = ')'; close_token = '('; break; case '>': counter = 1; open_token = '>'; close_token = '<'; break; } if (counter == 0) break; } else { if (*rit == open_token) counter++; if (*rit == close_token) counter--; } ++rit; } while (((*rit == ' ') || (*rit == ' ')) && (rit != rend)) rit++; auto rbeg = rit; while ((*rit != ' ') && (*rit != ':') && (rit != rend)) rit++; const uint32_t pos = rend - rit; const uint32_t length = rit - rbeg; return name.substr(pos, length); } void* monitor_thr_fun(void*) { while (context_array != NULL) { sleep(CTX_OUTSTANDING_MON); if (pthread_mutex_lock(&mutex) != 0) { perror("pthread_mutex_lock"); abort(); } const uint32_t inflight = context_count - context_collected; std::cerr << std::flush; std::clog << std::flush; std::cout << "ROCProfiler: count(" << context_count << "), outstanding(" << inflight << "/" << CTX_OUTSTANDING_MAX << ")" << std::endl << std::flush; if (pthread_mutex_unlock(&mutex) != 0) { perror("pthread_mutex_unlock"); abort(); } } return NULL; } uint32_t next_context_count() { if (pthread_mutex_lock(&mutex) != 0) { perror("pthread_mutex_lock"); abort(); } ++context_count; if (pthread_mutex_unlock(&mutex) != 0) { perror("pthread_mutex_unlock"); abort(); } return context_count; } // Allocate entry to store profiling context context_entry_t* alloc_context_entry() { if (CTX_OUTSTANDING_MAX != 0) { while((context_count - context_collected) > CTX_OUTSTANDING_MAX) usleep(1000); } if (pthread_mutex_lock(&mutex) != 0) { perror("pthread_mutex_lock"); abort(); } const uint32_t index = next_context_count() - 1; auto ret = context_array->insert({index, context_entry_t{}}); if (ret.second == false) { fprintf(stderr, "context_array corruption, index repeated %u\n", index); abort(); } if (pthread_mutex_unlock(&mutex) != 0) { perror("pthread_mutex_unlock"); abort(); } context_entry_t* entry = &(ret.first->second); entry->index = index; return entry; } // Allocate entry to store profiling context void dealloc_context_entry(context_entry_t* entry) { if (pthread_mutex_lock(&mutex) != 0) { perror("pthread_mutex_lock"); abort(); } assert(context_array != NULL); context_array->erase(entry->index); if (pthread_mutex_unlock(&mutex) != 0) { perror("pthread_mutex_unlock"); abort(); } } // Dump trace data to file void dump_sqtt_trace(const char* label, const uint32_t chunk, const void* data, const uint32_t& size) { if (result_prefix != NULL) { // Open SQTT file std::ostringstream oss; oss << result_prefix << "/thread_trace_" << label << "_se" << chunk << ".out"; FILE* file = fopen(oss.str().c_str(), "w"); if (file == NULL) { std::ostringstream errmsg; errmsg << "fopen error, file '" << oss.str().c_str() << "'"; perror(errmsg.str().c_str()); abort(); } // Write the buffer in terms of shorts (16 bits) const unsigned short* ptr = reinterpret_cast(data); for (uint32_t i = 0; i < (size / sizeof(short)); ++i) { fprintf(file, "%04x\n", ptr[i]); } // Close SQTT file fclose(file); } } struct trace_data_arg_t { FILE* file; const char* label; hsa_agent_t agent; }; // Trace data callback for getting trace data from GPU local mamory hsa_status_t trace_data_cb(hsa_ven_amd_aqlprofile_info_type_t info_type, hsa_ven_amd_aqlprofile_info_data_t* info_data, void* data) { hsa_status_t status = HSA_STATUS_SUCCESS; trace_data_arg_t* arg = reinterpret_cast(data); if (info_type == HSA_VEN_AMD_AQLPROFILE_INFO_SQTT_DATA) { const void* data_ptr = info_data->sqtt_data.ptr; const uint32_t data_size = info_data->sqtt_data.size; fprintf(arg->file, " SE(%u) size(%u)\n", info_data->sample_id, data_size); if (is_sqtt_local) { HsaRsrcFactory* hsa_rsrc = &HsaRsrcFactory::Instance(); const AgentInfo* agent_info = hsa_rsrc->GetAgentInfo(arg->agent); const uint32_t mem_size = data_size; void* buffer = hsa_rsrc->AllocateSysMemory(agent_info, mem_size); if(!hsa_rsrc->Memcpy(agent_info, buffer, data_ptr, mem_size)) { fatal("SQTT data memcopy to host failed"); } dump_sqtt_trace(arg->label, info_data->sample_id, buffer, data_size); HsaRsrcFactory::FreeMemory(buffer); } else { dump_sqtt_trace(arg->label, info_data->sample_id, data_ptr, data_size); } } else status = HSA_STATUS_ERROR; return status; } // Align to specified alignment unsigned align_size(unsigned size, unsigned alignment) { return ((size + alignment - 1) & ~(alignment - 1)); } // Output profiling results for input features void output_results(const context_entry_t* entry, const char* label) { FILE* file = entry->file_handle; const rocprofiler_feature_t* features = entry->features; const unsigned feature_count = entry->feature_count; rocprofiler_t* context = entry->group.context; for (unsigned i = 0; i < feature_count; ++i) { const rocprofiler_feature_t* p = &features[i]; fprintf(file, " %s ", p->name); switch (p->data.kind) { // Output metrics results case ROCPROFILER_DATA_KIND_INT64: fprintf(file, "(%lu)\n", p->data.result_int64); break; // Output trace results case ROCPROFILER_DATA_KIND_BYTES: { if (p->data.result_bytes.copy) { uint64_t size = 0; const char* ptr = reinterpret_cast(p->data.result_bytes.ptr); const char* end = reinterpret_cast(ptr + p->data.result_bytes.size); for (unsigned i = 0; i < p->data.result_bytes.instance_count; ++i) { const uint32_t chunk_size = *reinterpret_cast(ptr); const char* chunk_data = ptr + sizeof(uint32_t); if (chunk_data >= end) fatal("SQTT data is out of the result buffer size"); dump_sqtt_trace(label, i, chunk_data, chunk_size); const uint32_t off = align_size(chunk_size, sizeof(uint32_t)); ptr = chunk_data + off; if (chunk_data >= end) fatal("SQTT data ptr is out of the result buffer size"); size += chunk_size; } fprintf(file, "size(%lu)\n", size); HsaRsrcFactory::FreeMemory(p->data.result_bytes.ptr); const_cast(p)->data.result_bytes.size = 0; } else { fprintf(file, "(\n"); trace_data_arg_t trace_data_arg{file, label, entry->agent}; hsa_status_t status = rocprofiler_iterate_trace_data(context, trace_data_cb, reinterpret_cast(&trace_data_arg)); check_status(status); fprintf(file, " )\n"); } break; } default: fprintf(stderr, "RPL-tool: undefined data kind(%u)\n", p->data.kind); abort(); } } } // Output group intermeadate profiling results, created internally for complex metrics void output_group(const context_entry_t* entry, const char* label) { const rocprofiler_group_t* group = &(entry->group); context_entry_t group_entry = *entry; for (unsigned i = 0; i < group->feature_count; ++i) { if (group->features[i]->data.kind == ROCPROFILER_DATA_KIND_INT64) { group_entry.features = group->features[i]; group_entry.feature_count = 1; output_results(&group_entry, label); } } } // Dump stored context profiling output data bool dump_context(context_entry_t* entry) { hsa_status_t status = HSA_STATUS_ERROR; if (entry->valid == 0) return true; const rocprofiler_dispatch_record_t* record = entry->data.record; if (record) { if (record->complete == 0) { return false; } } ++context_collected; const uint32_t index = entry->index; FILE* file_handle = entry->file_handle; const std::string nik_name = (to_truncate_names == 0) ? entry->data.kernel_name : filtr_kernel_name(entry->data.kernel_name); fprintf(file_handle, "dispatch[%u], queue_index(%lu), kernel_name(\"%s\")", index, entry->data.queue_index, nik_name.c_str()); if (record) fprintf(file_handle, ", time(%lu,%lu,%lu,%lu)", record->dispatch, record->begin, record->end, record->complete); fprintf(file_handle, "\n"); fflush(file_handle); if (record) { delete record; entry->data.record = NULL; } rocprofiler_group_t& group = entry->group; if (group.context != NULL) { status = rocprofiler_group_get_data(&group); check_status(status); if (verbose == 1) output_group(entry, "group0-data"); status = rocprofiler_get_metrics(group.context); check_status(status); std::ostringstream oss; oss << index << "__" << filtr_kernel_name(entry->data.kernel_name); output_results(entry, oss.str().substr(0, KERNEL_NAME_LEN_MAX).c_str()); free(const_cast(entry->data.kernel_name)); // Finishing cleanup // Deleting profiling context will delete all allocated resources rocprofiler_close(group.context); } entry->valid = 0; return true; } // Dump and clean a given context entry static inline bool dump_context_entry(context_entry_t* entry) { const bool ret = dump_context(entry); if (ret) dealloc_context_entry(entry); return ret; } // Dump waiting entries static inline void dump_wait_list() { if (pthread_mutex_lock(&mutex) != 0) { perror("pthread_mutex_lock"); abort(); } auto it = wait_list->begin(); auto end = wait_list->end(); while (it != end) { auto cur = it++; if (dump_context_entry(*cur)) { wait_list->erase(cur); } } if (pthread_mutex_unlock(&mutex) != 0) { perror("pthread_mutex_unlock"); abort(); } } // Dump all stored contexts profiling output data void dump_context_array() { if (pthread_mutex_lock(&mutex) != 0) { perror("pthread_mutex_lock"); abort(); } if (context_array) { if (!wait_list->empty()) dump_wait_list(); auto it = context_array->begin(); auto end = context_array->end(); while (it != end) { auto cur = it++; dump_context(&(cur->second)); } } if (pthread_mutex_unlock(&mutex) != 0) { perror("pthread_mutex_unlock"); abort(); } } // Profiling completion handler bool handler(rocprofiler_group_t group, void* arg) { context_entry_t* entry = reinterpret_cast(arg); if (pthread_mutex_lock(&mutex) != 0) { perror("pthread_mutex_lock"); abort(); } if (!wait_list->empty()) dump_wait_list(); if (!dump_context_entry(entry)) { wait_list->push_back(entry); } if (trace_on) { fprintf(stdout, "tool::handler: context_array %d tid %u\n", (int)(context_array->size()), GetTid()); fflush(stdout); } if (pthread_mutex_unlock(&mutex) != 0) { perror("pthread_mutex_unlock"); abort(); } return false; } bool check_filter(const rocprofiler_callback_data_t* callback_data, const callbacks_data_t* tool_data) { bool found = true; std::vector* range_ptr = tool_data->range; if (found && range_ptr) { found = false; std::vector& range = *range_ptr; if (range.size() == 1) { if (context_count >= range[0]) found = true; } else if (range.size() == 2) { if ((context_count >= range[0]) && (context_count < range[1])) found = true; } } std::vector* gpu_index = tool_data->gpu_index; if (found && gpu_index) { found = false; for (uint32_t i : *gpu_index) { if (i == callback_data->agent_index) { found = true; } } } std::vector* kernel_string = tool_data->kernel_string; if (found && kernel_string) { found = false; for (const std::string& s : *kernel_string) { if (std::string(callback_data->kernel_name).find(s) != std::string::npos) { found = true; } } } return found; } // Kernel disoatch callback hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, void* user_data, rocprofiler_group_t* group) { // Passed tool data callbacks_data_t* tool_data = reinterpret_cast(user_data); // HSA status hsa_status_t status = HSA_STATUS_ERROR; // Checking dispatch condition if (tool_data->filter_on == 1) { if (check_filter(callback_data, tool_data) == false) { next_context_count(); return HSA_STATUS_SUCCESS; } } // Profiling context rocprofiler_t* context = NULL; // Context entry context_entry_t* entry = alloc_context_entry(); // context properties rocprofiler_properties_t properties{}; properties.handler = (result_prefix != NULL) ? handler : NULL; properties.handler_arg = (void*)entry; rocprofiler_feature_t* features = tool_data->features; unsigned feature_count = tool_data->feature_count; if (tool_data->set != NULL) { uint32_t set_offset = 0; uint32_t next_offset = 0; const auto entry_index = entry->index; if (entry_index < (tool_data->set->size() - 1)) { set_offset = (*(tool_data->set))[entry_index]; next_offset = (*(tool_data->set))[entry_index + 1]; } else { set_offset = tool_data->set->back(); next_offset = feature_count; } features += set_offset; feature_count = next_offset - set_offset; } if (feature_count > 0) { // Open profiling context status = rocprofiler_open(callback_data->agent, features, feature_count, &context, 0 /*ROCPROFILER_MODE_SINGLEGROUP*/, &properties); check_status(status); // Check that we have only one profiling group uint32_t group_count = 0; status = rocprofiler_group_count(context, &group_count); check_status(status); assert(group_count == 1); // Get group[0] const uint32_t group_index = 0; status = rocprofiler_get_group(context, group_index, group); check_status(status); } // Fill profiling context entry entry->agent = callback_data->agent; entry->group = *group; entry->features = features; entry->feature_count = feature_count; entry->data = *callback_data; entry->data.kernel_name = strdup(callback_data->kernel_name); entry->file_handle = tool_data->file_handle; entry->valid = 1; if (trace_on) { fprintf(stdout, "tool::dispatch: context_array %d tid %u\n", (int)(context_array->size()), GetTid()); fflush(stdout); } return status; } hsa_status_t destroy_callback(hsa_queue_t* queue, void*) { if (result_file_opened == false) printf("\nROCProfiler results:\n"); dump_context_array(); return HSA_STATUS_SUCCESS; } static hsa_status_t info_callback(const rocprofiler_info_data_t info, void * arg) { const char symb = *reinterpret_cast(arg); if (((symb == 'b') && (info.metric.expr == NULL)) || ((symb == 'd') && (info.metric.expr != NULL))) { if (info.metric.expr != NULL) { fprintf(stdout, "\n gpu-agent%d : %s : %s\n", info.agent_index, info.metric.name, info.metric.description); fprintf(stdout, " %s = %s\n", info.metric.name, info.metric.expr); } else { fprintf(stdout, "\n gpu-agent%d : %s", info.agent_index, info.metric.name); if (info.metric.instances > 1) fprintf(stdout, "[0-%u]", info.metric.instances - 1); fprintf(stdout, " : %s\n", info.metric.description); fprintf(stdout, " block %s has %u counters\n", info.metric.block_name, info.metric.block_counters); } fflush(stdout); } return HSA_STATUS_SUCCESS; } std::string normalize_token(const std::string token, bool not_empty, std::string label) { const std::string space_chars_set = " \t"; const size_t first_pos = token.find_first_not_of(space_chars_set); size_t norm_len = 0; std::string error_str = "none"; if (first_pos != std::string::npos) { const size_t last_pos = token.find_last_not_of(space_chars_set); if (last_pos == std::string::npos) error_str = "token string error: \"" + token + "\""; else { const size_t end_pos = last_pos + 1; if (end_pos <= first_pos) error_str = "token string error: \"" + token + "\""; else norm_len = end_pos - first_pos; } } if (((first_pos != std::string::npos) && (norm_len == 0)) || ((first_pos == std::string::npos) && not_empty)) { fatal(label + ": " + error_str); } return (norm_len != 0) ? token.substr(first_pos, norm_len) : std::string(""); } int get_xml_array(xml::Xml* xml, const std::string& tag, const std::string& field, const std::string& delim, std::vector* vec, const char* label = NULL) { int parse_iter = 0; auto nodes = xml->GetNodes(tag); auto rit = nodes.rbegin(); auto rend = nodes.rend(); while (rit != rend) { auto& opts = (*rit)->opts; if (opts.find(field) != opts.end()) break; ++rit; } if (rit != rend) { const std::string array_string = (*rit)->opts[field]; if (label != NULL) printf("%s%s = %s\n", label, field.c_str(), array_string.c_str()); size_t pos1 = 0; const size_t string_len = array_string.length(); while (pos1 < string_len) { const size_t pos2 = array_string.find(delim, pos1); const bool found = (pos2 != std::string::npos); const size_t token_len = (pos2 != std::string::npos) ? pos2 - pos1 : string_len - pos1; const std::string token = array_string.substr(pos1, token_len); const std::string norm_str = normalize_token(token, found, "Tokens array parsing error, file '" + xml->GetName() + "', " + tag + "::" + field); if (norm_str.length() != 0) vec->push_back(norm_str); if (!found) break; pos1 = pos2 + 1; ++parse_iter; } } return parse_iter; } int get_xml_array(xml::Xml* xml, const std::string& tag, const std::string& field, const std::string& delim, std::vector* vec, const char* label = NULL) { std::vector str_vec; const int parse_iter = get_xml_array(xml, tag, field, delim, &str_vec, label); for (const std::string& str : str_vec) vec->push_back(atoi(str.c_str())); return parse_iter; } static inline void check_env_var(const char* var_name, uint32_t& val) { const char* str = getenv(var_name); if (str != NULL ) val = atol(str); } static inline void check_env_var(const char* var_name, uint64_t& val) { const char* str = getenv(var_name); if (str != NULL ) val = atoll(str); } // Tool constructor extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) { if (pthread_mutex_lock(&mutex) != 0) { perror("pthread_mutex_lock"); abort(); } if (is_loaded) return; is_loaded = true; if (pthread_mutex_unlock(&mutex) != 0) { perror("pthread_mutex_unlock"); abort(); } // Loading configuration rcfile std::string rcpath = std::string("./") + rcfile_name; xml::Xml* rcfile = xml::Xml::Create(rcpath); const char* home_dir = getenv("HOME"); if (rcfile == NULL && home_dir != NULL) { rcpath = std::string(home_dir) + "/" + rcfile_name; rcfile = xml::Xml::Create(rcpath); } const char* pkg_dir = getenv("ROCP_PACKAGE_DIR"); if (rcfile == NULL && pkg_dir != NULL) { rcpath = std::string(pkg_dir) + "/" + rcfile_name; rcfile = xml::Xml::Create(rcpath); } if (rcfile != NULL) { // Getting defaults printf("ROCProfiler: rc-file '%s'\n", rcpath.c_str()); auto defaults_list = rcfile->GetNodes("top.defaults"); for (auto* entry : defaults_list) { const auto& opts = entry->opts; auto it = opts.find("basenames"); if (it != opts.end()) { to_truncate_names = (it->second == "on") ? 1 : 0; } it = opts.find("timestamp"); if (it != opts.end()) { settings->timestamp_on = (it->second == "on") ? 1 : 0; } it = opts.find("ctx-limit"); if (it != opts.end()) { CTX_OUTSTANDING_MAX = atol(it->second.c_str()); } it = opts.find("heartbeat"); if (it != opts.end()) { CTX_OUTSTANDING_MON = atol(it->second.c_str()); } it = opts.find("sqtt-size"); if (it != opts.end()) { std::string str = normalize_token(it->second, true, "option sqtt-size"); uint32_t multiplier = 1; switch (str.back()) { case 'K': multiplier = 1024; break; case 'M': multiplier = 1024 * 1024; break; } if (multiplier != 1) str = str.substr(0, str.length() - 1); settings->sqtt_size = strtoull(str.c_str(), NULL, 0) * multiplier; } it = opts.find("sqtt-local"); if (it != opts.end()) { settings->sqtt_local = (it->second == "on"); } } } // Enable verbose mode check_env_var("ROCP_VERBOSE_MODE", verbose); // Enable kernel names truncating check_env_var("ROCP_TRUNCATE_NAMES", to_truncate_names); // Set outstanding dispatches parameter check_env_var("ROCP_OUTSTANDING_MAX", CTX_OUTSTANDING_MAX); check_env_var("ROCP_OUTSTANDING_MON", CTX_OUTSTANDING_MON); // Enable timestamping check_env_var("ROCP_TIMESTAMP_ON", settings->timestamp_on); // Set data timeout check_env_var("ROCP_DATA_TIMEOUT", settings->timeout); // Set SQTT size check_env_var("ROCP_SQTT_SIZE", settings->sqtt_size); // Set SQTT local buffer check_env_var("ROCP_SQTT_LOCAL", settings->sqtt_local); is_sqtt_local = settings->sqtt_local; // Printing out info char* info_symb = getenv("ROCP_INFO"); if (info_symb != NULL) { if (*info_symb != 'b' && *info_symb != 'd') { fprintf(stderr, "ROCProfiler: bad info symbol '%c', ROCP_INFO env", *info_symb); } else { if (*info_symb == 'b') printf("Basic HW counters:\n"); else printf("Derived metrics:\n"); hsa_status_t status = rocprofiler_iterate_info(NULL, ROCPROFILER_INFO_KIND_METRIC, info_callback, info_symb); check_status(status); } exit(1); } // Set output file result_prefix = getenv("ROCP_OUTPUT_DIR"); if (result_prefix != NULL) { DIR* dir = opendir(result_prefix); if (dir == NULL) { std::ostringstream errmsg; errmsg << "ROCProfiler: Cannot open output directory '" << result_prefix << "'"; perror(errmsg.str().c_str()); abort(); } std::ostringstream oss; oss << result_prefix << "/results.txt"; result_file_handle = fopen(oss.str().c_str(), "w"); if (result_file_handle == NULL) { std::ostringstream errmsg; errmsg << "ROCProfiler: fopen error, file '" << oss.str().c_str() << "'"; perror(errmsg.str().c_str()); abort(); } } else result_file_handle = stdout; result_file_opened = (result_prefix != NULL) && (result_file_handle != NULL); // Getting input const char* xml_name = getenv("ROCP_INPUT"); if (xml_name == NULL) fatal("ROCProfiler: input is not specified, ROCP_INPUT env"); printf("ROCProfiler: input from \"%s\"\n", xml_name); xml::Xml* xml = xml::Xml::Create(xml_name); if (xml == NULL) { fprintf(stderr, "ROCProfiler: Input file not found '%s'\n", xml_name); abort(); } // Getting metrics std::vector metrics_vec; get_xml_array(xml, "top.metric", "name", ",", &metrics_vec); // Metrics set metrics_set = new std::vector; get_xml_array(xml, "top.metric", "set", ",", metrics_set, " "); if (metrics_set->size() != 0) { uint32_t accum = 0; metrics_set->insert(metrics_set->begin(), 0); for (auto it = metrics_set->begin(); it != metrics_set->end(); ++it) { accum += *it; *it = accum; } } // Getting GPU indexes gpu_index_vec = new std::vector; get_xml_array(xml, "top.metric", "gpu_index", ",", gpu_index_vec, " "); // Getting kernel names kernel_string_vec = new std::vector; get_xml_array(xml, "top.metric", "kernel", ",", kernel_string_vec, " "); // Getting profiling range range_vec = new std::vector; const int range_parse_iter = get_xml_array(xml, "top.metric", "range", ":", range_vec, " "); if ((range_vec->size() > 2) || (range_parse_iter > 1)) { fatal("Bad range format, input file " + xml->GetName()); } if ((range_vec->size() == 1) && (range_parse_iter == 0)) { range_vec->push_back(*(range_vec->begin()) + 1); } // Getting traces auto traces_list = xml->GetNodes("top.trace"); const unsigned feature_count = metrics_vec.size() + traces_list.size(); rocprofiler_feature_t* features = new rocprofiler_feature_t[feature_count]; memset(features, 0, feature_count * sizeof(rocprofiler_feature_t)); printf(" %d metrics\n", (int)metrics_vec.size()); for (unsigned i = 0; i < metrics_vec.size(); ++i) { const std::string& name = metrics_vec[i]; printf("%s%s", (i == 0) ? " " : ", ", name.c_str()); features[i] = {}; features[i].kind = ROCPROFILER_FEATURE_KIND_METRIC; features[i].name = strdup(name.c_str()); } if (metrics_vec.size()) printf("\n"); printf(" %d traces\n", (int)traces_list.size()); unsigned index = metrics_vec.size(); for (auto* entry : traces_list) { auto params_list = xml->GetNodes("top.trace.parameters"); if (params_list.size() > 1) { fatal("ROCProfiler: Single input 'parameters' section is supported"); } std::string name = ""; bool to_copy_data = false; for (const auto& opt : entry->opts) { if (opt.first == "name") name = opt.second; else if (opt.first == "copy") to_copy_data = (opt.second == "true"); else fatal("ROCProfiler: Bad trace property '" + opt.first + "'"); } if (name == "") fatal("ROCProfiler: Bad trace properties, name is not specified"); std::map parameters_dict; parameters_dict["TARGET_CU"] = HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET; parameters_dict["VM_ID_MASK"] = HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_VM_ID_MASK; parameters_dict["MASK"] = HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_MASK; parameters_dict["TOKEN_MASK"] = HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK; parameters_dict["TOKEN_MASK2"] = HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK2; #ifdef AQLPROF_NEW_API parameters_dict["SE_MASK"] = HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SE_MASK; #endif printf(" %s (", name.c_str()); features[index] = {}; features[index].kind = ROCPROFILER_FEATURE_KIND_TRACE; features[index].name = strdup(name.c_str()); features[index].data.result_bytes.copy = to_copy_data; for (auto* params : params_list) { const unsigned parameter_count = params->opts.size(); rocprofiler_parameter_t* parameters = new rocprofiler_parameter_t[parameter_count]; unsigned p_index = 0; for (auto& v : params->opts) { const std::string parameter_name = v.first; if (parameters_dict.find(parameter_name) == parameters_dict.end()) { fprintf(stderr, "ROCProfiler: unknown trace parameter '%s'\n", parameter_name.c_str()); abort(); } const uint32_t value = strtol(v.second.c_str(), NULL, 0); printf("\n %s = 0x%x", parameter_name.c_str(), value); parameters[p_index] = {}; parameters[p_index].parameter_name = parameters_dict[parameter_name]; parameters[p_index].value = value; ++p_index; } features[index].parameters = parameters; features[index].parameter_count = parameter_count; } if (params_list.empty() == false) printf("\n "); printf(")\n"); fflush(stdout); ++index; } fflush(stdout); // Context array aloocation context_array = new context_array_t; wait_list = new wait_list_t; // Adding dispatch observer rocprofiler_queue_callbacks_t callbacks_ptrs{0}; callbacks_ptrs.dispatch = dispatch_callback; callbacks_ptrs.destroy = destroy_callback; callbacks_data = new callbacks_data_t{}; callbacks_data->features = features; callbacks_data->feature_count = feature_count; callbacks_data->set = (metrics_set->empty()) ? NULL : metrics_set; callbacks_data->group_index = 0; callbacks_data->file_handle = result_file_handle; callbacks_data->gpu_index = (gpu_index_vec->empty()) ? NULL : gpu_index_vec; callbacks_data->kernel_string = (kernel_string_vec->empty()) ? NULL : kernel_string_vec; callbacks_data->range = (range_vec->empty()) ? NULL : range_vec;; callbacks_data->filter_on = (callbacks_data->gpu_index != NULL) || (callbacks_data->kernel_string != NULL) || (callbacks_data->range != NULL) ? 1 : 0; rocprofiler_set_queue_callbacks(callbacks_ptrs, callbacks_data); xml::Xml::Destroy(xml); if (CTX_OUTSTANDING_MON != 0) { pthread_t thread; pthread_attr_t attr; int err = pthread_attr_init(&attr); if (err) { errno = err; perror("pthread_attr_init"); abort(); } err = pthread_create(&thread, &attr, monitor_thr_fun, NULL); } } // Tool destructor extern "C" PUBLIC_API void OnUnloadTool() { if (pthread_mutex_lock(&mutex) != 0) { perror("pthread_mutex_lock"); abort(); } if (!is_loaded) return; is_loaded = false; if (pthread_mutex_unlock(&mutex) != 0) { perror("pthread_mutex_unlock"); abort(); } // Unregister dispatch callback rocprofiler_remove_queue_callbacks(); // Dump stored profiling output data printf("\nROCPRofiler: %u contexts collected", context_collected); if (result_file_opened) printf(", output directory %s", result_prefix); printf("\n"); fflush(stdout); dump_context_array(); if (wait_list) { if (!wait_list->empty()) { printf("\nWaiting for pending kernels ..."); fflush(stdout); while (wait_list->size() != 0) { usleep(1000); dump_wait_list(); } printf(".done\n"); fflush(stdout); } } if (result_file_opened) fclose(result_file_handle); // Cleanup if (callbacks_data != NULL) { delete[] callbacks_data->features; delete callbacks_data; callbacks_data = NULL; } delete metrics_set; metrics_set = NULL; delete gpu_index_vec; gpu_index_vec = NULL; delete kernel_string_vec; kernel_string_vec = NULL; delete range_vec; range_vec = NULL; delete context_array; context_array = NULL; delete wait_list; wait_list = NULL; } extern "C" DESTRUCTOR_API void destructor() { if (is_loaded == true) OnUnloadTool(); }