diff --git a/projects/rocprofiler/inc/rocprofiler.h b/projects/rocprofiler/inc/rocprofiler.h index bf176928f8..7b4ffb3f87 100644 --- a/projects/rocprofiler/inc/rocprofiler.h +++ b/projects/rocprofiler/inc/rocprofiler.h @@ -215,6 +215,7 @@ hsa_status_t rocprofiler_reset(rocprofiler_t* context, // [in] profiling contex // Profiling callback data typedef struct { hsa_agent_t agent; + uint32_t agent_index; const hsa_queue_t* queue; uint64_t queue_index; uint64_t kernel_object; diff --git a/projects/rocprofiler/src/core/context.h b/projects/rocprofiler/src/core/context.h index 4f9b573d37..5fb9545151 100644 --- a/projects/rocprofiler/src/core/context.h +++ b/projects/rocprofiler/src/core/context.h @@ -335,7 +335,7 @@ class Context { const hsa_signal_value_t signal_value = hsa_signal_wait_scacquire(tuple.completion_signal, HSA_SIGNAL_CONDITION_LT, 1, timeout, HSA_WAIT_STATE_BLOCKED); complete = (signal_value == 0); - if (!complete) printf("ROCProfiler: Signal timeout, signal(%d) timeout(%lx)\n", (int)signal_value, timeout); + if (!complete) printf("ROCProfiler: Signal timeout, signal(%d) timeout(0x%lx)\n", (int)signal_value, timeout); } for (rocprofiler_feature_t* rinfo : *(tuple.info_vector)) rinfo->data.kind = ROCPROFILER_DATA_KIND_UNINIT; callback_data_t callback_data{tuple.info_vector, tuple.info_vector->size(), NULL}; diff --git a/projects/rocprofiler/src/core/intercept_queue.h b/projects/rocprofiler/src/core/intercept_queue.h index 3d666d54aa..b49ed49294 100644 --- a/projects/rocprofiler/src/core/intercept_queue.h +++ b/projects/rocprofiler/src/core/intercept_queue.h @@ -87,6 +87,7 @@ class InterceptQueue { reinterpret_cast(packet); const char* kernel_name = GetKernelName(dispatch_packet); rocprofiler_callback_data_t data = {obj->agent_info_->dev_id, + obj->agent_info_->dev_index, obj->queue_, user_que_idx, dispatch_packet->kernel_object, diff --git a/projects/rocprofiler/src/core/rocprofiler.cpp b/projects/rocprofiler/src/core/rocprofiler.cpp index aab13df140..2656d59945 100644 --- a/projects/rocprofiler/src/core/rocprofiler.cpp +++ b/projects/rocprofiler/src/core/rocprofiler.cpp @@ -174,7 +174,7 @@ const MetricsDict* GetMetrics(const hsa_agent_t& agent) { util::Logger::mutex_t util::Logger::mutex_; util::Logger* util::Logger::instance_ = NULL; -uint64_t Context::timeout_ = 1000; +uint64_t Context::timeout_ = UINT64_MAX; } /////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/projects/rocprofiler/src/util/hsa_rsrc_factory.cpp b/projects/rocprofiler/src/util/hsa_rsrc_factory.cpp index 23c26265c1..fc64b072ef 100644 --- a/projects/rocprofiler/src/util/hsa_rsrc_factory.cpp +++ b/projects/rocprofiler/src/util/hsa_rsrc_factory.cpp @@ -25,6 +25,7 @@ POSSIBILITY OF SUCH DAMAGE. #include "util/hsa_rsrc_factory.h" #include +#include #include #include #include @@ -32,6 +33,8 @@ POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include +#include #include #include @@ -80,14 +83,13 @@ hsa_status_t HsaRsrcFactory::FindMemRegionsCallback(hsa_region_t region, void* d } // Constructor of the class -HsaRsrcFactory::HsaRsrcFactory() { +HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize_hsa) { hsa_status_t status; -#if 0 // Initialize the Hsa Runtime - printf("ROCProfiler: HSA init\n"); - status = hsa_init(); - CHECK_STATUS("Error in hsa_init", status); -#endif + if (initialize_hsa_) { + status = hsa_init(); + CHECK_STATUS("Error in hsa_init", status); + } // Discover the set of Gpu devices available on the platform status = hsa_iterate_agents(GetHsaAgentsCallback, this); CHECK_STATUS("Error Calling hsa_iterate_agents", status); @@ -111,11 +113,10 @@ HsaRsrcFactory::HsaRsrcFactory() { HsaRsrcFactory::~HsaRsrcFactory() { for (auto p : cpu_list_) delete p; for (auto p : gpu_list_) delete p; -#if 0 - printf("ROCProfiler: HSA shutdown\n"); - hsa_status_t status = hsa_shut_down(); - CHECK_STATUS("Error in hsa_shut_down", status); -#endif + if (initialize_hsa_) { + hsa_status_t status = hsa_shut_down(); + CHECK_STATUS("Error in hsa_shut_down", status); + } } hsa_status_t HsaRsrcFactory::LoadAqlProfileLib(aqlprofile_pfn_t* api) { @@ -371,67 +372,53 @@ bool HsaRsrcFactory::TransferData(void* dest_buff, void* src_buff, uint32_t leng // // @return bool true if successful, false otherwise // -void* HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, - const char* kernel_name, hsa_executable_t* hsa_exec, hsa_executable_symbol_t* code_desc) { - // Finalize the Hsail object into code object - hsa_status_t status; - hsa_code_object_t code_object; +bool HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, + const char* kernel_name, hsa_executable_t* executable, hsa_executable_symbol_t* code_desc) { + hsa_status_t status = HSA_STATUS_ERROR; // Build the code object filename std::string filename(brig_path); std::clog << "Code object filename: " << filename << std::endl; // Open the file containing code object - std::ifstream codeStream(filename.c_str(), std::ios::binary | std::ios::ate); - if (!codeStream) { - std::cerr << "Error: failed to load " << filename << std::endl; + hsa_file_t file_handle = open(filename.c_str(), O_RDONLY); + if (file_handle == -1) { + std::cerr << "Error: failed to load '" << filename << "'" << std::endl; assert(false); - return NULL; + return false; } - // Allocate memory to read in code object from file - size_t size = std::string::size_type(codeStream.tellg()); - char* code_buf = (char*)AllocateSysMemory(agent_info, size); - if (!code_buf) { - std::cerr << "Error: failed to allocate memory for code object." << std::endl; - assert(false); - return NULL; - } - - // Read the code object into allocated memory - codeStream.seekg(0, std::ios::beg); - std::copy(std::istreambuf_iterator(codeStream), std::istreambuf_iterator(), code_buf); - - // De-Serialize the code object that has been read into memory - status = hsa_code_object_deserialize(code_buf, size, NULL, &code_object); + // Create code object reader + hsa_code_object_reader_t code_obj_rdr = {0}; + status = hsa_code_object_reader_create_from_file(file_handle, &code_obj_rdr); if (status != HSA_STATUS_SUCCESS) { - std::cerr << "Failed to deserialize code object" << std::endl; - if (code_buf) hsa_memory_free(code_buf); - return NULL; + std::cerr << "Failed to create code object reader '" << filename << "'" << std::endl; + return false; } // Create executable. - status = - hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, "", hsa_exec); + status = hsa_executable_create_alt(HSA_PROFILE_FULL, + HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL, executable); CHECK_STATUS("Error in creating executable object", status); // Load code object. - status = hsa_executable_load_code_object(*hsa_exec, agent_info->dev_id, code_object, ""); + status = hsa_executable_load_agent_code_object(*executable, agent_info->dev_id, + code_obj_rdr, NULL, NULL); CHECK_STATUS("Error in loading executable object", status); // Freeze executable. - status = hsa_executable_freeze(*hsa_exec, ""); + status = hsa_executable_freeze(*executable, ""); CHECK_STATUS("Error in freezing executable object", status); // Get symbol handle. hsa_executable_symbol_t kernelSymbol; - status = hsa_executable_get_symbol(*hsa_exec, NULL, kernel_name, agent_info->dev_id, 0, + status = hsa_executable_get_symbol(*executable, NULL, kernel_name, agent_info->dev_id, 0, &kernelSymbol); CHECK_STATUS("Error in looking up kernel symbol", status); // Update output parameter *code_desc = kernelSymbol; - return code_buf; + return true; } // Print the various fields of Hsa Gpu Agents diff --git a/projects/rocprofiler/src/util/hsa_rsrc_factory.h b/projects/rocprofiler/src/util/hsa_rsrc_factory.h index db86b617a4..a4b5fa3620 100644 --- a/projects/rocprofiler/src/util/hsa_rsrc_factory.h +++ b/projects/rocprofiler/src/util/hsa_rsrc_factory.h @@ -114,16 +114,16 @@ class HsaRsrcFactory { public: typedef std::recursive_mutex mutex_t; - static HsaRsrcFactory* Create() { + static HsaRsrcFactory* Create(bool initialize_hsa = true) { std::lock_guard lck(mutex_); if (instance_ == NULL) { - instance_ = new HsaRsrcFactory(); + instance_ = new HsaRsrcFactory(initialize_hsa); } return instance_; } static HsaRsrcFactory& Instance() { - if (instance_ == NULL) instance_ = Create(); + if (instance_ == NULL) instance_ = Create(false); hsa_status_t status = (instance_ != NULL) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; CHECK_STATUS("HsaRsrcFactory::Instance() failed", status); return *instance_; @@ -229,9 +229,9 @@ class HsaRsrcFactory { // @param code_desc Handle of finalized Code Descriptor that could // be used to submit for execution // - // @return code buffer, non NULL if successful, NULL otherwise + // @return true if successful, false otherwise // - void* LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, const char* kernel_name, + bool LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, const char* kernel_name, hsa_executable_t* hsa_exec, hsa_executable_symbol_t* code_desc); // Print the various fields of Hsa Gpu Agents @@ -259,11 +259,14 @@ class HsaRsrcFactory { // Constructor of the class. Will initialize the Hsa Runtime and // query the system topology to get the list of Cpu and Gpu devices - HsaRsrcFactory(); + HsaRsrcFactory(bool initialize_hsa); // Destructor of the class ~HsaRsrcFactory(); + // HSA was initialized + const bool initialize_hsa_; + // Add an instance of AgentInfo representing a Hsa Gpu agent const AgentInfo* AddAgentInfo(const hsa_agent_t agent); diff --git a/projects/rocprofiler/src/xml/xml.h b/projects/rocprofiler/src/xml/xml.h index 4e234e75d8..d0dfd0da9e 100644 --- a/projects/rocprofiler/src/xml/xml.h +++ b/projects/rocprofiler/src/xml/xml.h @@ -89,7 +89,7 @@ class Xml { AddExpr(full_tag, name, oss.str()); } - nodes_t GetNodes(std::string global_tag) { return (*map_)[global_tag]; } + nodes_t GetNodes(const std::string& global_tag) { return (*map_)[global_tag]; } template F ForEach(const F& f_i) { diff --git a/projects/rocprofiler/test/ctrl/test_hsa.cpp b/projects/rocprofiler/test/ctrl/test_hsa.cpp index 58dd38efb9..c2e4536449 100644 --- a/projects/rocprofiler/test/ctrl/test_hsa.cpp +++ b/projects/rocprofiler/test/ctrl/test_hsa.cpp @@ -43,7 +43,7 @@ HsaRsrcFactory* TestHsa::HsaInstantiate(const uint32_t agent_ind) { if (hsa_rsrc_ == NULL) { agent_id_ = agent_ind; - hsa_rsrc_ = HsaRsrcFactory::CreateInstance(); + hsa_rsrc_ = HsaRsrcFactory::Create(); // Print properties of the agents hsa_rsrc_->PrintGpuAgents("> GPU agents"); @@ -125,9 +125,8 @@ bool TestHsa::Setup() { // Load and Finalize Kernel Code Descriptor char* brig_path = (char*)brig_path_obj_.c_str(); - code_buf_ = - hsa_rsrc_->LoadAndFinalize(agent_info_, brig_path, name_.c_str(), &hsa_exec_, &kernel_code_desc_); - if (code_buf_ == NULL) { + bool suc = hsa_rsrc_->LoadAndFinalize(agent_info_, brig_path, name_.c_str(), &hsa_exec_, &kernel_code_desc_); + if (suc == false) { std::cerr << "Error in loading and finalizing Kernel" << std::endl; return false; } @@ -241,7 +240,6 @@ void TestHsa::PrintTime() { bool TestHsa::Cleanup() { hsa_executable_destroy(hsa_exec_); - hsa_memory_free(code_buf_); hsa_signal_destroy(hsa_signal_); return true; } diff --git a/projects/rocprofiler/test/ctrl/test_hsa.h b/projects/rocprofiler/test/ctrl/test_hsa.h index 300ef48685..02091110c3 100644 --- a/projects/rocprofiler/test/ctrl/test_hsa.h +++ b/projects/rocprofiler/test/ctrl/test_hsa.h @@ -47,7 +47,6 @@ class TestHsa : public TestAql { total_time_taken_ = 0; setup_time_taken_ = 0; dispatch_time_taken_ = 0; - code_buf_ = NULL; hsa_exec_ = {}; } @@ -123,8 +122,7 @@ class TestHsa : public TestAql { // Test kernel name std::string name_; - // Kernel code buffer - void* code_buf_; + // Kernel executable hsa_executable_t hsa_exec_; }; diff --git a/projects/rocprofiler/test/ctrl/tool.cpp b/projects/rocprofiler/test/ctrl/tool.cpp index b019e4883e..f70786f5e2 100644 --- a/projects/rocprofiler/test/ctrl/tool.cpp +++ b/projects/rocprofiler/test/ctrl/tool.cpp @@ -32,6 +32,9 @@ struct callbacks_data_t { unsigned feature_count; unsigned group_index; FILE* file_handle; + std::vector* gpu_index; + std::vector* kernel_string; + std::vector* range; }; // Context stored entry type @@ -54,10 +57,18 @@ typedef std::map context_array_t; context_array_t* context_array = NULL; // Contexts collected count uint32_t context_count = 0; +uint32_t context_collected = 0; // Profiling results output file name const char* result_prefix = NULL; // Global results file handle FILE* result_file_handle = NULL; +// Dispatch filters +// GPU index filter +std::vector* gpu_index_vec = NULL; +// Kernel name filter +std::vector* kernel_string_vec = NULL; +// DIspatch number range filter +std::vector* range_vec = NULL; // Check returned HSA API status void check_status(hsa_status_t status) { @@ -69,6 +80,20 @@ void check_status(hsa_status_t status) { } } +uint32_t next_context_count() { + if (pthread_mutex_lock(&mutex) != 0) { + perror("pthread_mutex_lock"); + exit(1); + } + const uint32_t prev_val = context_count; + context_count = prev_val + 1; + if (pthread_mutex_unlock(&mutex) != 0) { + perror("pthread_mutex_unlock"); + exit(1); + } + return prev_val; +} + // Allocate entry to store profiling context context_entry_t* alloc_context_entry() { if (pthread_mutex_lock(&mutex) != 0) { @@ -83,7 +108,6 @@ context_entry_t* alloc_context_entry() { fprintf(stderr, "context_array corruption, index repeated %u\n", index); abort(); } - ++context_count; if (pthread_mutex_unlock(&mutex) != 0) { perror("pthread_mutex_unlock"); @@ -91,7 +115,6 @@ context_entry_t* alloc_context_entry() { } context_entry_t* entry = &(ret.first->second); - entry->index = index; return entry; } @@ -220,6 +243,7 @@ void dump_context(context_entry_t* entry) { hsa_status_t status = HSA_STATUS_ERROR; if (entry->valid) { + ++context_collected; entry->valid = 0; const uint32_t index = entry->index; FILE* file_handle = entry->file_handle; @@ -285,10 +309,46 @@ void handler(rocprofiler_group_t group, void* arg) { // Kernel disoatch callback hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, void* user_data, rocprofiler_group_t* group) { - // HSA status - hsa_status_t status = HSA_STATUS_ERROR; // Passed tool data callbacks_data_t* tool_data = reinterpret_cast(user_data); + + // Checking dispatch condition + bool found = true; + std::vector* range_ptr = tool_data->range; + if (found && range_ptr) { + found = false; + std::vector& range = *range_ptr; + if (range.size() == 1) { + if (context_count >= range[0]) found = true; + } else if (range.size() == 2) { + if ((context_count >= range[0]) && (context_count < range[1])) found = true; + } + } + std::vector* gpu_index = tool_data->gpu_index; + if (found && gpu_index) { + found = false; + for (uint32_t i : *gpu_index) { + if (i == callback_data->agent_index) { + found = true; + } + } + } + std::vector* kernel_string = tool_data->kernel_string; + if (found && kernel_string) { + found = false; + for (const std::string& s : *kernel_string) { + if (std::string(callback_data->kernel_name).find(s) != std::string::npos) { + found = true; + } + } + } + if (found == false) { + next_context_count(); + return HSA_STATUS_SUCCESS; + } + + // HSA status + hsa_status_t status = HSA_STATUS_ERROR; // Profiling context rocprofiler_t* context = NULL; // Context entry @@ -320,6 +380,7 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, entry->data = *callback_data; entry->data.kernel_name = strdup(callback_data->kernel_name); entry->file_handle = tool_data->file_handle; + entry->index = next_context_count(); entry->valid = 1; return status; @@ -341,19 +402,48 @@ static hsa_status_t info_callback(const rocprofiler_info_data_t info, void * arg return HSA_STATUS_SUCCESS; } +void get_xml_array(xml::Xml* xml, const std::string& tag, const std::string& field, const std::string& delim, std::vector* vec, const char* label = NULL) { + auto nodes = xml->GetNodes(tag); + auto rit = nodes.rbegin(); + auto rend = nodes.rend(); + while (rit != rend) { + auto& opts = (*rit)->opts; + if (opts.find(field) != opts.end()) break; + ++rit; + } + if (rit != rend) { + const std::string array_string = (*rit)->opts[field]; + if (label != NULL) printf("%s%s = %s\n", label, field.c_str(), array_string.c_str()); + size_t pos1 = 0; + while (pos1 < array_string.length()) { + const size_t pos2 = array_string.find(delim, pos1); + const std::string token = array_string.substr(pos1, pos2 - pos1); + vec->push_back(token); + if (pos2 == std::string::npos) break; + pos1 = pos2 + 1; + } + } +} + +void get_xml_array(xml::Xml* xml, const std::string& tag, const std::string& field, const std::string& delim, std::vector* vec, const char* label = NULL) { + std::vector str_vec; + get_xml_array(xml, tag, field, delim, &str_vec, label); + for (const std::string& str : str_vec) vec->push_back(atoi(str.c_str())); +} + // Tool constructor extern "C" PUBLIC_API void OnLoadTool() { std::map parameters_dict; - parameters_dict["HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET"] = + parameters_dict["COMPUTE_UNIT_TARGET"] = HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET; - parameters_dict["HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_VM_ID_MASK"] = + parameters_dict["VM_ID_MASK"] = HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_VM_ID_MASK; - parameters_dict["HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_MASK"] = + parameters_dict["MASK"] = HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_MASK; - parameters_dict["HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK"] = + parameters_dict["TOKEN_MASK"] = HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK; - parameters_dict["HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK2"] = + parameters_dict["TOKEN_MASK2"] = HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK2; char* info_symb = getenv("ROCP_INFO"); @@ -374,7 +464,7 @@ extern "C" PUBLIC_API void OnLoadTool() DIR* dir = opendir(result_prefix); if (dir == NULL) { std::ostringstream errmsg; - errmsg << "Cannot open output directory '" << result_prefix << "'"; + errmsg << "ROCProfiler: Cannot open output directory '" << result_prefix << "'"; perror(errmsg.str().c_str()); exit(1); } @@ -383,7 +473,7 @@ extern "C" PUBLIC_API void OnLoadTool() result_file_handle = fopen(oss.str().c_str(), "w"); if (result_file_handle == NULL) { std::ostringstream errmsg; - errmsg << "fopen error, file '" << oss.str().c_str() << "'"; + errmsg << "ROCProfiler: fopen error, file '" << oss.str().c_str() << "'"; perror(errmsg.str().c_str()); exit(1); } @@ -399,24 +489,23 @@ extern "C" PUBLIC_API void OnLoadTool() printf("ROCProfiler: input from \"%s\"\n", xml_name); xml::Xml* xml = xml::Xml::Create(xml_name); if (xml == NULL) { - fprintf(stderr, "Input file not found '%s'\n", xml_name); + fprintf(stderr, "ROCProfiler: Input file not found '%s'\n", xml_name); exit(1); } // Getting metrics - auto metrics_list = xml->GetNodes("top.metric"); std::vector metrics_vec; - for (auto* entry : metrics_list) { - const std::string entry_str = entry->opts["name"]; - size_t pos1 = 0; - while (pos1 < entry_str.length()) { - const size_t pos2 = entry_str.find(",", pos1); - const std::string metric_name = entry_str.substr(pos1, pos2 - pos1); - metrics_vec.push_back(metric_name); - if (pos2 == std::string::npos) break; - pos1 = pos2 + 1; - } - } + get_xml_array(xml, "top.metric", "name", ",", &metrics_vec); + + // Getting GPU indexes + gpu_index_vec = new std::vector; + get_xml_array(xml, "top.metric", "gpu_index", ",", gpu_index_vec, " "); + // Getting kernel names + kernel_string_vec = new std::vector; + get_xml_array(xml, "top.metric", "kernel", ",", kernel_string_vec, " "); + // Getting profiling range + range_vec = new std::vector; + get_xml_array(xml, "top.metric", "range", ":", range_vec, " "); // Getting traces auto traces_list = xml->GetNodes("top.trace"); @@ -458,7 +547,7 @@ extern "C" PUBLIC_API void OnLoadTool() for (auto& v : params->opts) { const std::string parameter_name = v.first; if (parameters_dict.find(parameter_name) == parameters_dict.end()) { - fprintf(stderr, "ROCProfiler: unknown trace parameter %s\n", parameter_name.c_str()); + fprintf(stderr, "ROCProfiler: unknown trace parameter '%s'\n", parameter_name.c_str()); exit(1); } const uint32_t value = strtol(v.second.c_str(), NULL, 0); @@ -489,6 +578,9 @@ extern "C" PUBLIC_API void OnLoadTool() callbacks_data->feature_count = feature_count; callbacks_data->group_index = 0; callbacks_data->file_handle = result_file_handle; + callbacks_data->gpu_index = (gpu_index_vec->empty()) ? NULL : gpu_index_vec; + callbacks_data->kernel_string = (kernel_string_vec->empty()) ? NULL : kernel_string_vec; + callbacks_data->range = (range_vec->empty()) ? NULL : range_vec;; rocprofiler_set_queue_callbacks(callbacks_ptrs, callbacks_data); } @@ -503,7 +595,7 @@ extern "C" PUBLIC_API void OnUnloadTool() { // Dump stored profiling output data const bool result_file_opened = (result_prefix != NULL) && (result_file_handle != NULL); - printf("\nROCPRofiler: %u contexts collected", context_count); + printf("\nROCPRofiler: %u contexts collected", context_collected); if (result_file_opened) printf(", output directory %s", result_prefix); printf("\n"); dump_context_array(); @@ -514,4 +606,7 @@ extern "C" PUBLIC_API void OnUnloadTool() { delete[] callbacks_data->features; delete callbacks_data; } + delete gpu_index_vec; + delete kernel_string_vec; + delete range_vec; } diff --git a/projects/rocprofiler/test/gfx_metrics.xml b/projects/rocprofiler/test/gfx_metrics.xml new file mode 100644 index 0000000000..ca5a8009ef --- /dev/null +++ b/projects/rocprofiler/test/gfx_metrics.xml @@ -0,0 +1,93 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/projects/rocprofiler/test/input.xml b/projects/rocprofiler/test/input.xml index 92c2f29caa..53622e07e9 100644 --- a/projects/rocprofiler/test/input.xml +++ b/projects/rocprofiler/test/input.xml @@ -1,9 +1,23 @@ - - +# List of metrics + + +# SQTT trace with parameters + diff --git a/projects/rocprofiler/test/metrics.xml b/projects/rocprofiler/test/metrics.xml index e84aa28728..5aed1b7288 100644 --- a/projects/rocprofiler/test/metrics.xml +++ b/projects/rocprofiler/test/metrics.xml @@ -1,283 +1,205 @@ +#include "gfx_metrics.xml" + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # average for (16 instances x 4 shader engines) - + # average for 16 instances + + + # sum for 16 instances + + # FETCH_SIZE, kilobytes # The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. - + + # WRITE_SIZE + # The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # average for (16 instances x 4 shader engines) - + # average for 16 instances + + + # sum for 16 instances - + + + + # FETCH_SIZE, kilobytes # The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. - + + # WRITE_SIZE + # The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. + - # GPU_BUSY, percentage + # GPUBusy, percentage # The percentage of time GPU was busy. # Wavefronts Total wavefronts., # VALUInsts The average number of vector ALU instructions executed per work-item (affected by flow control). # SALUInsts The average number of scalar ALU instructions executed per work-item (affected by flow control). # VFetchInsts The average number of vector fetch instructions from the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that fetch from video memory. # SFetchInsts The average number of scalar fetch instructions from the video memory executed per work-item (affected by flow control). # VWriteInsts The average number of vector write instructions to the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that write to video memory. # FlatVMemInsts The average number of FLAT instructions that read from or write to the video memory executed per work item (affected by flow control). Includes FLAT instructions that read from or write to scratch. # LDSInsts The average number of LDS read or LDS write instructions executed per work item (affected by flow control). Excludes FLAT instructions that read from or write to LDS. # FlatLDSInsts The average number of FLAT instructions that read or write to LDS executed per work item (affected by flow control). # GDSInsts The average number of GDS read or GDS write instructions executed per work item (affected by flow control). # VALUUtilization The percentage of active vector ALU threads in a wave. A lower number can mean either more thread divergence in a wave or that the work-group size is not a multiple of 64. Value range: 0% (bad), 100% (ideal - no thread divergence). # VALUBusy The percentage of GPUTime vector ALU instructions are processed. Value range: 0% (bad) to 100% (optimal). # SALUBusy The percentage of GPUTime scalar ALU instructions are processed. Value range: 0% (bad) to 100% (optimal). # FetchSize The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. # WriteSize The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. # L2CacheHit The percentage of fetch, write, atomic, and other instructions that hit the data in L2 cache. Value range: 0% (no hit) to 100% (optimal). # MemUnitBusy The percentage of GPUTime the memory unit is active. The result includes the stall time (MemUnitStalled). This is measured with all extra fetches and writes and any cache or memory effects taken into account. Value range: 0% to 100% (fetch-bound). # MemUnitStalled The percentage of GPUTime the memory unit is stalled. Try reducing the number or size of fetches and writes if possible. Value range: 0% (optimal) to 100% (bad). # WriteUnitStalled The percentage of GPUTime the Write unit is stalled. Value range: 0% to 100% (bad). # The percentage of GPUTime ALU units are stalled by the LDS input queue being full or the output queue being not ready. If there are LDS bank conflicts, reduce them. Otherwise, try reducing the number of LDS accesses if possible. Value range: 0% (optimal) to 100% (bad). # LDSBankConflict The percentage of GPUTime LDS is stalled by bank conflicts. Value range: 0% (optimal) to 100% (bad). diff --git a/projects/rocprofiler/test/util/hsa_rsrc_factory.cpp b/projects/rocprofiler/test/util/hsa_rsrc_factory.cpp index ffe43a4296..88862801d8 100644 --- a/projects/rocprofiler/test/util/hsa_rsrc_factory.cpp +++ b/projects/rocprofiler/test/util/hsa_rsrc_factory.cpp @@ -25,6 +25,7 @@ POSSIBILITY OF SUCH DAMAGE. #include "util/hsa_rsrc_factory.h" #include +#include #include #include #include @@ -32,6 +33,8 @@ POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include +#include #include #include @@ -77,12 +80,13 @@ hsa_status_t HsaRsrcFactory::FindMemRegionsCallback(hsa_region_t region, void* d } // Constructor of the class -HsaRsrcFactory::HsaRsrcFactory() { +HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize_hsa) { + hsa_status_t status; // Initialize the Hsa Runtime - printf("HSA init\n"); - hsa_status_t status = hsa_init(); - CHECK_STATUS("Error in hsa_init", status); - + if (initialize_hsa_) { + status = hsa_init(); + CHECK_STATUS("Error in hsa_init", status); + } // Discover the set of Gpu devices available on the platform status = hsa_iterate_agents(GetHsaAgentsCallback, this); CHECK_STATUS("Error Calling hsa_iterate_agents", status); @@ -106,10 +110,10 @@ HsaRsrcFactory::HsaRsrcFactory() { HsaRsrcFactory::~HsaRsrcFactory() { for (auto p : cpu_list_) delete p; for (auto p : gpu_list_) delete p; - - printf("HSA shutdown\n"); - hsa_status_t status = hsa_shut_down(); - CHECK_STATUS("Error in hsa_shut_down", status); + if (initialize_hsa_) { + hsa_status_t status = hsa_shut_down(); + CHECK_STATUS("Error in hsa_shut_down", status); + } } hsa_status_t HsaRsrcFactory::LoadAqlProfileLib(aqlprofile_pfn_t* api) { @@ -365,67 +369,53 @@ bool HsaRsrcFactory::TransferData(void* dest_buff, void* src_buff, uint32_t leng // // @return bool true if successful, false otherwise // -void* HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, - const char* kernel_name, hsa_executable_t* hsa_exec, hsa_executable_symbol_t* code_desc) { - // Finalize the Hsail object into code object - hsa_status_t status; - hsa_code_object_t code_object; +bool HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, + const char* kernel_name, hsa_executable_t* executable, hsa_executable_symbol_t* code_desc) { + hsa_status_t status = HSA_STATUS_ERROR; // Build the code object filename std::string filename(brig_path); std::clog << "Code object filename: " << filename << std::endl; // Open the file containing code object - std::ifstream codeStream(filename.c_str(), std::ios::binary | std::ios::ate); - if (!codeStream) { - std::cerr << "Error: failed to load " << filename << std::endl; + hsa_file_t file_handle = open(filename.c_str(), O_RDONLY); + if (file_handle == -1) { + std::cerr << "Error: failed to load '" << filename << "'" << std::endl; assert(false); - return NULL; + return false; } - // Allocate memory to read in code object from file - size_t size = std::string::size_type(codeStream.tellg()); - char* code_buf = (char*)AllocateSysMemory(agent_info, size); - if (!code_buf) { - std::cerr << "Error: failed to allocate memory for code object." << std::endl; - assert(false); - return NULL; - } - - // Read the code object into allocated memory - codeStream.seekg(0, std::ios::beg); - std::copy(std::istreambuf_iterator(codeStream), std::istreambuf_iterator(), code_buf); - - // De-Serialize the code object that has been read into memory - status = hsa_code_object_deserialize(code_buf, size, NULL, &code_object); + // Create code object reader + hsa_code_object_reader_t code_obj_rdr = {0}; + status = hsa_code_object_reader_create_from_file(file_handle, &code_obj_rdr); if (status != HSA_STATUS_SUCCESS) { - std::cerr << "Failed to deserialize code object" << std::endl; - if (code_buf) hsa_memory_free(code_buf); - return NULL; + std::cerr << "Failed to create code object reader '" << filename << "'" << std::endl; + return false; } // Create executable. - status = - hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, "", hsa_exec); + status = hsa_executable_create_alt(HSA_PROFILE_FULL, + HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL, executable); CHECK_STATUS("Error in creating executable object", status); // Load code object. - status = hsa_executable_load_code_object(*hsa_exec, agent_info->dev_id, code_object, ""); + status = hsa_executable_load_agent_code_object(*executable, agent_info->dev_id, + code_obj_rdr, NULL, NULL); CHECK_STATUS("Error in loading executable object", status); // Freeze executable. - status = hsa_executable_freeze(*hsa_exec, ""); + status = hsa_executable_freeze(*executable, ""); CHECK_STATUS("Error in freezing executable object", status); // Get symbol handle. hsa_executable_symbol_t kernelSymbol; - status = hsa_executable_get_symbol(*hsa_exec, NULL, kernel_name, agent_info->dev_id, 0, + status = hsa_executable_get_symbol(*executable, NULL, kernel_name, agent_info->dev_id, 0, &kernelSymbol); CHECK_STATUS("Error in looking up kernel symbol", status); // Update output parameter *code_desc = kernelSymbol; - return code_buf; + return true; } // Print the various fields of Hsa Gpu Agents diff --git a/projects/rocprofiler/test/util/hsa_rsrc_factory.h b/projects/rocprofiler/test/util/hsa_rsrc_factory.h index b8d9ff326a..d5c10879e0 100644 --- a/projects/rocprofiler/test/util/hsa_rsrc_factory.h +++ b/projects/rocprofiler/test/util/hsa_rsrc_factory.h @@ -112,20 +112,18 @@ class HsaRsrcFactory { public: typedef std::recursive_mutex mutex_t; - static HsaRsrcFactory* Create() { return NULL; } - - static HsaRsrcFactory* CreateInstance() { + static HsaRsrcFactory* Create(bool initialize_hsa = true) { std::lock_guard lck(mutex_); if (instance_ == NULL) { - instance_ = new HsaRsrcFactory(); + instance_ = new HsaRsrcFactory(initialize_hsa); } return instance_; } static HsaRsrcFactory& Instance() { - CreateInstance(); + if (instance_ == NULL) instance_ = Create(false); hsa_status_t status = (instance_ != NULL) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; - CHECK_STATUS("HsaRsrcFactory::Instance() is not found", status); + CHECK_STATUS("HsaRsrcFactory::Instance() failed", status); return *instance_; } @@ -229,9 +227,9 @@ class HsaRsrcFactory { // @param code_desc Handle of finalized Code Descriptor that could // be used to submit for execution // - // @return code buffer, non NULL if successful, NULL otherwise + // @return true if successful, false otherwise // - void* LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, const char* kernel_name, + bool LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, const char* kernel_name, hsa_executable_t* hsa_exec, hsa_executable_symbol_t* code_desc); // Print the various fields of Hsa Gpu Agents @@ -259,11 +257,14 @@ class HsaRsrcFactory { // Constructor of the class. Will initialize the Hsa Runtime and // query the system topology to get the list of Cpu and Gpu devices - HsaRsrcFactory(); + HsaRsrcFactory(bool initialize_hsa); // Destructor of the class ~HsaRsrcFactory(); + // HSA was initialized + const bool initialize_hsa_; + // Add an instance of AgentInfo representing a Hsa Gpu agent const AgentInfo* AddAgentInfo(const hsa_agent_t agent);