diff --git a/projects/rocprofiler/inc/rocprofiler.h b/projects/rocprofiler/inc/rocprofiler.h index 9936396c63..f7d5a73bb8 100644 --- a/projects/rocprofiler/inc/rocprofiler.h +++ b/projects/rocprofiler/inc/rocprofiler.h @@ -78,9 +78,8 @@ extern "C" { // Profiling feature kind typedef enum { - ROCPROFILER_FEATURE_KIND_COUNTER = 0, - ROCPROFILER_FEATURE_KIND_METRIC = 1, - ROCPROFILER_FEATURE_KIND_TRACE = 2 + ROCPROFILER_FEATURE_KIND_METRIC = 0, + ROCPROFILER_FEATURE_KIND_TRACE = 1 } rocprofiler_feature_kind_t; // Profiling feture parameter diff --git a/projects/rocprofiler/src/core/context.h b/projects/rocprofiler/src/core/context.h index 2355358a4c..9b9a59cad9 100644 --- a/projects/rocprofiler/src/core/context.h +++ b/projects/rocprofiler/src/core/context.h @@ -200,7 +200,7 @@ class Context { const rocprofiler_feature_kind_t kind = info->kind; const char* name = info->name; - if (kind != ROCPROFILER_FEATURE_KIND_TRACE) { // Processing metrics features + if (kind == ROCPROFILER_FEATURE_KIND_METRIC) { // Processing metrics features const Metric* metric = metrics_->Get(name); if (metric == NULL) EXC_RAISING(HSA_STATUS_ERROR, "input metric '" << name << "' is not found"); diff --git a/projects/rocprofiler/src/core/metrics.h b/projects/rocprofiler/src/core/metrics.h index 3ff6e31c18..b777c2899f 100644 --- a/projects/rocprofiler/src/core/metrics.h +++ b/projects/rocprofiler/src/core/metrics.h @@ -38,7 +38,7 @@ class Metric { virtual const xml::Expr* GetExpr() const = 0; private: - std::string name_; + const std::string name_; }; class BaseMetric : public Metric { @@ -107,10 +107,38 @@ class MetricsDict { } } - const Metric* Get(const std::string& name) const { + const Metric* Get(const std::string& name) { const Metric* metric = NULL; + auto it = cache_.find(name); if (it != cache_.end()) metric = it->second; + else { + const std::size_t pos = name.find(':'); + if (pos != std::string::npos) { + std::string block_name = name.substr(0, pos); + const std::string event_str = name.substr(pos + 1); + + uint32_t block_index = 0; + bool indexed = false; + const std::size_t pos1 = block_name.find('['); + if (pos1 != std::string::npos) { + const std::size_t pos2 = block_name.find(']'); + if (pos2 == std::string::npos) EXC_RAISING(HSA_STATUS_ERROR, "Malformed metric name '" << name << "'"); + block_name = name.substr(0, pos1); + const std::string block_index_str = name.substr(pos1 + 1, pos2 - (pos1 + 1)); + block_index = atol(block_index_str.c_str()); + indexed = true; + } + + const hsa_ven_amd_aqlprofile_id_query_t query = Translate(agent_info_, block_name); + const hsa_ven_amd_aqlprofile_block_name_t block_id = (hsa_ven_amd_aqlprofile_block_name_t)query.id; + if ((query.instance_count > 1) && (indexed == false)) EXC_RAISING(HSA_STATUS_ERROR, "Malformed indexed metric name '" << name << "'"); + const uint32_t event_id = atol(event_str.c_str()); + const counter_t counter = {name, {block_id, block_index, event_id}}; + metric = new BaseMetric(name, counter); + } + } + return metric; } @@ -119,12 +147,15 @@ class MetricsDict { const_iterator_t End() const { return cache_.end(); } private: - MetricsDict(const util::AgentInfo* agent_info) : xml_(NULL) { + MetricsDict(const util::AgentInfo* agent_info) : xml_(NULL), agent_info_(agent_info) { const char* xml_name = getenv("ROCP_METRICS"); if (xml_name != NULL) { xml_ = xml::Xml::Create(xml_name); if (xml_ == NULL) EXC_RAISING(HSA_STATUS_ERROR, "metrics .xml open error '" << xml_name << "'"); - std::cout << "ROCProfiler: importing metrics from '" << xml_name << "':" << std::endl; + xml_->AddConst("top.const.metric", "NUM_SIMDS", 64); + xml_->AddConst("top.const.metric", "NUM_SHADER_ENGINES", 4); + std::cout << "ROCProfiler: importing '" << xml_name << "':" << std::endl; + ImportMetrics(agent_info, "const"); ImportMetrics(agent_info, agent_info->gfxip); ImportMetrics(agent_info, "global"); } @@ -135,47 +166,53 @@ class MetricsDict { for (auto& entry : cache_) delete entry.second; } - void ImportMetrics(const util::AgentInfo* agent_info, const char* scope) { - auto scope_list = xml_->GetNodes("top." + std::string(scope) + ".metric"); + static hsa_ven_amd_aqlprofile_id_query_t Translate(const util::AgentInfo* agent_info, const std::string& block_name) { + hsa_ven_amd_aqlprofile_profile_t profile; + profile.agent = agent_info->dev_id; + hsa_ven_amd_aqlprofile_id_query_t query = {block_name.c_str(), 0, 0}; + hsa_status_t status = + util::HsaRsrcFactory::Instance().AqlProfileApi()->hsa_ven_amd_aqlprofile_get_info( + &profile, HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_ID, &query); + if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(HSA_STATUS_ERROR, "ImportMetrics: bad block name '" << block_name << "'"); + return query; + } + + void ImportMetrics(const util::AgentInfo* agent_info, const std::string& scope) { + auto scope_list = xml_->GetNodes("top." + scope + ".metric"); if (!scope_list.empty()) { std::cout << " " << scope_list.size() << " " << scope << " metrics found" << std::endl; for (auto node : scope_list) { const std::string name = node->opts["name"]; - if (cache_.find(name) != cache_.end()) - EXC_RAISING(HSA_STATUS_ERROR, "ImportMetrics: metrics redefined '" << name << "'"); - const std::string expr_str = node->opts["expr"]; + std::string descr = node->opts["descr"]; + if (descr.empty()) descr = (expr_str.empty()) ? name : expr_str; + if (expr_str.empty()) { const std::string block_name = node->opts["block"]; - const uint32_t event_id = atoi(node->opts["event"].c_str()); + const std::string event_str = node->opts["event"]; + const uint32_t event_id = atol(event_str.c_str()); - hsa_ven_amd_aqlprofile_profile_t profile; - profile.agent = agent_info->dev_id; - hsa_ven_amd_aqlprofile_id_query_t query = {block_name.c_str(), 0, 0}; - hsa_status_t status = - util::HsaRsrcFactory::Instance().AqlProfileApi()->hsa_ven_amd_aqlprofile_get_info( - &profile, HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_ID, &query); - if (status == HSA_STATUS_SUCCESS) { - const hsa_ven_amd_aqlprofile_block_name_t block_id = - (hsa_ven_amd_aqlprofile_block_name_t)query.id; - if (query.instance_count > 1) { - for (unsigned block_index = 0; block_index < query.instance_count; ++block_index) { - std::ostringstream os; - os << name << '[' << block_index << ']'; - const std::string full_name = os.str(); - const counter_t counter = {full_name, {block_id, block_index, event_id}}; - cache_[full_name] = new BaseMetric(full_name, counter); - } - } else { - const counter_t counter = {name, {block_id, 0, event_id}}; - cache_[name] = new BaseMetric(name, counter); + const hsa_ven_amd_aqlprofile_id_query_t query = Translate(agent_info, block_name); + const hsa_ven_amd_aqlprofile_block_name_t block_id = (hsa_ven_amd_aqlprofile_block_name_t)query.id; + if (query.instance_count > 1) { + for (unsigned block_index = 0; block_index < query.instance_count; ++block_index) { + std::ostringstream full_name; + full_name << name << '[' << block_index << ']'; + std::ostringstream alias; + alias << block_name << "[" << block_index << "]:" << event_str; + const counter_t counter = {full_name.str(), {block_id, block_index, event_id}}; + AddMetric(full_name.str(), alias.str(), counter); } - } else - AQL_EXC_RAISING(HSA_STATUS_ERROR, "ImportMetrics: bad block name '" << block_name - << "'"); + } else { + const std::string alias = block_name + ":" + event_str; + const counter_t counter = {name, {block_id, 0, event_id}}; + AddMetric(name, alias, counter); + } } else { xml::Expr* expr_obj = new xml::Expr(expr_str, new ExprCache(&cache_)); + std::cout << "# " << descr << std::endl; + std::cout << name << "=" << expr_obj->String() << "\n" << std::endl; counters_vec_t counters_vec; for (const std::string var : expr_obj->GetVars()) { auto it = cache_.find(var); @@ -184,12 +221,46 @@ class MetricsDict { << "' is not found"); it->second->GetCounters(counters_vec); } - cache_[name] = new ExprMetric(name, counters_vec, expr_obj); + AddMetric(name, counters_vec, expr_obj); } } } } + const Metric* AddMetric(const std::string& name, const std::string& /*alias*/, const counter_t& counter) { + const Metric* metric = NULL; + const auto ret = cache_.insert({name, NULL}); + if (ret.second) { + metric = new BaseMetric(name, counter); + ret.first->second = metric; + } else EXC_RAISING(HSA_STATUS_ERROR, "metric redefined '" << name << "'"); +#if 0 + if (alias != name) { + if (cache_.find(alias) != cache_.end()) EXC_RAISING(HSA_STATUS_ERROR, "metric alias/name interference '" << alias << "'"); + const auto ret = aliases_.insert({alias, name}); + if (!ret.second) EXC_RAISING(HSA_STATUS_ERROR, "metric alias redefined '" << alias << "'"); + } +#endif + return metric; + } + + const Metric* AddMetric(const std::string& name, const counters_vec_t& counters_vec, const xml::Expr* expr_obj) { + const Metric* metric = NULL; + const auto ret = cache_.insert({name, NULL}); + if (ret.second) { + metric = new ExprMetric(name, counters_vec, expr_obj); + ret.first->second = metric; + } else EXC_RAISING(HSA_STATUS_ERROR, "expr-metric redefined '" << name << "'"); + return metric; + } + +#if 0 + std::string UnAlias(const std::string& name) const { + auto it = aliases_.find(name); + return (it != aliases_.end()) ? it->second : name; + } +#endif + void Print() { for (auto& v : cache_) { const Metric* metric = v.second; @@ -203,7 +274,11 @@ class MetricsDict { } xml::Xml* xml_; + const util::AgentInfo* agent_info_; cache_t cache_; +#if 0 + std::map aliases_; +#endif static map_t* map_; static mutex_t mutex_; diff --git a/projects/rocprofiler/src/core/profile.h b/projects/rocprofiler/src/core/profile.h index 2301b76a25..6002ba2ef3 100644 --- a/projects/rocprofiler/src/core/profile.h +++ b/projects/rocprofiler/src/core/profile.h @@ -79,15 +79,16 @@ class Profile { Profile(const util::AgentInfo* agent_info) : agent_info_(agent_info) { profile_ = {}; profile_.agent = agent_info->dev_id; + completion_signal_ = {}; is_legacy_ = (strncmp(agent_info->name, "gfx8", 4) == 0); } virtual ~Profile() { - if (!info_vector_.empty()) { - info_vector_.clear(); - hsa_memory_free(profile_.command_buffer.ptr); - hsa_memory_free(profile_.output_buffer.ptr); - free(const_cast(profile_.events)); - free(const_cast(profile_.parameters)); + info_vector_.clear(); + if (profile_.command_buffer.ptr) hsa_memory_free(profile_.command_buffer.ptr); + if (profile_.output_buffer.ptr) hsa_memory_free(profile_.output_buffer.ptr); + if (profile_.events) free(const_cast(profile_.events)); + if (profile_.parameters) free(const_cast(profile_.parameters)); + if (completion_signal_.handle) { hsa_status_t status = hsa_signal_destroy(completion_signal_); if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "signal_destroy " << std::hex << status); } diff --git a/projects/rocprofiler/src/util/hsa_rsrc_factory.cpp b/projects/rocprofiler/src/util/hsa_rsrc_factory.cpp index 9b27cacbbc..03eaf3c5f1 100644 --- a/projects/rocprofiler/src/util/hsa_rsrc_factory.cpp +++ b/projects/rocprofiler/src/util/hsa_rsrc_factory.cpp @@ -26,12 +26,14 @@ POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include #include #include +#include #include #include #include @@ -76,6 +78,7 @@ hsa_status_t HsaRsrcFactory::FindMemRegionsCallback(hsa_region_t region, void* d // Constructor of the class HsaRsrcFactory::HsaRsrcFactory() { // Initialize the Hsa Runtime + printf("HSA init\n"); hsa_status_t status = hsa_init(); CHECK_STATUS("Error in hsa_init", status); @@ -100,6 +103,10 @@ HsaRsrcFactory::HsaRsrcFactory() { // Destructor of the class HsaRsrcFactory::~HsaRsrcFactory() { + for (auto p : cpu_list_) free(const_cast(p)); + for (auto p : gpu_list_) free(const_cast(p)); + + printf("HSA shutdown\n"); hsa_status_t status = hsa_shut_down(); CHECK_STATUS("Error in hsa_shut_down", status); } @@ -162,12 +169,15 @@ const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) { hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, agent_info->name); strncpy(agent_info->gfxip, agent_info->name, 4); agent_info->gfxip[4] = '\0'; - agent_info->max_wave_size = 0; hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &agent_info->max_wave_size); - agent_info->max_queue_size = 0; hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &agent_info->max_queue_size); - agent_info->profile = hsa_profile_t(108); hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_info->profile); + agent_info->is_apu = (agent_info->profile == HSA_PROFILE_FULL) ? true : false; + hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT), &agent_info->cu_num); + hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU), &agent_info->waves_per_cu); + hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU), &agent_info->simds_per_cu); + hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES), &agent_info->se_num); + hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_NUM_SHADER_ARRAYS_PER_SE), &agent_info->shader_arrays_per_se); // Initialize memory regions to zero agent_info->kernarg_region.handle = 0; @@ -349,8 +359,8 @@ bool HsaRsrcFactory::TransferData(void* dest_buff, void* src_buff, uint32_t leng // // @return bool true if successful, false otherwise // -bool HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, - char* kernel_name, hsa_executable_symbol_t* code_desc) { +void* HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, + const char* kernel_name, hsa_executable_t* hsa_exec, hsa_executable_symbol_t* code_desc) { // Finalize the Hsail object into code object hsa_status_t status; hsa_code_object_t code_object; @@ -364,52 +374,52 @@ bool HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* br if (!codeStream) { std::cerr << "Error: failed to load " << filename << std::endl; assert(false); - return false; + return NULL; } // Allocate memory to read in code object from file size_t size = std::string::size_type(codeStream.tellg()); - char* codeBuff = (char*)AllocateSysMemory(agent_info, size); - if (!codeBuff) { + char* code_buf = (char*)AllocateSysMemory(agent_info, size); + if (!code_buf) { std::cerr << "Error: failed to allocate memory for code object." << std::endl; assert(false); - return false; + return NULL; } // Read the code object into allocated memory codeStream.seekg(0, std::ios::beg); - std::copy(std::istreambuf_iterator(codeStream), std::istreambuf_iterator(), codeBuff); + std::copy(std::istreambuf_iterator(codeStream), std::istreambuf_iterator(), code_buf); // De-Serialize the code object that has been read into memory - status = hsa_code_object_deserialize(codeBuff, size, NULL, &code_object); + status = hsa_code_object_deserialize(code_buf, size, NULL, &code_object); if (status != HSA_STATUS_SUCCESS) { std::cerr << "Failed to deserialize code object" << std::endl; - return false; + if (code_buf) hsa_memory_free(code_buf); + return NULL; } // Create executable. - hsa_executable_t hsaExecutable; status = - hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, "", &hsaExecutable); + hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, "", hsa_exec); CHECK_STATUS("Error in creating executable object", status); // Load code object. - status = hsa_executable_load_code_object(hsaExecutable, agent_info->dev_id, code_object, ""); + status = hsa_executable_load_code_object(*hsa_exec, agent_info->dev_id, code_object, ""); CHECK_STATUS("Error in loading executable object", status); // Freeze executable. - status = hsa_executable_freeze(hsaExecutable, ""); + status = hsa_executable_freeze(*hsa_exec, ""); CHECK_STATUS("Error in freezing executable object", status); // Get symbol handle. hsa_executable_symbol_t kernelSymbol; - status = hsa_executable_get_symbol(hsaExecutable, NULL, kernel_name, agent_info->dev_id, 0, + status = hsa_executable_get_symbol(*hsa_exec, NULL, kernel_name, agent_info->dev_id, 0, &kernelSymbol); CHECK_STATUS("Error in looking up kernel symbol", status); // Update output parameter *code_desc = kernelSymbol; - return true; + return code_buf; } // Print the various fields of Hsa Gpu Agents @@ -423,13 +433,47 @@ bool HsaRsrcFactory::PrintGpuAgents(const std::string& header) { std::clog << "> agent[" << idx << "] :" << std::endl; std::clog << ">> Name : " << agent_info->name << std::endl; + std::clog << ">> APU : " << agent_info->is_apu << std::endl; + std::clog << ">> HSAIL profile : " << agent_info->profile << std::endl; std::clog << ">> Max Wave Size : " << agent_info->max_wave_size << std::endl; std::clog << ">> Max Queue Size : " << agent_info->max_queue_size << std::endl; std::clog << ">> Kernarg Region Id : " << agent_info->coarse_region.handle << std::endl; + std::clog << ">> CU number : " << agent_info->cu_num << std::endl; + std::clog << ">> Waves per CU : " << agent_info->waves_per_cu << std::endl; + std::clog << ">> SIMDs per CU : " << agent_info->simds_per_cu << std::endl; + std::clog << ">> SE number : " << agent_info->se_num << std::endl; + std::clog << ">> Shader Arrays per SE : " << agent_info->shader_arrays_per_se << std::endl; } return true; } +uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, void* packet) { + const uint32_t slot_size_b = 0x40; + + // adevance command queue + const uint64_t write_idx = hsa_queue_load_write_index_relaxed(queue); + hsa_queue_store_write_index_relaxed(queue, write_idx + 1); + while ((write_idx - hsa_queue_load_read_index_relaxed(queue)) >= queue->size) { + sched_yield(); + } + + uint32_t slot_idx = (uint32_t)(write_idx % queue->size); + uint32_t* queue_slot = (uint32_t*)((uintptr_t)(queue->base_address) + (slot_idx * slot_size_b)); + uint32_t* slot_data = (uint32_t*)packet; + + // Copy buffered commands into the queue slot. + // Overwrite the AQL invalid header (first dword) last. + // This prevents the slot from being read until it's fully written. + memcpy(&queue_slot[1], &slot_data[1], slot_size_b - sizeof(uint32_t)); + std::atomic* header_atomic_ptr = reinterpret_cast*>(&queue_slot[0]); + header_atomic_ptr->store(slot_data[0], std::memory_order_release); + + // ringdoor bell + hsa_signal_store_relaxed(queue->doorbell_signal, write_idx); + + return write_idx; +} + HsaRsrcFactory* HsaRsrcFactory::instance_ = NULL; HsaRsrcFactory::mutex_t HsaRsrcFactory::mutex_; diff --git a/projects/rocprofiler/src/util/hsa_rsrc_factory.h b/projects/rocprofiler/src/util/hsa_rsrc_factory.h index 105483b7d0..be6f27766c 100644 --- a/projects/rocprofiler/src/util/hsa_rsrc_factory.h +++ b/projects/rocprofiler/src/util/hsa_rsrc_factory.h @@ -67,6 +67,9 @@ struct AgentInfo { // Agent type - Cpu = 0, Gpu = 1 or Dsp = 2 uint32_t dev_type; + // APU flag + bool is_apu; + // Agent system index uint32_t dev_index; @@ -90,6 +93,21 @@ struct AgentInfo { // Memory region supporting kernel arguments hsa_region_t kernarg_region; + + // The number of compute unit available in the agent. + uint32_t cu_num; + + // Maximum number of waves possible in a Compute Unit. + uint32_t waves_per_cu; + + // Number of SIMD's per compute unit CU + uint32_t simds_per_cu; + + // Number of Shader Engines (SE) in Gpu + uint32_t se_num; + + // Number of Shader Arrays Per Shader Engines in Gpu + uint32_t shader_arrays_per_se; }; class HsaRsrcFactory { @@ -213,14 +231,17 @@ class HsaRsrcFactory { // @param code_desc Handle of finalized Code Descriptor that could // be used to submit for execution // - // @return bool true if successful, false otherwise + // @return code buffer, non NULL if successful, NULL otherwise // - bool LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, char* kernel_name, - hsa_executable_symbol_t* code_desc); + void* LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, const char* kernel_name, + hsa_executable_t* hsa_exec, hsa_executable_symbol_t* code_desc); // Print the various fields of Hsa Gpu Agents bool PrintGpuAgents(const std::string& header); + // Submit AQL packet to given queue + static uint64_t Submit(hsa_queue_t* queue, void* packet); + // Return AqlProfile API table typedef hsa_ven_amd_aqlprofile_1_00_pfn_t aqlprofile_pfn_t; const aqlprofile_pfn_t* AqlProfileApi() const { return &aqlprofile_api_; } diff --git a/projects/rocprofiler/src/xml/expr.h b/projects/rocprofiler/src/xml/expr.h index ca4784871c..db864d0738 100644 --- a/projects/rocprofiler/src/xml/expr.h +++ b/projects/rocprofiler/src/xml/expr.h @@ -24,6 +24,7 @@ class div_zero_exception_t : public exception_t { }; typedef uint64_t args_t; +static const args_t ARGS_MAX = UINT64_MAX; typedef std::map args_map_t; class Expr; @@ -177,14 +178,19 @@ class Expr { bool SubCheck() const { return (sub_count_ == 0); } unsigned FindOp() const { unsigned i = pos_; + unsigned open_n = 0; while (i < expr_.length()) { switch (Symb(i)) { case '*': case '/': case '+': case '-': + goto end; case '(': + ++open_n; + break; case ')': + if (open_n != 0) i += 1; goto end; } ++i; @@ -263,6 +269,74 @@ class var_expr_t : public bin_expr_t { const std::string name_; }; +class fun_expr_t : public bin_expr_t { + public: + typedef std::vector vvect_t; + fun_expr_t(const std::string& fname, const std::string& vname, const uint32_t& vnum) : fname_(fname) { + for (uint32_t i = 0; i < vnum; ++i) { + std::ostringstream var_full_name; + var_full_name << vname << "[" << i << "]"; + vvect.push_back(var_expr_t(var_full_name.str())); + } + } + const vvect_t& GetVars() const { return vvect; } + std::string Symbol() const { + const std::string var = vvect[0].Symbol(); + const std::string vname = var.substr(0, var.length() - 3); + std::ostringstream oss; + std::string str("("); + str.back() = ')'; + oss << fname_ << "(" << vname << "," << vvect.size() << ")"; + return oss.str(); + } + + private: + const std::string fname_; + vvect_t vvect; +}; +class sum_expr_t : public fun_expr_t { + public: + sum_expr_t(const std::string& vname, const uint32_t& vnum) : fun_expr_t("sum", vname, vnum) {} + args_t Eval(const args_cache_t& args) const { + args_t result = 0; + for (const auto& var : GetVars()) result += var.Eval(args); + return result; + } +}; +class avr_expr_t : public fun_expr_t { + public: + avr_expr_t(const std::string& vname, const uint32_t& vnum) : fun_expr_t("avr", vname, vnum) {} + args_t Eval(const args_cache_t& args) const { + args_t result = 0; + for (const auto& var : GetVars()) result += var.Eval(args); + return result / GetVars().size(); + } +}; +class min_expr_t : public fun_expr_t { + public: + min_expr_t(const std::string& vname, const uint32_t& vnum) : fun_expr_t("min", vname, vnum) {} + args_t Eval(const args_cache_t& args) const { + args_t result = ARGS_MAX; + for (const auto& var : GetVars()) { + args_t val = var.Eval(args); + result = (val < result) ? val : result; + } + return result; + } +}; +class max_expr_t : public fun_expr_t { + public: + max_expr_t(const std::string& vname, const uint32_t& vnum) : fun_expr_t("max", vname, vnum) {} + args_t Eval(const args_cache_t& args) const { + args_t result = 0; + for (const auto& var : GetVars()) { + args_t val = var.Eval(args); + result = (val > result) ? val : result; + } + return result; + } +}; + inline const bin_expr_t* bin_expr_t::CreateExpr(const bin_expr_t* arg1, const bin_expr_t* arg2, const char op) { const bin_expr_t* expr = NULL; @@ -285,11 +359,41 @@ inline const bin_expr_t* bin_expr_t::CreateExpr(const bin_expr_t* arg1, const bi inline const bin_expr_t* bin_expr_t::CreateArg(Expr* obj, const std::string str) { const bin_expr_t* arg = NULL; + const unsigned i = strspn(str.c_str(), "1234567890"); if (i == str.length()) { const unsigned value = atoi(str.c_str()); arg = new const_expr_t(value); - } else { + } + + if (arg == NULL) { + const std::size_t pos = str.find('('); + if (pos != std::string::npos) { + char* fname = NULL; + char* vname = NULL; + int vnum = 0; + int ret = sscanf(str.c_str(), "%m[a-zA-Z_](%m[0-9a-zA-Z_],%d)", &fname, &vname, &vnum); + if (ret == 3) { + const std::string fun_name(fname); + const fun_expr_t* farg = NULL; + if (fun_name == "sum") { + farg = new sum_expr_t(vname, vnum); + } else if (fun_name == "avr") { + farg = new avr_expr_t(vname, vnum); + } else if (fun_name == "min") { + farg = new min_expr_t(vname, vnum); + } else if (fun_name == "max") { + farg = new max_expr_t(vname, vnum); + } + if (farg) for (const auto& var : farg->GetVars()) obj->AddVar(var.Symbol()); + arg = farg; + } + free(fname); + free(vname); + } + } + + if (arg == NULL) { const std::string sub_expr = obj->Lookup(str); if (sub_expr.empty()) { arg = new var_expr_t(str); @@ -299,6 +403,7 @@ inline const bin_expr_t* bin_expr_t::CreateArg(Expr* obj, const std::string str) arg = expr->GetTree(); } } + return arg; } diff --git a/projects/rocprofiler/src/xml/xml.h b/projects/rocprofiler/src/xml/xml.h index d6da6f779b..df2640c52a 100644 --- a/projects/rocprofiler/src/xml/xml.h +++ b/projects/rocprofiler/src/xml/xml.h @@ -36,11 +36,29 @@ class Xml { return xml; } + void AddExpr(const std::string& full_tag, const std::string& name, const std::string& expr) { + const std::size_t pos = full_tag.rfind('.'); + const std::size_t pos1 = (pos == std::string::npos) ? 0 : pos + 1; + const std::string level_tag = full_tag.substr(pos1); + level_t* level = new level_t; + map_[full_tag].push_back(level); + level->tag = level_tag; + level->opts["name"] = name; + level->opts["expr"] = expr; + } + + void AddConst(const std::string& full_tag, const std::string& name, const uint64_t& val) { + std::ostringstream oss; + oss << val; + AddExpr(full_tag, name, oss.str()); + } + static void Destroy(Xml *xml) { delete xml; } std::vector GetNodes(std::string global_tag) { return map_[global_tag]; } void Print() const { + std::cout << "XML file '" << file_name_ << "':" << std::endl; for (auto& elem : map_) { for (auto node : elem.second) { if (node->opts.size()) { @@ -74,13 +92,14 @@ class Xml { while (1) { token_t token = (remainder.size()) ? remainder : NextToken(); remainder.clear(); + +// token_t token1 = token; +// token1.push_back('\0'); +// std::cout << "> " << &token1[0] << std::endl; + // End of file if (token.size() == 0) break; - // token_t token1 = token; - // token1.push_back('\0'); - // std::cout << "> " << &token1[0] << std::endl; - switch (state_) { case BODY_STATE: if (token[0] == '<') { @@ -146,6 +165,11 @@ class Xml { ~Xml() {} + bool SpaceCheck() const { + bool cond = ((buffer_[index_] == ' ') || (buffer_[index_] == ' ')); + return cond; + } + bool LineEndCheck() { bool found = false; if (buffer_[index_] == '\n') { @@ -162,24 +186,55 @@ class Xml { token_t NextToken() { token_t token; + bool in_string = false; + bool special_symb = false; while (1) { if (data_size_ == 0) { data_size_ = read(fd_, buffer_, buf_size_); if (data_size_ <= 0) break; } + if (token.empty()) - while ((index_ < data_size_) && ((buffer_[index_] == ' ') || LineEndCheck())) { + while ((index_ < data_size_) && (SpaceCheck() || LineEndCheck())) { ++index_; } - while ((index_ < data_size_) && (buffer_[index_] != ' ') && !LineEndCheck()) { - token.push_back(buffer_[index_++]); + while ((index_ < data_size_) && (in_string || !(SpaceCheck() || LineEndCheck()))) { + const char symb = buffer_[index_]; + bool skip_symb = false; + + switch (symb) { + case '\\': + if (special_symb) special_symb = false; + else { + special_symb = true; + skip_symb = true; + } + break; + case '"': + if (special_symb) special_symb = false; + else { + in_string = !in_string; + if (!in_string) { + buffer_[index_] = ' '; + --index_; + } + skip_symb = true; + } + break; + } + + if (!skip_symb) token.push_back(symb); + ++index_; } + if (index_ == data_size_) { index_ = 0; data_size_ = 0; - } else + } else { + if (special_symb || in_string) BadFormat(token); break; + } } return token; diff --git a/projects/rocprofiler/test/ctrl/test_aql.h b/projects/rocprofiler/test/ctrl/test_aql.h index 4f2f65d690..9a5167fe14 100644 --- a/projects/rocprofiler/test/ctrl/test_aql.h +++ b/projects/rocprofiler/test/ctrl/test_aql.h @@ -42,7 +42,7 @@ class TestAql { } TestAql* Test() { return test_; } - virtual AgentInfo* GetAgentInfo() { return (test_) ? test_->GetAgentInfo() : 0; } + virtual const AgentInfo* GetAgentInfo() { return (test_) ? test_->GetAgentInfo() : 0; } virtual hsa_queue_t* GetQueue() { return (test_) ? test_->GetQueue() : 0; } virtual HsaRsrcFactory* GetRsrcFactory() { return (test_) ? test_->GetRsrcFactory() : 0; } diff --git a/projects/rocprofiler/test/ctrl/test_hsa.cpp b/projects/rocprofiler/test/ctrl/test_hsa.cpp index 5a423d6622..7de33c9d92 100644 --- a/projects/rocprofiler/test/ctrl/test_hsa.cpp +++ b/projects/rocprofiler/test/ctrl/test_hsa.cpp @@ -34,7 +34,7 @@ OF THE POSSIBILITY OF SUCH DAMAGE. #include "util/hsa_rsrc_factory.h" HsaRsrcFactory* TestHsa::hsa_rsrc_ = NULL; -AgentInfo* TestHsa::agent_info_ = NULL; +const AgentInfo* TestHsa::agent_info_ = NULL; hsa_queue_t* TestHsa::hsa_queue_ = NULL; uint32_t TestHsa::agent_id_ = 0; @@ -43,7 +43,7 @@ HsaRsrcFactory* TestHsa::HsaInstantiate(const uint32_t agent_ind) { if (hsa_rsrc_ == NULL) { agent_id_ = agent_ind; - hsa_rsrc_ = HsaRsrcFactory::Create(); + hsa_rsrc_ = HsaRsrcFactory::CreateInstance(); // Print properties of the agents hsa_rsrc_->PrintGpuAgents("> GPU agents"); diff --git a/projects/rocprofiler/test/ctrl/test_hsa.h b/projects/rocprofiler/test/ctrl/test_hsa.h index 2b3b29d72a..300ef48685 100644 --- a/projects/rocprofiler/test/ctrl/test_hsa.h +++ b/projects/rocprofiler/test/ctrl/test_hsa.h @@ -52,7 +52,7 @@ class TestHsa : public TestAql { } // Get methods for Agent Info, HAS queue, HSA Resourcse Manager - AgentInfo* GetAgentInfo() { return agent_info_; } + const AgentInfo* GetAgentInfo() { return agent_info_; } hsa_queue_t* GetQueue() { return hsa_queue_; } HsaRsrcFactory* GetRsrcFactory() { return hsa_rsrc_; } @@ -115,7 +115,7 @@ class TestHsa : public TestAql { static uint32_t agent_id_; // Handle to an Hsa Gpu Agent - static AgentInfo* agent_info_; + static const AgentInfo* agent_info_; // Handle to an Hsa Queue static hsa_queue_t* hsa_queue_; diff --git a/projects/rocprofiler/test/ctrl/tool.cpp b/projects/rocprofiler/test/ctrl/tool.cpp index cfc616782b..d372f13ae0 100644 --- a/projects/rocprofiler/test/ctrl/tool.cpp +++ b/projects/rocprofiler/test/ctrl/tool.cpp @@ -371,6 +371,7 @@ CONSTRUCTOR_API void constructor() fprintf(stderr, "Input file not found '%s'\n", xml_name); exit(1); } + xml->Print(); // Getting metrics auto metrics_list = xml->GetNodes("top.metric"); diff --git a/projects/rocprofiler/test/metrics.xml b/projects/rocprofiler/test/metrics.xml index 6bd37d507d..982524805e 100644 --- a/projects/rocprofiler/test/metrics.xml +++ b/projects/rocprofiler/test/metrics.xml @@ -8,11 +8,23 @@ + + + + + + + + + + + + @@ -22,37 +34,32 @@ + + + + + # average for (16 instances x 4 shader engines) - - # sun for 16 instances - - - - + + # sum for 16 instances + + + + + # FETCH_SIZE, kilobytes # The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. - + @@ -63,13 +70,25 @@ - - + + + + + + + + + + + + + + @@ -86,60 +105,178 @@ # average for (16 instances x 4 shader engines) - + # sum for 16 instances - - - - - - + + + + + + # FETCH_SIZE, kilobytes # The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. - + # GPU_BUSY, percentage # The percentage of time GPU was busy. - + - # MEM_BUSY, percentage - # The percentage of GPUTime the memory unit is active. The result includes the stall time (MemUnitStalled). This is measured with all extra fetches and writes and any cache or memory effects taken into account. Value range: 0% to 100% (fetch-bound). - + # Wavefronts Total wavefronts., + - # VWRITE_INSTS - # The average number of vector write instructions to the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that write to video memory. - + # VALUInsts The average number of vector ALU instructions executed per work-item (affected by flow control). + - # SFETCH_INSTS - # The average number of scalar fetch instructions from the video memory executed per work-item (affected by flow control). - + # SALUInsts The average number of scalar ALU instructions executed per work-item (affected by flow control). + + + # VFetchInsts The average number of vector fetch instructions from the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that fetch from video memory. + - # VALU_INSTS - # The average number of vector ALU instructions executed per work-item (affected by flow control). - + # SFetchInsts The average number of scalar fetch instructions from the video memory executed per work-item (affected by flow control). + + + # VWriteInsts The average number of vector write instructions to the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that write to video memory. + + + # FlatVMemInsts The average number of FLAT instructions that read from or write to the video memory executed per work item (affected by flow control). Includes FLAT instructions that read from or write to scratch. + + + # LDSInsts The average number of LDS read or LDS write instructions executed per work item (affected by flow control). Excludes FLAT instructions that read from or write to LDS. + + + # FlatLDSInsts The average number of FLAT instructions that read or write to LDS executed per work item (affected by flow control). + + + # GDSInsts The average number of GDS read or GDS write instructions executed per work item (affected by flow control). + + + # VALUUtilization The percentage of active vector ALU threads in a wave. A lower number can mean either more thread divergence in a wave or that the work-group size is not a multiple of 64. Value range: 0% (bad), 100% (ideal - no thread divergence). + + + # VALUBusy The percentage of GPUTime vector ALU instructions are processed. Value range: 0% (bad) to 100% (optimal). + + + # SALUBusy The percentage of GPUTime scalar ALU instructions are processed. Value range: 0% (bad) to 100% (optimal). + + + # FetchSize The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. + + + # WriteSize The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. + + + # L2CacheHit The percentage of fetch, write, atomic, and other instructions that hit the data in L2 cache. Value range: 0% (no hit) to 100% (optimal). + + + # MemUnitBusy The percentage of GPUTime the memory unit is active. The result includes the stall time (MemUnitStalled). This is measured with all extra fetches and writes and any cache or memory effects taken into account. Value range: 0% to 100% (fetch-bound). + + + # MemUnitStalled The percentage of GPUTime the memory unit is stalled. Try reducing the number or size of fetches and writes if possible. Value range: 0% (optimal) to 100% (bad). + + + # WriteUnitStalled The percentage of GPUTime the Write unit is stalled. Value range: 0% to 100% (bad). + + + # The percentage of GPUTime ALU units are stalled by the LDS input queue being full or the output queue being not ready. If there are LDS bank conflicts, reduce them. Otherwise, try reducing the number of LDS accesses if possible. Value range: 0% (optimal) to 100% (bad). + + + # LDSBankConflict The percentage of GPUTime LDS is stalled by bank conflicts. Value range: 0% (optimal) to 100% (bad). + - # L2CACHE_HIT, percentage - # The percentage of fetch, write, atomic, and other instructions that hit the data in L2 cache. Value range: 0% (no hit) to 100% (optimal). - diff --git a/projects/rocprofiler/test/run.sh b/projects/rocprofiler/test/run.sh index 9a3aed3744..e02cca3ad9 100755 --- a/projects/rocprofiler/test/run.sh +++ b/projects/rocprofiler/test/run.sh @@ -21,7 +21,7 @@ export ROCP_METRICS=metrics.xml export ROCP_INPUT=input.xml # output directory for the tool library, for metrics results file 'results.txt' # and SQTT trace files 'thread_trace.se.out' -#export ROCP_OUTPUT_DIR=./ +export ROCP_OUTPUT_DIR=./RESULTS if [ -n "$1" ] ; then tbin="$*" diff --git a/projects/rocprofiler/test/util/hsa_rsrc_factory.cpp b/projects/rocprofiler/test/util/hsa_rsrc_factory.cpp index 6e5573b1b2..50427a84ed 100644 --- a/projects/rocprofiler/test/util/hsa_rsrc_factory.cpp +++ b/projects/rocprofiler/test/util/hsa_rsrc_factory.cpp @@ -26,6 +26,7 @@ POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include @@ -39,8 +40,17 @@ POSSIBILITY OF SUCH DAMAGE. #include #include +// Callback function to get available in the system agents +hsa_status_t HsaRsrcFactory::GetHsaAgentsCallback(hsa_agent_t agent, void* data) { + hsa_status_t status = HSA_STATUS_ERROR; + HsaRsrcFactory* hsa_rsrc = reinterpret_cast(data); + const AgentInfo* agent_info = hsa_rsrc->AddAgentInfo(agent); + if (agent_info != NULL) status = HSA_STATUS_SUCCESS; + return status; +} + // Callback function to find and bind kernarg region of an agent -static hsa_status_t FindMemRegionsCallback(hsa_region_t region, void* data) { +hsa_status_t HsaRsrcFactory::FindMemRegionsCallback(hsa_region_t region, void* data) { hsa_region_global_flag_t flags; hsa_region_segment_t segment_id; @@ -62,53 +72,6 @@ static hsa_status_t FindMemRegionsCallback(hsa_region_t region, void* data) { return HSA_STATUS_SUCCESS; } -// Callback function to get the number of agents -static hsa_status_t GetHsaAgentsCallback(hsa_agent_t agent, void* data) { - // Copy handle of agent and increment number of agents reported - HsaRsrcFactory* rsrcFactory = reinterpret_cast(data); - - // Determine if device is a Gpu agent - hsa_status_t status; - hsa_device_type_t type; - status = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type); - CHECK_STATUS("Error Calling hsa_agent_get_info", status); - if (type == HSA_DEVICE_TYPE_DSP) { - return HSA_STATUS_SUCCESS; - } - - if (type == HSA_DEVICE_TYPE_CPU) { - AgentInfo* agent_info = reinterpret_cast(malloc(sizeof(AgentInfo))); - agent_info->dev_id = agent; - agent_info->dev_type = HSA_DEVICE_TYPE_CPU; - rsrcFactory->AddAgentInfo(agent_info, false); - return HSA_STATUS_SUCCESS; - } - - // Device is a Gpu agent, build an instance of AgentInfo - AgentInfo* agent_info = reinterpret_cast(malloc(sizeof(AgentInfo))); - agent_info->dev_id = agent; - agent_info->dev_type = HSA_DEVICE_TYPE_GPU; - hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, agent_info->name); - agent_info->max_wave_size = 0; - hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &agent_info->max_wave_size); - agent_info->max_queue_size = 0; - hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &agent_info->max_queue_size); - agent_info->profile = hsa_profile_t(108); - hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_info->profile); - - // Initialize memory regions to zero - agent_info->kernarg_region.handle = 0; - agent_info->coarse_region.handle = 0; - - // Find and Bind Memory regions of the Gpu agent - hsa_agent_iterate_regions(agent, FindMemRegionsCallback, agent_info); - - // Save the instance of AgentInfo - rsrcFactory->AddAgentInfo(agent_info, true); - - return HSA_STATUS_SUCCESS; -} - // Constructor of the class HsaRsrcFactory::HsaRsrcFactory() { // Initialize the Hsa Runtime @@ -128,12 +91,17 @@ HsaRsrcFactory::HsaRsrcFactory() { status = hsa_system_get_extension_table(HSA_EXTENSION_AMD_AQLPROFILE, 1, 0, &aqlprofile_api_); #endif CHECK_STATUS("aqlprofile API table load failed", status); + + // Get Loader API table + loader_api_ = {0}; + status = hsa_system_get_extension_table(HSA_EXTENSION_AMD_LOADER, 1, 0, &loader_api_); + CHECK_STATUS("loader API table query failed", status); } // Destructor of the class HsaRsrcFactory::~HsaRsrcFactory() { - for (auto p : cpu_list_) free(p); - for (auto p : gpu_list_) free(p); + for (auto p : cpu_list_) free(const_cast(p)); + for (auto p : gpu_list_) free(const_cast(p)); printf("HSA shutdown\n"); hsa_status_t status = hsa_shut_down(); @@ -173,6 +141,68 @@ hsa_status_t HsaRsrcFactory::LoadAqlProfileLib(aqlprofile_pfn_t* api) { return HSA_STATUS_SUCCESS; } +// Add system agent info +const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) { + // Determine if device is a Gpu agent + hsa_status_t status; + AgentInfo* agent_info = NULL; + + hsa_device_type_t type; + status = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type); + CHECK_STATUS("Error Calling hsa_agent_get_info", status); + + if (type == HSA_DEVICE_TYPE_CPU) { + agent_info = new AgentInfo{}; + agent_info->dev_id = agent; + agent_info->dev_type = HSA_DEVICE_TYPE_CPU; + agent_info->dev_index = cpu_list_.size(); + cpu_list_.push_back(agent_info); + } + + if (type == HSA_DEVICE_TYPE_GPU) { + agent_info = new AgentInfo{}; + agent_info->dev_id = agent; + agent_info->dev_type = HSA_DEVICE_TYPE_GPU; + hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, agent_info->name); + strncpy(agent_info->gfxip, agent_info->name, 4); + agent_info->gfxip[4] = '\0'; + hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &agent_info->max_wave_size); + hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &agent_info->max_queue_size); + hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_info->profile); + agent_info->is_apu = (agent_info->profile == HSA_PROFILE_FULL) ? true : false; + hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT), &agent_info->cu_num); + hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU), &agent_info->waves_per_cu); + hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU), &agent_info->simds_per_cu); + hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES), &agent_info->se_num); + hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_NUM_SHADER_ARRAYS_PER_SE), &agent_info->shader_arrays_per_se); + + // Initialize memory regions to zero + agent_info->kernarg_region.handle = 0; + agent_info->coarse_region.handle = 0; + + // Find and Bind Memory regions of the Gpu agent + hsa_agent_iterate_regions(agent, FindMemRegionsCallback, agent_info); + + // Set GPU index + agent_info->dev_index = gpu_list_.size(); + gpu_list_.push_back(agent_info); + } + + if (agent_info) agent_map_[agent.handle] = agent_info; + + return agent_info; +} + +// Return systen agent info +const AgentInfo* HsaRsrcFactory::GetAgentInfo(const hsa_agent_t agent) { + const AgentInfo* agent_info = NULL; + auto it = agent_map_.find(agent.handle); + if (it != agent_map_.end()) { + agent_info = it->second; + } + return agent_info; +} + // Get the count of Hsa Gpu Agents available on the platform // // @return uint32_t Number of Gpu agents on platform @@ -193,7 +223,7 @@ uint32_t HsaRsrcFactory::GetCountOfCpuAgents() { return uint32_t(cpu_list_.size( // // @return bool true if successful, false otherwise // -bool HsaRsrcFactory::GetGpuAgentInfo(uint32_t idx, AgentInfo** agent_info) { +bool HsaRsrcFactory::GetGpuAgentInfo(uint32_t idx, const AgentInfo** agent_info) { // Determine if request is valid uint32_t size = uint32_t(gpu_list_.size()); if (idx >= size) { @@ -202,6 +232,7 @@ bool HsaRsrcFactory::GetGpuAgentInfo(uint32_t idx, AgentInfo** agent_info) { // Copy AgentInfo from specified index *agent_info = gpu_list_[idx]; + return true; } @@ -213,7 +244,7 @@ bool HsaRsrcFactory::GetGpuAgentInfo(uint32_t idx, AgentInfo** agent_info) { // // @return bool true if successful, false otherwise // -bool HsaRsrcFactory::GetCpuAgentInfo(uint32_t idx, AgentInfo** agent_info) { +bool HsaRsrcFactory::GetCpuAgentInfo(uint32_t idx, const AgentInfo** agent_info) { // Determine if request is valid uint32_t size = uint32_t(cpu_list_.size()); if (idx >= size) { @@ -236,7 +267,8 @@ bool HsaRsrcFactory::GetCpuAgentInfo(uint32_t idx, AgentInfo** agent_info) { // // @return bool true if successful, false otherwise // -bool HsaRsrcFactory::CreateQueue(AgentInfo* agent_info, uint32_t num_pkts, hsa_queue_t** queue) { +bool HsaRsrcFactory::CreateQueue(const AgentInfo* agent_info, uint32_t num_pkts, + hsa_queue_t** queue) { hsa_status_t status; status = hsa_queue_create(agent_info->dev_id, num_pkts, HSA_QUEUE_TYPE_MULTI, NULL, NULL, UINT32_MAX, UINT32_MAX, queue); @@ -324,7 +356,7 @@ bool HsaRsrcFactory::TransferData(void* dest_buff, void* src_buff, uint32_t leng // // @return bool true if successful, false otherwise // -void* HsaRsrcFactory::LoadAndFinalize(AgentInfo* agent_info, const char* brig_path, +void* HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, const char* kernel_name, hsa_executable_t* hsa_exec, hsa_executable_symbol_t* code_desc) { // Finalize the Hsail object into code object hsa_status_t status; @@ -387,32 +419,27 @@ void* HsaRsrcFactory::LoadAndFinalize(AgentInfo* agent_info, const char* brig_pa return code_buf; } -// Add an instance of AgentInfo representing a Hsa Gpu agent -void HsaRsrcFactory::AddAgentInfo(AgentInfo* agent_info, bool gpu) { - // Add input to Gpu list - if (gpu) { - gpu_list_.push_back(agent_info); - return; - } - - // Add input to Cpu list - cpu_list_.push_back(agent_info); -} - // Print the various fields of Hsa Gpu Agents bool HsaRsrcFactory::PrintGpuAgents(const std::string& header) { std::clog << header << " :" << std::endl; - AgentInfo* agent_info; + const AgentInfo* agent_info; int size = uint32_t(gpu_list_.size()); for (int idx = 0; idx < size; idx++) { agent_info = gpu_list_[idx]; std::clog << "> agent[" << idx << "] :" << std::endl; std::clog << ">> Name : " << agent_info->name << std::endl; + std::clog << ">> APU : " << agent_info->is_apu << std::endl; + std::clog << ">> HSAIL profile : " << agent_info->profile << std::endl; std::clog << ">> Max Wave Size : " << agent_info->max_wave_size << std::endl; std::clog << ">> Max Queue Size : " << agent_info->max_queue_size << std::endl; std::clog << ">> Kernarg Region Id : " << agent_info->coarse_region.handle << std::endl; + std::clog << ">> CU number : " << agent_info->cu_num << std::endl; + std::clog << ">> Waves per CU : " << agent_info->waves_per_cu << std::endl; + std::clog << ">> SIMDs per CU : " << agent_info->simds_per_cu << std::endl; + std::clog << ">> SE number : " << agent_info->se_num << std::endl; + std::clog << ">> Shader Arrays per SE : " << agent_info->shader_arrays_per_se << std::endl; } return true; } diff --git a/projects/rocprofiler/test/util/hsa_rsrc_factory.h b/projects/rocprofiler/test/util/hsa_rsrc_factory.h index c32b83f3b5..b8d9ff326a 100644 --- a/projects/rocprofiler/test/util/hsa_rsrc_factory.h +++ b/projects/rocprofiler/test/util/hsa_rsrc_factory.h @@ -28,6 +28,7 @@ POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include @@ -35,6 +36,7 @@ POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include @@ -52,6 +54,7 @@ POSSIBILITY OF SUCH DAMAGE. static const unsigned MEM_PAGE_BYTES = 0x1000; static const unsigned MEM_PAGE_MASK = MEM_PAGE_BYTES - 1; +typedef decltype(hsa_agent_t::handle) hsa_agent_handle_t; // Encapsulates information about a Hsa Agent such as its // handle, name, max queue size, max wavefront size, etc. @@ -62,6 +65,15 @@ struct AgentInfo { // Agent type - Cpu = 0, Gpu = 1 or Dsp = 2 uint32_t dev_type; + // APU flag + bool is_apu; + + // Agent system index + uint32_t dev_index; + + // GFXIP name + char gfxip[64]; + // Name of Agent whose length is less than 64 char name[64]; @@ -79,31 +91,52 @@ struct AgentInfo { // Memory region supporting kernel arguments hsa_region_t kernarg_region; + + // The number of compute unit available in the agent. + uint32_t cu_num; + + // Maximum number of waves possible in a Compute Unit. + uint32_t waves_per_cu; + + // Number of SIMD's per compute unit CU + uint32_t simds_per_cu; + + // Number of Shader Engines (SE) in Gpu + uint32_t se_num; + + // Number of Shader Arrays Per Shader Engines in Gpu + uint32_t shader_arrays_per_se; }; class HsaRsrcFactory { public: typedef std::recursive_mutex mutex_t; - static HsaRsrcFactory* Create() { + static HsaRsrcFactory* Create() { return NULL; } + + static HsaRsrcFactory* CreateInstance() { std::lock_guard lck(mutex_); - if (HsaRsrcFactory::instance_ == NULL) { - HsaRsrcFactory::instance_ = new HsaRsrcFactory(); + if (instance_ == NULL) { + instance_ = new HsaRsrcFactory(); } return instance_; } + static HsaRsrcFactory& Instance() { + CreateInstance(); + hsa_status_t status = (instance_ != NULL) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; + CHECK_STATUS("HsaRsrcFactory::Instance() is not found", status); + return *instance_; + } + static void Destroy() { std::lock_guard lck(mutex_); if (instance_) delete instance_; instance_ = NULL; } - static HsaRsrcFactory& Instance() { - hsa_status_t status = (instance_ != NULL) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; - CHECK_STATUS("HsaRsrcFactory::Instance()", status); - return *instance_; - } + // Return system agent info + const AgentInfo* GetAgentInfo(const hsa_agent_t agent); // Get the count of Hsa Gpu Agents available on the platform // @@ -125,7 +158,7 @@ class HsaRsrcFactory { // // @return bool true if successful, false otherwise // - bool GetGpuAgentInfo(uint32_t idx, AgentInfo** agent_info); + bool GetGpuAgentInfo(uint32_t idx, const AgentInfo** agent_info); // Get the AgentInfo handle of a Cpu device // @@ -135,7 +168,7 @@ class HsaRsrcFactory { // // @return bool true if successful, false otherwise // - bool GetCpuAgentInfo(uint32_t idx, AgentInfo** agent_info); + bool GetCpuAgentInfo(uint32_t idx, const AgentInfo** agent_info); // Create a Queue object and return its handle. The queue object is expected // to support user requested number of Aql dispatch packets. @@ -148,7 +181,7 @@ class HsaRsrcFactory { // // @return bool true if successful, false otherwise // - bool CreateQueue(AgentInfo* agent_info, uint32_t num_pkts, hsa_queue_t** queue); + bool CreateQueue(const AgentInfo* agent_info, uint32_t num_pkts, hsa_queue_t** queue); // Create a Signal object and return its handle. // @@ -198,12 +231,9 @@ class HsaRsrcFactory { // // @return code buffer, non NULL if successful, NULL otherwise // - void* LoadAndFinalize(AgentInfo* agent_info, const char* brig_path, const char* kernel_name, + void* LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, const char* kernel_name, hsa_executable_t* hsa_exec, hsa_executable_symbol_t* code_desc); - // Add an instance of AgentInfo representing a Hsa Gpu agent - void AddAgentInfo(AgentInfo* agent_info, bool gpu); - // Print the various fields of Hsa Gpu Agents bool PrintGpuAgents(const std::string& header); @@ -214,7 +244,16 @@ class HsaRsrcFactory { typedef hsa_ven_amd_aqlprofile_1_00_pfn_t aqlprofile_pfn_t; const aqlprofile_pfn_t* AqlProfileApi() const { return &aqlprofile_api_; } + // Return Loader API table + const hsa_ven_amd_loader_1_00_pfn_t* LoaderApi() const { return &loader_api_; } + private: + // System agents iterating callback + static hsa_status_t GetHsaAgentsCallback(hsa_agent_t agent, void* data); + + // Callback function to find and bind kernarg region of an agent + static hsa_status_t FindMemRegionsCallback(hsa_region_t region, void* data); + // Load AQL profile HSA extension library directly static hsa_status_t LoadAqlProfileLib(aqlprofile_pfn_t* api); @@ -225,17 +264,26 @@ class HsaRsrcFactory { // Destructor of the class ~HsaRsrcFactory(); + // Add an instance of AgentInfo representing a Hsa Gpu agent + const AgentInfo* AddAgentInfo(const hsa_agent_t agent); + static HsaRsrcFactory* instance_; static mutex_t mutex_; // Used to maintain a list of Hsa Gpu Agent Info - std::vector gpu_list_; + std::vector gpu_list_; // Used to maintain a list of Hsa Cpu Agent Info - std::vector cpu_list_; + std::vector cpu_list_; + + // System agents map + std::map agent_map_; // AqlProfile API table aqlprofile_pfn_t aqlprofile_api_; + + // Loader API table + hsa_ven_amd_loader_1_00_pfn_t loader_api_; }; #endif // TEST_UTIL_HSA_RSRC_FACTORY_H_ diff --git a/projects/rocprofiler/test/util/xml.h b/projects/rocprofiler/test/util/xml.h index 890c687a43..e01896cc00 100644 --- a/projects/rocprofiler/test/util/xml.h +++ b/projects/rocprofiler/test/util/xml.h @@ -41,6 +41,7 @@ class Xml { std::vector GetNodes(std::string global_tag) { return map_[global_tag]; } void Print() const { + std::cout << "XML file '" << file_name_ << "':" << std::endl; for (auto& elem : map_) { for (auto node : elem.second) { if (node->opts.size()) { @@ -74,13 +75,14 @@ class Xml { while (1) { token_t token = (remainder.size()) ? remainder : NextToken(); remainder.clear(); + +// token_t token1 = token; +// token1.push_back('\0'); +// std::cout << "> " << &token1[0] << std::endl; + // End of file if (token.size() == 0) break; - // token_t token1 = token; - // token1.push_back('\0'); - // std::cout << "> " << &token1[0] << std::endl; - switch (state_) { case BODY_STATE: if (token[0] == '<') { @@ -146,6 +148,11 @@ class Xml { ~Xml() {} + bool SpaceCheck() const { + bool cond = ((buffer_[index_] == ' ') || (buffer_[index_] == ' ')); + return cond; + } + bool LineEndCheck() { bool found = false; if (buffer_[index_] == '\n') { @@ -162,24 +169,55 @@ class Xml { token_t NextToken() { token_t token; + bool in_string = false; + bool special_symb = false; while (1) { if (data_size_ == 0) { data_size_ = read(fd_, buffer_, buf_size_); if (data_size_ <= 0) break; } + if (token.empty()) - while ((index_ < data_size_) && ((buffer_[index_] == ' ') || LineEndCheck())) { + while ((index_ < data_size_) && (SpaceCheck() || LineEndCheck())) { ++index_; } - while ((index_ < data_size_) && (buffer_[index_] != ' ') && !LineEndCheck()) { - token.push_back(buffer_[index_++]); + while ((index_ < data_size_) && (in_string || !(SpaceCheck() || LineEndCheck()))) { + const char symb = buffer_[index_]; + bool skip_symb = false; + + switch (symb) { + case '\\': + if (special_symb) special_symb = false; + else { + special_symb = true; + skip_symb = true; + } + break; + case '"': + if (special_symb) special_symb = false; + else { + in_string = !in_string; + if (!in_string) { + buffer_[index_] = ' '; + --index_; + } + skip_symb = true; + } + break; + } + + if (!skip_symb) token.push_back(symb); + ++index_; } + if (index_ == data_size_) { index_ = 0; data_size_ = 0; - } else + } else { + if (special_symb || in_string) BadFormat(token); break; + } } return token;