resources releasing; metrics

[ROCm/rocprofiler commit: 79510b8c6c]
2018-02-01 14:52:21 -06:00
@@ -78,9 +78,8 @@ extern "C" {

 // Profiling feature kind
 typedef enum {
-  ROCPROFILER_FEATURE_KIND_COUNTER = 0,
-  ROCPROFILER_FEATURE_KIND_METRIC = 1,
-  ROCPROFILER_FEATURE_KIND_TRACE = 2
+  ROCPROFILER_FEATURE_KIND_METRIC = 0,
+  ROCPROFILER_FEATURE_KIND_TRACE = 1
 } rocprofiler_feature_kind_t;

 // Profiling feture parameter
@@ -200,7 +200,7 @@ class Context {
      const rocprofiler_feature_kind_t kind = info->kind;
      const char* name = info->name;

-      if (kind != ROCPROFILER_FEATURE_KIND_TRACE) {  // Processing metrics features
+      if (kind == ROCPROFILER_FEATURE_KIND_METRIC) {  // Processing metrics features
        const Metric* metric = metrics_->Get(name);
        if (metric == NULL)
          EXC_RAISING(HSA_STATUS_ERROR, "input metric '" << name << "' is not found");
@@ -38,7 +38,7 @@ class Metric {
  virtual const xml::Expr* GetExpr() const = 0;

 private:
-  std::string name_;
+  const std::string name_;
 };

 class BaseMetric : public Metric {
@@ -107,10 +107,38 @@ class MetricsDict {
    }
  }

-  const Metric* Get(const std::string& name) const {
+  const Metric* Get(const std::string& name) {
    const Metric* metric = NULL;
+
    auto it = cache_.find(name);
    if (it != cache_.end()) metric = it->second;
+    else {
+      const std::size_t pos = name.find(':');
+      if (pos != std::string::npos) {
+        std::string block_name = name.substr(0, pos);
+        const std::string event_str = name.substr(pos + 1);
+
+        uint32_t block_index = 0;
+        bool indexed = false;
+        const std::size_t pos1 = block_name.find('[');
+        if (pos1 != std::string::npos) {
+          const std::size_t pos2 = block_name.find(']');
+          if (pos2 == std::string::npos) EXC_RAISING(HSA_STATUS_ERROR, "Malformed metric name '" << name << "'");
+          block_name = name.substr(0, pos1);
+          const std::string block_index_str = name.substr(pos1 + 1, pos2 - (pos1 + 1)); 
+          block_index = atol(block_index_str.c_str());
+          indexed = true;
+        }
+
+        const hsa_ven_amd_aqlprofile_id_query_t query = Translate(agent_info_, block_name);
+        const hsa_ven_amd_aqlprofile_block_name_t block_id = (hsa_ven_amd_aqlprofile_block_name_t)query.id;
+        if ((query.instance_count > 1) && (indexed == false)) EXC_RAISING(HSA_STATUS_ERROR, "Malformed indexed metric name '" << name << "'");
+        const uint32_t event_id = atol(event_str.c_str());
+        const counter_t counter = {name, {block_id, block_index, event_id}};
+        metric = new BaseMetric(name, counter);
+      }
+    }
+
    return metric;
  }

@@ -119,12 +147,15 @@ class MetricsDict {
  const_iterator_t End() const { return cache_.end(); }

 private:
-  MetricsDict(const util::AgentInfo* agent_info) : xml_(NULL) {
+  MetricsDict(const util::AgentInfo* agent_info) : xml_(NULL), agent_info_(agent_info) {
    const char* xml_name = getenv("ROCP_METRICS");
    if (xml_name != NULL) {
      xml_ = xml::Xml::Create(xml_name);
      if (xml_ == NULL) EXC_RAISING(HSA_STATUS_ERROR, "metrics .xml open error '" << xml_name << "'");
-      std::cout << "ROCProfiler: importing metrics from '" << xml_name << "':" << std::endl;
+      xml_->AddConst("top.const.metric", "NUM_SIMDS", 64);
+      xml_->AddConst("top.const.metric", "NUM_SHADER_ENGINES", 4);
+      std::cout << "ROCProfiler: importing '" << xml_name << "':" << std::endl;
+      ImportMetrics(agent_info, "const");
      ImportMetrics(agent_info, agent_info->gfxip);
      ImportMetrics(agent_info, "global");
    }
@@ -135,47 +166,53 @@ class MetricsDict {
    for (auto& entry : cache_) delete entry.second;
  }

-  void ImportMetrics(const util::AgentInfo* agent_info, const char* scope) {
-    auto scope_list = xml_->GetNodes("top." + std::string(scope) + ".metric");
+  static hsa_ven_amd_aqlprofile_id_query_t Translate(const util::AgentInfo* agent_info, const std::string& block_name) {
+    hsa_ven_amd_aqlprofile_profile_t profile;
+    profile.agent = agent_info->dev_id;
+    hsa_ven_amd_aqlprofile_id_query_t query = {block_name.c_str(), 0, 0};
+    hsa_status_t status =
+        util::HsaRsrcFactory::Instance().AqlProfileApi()->hsa_ven_amd_aqlprofile_get_info(
+            &profile, HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_ID, &query);
+    if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(HSA_STATUS_ERROR, "ImportMetrics: bad block name '" << block_name << "'");
+    return query;
+  }
+
+  void ImportMetrics(const util::AgentInfo* agent_info, const std::string& scope) {
+    auto scope_list = xml_->GetNodes("top." + scope + ".metric");
    if (!scope_list.empty()) {
      std::cout << "  " << scope_list.size() << " " << scope << " metrics found" << std::endl;

      for (auto node : scope_list) {
        const std::string name = node->opts["name"];
-        if (cache_.find(name) != cache_.end())
-          EXC_RAISING(HSA_STATUS_ERROR, "ImportMetrics: metrics redefined '" << name << "'");
-
        const std::string expr_str = node->opts["expr"];
+        std::string descr = node->opts["descr"];
+        if (descr.empty()) descr = (expr_str.empty()) ? name : expr_str;
+
        if (expr_str.empty()) {
          const std::string block_name = node->opts["block"];
-          const uint32_t event_id = atoi(node->opts["event"].c_str());
+          const std::string event_str = node->opts["event"];
+          const uint32_t event_id = atol(event_str.c_str());

-          hsa_ven_amd_aqlprofile_profile_t profile;
-          profile.agent = agent_info->dev_id;
-          hsa_ven_amd_aqlprofile_id_query_t query = {block_name.c_str(), 0, 0};
-          hsa_status_t status =
-              util::HsaRsrcFactory::Instance().AqlProfileApi()->hsa_ven_amd_aqlprofile_get_info(
-                  &profile, HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_ID, &query);
-          if (status == HSA_STATUS_SUCCESS) {
-            const hsa_ven_amd_aqlprofile_block_name_t block_id =
-                (hsa_ven_amd_aqlprofile_block_name_t)query.id;
-            if (query.instance_count > 1) {
-              for (unsigned block_index = 0; block_index < query.instance_count; ++block_index) {
-                std::ostringstream os;
-                os << name << '[' << block_index << ']';
-                const std::string full_name = os.str();
-                const counter_t counter = {full_name, {block_id, block_index, event_id}};
-                cache_[full_name] = new BaseMetric(full_name, counter);
-              }
-            } else {
-              const counter_t counter = {name, {block_id, 0, event_id}};
-              cache_[name] = new BaseMetric(name, counter);
+          const hsa_ven_amd_aqlprofile_id_query_t query = Translate(agent_info, block_name);
+          const hsa_ven_amd_aqlprofile_block_name_t block_id = (hsa_ven_amd_aqlprofile_block_name_t)query.id;
+          if (query.instance_count > 1) {
+            for (unsigned block_index = 0; block_index < query.instance_count; ++block_index) {
+              std::ostringstream full_name;
+              full_name << name << '[' << block_index << ']';
+              std::ostringstream alias;
+              alias << block_name << "[" << block_index << "]:" << event_str;
+              const counter_t counter = {full_name.str(), {block_id, block_index, event_id}};
+              AddMetric(full_name.str(), alias.str(), counter);
            }
-          } else
-            AQL_EXC_RAISING(HSA_STATUS_ERROR, "ImportMetrics: bad block name '" << block_name
-                                                                                << "'");
+          } else {
+            const std::string alias = block_name + ":" + event_str;
+            const counter_t counter = {name, {block_id, 0, event_id}};
+            AddMetric(name, alias, counter);
+          }
        } else {
          xml::Expr* expr_obj = new xml::Expr(expr_str, new ExprCache(&cache_));
+          std::cout << "# " << descr << std::endl;
+          std::cout << name << "=" << expr_obj->String() << "\n" << std::endl;
          counters_vec_t counters_vec;
          for (const std::string var : expr_obj->GetVars()) {
            auto it = cache_.find(var);
@@ -184,12 +221,46 @@ class MetricsDict {
                                                           << "' is not found");
            it->second->GetCounters(counters_vec);
          }
-          cache_[name] = new ExprMetric(name, counters_vec, expr_obj);
+          AddMetric(name, counters_vec, expr_obj);
        }
      }
    }
  }

+  const Metric* AddMetric(const std::string& name, const std::string& /*alias*/, const counter_t& counter) {
+    const Metric* metric = NULL;
+    const auto ret = cache_.insert({name, NULL});
+    if (ret.second) {
+      metric = new BaseMetric(name, counter);
+      ret.first->second = metric;
+    } else EXC_RAISING(HSA_STATUS_ERROR, "metric redefined '" << name << "'");
+#if 0
+    if (alias != name) {
+      if (cache_.find(alias) != cache_.end()) EXC_RAISING(HSA_STATUS_ERROR, "metric alias/name interference '" << alias << "'");
+      const auto ret = aliases_.insert({alias, name});
+      if (!ret.second) EXC_RAISING(HSA_STATUS_ERROR, "metric alias redefined '" << alias << "'");
+    }
+#endif
+    return metric;
+  }
+
+  const Metric* AddMetric(const std::string& name, const counters_vec_t& counters_vec, const xml::Expr* expr_obj) {
+    const Metric* metric = NULL;
+    const auto ret = cache_.insert({name, NULL});
+    if (ret.second) {
+      metric = new ExprMetric(name, counters_vec, expr_obj);
+      ret.first->second = metric;
+    } else EXC_RAISING(HSA_STATUS_ERROR, "expr-metric redefined '" << name << "'");
+    return metric;
+  }
+
+#if 0
+  std::string UnAlias(const std::string& name) const {
+    auto it = aliases_.find(name);
+    return (it != aliases_.end()) ? it->second : name;
+  }
+#endif
+
  void Print() {
    for (auto& v : cache_) {
      const Metric* metric = v.second;
@@ -203,7 +274,11 @@ class MetricsDict {
  }

  xml::Xml* xml_;
+  const util::AgentInfo* agent_info_;
  cache_t cache_;
+#if 0
+  std::map<std::string, std::string> aliases_;
+#endif

  static map_t* map_;
  static mutex_t mutex_;
@@ -79,15 +79,16 @@ class Profile {
  Profile(const util::AgentInfo* agent_info) : agent_info_(agent_info) {
    profile_ = {};
    profile_.agent = agent_info->dev_id;
+    completion_signal_ = {};
    is_legacy_ = (strncmp(agent_info->name, "gfx8", 4) == 0);
  }
  virtual ~Profile() {
-    if (!info_vector_.empty()) {
-      info_vector_.clear();
-      hsa_memory_free(profile_.command_buffer.ptr);
-      hsa_memory_free(profile_.output_buffer.ptr);
-      free(const_cast<event_t*>(profile_.events));
-      free(const_cast<parameter_t*>(profile_.parameters));
+    info_vector_.clear();
+    if (profile_.command_buffer.ptr) hsa_memory_free(profile_.command_buffer.ptr);
+    if (profile_.output_buffer.ptr) hsa_memory_free(profile_.output_buffer.ptr);
+    if (profile_.events) free(const_cast<event_t*>(profile_.events));
+    if (profile_.parameters) free(const_cast<parameter_t*>(profile_.parameters));
+    if (completion_signal_.handle) {
      hsa_status_t status = hsa_signal_destroy(completion_signal_);
      if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "signal_destroy " << std::hex << status);
    }
@@ -26,12 +26,14 @@ POSSIBILITY OF SUCH DAMAGE.

 #include <dlfcn.h>
 #include <hsa.h>
+#include <hsa_ext_amd.h>
 #include <hsa_ext_finalize.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <string.h>
 #include <stdlib.h>

+#include <atomic>
 #include <cassert>
 #include <fstream>
 #include <iostream>
@@ -76,6 +78,7 @@ hsa_status_t HsaRsrcFactory::FindMemRegionsCallback(hsa_region_t region, void* d
 // Constructor of the class
 HsaRsrcFactory::HsaRsrcFactory() {
  // Initialize the Hsa Runtime
+  printf("HSA init\n");
  hsa_status_t status = hsa_init();
  CHECK_STATUS("Error in hsa_init", status);

@@ -100,6 +103,10 @@ HsaRsrcFactory::HsaRsrcFactory() {

 // Destructor of the class
 HsaRsrcFactory::~HsaRsrcFactory() {
+  for (auto p : cpu_list_) free(const_cast<AgentInfo*>(p));
+  for (auto p : gpu_list_) free(const_cast<AgentInfo*>(p));
+
+  printf("HSA shutdown\n");
  hsa_status_t status = hsa_shut_down();
  CHECK_STATUS("Error in hsa_shut_down", status);
 }
@@ -162,12 +169,15 @@ const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) {
    hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, agent_info->name);
    strncpy(agent_info->gfxip, agent_info->name, 4);
    agent_info->gfxip[4] = '\0';
-    agent_info->max_wave_size = 0;
    hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &agent_info->max_wave_size);
-    agent_info->max_queue_size = 0;
    hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &agent_info->max_queue_size);
-    agent_info->profile = hsa_profile_t(108);
    hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_info->profile);
+    agent_info->is_apu = (agent_info->profile == HSA_PROFILE_FULL) ? true : false;
+    hsa_agent_get_info(agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT), &agent_info->cu_num);
+    hsa_agent_get_info(agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU), &agent_info->waves_per_cu);
+    hsa_agent_get_info(agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU), &agent_info->simds_per_cu);
+    hsa_agent_get_info(agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES), &agent_info->se_num);
+    hsa_agent_get_info(agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_NUM_SHADER_ARRAYS_PER_SE), &agent_info->shader_arrays_per_se);

    // Initialize memory regions to zero
    agent_info->kernarg_region.handle = 0;
@@ -349,8 +359,8 @@ bool HsaRsrcFactory::TransferData(void* dest_buff, void* src_buff, uint32_t leng
 //
 // @return bool true if successful, false otherwise
 //
-bool HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path,
-                                     char* kernel_name, hsa_executable_symbol_t* code_desc) {
+void* HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path,
+                                      const char* kernel_name, hsa_executable_t* hsa_exec, hsa_executable_symbol_t* code_desc) {
  // Finalize the Hsail object into code object
  hsa_status_t status;
  hsa_code_object_t code_object;
@@ -364,52 +374,52 @@ bool HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* br
  if (!codeStream) {
    std::cerr << "Error: failed to load " << filename << std::endl;
    assert(false);
-    return false;
+    return NULL;
  }

  // Allocate memory to read in code object from file
  size_t size = std::string::size_type(codeStream.tellg());
-  char* codeBuff = (char*)AllocateSysMemory(agent_info, size);
-  if (!codeBuff) {
+  char* code_buf = (char*)AllocateSysMemory(agent_info, size);
+  if (!code_buf) {
    std::cerr << "Error: failed to allocate memory for code object." << std::endl;
    assert(false);
-    return false;
+    return NULL;
  }

  // Read the code object into allocated memory
  codeStream.seekg(0, std::ios::beg);
-  std::copy(std::istreambuf_iterator<char>(codeStream), std::istreambuf_iterator<char>(), codeBuff);
+  std::copy(std::istreambuf_iterator<char>(codeStream), std::istreambuf_iterator<char>(), code_buf);

  // De-Serialize the code object that has been read into memory
-  status = hsa_code_object_deserialize(codeBuff, size, NULL, &code_object);
+  status = hsa_code_object_deserialize(code_buf, size, NULL, &code_object);
  if (status != HSA_STATUS_SUCCESS) {
    std::cerr << "Failed to deserialize code object" << std::endl;
-    return false;
+    if (code_buf) hsa_memory_free(code_buf);
+    return NULL;
  }

  // Create executable.
-  hsa_executable_t hsaExecutable;
  status =
-      hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, "", &hsaExecutable);
+      hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, "", hsa_exec);
  CHECK_STATUS("Error in creating executable object", status);

  // Load code object.
-  status = hsa_executable_load_code_object(hsaExecutable, agent_info->dev_id, code_object, "");
+  status = hsa_executable_load_code_object(*hsa_exec, agent_info->dev_id, code_object, "");
  CHECK_STATUS("Error in loading executable object", status);

  // Freeze executable.
-  status = hsa_executable_freeze(hsaExecutable, "");
+  status = hsa_executable_freeze(*hsa_exec, "");
  CHECK_STATUS("Error in freezing executable object", status);

  // Get symbol handle.
  hsa_executable_symbol_t kernelSymbol;
-  status = hsa_executable_get_symbol(hsaExecutable, NULL, kernel_name, agent_info->dev_id, 0,
+  status = hsa_executable_get_symbol(*hsa_exec, NULL, kernel_name, agent_info->dev_id, 0,
                                     &kernelSymbol);
  CHECK_STATUS("Error in looking up kernel symbol", status);

  // Update output parameter
  *code_desc = kernelSymbol;
-  return true;
+  return code_buf;
 }

 // Print the various fields of Hsa Gpu Agents
@@ -423,13 +433,47 @@ bool HsaRsrcFactory::PrintGpuAgents(const std::string& header) {

    std::clog << "> agent[" << idx << "] :" << std::endl;
    std::clog << ">> Name : " << agent_info->name << std::endl;
+    std::clog << ">> APU : " << agent_info->is_apu << std::endl;
+    std::clog << ">> HSAIL profile : " << agent_info->profile << std::endl;
    std::clog << ">> Max Wave Size : " << agent_info->max_wave_size << std::endl;
    std::clog << ">> Max Queue Size : " << agent_info->max_queue_size << std::endl;
    std::clog << ">> Kernarg Region Id : " << agent_info->coarse_region.handle << std::endl;
+    std::clog << ">> CU number : " << agent_info->cu_num << std::endl;
+    std::clog << ">> Waves per CU : " << agent_info->waves_per_cu << std::endl;
+    std::clog << ">> SIMDs per CU : " << agent_info->simds_per_cu << std::endl;
+    std::clog << ">> SE number : " << agent_info->se_num << std::endl;
+    std::clog << ">> Shader Arrays per SE : " << agent_info->shader_arrays_per_se << std::endl;
  }
  return true;
 }

+uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, void* packet) {
+  const uint32_t slot_size_b = 0x40;
+
+  // adevance command queue
+  const uint64_t write_idx = hsa_queue_load_write_index_relaxed(queue);
+  hsa_queue_store_write_index_relaxed(queue, write_idx + 1);
+  while ((write_idx - hsa_queue_load_read_index_relaxed(queue)) >= queue->size) {
+    sched_yield();
+  }
+
+  uint32_t slot_idx = (uint32_t)(write_idx % queue->size);
+  uint32_t* queue_slot = (uint32_t*)((uintptr_t)(queue->base_address) + (slot_idx * slot_size_b));
+  uint32_t* slot_data = (uint32_t*)packet;
+
+  // Copy buffered commands into the queue slot.
+  // Overwrite the AQL invalid header (first dword) last.
+  // This prevents the slot from being read until it's fully written.
+  memcpy(&queue_slot[1], &slot_data[1], slot_size_b - sizeof(uint32_t));
+  std::atomic<uint32_t>* header_atomic_ptr = reinterpret_cast<std::atomic<uint32_t>*>(&queue_slot[0]);
+  header_atomic_ptr->store(slot_data[0], std::memory_order_release);
+
+  // ringdoor bell
+  hsa_signal_store_relaxed(queue->doorbell_signal, write_idx);
+
+  return write_idx;
+}
+
 HsaRsrcFactory* HsaRsrcFactory::instance_ = NULL;
 HsaRsrcFactory::mutex_t HsaRsrcFactory::mutex_;

@@ -67,6 +67,9 @@ struct AgentInfo {
  // Agent type - Cpu = 0, Gpu = 1 or Dsp = 2
  uint32_t dev_type;

+  // APU flag
+  bool is_apu;
+
  // Agent system index
  uint32_t dev_index;

@@ -90,6 +93,21 @@ struct AgentInfo {

  // Memory region supporting kernel arguments
  hsa_region_t kernarg_region;
+
+  // The number of compute unit available in the agent.
+  uint32_t cu_num;
+
+  // Maximum number of waves possible in a Compute Unit.
+  uint32_t waves_per_cu;
+
+  // Number of SIMD's per compute unit CU
+  uint32_t simds_per_cu;
+
+  // Number of Shader Engines (SE) in Gpu
+  uint32_t se_num;
+
+  // Number of Shader Arrays Per Shader Engines in Gpu
+  uint32_t shader_arrays_per_se;
 };

 class HsaRsrcFactory {
@@ -213,14 +231,17 @@ class HsaRsrcFactory {
  // @param code_desc Handle of finalized Code Descriptor that could
  // be used to submit for execution
  //
-  // @return bool true if successful, false otherwise
+  // @return code buffer, non NULL if successful, NULL otherwise
  //
-  bool LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, char* kernel_name,
-                       hsa_executable_symbol_t* code_desc);
+  void* LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, const char* kernel_name,
+                        hsa_executable_t* hsa_exec, hsa_executable_symbol_t* code_desc);

  // Print the various fields of Hsa Gpu Agents
  bool PrintGpuAgents(const std::string& header);

+  // Submit AQL packet to given queue
+  static uint64_t Submit(hsa_queue_t* queue, void* packet);
+
  // Return AqlProfile API table
  typedef hsa_ven_amd_aqlprofile_1_00_pfn_t aqlprofile_pfn_t;
  const aqlprofile_pfn_t* AqlProfileApi() const { return &aqlprofile_api_; }
@@ -24,6 +24,7 @@ class div_zero_exception_t : public exception_t {
 };

 typedef uint64_t args_t;
+static const args_t ARGS_MAX = UINT64_MAX;
 typedef std::map<std::string, args_t> args_map_t;
 class Expr;

@@ -177,14 +178,19 @@ class Expr {
  bool SubCheck() const { return (sub_count_ == 0); }
  unsigned FindOp() const {
    unsigned i = pos_;
+    unsigned open_n = 0;
    while (i < expr_.length()) {
      switch (Symb(i)) {
        case '*':
        case '/':
        case '+':
        case '-':
+          goto end;
        case '(':
+          ++open_n;
+          break;
        case ')':
+          if (open_n != 0) i += 1;
          goto end;
      }
      ++i;
@@ -263,6 +269,74 @@ class var_expr_t : public bin_expr_t {
  const std::string name_;
 };

+class fun_expr_t : public bin_expr_t {
+ public:
+  typedef std::vector<var_expr_t> vvect_t;
+  fun_expr_t(const std::string& fname, const std::string& vname, const uint32_t& vnum) : fname_(fname) {
+    for (uint32_t i = 0; i < vnum; ++i) {
+      std::ostringstream var_full_name;
+      var_full_name << vname << "[" << i << "]";
+      vvect.push_back(var_expr_t(var_full_name.str()));
+    }
+  }
+  const vvect_t& GetVars() const { return vvect; }
+  std::string Symbol() const {
+    const std::string var = vvect[0].Symbol();
+    const std::string vname = var.substr(0, var.length() - 3);
+    std::ostringstream oss;
+    std::string str("(");
+    str.back() = ')';
+    oss << fname_ << "(" << vname << "," << vvect.size() << ")";
+    return oss.str();
+  }
+
+ private:
+  const std::string fname_;
+  vvect_t vvect;
+};
+class sum_expr_t : public fun_expr_t {
+ public:
+  sum_expr_t(const std::string& vname, const uint32_t& vnum) : fun_expr_t("sum", vname, vnum) {}
+  args_t Eval(const args_cache_t& args) const {
+    args_t result = 0;
+    for (const auto& var : GetVars()) result += var.Eval(args);
+    return result;
+  }
+};
+class avr_expr_t : public fun_expr_t {
+ public:
+  avr_expr_t(const std::string& vname, const uint32_t& vnum) : fun_expr_t("avr", vname, vnum) {}
+  args_t Eval(const args_cache_t& args) const {
+    args_t result = 0;
+    for (const auto& var : GetVars()) result += var.Eval(args);
+    return result / GetVars().size();
+  }
+};
+class min_expr_t : public fun_expr_t {
+ public:
+  min_expr_t(const std::string& vname, const uint32_t& vnum) : fun_expr_t("min", vname, vnum) {}
+  args_t Eval(const args_cache_t& args) const {
+    args_t result = ARGS_MAX;
+    for (const auto& var : GetVars()) {
+      args_t val = var.Eval(args);
+      result = (val < result) ? val : result;
+    }
+    return result;
+  }
+};
+class max_expr_t : public fun_expr_t {
+ public:
+  max_expr_t(const std::string& vname, const uint32_t& vnum) : fun_expr_t("max", vname, vnum) {}
+  args_t Eval(const args_cache_t& args) const {
+    args_t result = 0;
+    for (const auto& var : GetVars()) {
+      args_t val = var.Eval(args);
+      result = (val > result) ? val : result;
+    }
+    return result;
+  }
+};
+
 inline const bin_expr_t* bin_expr_t::CreateExpr(const bin_expr_t* arg1, const bin_expr_t* arg2,
                                                const char op) {
  const bin_expr_t* expr = NULL;
@@ -285,11 +359,41 @@ inline const bin_expr_t* bin_expr_t::CreateExpr(const bin_expr_t* arg1, const bi

 inline const bin_expr_t* bin_expr_t::CreateArg(Expr* obj, const std::string str) {
  const bin_expr_t* arg = NULL;
+
  const unsigned i = strspn(str.c_str(), "1234567890");
  if (i == str.length()) {
    const unsigned value = atoi(str.c_str());
    arg = new const_expr_t(value);
-  } else {
+  }
+
+  if (arg == NULL) {
+    const std::size_t pos = str.find('(');
+    if (pos != std::string::npos) {
+      char* fname = NULL;
+      char* vname = NULL;
+      int vnum = 0;
+      int ret = sscanf(str.c_str(), "%m[a-zA-Z_](%m[0-9a-zA-Z_],%d)", &fname, &vname, &vnum);
+      if (ret == 3) {
+        const std::string fun_name(fname);
+        const fun_expr_t* farg = NULL;
+        if (fun_name == "sum") {
+          farg = new sum_expr_t(vname, vnum);
+        } else if (fun_name == "avr") {
+          farg = new avr_expr_t(vname, vnum);
+        } else if (fun_name == "min") {
+          farg = new min_expr_t(vname, vnum);
+        } else if (fun_name == "max") {
+          farg = new max_expr_t(vname, vnum);
+        }
+        if (farg) for (const auto& var : farg->GetVars()) obj->AddVar(var.Symbol());
+        arg = farg;
+      }
+      free(fname);
+      free(vname);
+    }
+  }
+
+  if (arg == NULL) {
    const std::string sub_expr = obj->Lookup(str);
    if (sub_expr.empty()) {
      arg = new var_expr_t(str);
@@ -299,6 +403,7 @@ inline const bin_expr_t* bin_expr_t::CreateArg(Expr* obj, const std::string str)
      arg = expr->GetTree();
    }
  }
+
  return arg;
 }

@@ -36,11 +36,29 @@ class Xml {
    return xml;
  }

+  void AddExpr(const std::string& full_tag, const std::string& name, const std::string& expr) {
+    const std::size_t pos = full_tag.rfind('.');
+    const std::size_t pos1 = (pos == std::string::npos) ? 0 : pos + 1;
+    const std::string level_tag = full_tag.substr(pos1);
+    level_t* level = new level_t;
+    map_[full_tag].push_back(level);
+    level->tag = level_tag;
+    level->opts["name"] = name;
+    level->opts["expr"] = expr;
+  }
+
+  void AddConst(const std::string& full_tag, const std::string& name, const uint64_t& val) {
+    std::ostringstream oss;
+    oss << val;
+    AddExpr(full_tag, name, oss.str());
+  }
+
  static void Destroy(Xml *xml) { delete xml; }

  std::vector<level_t*> GetNodes(std::string global_tag) { return map_[global_tag]; }

  void Print() const {
+    std::cout << "XML file '" << file_name_ << "':" << std::endl;
    for (auto& elem : map_) {
      for (auto node : elem.second) {
        if (node->opts.size()) {
@@ -74,13 +92,14 @@ class Xml {
    while (1) {
      token_t token = (remainder.size()) ? remainder : NextToken();
      remainder.clear();
+
+//      token_t token1 = token;
+//      token1.push_back('\0');
+//      std::cout << "> " << &token1[0] << std::endl;
+
      // End of file
      if (token.size() == 0) break;

-      //      token_t token1 = token;
-      //      token1.push_back('\0');
-      //      std::cout << "> " << &token1[0] << std::endl;
-
      switch (state_) {
        case BODY_STATE:
          if (token[0] == '<') {
@@ -146,6 +165,11 @@ class Xml {

  ~Xml() {}

+  bool SpaceCheck() const {
+    bool cond = ((buffer_[index_] == ' ') || (buffer_[index_] == '	'));
+    return cond;
+  }
+
  bool LineEndCheck() {
    bool found = false;
    if (buffer_[index_] == '\n') {
@@ -162,24 +186,55 @@ class Xml {

  token_t NextToken() {
    token_t token;
+    bool in_string = false;
+    bool special_symb = false;

    while (1) {
      if (data_size_ == 0) {
        data_size_ = read(fd_, buffer_, buf_size_);
        if (data_size_ <= 0) break;
      }
+
      if (token.empty())
-        while ((index_ < data_size_) && ((buffer_[index_] == ' ') || LineEndCheck())) {
+        while ((index_ < data_size_) && (SpaceCheck() || LineEndCheck())) {
          ++index_;
        }
-      while ((index_ < data_size_) && (buffer_[index_] != ' ') && !LineEndCheck()) {
-        token.push_back(buffer_[index_++]);
+      while ((index_ < data_size_) && (in_string || !(SpaceCheck() || LineEndCheck()))) {
+        const char symb = buffer_[index_];
+        bool skip_symb = false;
+
+        switch (symb) {
+          case '\\':
+            if (special_symb) special_symb = false;
+            else {
+              special_symb = true;
+              skip_symb = true;
+            }
+            break;
+          case '"':
+            if (special_symb) special_symb = false;
+            else {
+              in_string = !in_string;
+              if (!in_string) {
+                buffer_[index_] = ' ';
+                --index_;
+              }
+              skip_symb = true;
+            }
+            break;
+        }
+
+        if (!skip_symb) token.push_back(symb);
+        ++index_;
      }
+
      if (index_ == data_size_) {
        index_ = 0;
        data_size_ = 0;
-      } else
+      } else {
+        if (special_symb || in_string) BadFormat(token);
        break;
+      }
    }

    return token;
@@ -42,7 +42,7 @@ class TestAql {
  }

  TestAql* Test() { return test_; }
-  virtual AgentInfo* GetAgentInfo() { return (test_) ? test_->GetAgentInfo() : 0; }
+  virtual const AgentInfo* GetAgentInfo() { return (test_) ? test_->GetAgentInfo() : 0; }
  virtual hsa_queue_t* GetQueue() { return (test_) ? test_->GetQueue() : 0; }
  virtual HsaRsrcFactory* GetRsrcFactory() { return (test_) ? test_->GetRsrcFactory() : 0; }

@@ -34,7 +34,7 @@ OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "util/hsa_rsrc_factory.h"

 HsaRsrcFactory* TestHsa::hsa_rsrc_ = NULL;
-AgentInfo* TestHsa::agent_info_ = NULL;
+const AgentInfo* TestHsa::agent_info_ = NULL;
 hsa_queue_t* TestHsa::hsa_queue_ = NULL;
 uint32_t TestHsa::agent_id_ = 0;

@@ -43,7 +43,7 @@ HsaRsrcFactory* TestHsa::HsaInstantiate(const uint32_t agent_ind) {
  if (hsa_rsrc_ == NULL) {
    agent_id_ = agent_ind;

-    hsa_rsrc_ = HsaRsrcFactory::Create();
+    hsa_rsrc_ = HsaRsrcFactory::CreateInstance();

    // Print properties of the agents
    hsa_rsrc_->PrintGpuAgents("> GPU agents");
@@ -52,7 +52,7 @@ class TestHsa : public TestAql {
  }

  // Get methods for Agent Info, HAS queue, HSA Resourcse Manager
-  AgentInfo* GetAgentInfo() { return agent_info_; }
+  const AgentInfo* GetAgentInfo() { return agent_info_; }
  hsa_queue_t* GetQueue() { return hsa_queue_; }
  HsaRsrcFactory* GetRsrcFactory() { return hsa_rsrc_; }

@@ -115,7 +115,7 @@ class TestHsa : public TestAql {
  static uint32_t agent_id_;

  // Handle to an Hsa Gpu Agent
-  static AgentInfo* agent_info_;
+  static const AgentInfo* agent_info_;

  // Handle to an Hsa Queue
  static hsa_queue_t* hsa_queue_;
@@ -371,6 +371,7 @@ CONSTRUCTOR_API void constructor()
    fprintf(stderr, "Input file not found '%s'\n", xml_name);
    exit(1);
  }
+  xml->Print();

  // Getting metrics
  auto metrics_list = xml->GetNodes("top.metric");
@@ -8,11 +8,23 @@
  <metric name=SQ_INSTS_VALU block=SQ event=26 ></metric>
  <metric name=SQ_INSTS_VMEM_WR block=SQ event=27 ></metric>
  <metric name=SQ_INSTS_VMEM_RD block=SQ event=28 ></metric>
+  <metric name=SQ_INSTS_SALU block=SQ event=30 ></metric>
  <metric name=SQ_INSTS_SMEM block=SQ event=31 ></metric>
+  <metric name=SQ_INSTS_FLAT block=SQ event=32 ></metric>
  <metric name=SQ_INSTS_FLAT_LDS_ONLY block=SQ event=33 ></metric>
  <metric name=SQ_INSTS_LDS block=SQ event=34 ></metric>
+  <metric name=SQ_INSTS_GDS block=SQ event=35 ></metric>
  <metric name=SQ_WAVE_READY block=SQ event=47 ></metric>

+  <metric name=SQ_WAIT_INST_LDS block=SQ event=61 descr="Number of wave-cycles spent waiting for LDS instruction issue. In units of 4 cycles. (per-simd, nondeterministic)"></metric>
+  <metric name=SQ_ACTIVE_INST_VALU block=SQ event=69 descr="Number of cycles the SQ instruction arbiter is working on a VALU instruction. (per-simd, nondeterministic)"></metric>
+  <metric name=SQ_INST_CYCLES_SALU block=SQ event=86 descr="Number of cycles needed to execute non-memory read scalar operations. (per-simd, emulated)"></metric>
+
+  <metric name=SQ_THREAD_CYCLES_VALU block=SQ event=89 ></metric>
+  <metric name=SQ_THREAD_CYCLES_VALU_MAX block=SQ event=90 ></metric>
+
+  <metric name=SQ_LDS_BANK_CONFLICT block=SQ event=97 descr="Number of cycles LDS is stalled by bank conflicts. (emulated)"></metric>
+
  <metric name=TA_BUSY block=TA event=15 ></metric>
  <metric name=TA_FLAT_READ_WAVEFRONTS block=TA event=101 ></metric>
  <metric name=TA_FLAT_WRITE_WAVEFRONTS block=TA event=102 ></metric>
@@ -22,37 +34,32 @@
  <metric name=TCC_HIT block=TCC event=18 ></metric>
  <metric name=TCC_MISS block=TCC event=19 ></metric>
  <metric name=TCC_WRITEBACK block=TCC event=22 ></metric>
+  <metric name=TCC_EA_WRREQ block=TCC event=26 ></metric>
+  <metric name=TCC_EA_WRREQ_64B block=TCC event=27 ></metric>
+  <metric name=TCC_EA_WRREQ_STALL block=TCC event=30 ></metric>
  <metric name=TCC_MC_RDREQ block=TCC event=35 ></metric>

+  <metric name=TCP_TA_DATA_STALL_CYCLES block=TCP event=3 descr="TCP stalls TA data interface. Now Windowed."></metric>
+
  <metric name=CPC_ALWAYS_COUNT block=CPC event=0 ></metric>
  <metric name=CPC_ME1_STALL_WAIT_ON_RCIU_READ block=CPC event=8 ></metric>

  # average for (16 instances x 4 shader engines)
-  <metric
-    name=TA_BUSY_avr
-    expr=(TA_BUSY[0]+TA_BUSY[1]+TA_BUSY[2]+TA_BUSY[3]+TA_BUSY[4]+TA_BUSY[5]+TA_BUSY[6]+TA_BUSY[7]+TA_BUSY[8]+TA_BUSY[9]+TA_BUSY[10]+TA_BUSY[11]+TA_BUSY[12]+TA_BUSY[13]+TA_BUSY[14]+TA_BUSY[15])/(16*4)
-  ></metric>
-  # sun for 16 instances
-  <metric
-    name=TA_FLAT_WRITE_WAVEFRONTS_sum
-    expr=TA_FLAT_WRITE_WAVEFRONTS[0]+TA_FLAT_WRITE_WAVEFRONTS[1]+TA_FLAT_WRITE_WAVEFRONTS[2]+TA_FLAT_WRITE_WAVEFRONTS[3]+TA_FLAT_WRITE_WAVEFRONTS[4]+TA_FLAT_WRITE_WAVEFRONTS[5]+TA_FLAT_WRITE_WAVEFRONTS[6]+TA_FLAT_WRITE_WAVEFRONTS[7]+TA_FLAT_WRITE_WAVEFRONTS[8]+TA_FLAT_WRITE_WAVEFRONTS[9]+TA_FLAT_WRITE_WAVEFRONTS[10]+TA_FLAT_WRITE_WAVEFRONTS[11]+TA_FLAT_WRITE_WAVEFRONTS[12]+TA_FLAT_WRITE_WAVEFRONTS[13]+TA_FLAT_WRITE_WAVEFRONTS[14]+TA_FLAT_WRITE_WAVEFRONTS[15]
-  ></metric>
-  <metric
-    name=TCC_HIT_sum
-    expr=TCC_HIT[0]+TCC_HIT[1]+TCC_HIT[2]+TCC_HIT[3]+TCC_HIT[4]+TCC_HIT[5]+TCC_HIT[6]+TCC_HIT[7]+TCC_HIT[8]+TCC_HIT[9]+TCC_HIT[10]+TCC_HIT[11]+TCC_HIT[12]+TCC_HIT[13]+TCC_HIT[14]+TCC_HIT[15]
-  ></metric>
-  <metric
-    name=TCC_MISS_sum
-    expr=TCC_MISS[0]+TCC_MISS[1]+TCC_MISS[2]+TCC_MISS[3]+TCC_MISS[4]+TCC_MISS[5]+TCC_MISS[6]+TCC_MISS[7]+TCC_MISS[8]+TCC_MISS[9]+TCC_MISS[10]+TCC_MISS[11]+TCC_MISS[12]+TCC_MISS[13]+TCC_MISS[14]+TCC_MISS[15]
-  ></metric>
-  <metric
-    name=TCC_MC_RDREQ_sum
-    expr=TCC_MC_RDREQ[0]+TCC_MC_RDREQ[1]+TCC_MC_RDREQ[2]+TCC_MC_RDREQ[3]+TCC_MC_RDREQ[4]+TCC_MC_RDREQ[5]+TCC_MC_RDREQ[6]+TCC_MC_RDREQ[7]+TCC_MC_RDREQ[8]+TCC_MC_RDREQ[9]+TCC_MC_RDREQ[10]+TCC_MC_RDREQ[11]+TCC_MC_RDREQ[12]+TCC_MC_RDREQ[13]+TCC_MC_RDREQ[14]+TCC_MC_RDREQ[15]
-  ></metric>
+  <metric name="TA_BUSY_avr" expr=avr(TA_BUSY,16)/4 ></metric>
+  # sum for 16 instances
+  <metric name="TA_FLAT_READ_WAVEFRONTS_sum" expr=sum(TA_FLAT_READ_WAVEFRONTS,16) ></metric>
+  <metric name="TA_FLAT_WRITE_WAVEFRONTS_sum" expr=sum(TA_FLAT_WRITE_WAVEFRONTS,16) ></metric>
+  <metric name="TCC_HIT_sum" expr=sum(TCC_HIT,16) ></metric>
+  <metric name="TCC_MISS_sum" expr=sum(TCC_MISS,16) ></metric>
+  <metric name="TCC_MC_RDREQ_sum" expr=sum(TCC_MC_RDREQ,16) ></metric>

  # FETCH_SIZE, kilobytes
  # The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account.
-  <metric name=FETCH_SIZE expr=(TCC_MC_RDREQ_sum*32)/1024 ></metric>
+  <metric
+    name="FETCH_SIZE"
+    expr=(TCC_MC_RDREQ_sum*32)/1024
+    descr="The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account."
+  ></metric>
 </gfx8>

 <gfx9>
@@ -63,13 +70,25 @@
  <metric name=SQ_WAVES block=SQ event=4 ></metric>
  <metric name=SQ_ITEMS block=SQ event=14 ></metric>
  <metric name=SQ_INSTS_VALU block=SQ event=26 ></metric>
-  <metric name=SQ_INSTS_VMEM_WR block=SQ event=30 ></metric>
-  <metric name=SQ_INSTS_VMEM_RD block=SQ event=31 ></metric>
+  <metric name=SQ_INSTS_VMEM_WR block=SQ event=27 ></metric>
+  <metric name=SQ_INSTS_VMEM_RD block=SQ event=28 ></metric>
+  <metric name=SQ_INSTS_SALU block=SQ event=30 ></metric>
  <metric name=SQ_INSTS_SMEM block=SQ event=31 ></metric>
+  <metric name=SQ_INSTS_FLAT block=SQ event=32 ></metric>
  <metric name=SQ_INSTS_FLAT_LDS_ONLY block=SQ event=33 ></metric>
  <metric name=SQ_INSTS_LDS block=SQ event=34 ></metric>
+  <metric name=SQ_INSTS_GDS block=SQ event=35 ></metric>
  <metric name=SQ_WAVE_READY block=SQ event=47 ></metric>

+  <metric name=SQ_WAIT_INST_LDS block=SQ event=63 descr="Number of wave-cycles spent waiting for LDS instruction issue. In units of 4 cycles. (per-simd, nondeterministic)"></metric>
+  <metric name=SQ_ACTIVE_INST_VALU block=SQ event=72 descr="regspec 71? Number of cycles the SQ instruction arbiter is working on a VALU instruction. (per-simd, nondeterministic)"></metric>
+  <metric name=SQ_INST_CYCLES_SALU block=SQ event=84 descr="Number of cycles needed to execute non-memory read scalar operations. (per-simd, emulated)"></metric>
+
+  <metric name=SQ_THREAD_CYCLES_VALU block=SQ event=85 ></metric>
+  <metric name=SQ_THREAD_CYCLES_VALU_MAX block=SQ event=86 ></metric>
+
+  <metric name=SQ_LDS_BANK_CONFLICT block=SQ event=93 descr="Number of cycles LDS is stalled by bank conflicts. (emulated)"></metric>
+
  <metric name=TA_BUSY block=TA event=15 ></metric>
  <metric name=TA_FLAT_READ_WAVEFRONTS block=TA event=101 ></metric>
  <metric name=TA_FLAT_WRITE_WAVEFRONTS block=TA event=102 ></metric>
@@ -86,60 +105,178 @@
  <metric name=CPC_ME1_STALL_WAIT_ON_RCIU_READ block=CPC event=8 ></metric>

  # average for (16 instances x 4 shader engines)
-  <metric
-    name=TA_BUSY_avr
-    expr=(TA_BUSY[0]+TA_BUSY[1]+TA_BUSY[2]+TA_BUSY[3]+TA_BUSY[4]+TA_BUSY[5]+TA_BUSY[6]+TA_BUSY[7]+TA_BUSY[8]+TA_BUSY[9]+TA_BUSY[10]+TA_BUSY[11]+TA_BUSY[12]+TA_BUSY[13]+TA_BUSY[14]+TA_BUSY[15])/(16*4)
-  ></metric>
+  <metric name="TA_BUSY_avr" expr=avr(TA_BUSY,16)/4 ></metric>
  # sum for 16 instances
-  <metric
-    name=TA_FLAT_WRITE_WAVEFRONTS_sum
-    expr=TA_FLAT_WRITE_WAVEFRONTS[0]+TA_FLAT_WRITE_WAVEFRONTS[1]+TA_FLAT_WRITE_WAVEFRONTS[2]+TA_FLAT_WRITE_WAVEFRONTS[3]+TA_FLAT_WRITE_WAVEFRONTS[4]+TA_FLAT_WRITE_WAVEFRONTS[5]+TA_FLAT_WRITE_WAVEFRONTS[6]+TA_FLAT_WRITE_WAVEFRONTS[7]+TA_FLAT_WRITE_WAVEFRONTS[8]+TA_FLAT_WRITE_WAVEFRONTS[9]+TA_FLAT_WRITE_WAVEFRONTS[10]+TA_FLAT_WRITE_WAVEFRONTS[11]+TA_FLAT_WRITE_WAVEFRONTS[12]+TA_FLAT_WRITE_WAVEFRONTS[13]+TA_FLAT_WRITE_WAVEFRONTS[14]+TA_FLAT_WRITE_WAVEFRONTS[15]
-  ></metric>
-
-  <metric
-    name=TCC_HIT_sum
-    expr=TCC_HIT[0]+TCC_HIT[1]+TCC_HIT[2]+TCC_HIT[3]+TCC_HIT[4]+TCC_HIT[5]+TCC_HIT[6]+TCC_HIT[7]+TCC_HIT[8]+TCC_HIT[9]+TCC_HIT[10]+TCC_HIT[11]+TCC_HIT[12]+TCC_HIT[13]+TCC_HIT[14]+TCC_HIT[15]
-  ></metric>
-  <metric
-    name=TCC_MISS_sum
-    expr=TCC_MISS[0]+TCC_MISS[1]+TCC_MISS[2]+TCC_MISS[3]+TCC_MISS[4]+TCC_MISS[5]+TCC_MISS[6]+TCC_MISS[7]+TCC_MISS[8]+TCC_MISS[9]+TCC_MISS[10]+TCC_MISS[11]+TCC_MISS[12]+TCC_MISS[13]+TCC_MISS[14]+TCC_MISS[15]
-  ></metric>
-  <metric
-    name=TCC_EA_RDREQ_sum
-    expr=TCC_EA_RDREQ[0]+TCC_EA_RDREQ[1]+TCC_EA_RDREQ[2]+TCC_EA_RDREQ[3]+TCC_EA_RDREQ[4]+TCC_EA_RDREQ[5]+TCC_EA_RDREQ[6]+TCC_EA_RDREQ[7]+TCC_EA_RDREQ[8]+TCC_EA_RDREQ[9]+TCC_EA_RDREQ[10]+TCC_EA_RDREQ[11]+TCC_EA_RDREQ[12]+TCC_EA_RDREQ[13]+TCC_EA_RDREQ[14]+TCC_EA_RDREQ[15]
-  ></metric>
-  <metric
-    name=TCC_EA_RDREQ_32B_sum
-    expr=TCC_EA_RDREQ_32B[0]+TCC_EA_RDREQ_32B[1]+TCC_EA_RDREQ_32B[2]+TCC_EA_RDREQ_32B[3]+TCC_EA_RDREQ_32B[4]+TCC_EA_RDREQ_32B[5]+TCC_EA_RDREQ_32B[6]+TCC_EA_RDREQ_32B[7]+TCC_EA_RDREQ_32B[8]+TCC_EA_RDREQ_32B[9]+TCC_EA_RDREQ_32B[10]+TCC_EA_RDREQ_32B[11]+TCC_EA_RDREQ_32B[12]+TCC_EA_RDREQ_32B[13]+TCC_EA_RDREQ_32B[14]+TCC_EA_RDREQ_32B[15]
-  ></metric>
+  <metric name="TA_FLAT_READ_WAVEFRONTS_sum" expr=sum(TA_FLAT_READ_WAVEFRONTS,16) ></metric>
+  <metric name="TA_FLAT_WRITE_WAVEFRONTS_sum" expr=sum(TA_FLAT_WRITE_WAVEFRONTS,16) ></metric>
+  <metric name=TCC_HIT_sum expr=sum(TCC_HIT,16) ></metric>
+  <metric name=TCC_MISS_sum expr=sum(TCC_MISS,16) ></metric>
+  <metric name=TCC_EA_RDREQ_sum expr=sum(TCC_EA_RDREQ,16) ></metric>
+  <metric name=TCC_EA_RDREQ_32B_sum expr=sum(TCC_EA_RDREQ_32B,16) ></metric>

  # FETCH_SIZE, kilobytes
  # The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account.
-  <metric name=FETCH_SIZE expr=((TCC_EA_RDREQ_sum-TCC_EA_RDREQ_32B_sum)*64+TCC_EA_RDREQ_32B_sum*32)/1024 ></metric>
+  <metric
+    name="FETCH_SIZE"
+    expr=((TCC_EA_RDREQ_sum-TCC_EA_RDREQ_32B_sum)*64+TCC_EA_RDREQ_32B_sum*32)/1024
+    descr="The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account."
+  ></metric>
 </gfx9>

 <global>
  # GPU_BUSY, percentage
  # The percentage of time GPU was busy.
-  <metric name=GPU_BUSY expr=100*GRBM_GUI_ACTIVE/GRBM_COUNT ></metric>
+  <metric
+    name=GPU_BUSY
+    expr=100*GRBM_GUI_ACTIVE/GRBM_COUNT
+    descr="The percentage of time GPU was busy."
+  ></metric>

-  # MEM_BUSY, percentage
-  # The percentage of GPUTime the memory unit is active. The result includes the stall time (MemUnitStalled). This is measured with all extra fetches and writes and any cache or memory effects taken into account. Value range: 0% to 100% (fetch-bound).
-  <metric name=MEM_BUSY expr=100*TA_BUSY_avr/GRBM_GUI_ACTIVE ></metric>
+  # Wavefronts      Total wavefronts.,
+  <metric
+    name="Wavefronts"
+    expr=SQ_WAVES
+    descr="Total wavefronts."
+  ></metric>

-  # VWRITE_INSTS
-  # The average number of vector write instructions to the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that write to video memory.
-  <metric name=VWRITE_INSTS expr=(SQ_INSTS_VMEM_WR-TA_FLAT_WRITE_WAVEFRONTS_sum)/SQ_WAVES ></metric>
+  # VALUInsts       The average number of vector ALU instructions executed per work-item (affected by flow control).
+  <metric
+    name="VALUInsts"
+    expr=SQ_INSTS_VALU/SQ_WAVES
+    descr="The average number of vector ALU instructions executed per work-item (affected by flow control)."
+  ></metric>

-  # SFETCH_INSTS
-  # The average number of scalar fetch instructions from the video memory executed per work-item (affected by flow control).
-  <metric name=SFETCH_INSTS expr=SQ_INSTS_SMEM/SQ_WAVES ></metric>
+  # SALUInsts       The average number of scalar ALU instructions executed per work-item (affected by flow control).
+  <metric
+    name="SALUInsts"
+    expr=SQ_INSTS_SALU/SQ_WAVES
+    descr="The average number of scalar ALU instructions executed per work-item (affected by flow control)."
+  ></metric>
+  
+  # VFetchInsts     The average number of vector fetch instructions from the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that fetch from video memory.
+  <metric
+    name="VFetchInsts"
+    expr=(SQ_INSTS_VMEM_RD-TA_FLAT_READ_WAVEFRONTS_sum)/SQ_WAVES
+    descr="The average number of vector fetch instructions from the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that fetch from video memory."
+  ></metric>

-  # VALU_INSTS
-  # The average number of vector ALU instructions executed per work-item (affected by flow control).
-  <metric name=VALU_INSTS expr=SQ_INSTS_VALU/SQ_WAVES ></metric>
+  # SFetchInsts     The average number of scalar fetch instructions from the video memory executed per work-item (affected by flow control).
+  <metric
+    name="SFetchInsts"
+    expr=SQ_INSTS_SMEM/SQ_WAVES
+    descr="The average number of scalar fetch instructions from the video memory executed per work-item (affected by flow control)."
+  ></metric>
+
+  # VWriteInsts     The average number of vector write instructions to the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that write to video memory.
+  <metric
+    name=VWriteInsts
+    expr=(SQ_INSTS_VMEM_WR-TA_FLAT_WRITE_WAVEFRONTS_sum)/SQ_WAVES
+    descr="The average number of vector write instructions to the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that write to video memory."
+  ></metric>
+
+  # FlatVMemInsts   The average number of FLAT instructions that read from or write to the video memory executed per work item (affected by flow control). Includes FLAT instructions that read from or write to scratch.
+  <metric
+    name="FlatVMemInsts"
+    expr=(SQ_INSTS_FLAT-SQ_INSTS_FLAT_LDS_ONLY)/SQ_WAVES
+    descr="The average number of FLAT instructions that read from or write to the video memory executed per work item (affected by flow control). Includes FLAT instructions that read from or write to scratch."
+  ></metric>
+
+  # LDSInsts        The average number of LDS read or LDS write instructions executed per work item (affected by flow control).  Excludes FLAT instructions that read from or write to LDS.
+  <metric
+    name="LDSInsts"
+    expr=(SQ_INSTS_LDS-SQ_INSTS_FLAT_LDS_ONLY)/SQ_WAVES
+    descr="The average number of LDS read or LDS write instructions executed per work item (affected by flow control).  Excludes FLAT instructions that read from or write to LDS."
+  ></metric>
+
+  # FlatLDSInsts    The average number of FLAT instructions that read or write to LDS executed per work item (affected by flow control).
+  <metric
+    name="FlatLDSInsts"
+    expr=SQ_INSTS_FLAT_LDS_ONLY/SQ_WAVES
+    descr="The average number of FLAT instructions that read or write to LDS executed per work item (affected by flow control)."
+  ></metric>
+
+  # GDSInsts        The average number of GDS read or GDS write instructions executed per work item (affected by flow control).
+  <metric
+    name="GDSInsts"
+    expr=SQ_INSTS_GDS/SQ_WAVES
+    descr="The average number of GDS read or GDS write instructions executed per work item (affected by flow control)."
+  ></metric>
+
+  # VALUUtilization The percentage of active vector ALU threads in a wave. A lower number can mean either more thread divergence in a wave or that the work-group size is not a multiple of 64. Value range: 0% (bad), 100% (ideal - no thread divergence).
+  <metric
+    name="VALUUtilization"
+    expr=100*SQ_THREAD_CYCLES_VALU/(SQ_ACTIVE_INST_VALU*64)
+    descr="The percentage of active vector ALU threads in a wave. A lower number can mean either more thread divergence in a wave or that the work-group size is not a multiple of 64. Value range: 0% (bad), 100% (ideal - no thread divergence)."
+  ></metric>
+
+  # VALUBusy        The percentage of GPUTime vector ALU instructions are processed. Value range: 0% (bad) to 100% (optimal).
+  <metric
+    name="VALUBusy"
+    expr=100*SQ_ACTIVE_INST_VALU*4/NUM_SIMDS/GRBM_GUI_ACTIVE
+    descr="The percentage of GPUTime vector ALU instructions are processed. Value range: 0% (bad) to 100% (optimal)."
+  ></metric>
+
+  # SALUBusy        The percentage of GPUTime scalar ALU instructions are processed. Value range: 0% (bad) to 100% (optimal).
+  <metric
+    name="SALUBusy"
+    expr=100*SQ_INST_CYCLES_SALU*4/(NUM_SIMDS/NUM_SHADER_ENGINES)/GRBM_GUI_ACTIVE
+    descr="The percentage of GPUTime scalar ALU instructions are processed. Value range: 0% (bad) to 100% (optimal)."
+  ></metric>
+
+  # FetchSize       The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account.
+  <metric
+    name="FetchSize"
+    expr=FETCH_SIZE
+    descr="The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account."
+  ></metric>
+
+  # WriteSize       The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account.
+  <metric
+    name="WriteSize"
+    expr=((sum(TCC_EA_WRREQ,16)-sum(TCC_EA_WRREQ_64B,16))*32+sum(TCC_EA_WRREQ_64B,16)*64)/1024
+    descr="The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account."
+  ></metric>
+
+  # L2CacheHit      The percentage of fetch, write, atomic, and other instructions that hit the data in L2 cache. Value range: 0% (no hit) to 100% (optimal).
+  <metric
+    name="L2CacheHit"
+    expr=100*sum(TCC_HIT,16)/(sum(TCC_HIT,16)+sum(TCC_MISS,16))
+    descr="The percentage of fetch, write, atomic, and other instructions that hit the data in L2 cache. Value range: 0% (no hit) to 100% (optimal)."
+  ></metric>
+
+  # MemUnitBusy     The percentage of GPUTime the memory unit is active. The result includes the stall time (MemUnitStalled). This is measured with all extra fetches and writes and any cache or memory effects taken into account. Value range: 0% to 100% (fetch-bound).
+  <metric
+    name="MemUnitBusy"
+    expr=100*max(TA_BUSY,16)/GRBM_GUI_ACTIVE/NUM_SHADER_ENGINES
+    descr="The percentage of GPUTime the memory unit is active. The result includes the stall time (MemUnitStalled). This is measured with all extra fetches and writes and any cache or memory effects taken into account. Value range: 0% to 100% (fetch-bound)."
+  ></metric>
+
+  # MemUnitStalled  The percentage of GPUTime the memory unit is stalled. Try reducing the number or size of fetches and writes if possible. Value range: 0% (optimal) to 100% (bad).
+  <metric
+    name="MemUnitStalled"
+    expr=100*max(TCP_TA_DATA_STALL_CYCLES,16)/GRBM_GUI_ACTIVE/NUM_SHADER_ENGINES
+    descr="The percentage of GPUTime the memory unit is stalled. Try reducing the number or size of fetches and writes if possible. Value range: 0% (optimal) to 100% (bad)."
+  ></metric>
+
+  # WriteUnitStalled The percentage of GPUTime the Write unit is stalled. Value range: 0% to 100% (bad).
+  <metric
+    name="WriteUnitStalled"
+    expr=100*max(TCC_EA_WRREQ_STALL,16)/GRBM_GUI_ACTIVE
+    descr="The percentage of GPUTime the Write unit is stalled. Value range: 0% to 100% (bad)."
+  ></metric>
+
+  # The percentage of GPUTime ALU units are stalled by the LDS input queue being full or the output queue being not ready. If there are LDS bank conflicts, reduce them. Otherwise, try reducing the number of LDS accesses if possible. Value range: 0% (optimal) to 100% (bad).
+  <metric
+    name="ALUStalledByLDS"
+    expr=100*SQ_WAIT_INST_LDS/SQ_WAVES/GRBM_GUI_ACTIVE/NUM_SHADER_ENGINES
+    descr="The percentage of GPUTime ALU units are stalled by the LDS input queue being full or the output queue being not ready. If there are LDS bank conflicts, reduce them. Otherwise, try reducing the number of LDS accesses if possible. Value range: 0% (optimal) to 100% (bad)."
+  ></metric>
+
+  # LDSBankConflict The percentage of GPUTime LDS is stalled by bank conflicts. Value range: 0% (optimal) to 100% (bad).
+  <metric
+    name="LDSBankConflict"
+    expr=100*SQ_LDS_BANK_CONFLICT/GRBM_GUI_ACTIVE/NUM_SIMDS
+    descr="The percentage of GPUTime LDS is stalled by bank conflicts. Value range: 0% (optimal) to 100% (bad)."
+  ></metric>

-  # L2CACHE_HIT, percentage
-  # The percentage of fetch, write, atomic, and other instructions that hit the data in L2 cache. Value range: 0% (no hit) to 100% (optimal).
-  <metric name=L2CACHE_HIT expr=100*TCC_HIT_sum/(TCC_HIT_sum+TCC_MISS_sum) ></metric>
 </global>
@@ -21,7 +21,7 @@ export ROCP_METRICS=metrics.xml
 export ROCP_INPUT=input.xml
 # output directory for the tool library, for metrics results file 'results.txt'
 # and SQTT trace files 'thread_trace.se<n>.out'
-#export ROCP_OUTPUT_DIR=./
+export ROCP_OUTPUT_DIR=./RESULTS

 if [ -n "$1" ] ; then
  tbin="$*"
@@ -26,6 +26,7 @@ POSSIBILITY OF SUCH DAMAGE.

 #include <dlfcn.h>
 #include <hsa.h>
+#include <hsa_ext_amd.h>
 #include <hsa_ext_finalize.h>
 #include <stdint.h>
 #include <stdio.h>
@@ -39,8 +40,17 @@ POSSIBILITY OF SUCH DAMAGE.
 #include <string>
 #include <vector>

+// Callback function to get available in the system agents
+hsa_status_t HsaRsrcFactory::GetHsaAgentsCallback(hsa_agent_t agent, void* data) {
+  hsa_status_t status = HSA_STATUS_ERROR;
+  HsaRsrcFactory* hsa_rsrc = reinterpret_cast<HsaRsrcFactory*>(data);
+  const AgentInfo* agent_info = hsa_rsrc->AddAgentInfo(agent);
+  if (agent_info != NULL) status = HSA_STATUS_SUCCESS;
+  return status;
+}
+
 // Callback function to find and bind kernarg region of an agent
-static hsa_status_t FindMemRegionsCallback(hsa_region_t region, void* data) {
+hsa_status_t HsaRsrcFactory::FindMemRegionsCallback(hsa_region_t region, void* data) {
  hsa_region_global_flag_t flags;
  hsa_region_segment_t segment_id;

@@ -62,53 +72,6 @@ static hsa_status_t FindMemRegionsCallback(hsa_region_t region, void* data) {
  return HSA_STATUS_SUCCESS;
 }

-// Callback function to get the number of agents
-static hsa_status_t GetHsaAgentsCallback(hsa_agent_t agent, void* data) {
-  // Copy handle of agent and increment number of agents reported
-  HsaRsrcFactory* rsrcFactory = reinterpret_cast<HsaRsrcFactory*>(data);
-
-  // Determine if device is a Gpu agent
-  hsa_status_t status;
-  hsa_device_type_t type;
-  status = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type);
-  CHECK_STATUS("Error Calling hsa_agent_get_info", status);
-  if (type == HSA_DEVICE_TYPE_DSP) {
-    return HSA_STATUS_SUCCESS;
-  }
-
-  if (type == HSA_DEVICE_TYPE_CPU) {
-    AgentInfo* agent_info = reinterpret_cast<AgentInfo*>(malloc(sizeof(AgentInfo)));
-    agent_info->dev_id = agent;
-    agent_info->dev_type = HSA_DEVICE_TYPE_CPU;
-    rsrcFactory->AddAgentInfo(agent_info, false);
-    return HSA_STATUS_SUCCESS;
-  }
-
-  // Device is a Gpu agent, build an instance of AgentInfo
-  AgentInfo* agent_info = reinterpret_cast<AgentInfo*>(malloc(sizeof(AgentInfo)));
-  agent_info->dev_id = agent;
-  agent_info->dev_type = HSA_DEVICE_TYPE_GPU;
-  hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, agent_info->name);
-  agent_info->max_wave_size = 0;
-  hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &agent_info->max_wave_size);
-  agent_info->max_queue_size = 0;
-  hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &agent_info->max_queue_size);
-  agent_info->profile = hsa_profile_t(108);
-  hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_info->profile);
-
-  // Initialize memory regions to zero
-  agent_info->kernarg_region.handle = 0;
-  agent_info->coarse_region.handle = 0;
-
-  // Find and Bind Memory regions of the Gpu agent
-  hsa_agent_iterate_regions(agent, FindMemRegionsCallback, agent_info);
-
-  // Save the instance of AgentInfo
-  rsrcFactory->AddAgentInfo(agent_info, true);
-
-  return HSA_STATUS_SUCCESS;
-}
-
 // Constructor of the class
 HsaRsrcFactory::HsaRsrcFactory() {
  // Initialize the Hsa Runtime
@@ -128,12 +91,17 @@ HsaRsrcFactory::HsaRsrcFactory() {
  status = hsa_system_get_extension_table(HSA_EXTENSION_AMD_AQLPROFILE, 1, 0, &aqlprofile_api_);
 #endif
  CHECK_STATUS("aqlprofile API table load failed", status);
+
+  // Get Loader API table
+  loader_api_ = {0};
+  status = hsa_system_get_extension_table(HSA_EXTENSION_AMD_LOADER, 1, 0, &loader_api_);
+  CHECK_STATUS("loader API table query failed", status);
 }

 // Destructor of the class
 HsaRsrcFactory::~HsaRsrcFactory() {
-  for (auto p : cpu_list_) free(p);
-  for (auto p : gpu_list_) free(p);
+  for (auto p : cpu_list_) free(const_cast<AgentInfo*>(p));
+  for (auto p : gpu_list_) free(const_cast<AgentInfo*>(p));

  printf("HSA shutdown\n");
  hsa_status_t status = hsa_shut_down();
@@ -173,6 +141,68 @@ hsa_status_t HsaRsrcFactory::LoadAqlProfileLib(aqlprofile_pfn_t* api) {
  return HSA_STATUS_SUCCESS;
 }

+// Add system agent info
+const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) {
+  // Determine if device is a Gpu agent
+  hsa_status_t status;
+  AgentInfo* agent_info = NULL;
+
+  hsa_device_type_t type;
+  status = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type);
+  CHECK_STATUS("Error Calling hsa_agent_get_info", status);
+
+  if (type == HSA_DEVICE_TYPE_CPU) {
+    agent_info = new AgentInfo{};
+    agent_info->dev_id = agent;
+    agent_info->dev_type = HSA_DEVICE_TYPE_CPU;
+    agent_info->dev_index = cpu_list_.size();
+    cpu_list_.push_back(agent_info);
+  }
+
+  if (type == HSA_DEVICE_TYPE_GPU) {
+    agent_info = new AgentInfo{};
+    agent_info->dev_id = agent;
+    agent_info->dev_type = HSA_DEVICE_TYPE_GPU;
+    hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, agent_info->name);
+    strncpy(agent_info->gfxip, agent_info->name, 4);
+    agent_info->gfxip[4] = '\0';
+    hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &agent_info->max_wave_size);
+    hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &agent_info->max_queue_size);
+    hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_info->profile);
+    agent_info->is_apu = (agent_info->profile == HSA_PROFILE_FULL) ? true : false;
+    hsa_agent_get_info(agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT), &agent_info->cu_num);
+    hsa_agent_get_info(agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU), &agent_info->waves_per_cu);
+    hsa_agent_get_info(agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU), &agent_info->simds_per_cu);
+    hsa_agent_get_info(agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES), &agent_info->se_num);
+    hsa_agent_get_info(agent, static_cast<hsa_agent_info_t>(HSA_AMD_AGENT_INFO_NUM_SHADER_ARRAYS_PER_SE), &agent_info->shader_arrays_per_se);
+
+    // Initialize memory regions to zero
+    agent_info->kernarg_region.handle = 0;
+    agent_info->coarse_region.handle = 0;
+
+    // Find and Bind Memory regions of the Gpu agent
+    hsa_agent_iterate_regions(agent, FindMemRegionsCallback, agent_info);
+
+    // Set GPU index
+    agent_info->dev_index = gpu_list_.size();
+    gpu_list_.push_back(agent_info);
+  }
+
+  if (agent_info) agent_map_[agent.handle] = agent_info;
+
+  return agent_info;
+}
+
+// Return systen agent info
+const AgentInfo* HsaRsrcFactory::GetAgentInfo(const hsa_agent_t agent) {
+  const AgentInfo* agent_info = NULL;
+  auto it = agent_map_.find(agent.handle);
+  if (it != agent_map_.end()) {
+    agent_info = it->second;
+  }
+  return agent_info;
+}
+
 // Get the count of Hsa Gpu Agents available on the platform
 //
 // @return uint32_t Number of Gpu agents on platform
@@ -193,7 +223,7 @@ uint32_t HsaRsrcFactory::GetCountOfCpuAgents() { return uint32_t(cpu_list_.size(
 //
 // @return bool true if successful, false otherwise
 //
-bool HsaRsrcFactory::GetGpuAgentInfo(uint32_t idx, AgentInfo** agent_info) {
+bool HsaRsrcFactory::GetGpuAgentInfo(uint32_t idx, const AgentInfo** agent_info) {
  // Determine if request is valid
  uint32_t size = uint32_t(gpu_list_.size());
  if (idx >= size) {
@@ -202,6 +232,7 @@ bool HsaRsrcFactory::GetGpuAgentInfo(uint32_t idx, AgentInfo** agent_info) {

  // Copy AgentInfo from specified index
  *agent_info = gpu_list_[idx];
+
  return true;
 }

@@ -213,7 +244,7 @@ bool HsaRsrcFactory::GetGpuAgentInfo(uint32_t idx, AgentInfo** agent_info) {
 //
 // @return bool true if successful, false otherwise
 //
-bool HsaRsrcFactory::GetCpuAgentInfo(uint32_t idx, AgentInfo** agent_info) {
+bool HsaRsrcFactory::GetCpuAgentInfo(uint32_t idx, const AgentInfo** agent_info) {
  // Determine if request is valid
  uint32_t size = uint32_t(cpu_list_.size());
  if (idx >= size) {
@@ -236,7 +267,8 @@ bool HsaRsrcFactory::GetCpuAgentInfo(uint32_t idx, AgentInfo** agent_info) {
 //
 // @return bool true if successful, false otherwise
 //
-bool HsaRsrcFactory::CreateQueue(AgentInfo* agent_info, uint32_t num_pkts, hsa_queue_t** queue) {
+bool HsaRsrcFactory::CreateQueue(const AgentInfo* agent_info, uint32_t num_pkts,
+                                 hsa_queue_t** queue) {
  hsa_status_t status;
  status = hsa_queue_create(agent_info->dev_id, num_pkts, HSA_QUEUE_TYPE_MULTI, NULL, NULL,
                            UINT32_MAX, UINT32_MAX, queue);
@@ -324,7 +356,7 @@ bool HsaRsrcFactory::TransferData(void* dest_buff, void* src_buff, uint32_t leng
 //
 // @return bool true if successful, false otherwise
 //
-void* HsaRsrcFactory::LoadAndFinalize(AgentInfo* agent_info, const char* brig_path,
+void* HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path,
                                      const char* kernel_name, hsa_executable_t* hsa_exec, hsa_executable_symbol_t* code_desc) {
  // Finalize the Hsail object into code object
  hsa_status_t status;
@@ -387,32 +419,27 @@ void* HsaRsrcFactory::LoadAndFinalize(AgentInfo* agent_info, const char* brig_pa
  return code_buf;
 }

-// Add an instance of AgentInfo representing a Hsa Gpu agent
-void HsaRsrcFactory::AddAgentInfo(AgentInfo* agent_info, bool gpu) {
-  // Add input to Gpu list
-  if (gpu) {
-    gpu_list_.push_back(agent_info);
-    return;
-  }
-
-  // Add input to Cpu list
-  cpu_list_.push_back(agent_info);
-}
-
 // Print the various fields of Hsa Gpu Agents
 bool HsaRsrcFactory::PrintGpuAgents(const std::string& header) {
  std::clog << header << " :" << std::endl;

-  AgentInfo* agent_info;
+  const AgentInfo* agent_info;
  int size = uint32_t(gpu_list_.size());
  for (int idx = 0; idx < size; idx++) {
    agent_info = gpu_list_[idx];

    std::clog << "> agent[" << idx << "] :" << std::endl;
    std::clog << ">> Name : " << agent_info->name << std::endl;
+    std::clog << ">> APU : " << agent_info->is_apu << std::endl;
+    std::clog << ">> HSAIL profile : " << agent_info->profile << std::endl;
    std::clog << ">> Max Wave Size : " << agent_info->max_wave_size << std::endl;
    std::clog << ">> Max Queue Size : " << agent_info->max_queue_size << std::endl;
    std::clog << ">> Kernarg Region Id : " << agent_info->coarse_region.handle << std::endl;
+    std::clog << ">> CU number : " << agent_info->cu_num << std::endl;
+    std::clog << ">> Waves per CU : " << agent_info->waves_per_cu << std::endl;
+    std::clog << ">> SIMDs per CU : " << agent_info->simds_per_cu << std::endl;
+    std::clog << ">> SE number : " << agent_info->se_num << std::endl;
+    std::clog << ">> Shader Arrays per SE : " << agent_info->shader_arrays_per_se << std::endl;
  }
  return true;
 }
@@ -28,6 +28,7 @@ POSSIBILITY OF SUCH DAMAGE.
 #include <hsa.h>
 #include <hsa_ext_finalize.h>
 #include <hsa_ven_amd_aqlprofile.h>
+#include <hsa_ven_amd_loader.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -35,6 +36,7 @@ POSSIBILITY OF SUCH DAMAGE.

 #include <iostream>
 #include <mutex>
+#include <map>
 #include <string>
 #include <vector>

@@ -52,6 +54,7 @@ POSSIBILITY OF SUCH DAMAGE.

 static const unsigned MEM_PAGE_BYTES = 0x1000;
 static const unsigned MEM_PAGE_MASK = MEM_PAGE_BYTES - 1;
+typedef decltype(hsa_agent_t::handle) hsa_agent_handle_t;

 // Encapsulates information about a Hsa Agent such as its
 // handle, name, max queue size, max wavefront size, etc.
@@ -62,6 +65,15 @@ struct AgentInfo {
  // Agent type - Cpu = 0, Gpu = 1 or Dsp = 2
  uint32_t dev_type;

+  // APU flag
+  bool is_apu;
+
+  // Agent system index
+  uint32_t dev_index;
+
+  // GFXIP name
+  char gfxip[64];
+
  // Name of Agent whose length is less than 64
  char name[64];

@@ -79,31 +91,52 @@ struct AgentInfo {

  // Memory region supporting kernel arguments
  hsa_region_t kernarg_region;
+
+  // The number of compute unit available in the agent.
+  uint32_t cu_num;
+
+  // Maximum number of waves possible in a Compute Unit.
+  uint32_t waves_per_cu;
+
+  // Number of SIMD's per compute unit CU
+  uint32_t simds_per_cu;
+
+  // Number of Shader Engines (SE) in Gpu
+  uint32_t se_num;
+
+  // Number of Shader Arrays Per Shader Engines in Gpu
+  uint32_t shader_arrays_per_se;
 };

 class HsaRsrcFactory {
 public:
  typedef std::recursive_mutex mutex_t;

-  static HsaRsrcFactory* Create() {
+  static HsaRsrcFactory* Create() { return NULL; }
+
+  static HsaRsrcFactory* CreateInstance() {
    std::lock_guard<mutex_t> lck(mutex_);
-    if (HsaRsrcFactory::instance_ == NULL) {
-      HsaRsrcFactory::instance_ = new HsaRsrcFactory();
+    if (instance_ == NULL) {
+      instance_ = new HsaRsrcFactory();
    }
    return instance_;
  }

+  static HsaRsrcFactory& Instance() {
+    CreateInstance();
+    hsa_status_t status = (instance_ != NULL) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
+    CHECK_STATUS("HsaRsrcFactory::Instance() is not found", status);
+    return *instance_;
+  }
+
  static void Destroy() {
    std::lock_guard<mutex_t> lck(mutex_);
    if (instance_) delete instance_;
    instance_ = NULL;
  }

-  static HsaRsrcFactory& Instance() {
-    hsa_status_t status = (instance_ != NULL) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
-    CHECK_STATUS("HsaRsrcFactory::Instance()", status);
-    return *instance_;
-  }
+  // Return system agent info
+  const AgentInfo* GetAgentInfo(const hsa_agent_t agent);

  // Get the count of Hsa Gpu Agents available on the platform
  //
@@ -125,7 +158,7 @@ class HsaRsrcFactory {
  //
  // @return bool true if successful, false otherwise
  //
-  bool GetGpuAgentInfo(uint32_t idx, AgentInfo** agent_info);
+  bool GetGpuAgentInfo(uint32_t idx, const AgentInfo** agent_info);

  // Get the AgentInfo handle of a Cpu device
  //
@@ -135,7 +168,7 @@ class HsaRsrcFactory {
  //
  // @return bool true if successful, false otherwise
  //
-  bool GetCpuAgentInfo(uint32_t idx, AgentInfo** agent_info);
+  bool GetCpuAgentInfo(uint32_t idx, const AgentInfo** agent_info);

  // Create a Queue object and return its handle. The queue object is expected
  // to support user requested number of Aql dispatch packets.
@@ -148,7 +181,7 @@ class HsaRsrcFactory {
  //
  // @return bool true if successful, false otherwise
  //
-  bool CreateQueue(AgentInfo* agent_info, uint32_t num_pkts, hsa_queue_t** queue);
+  bool CreateQueue(const AgentInfo* agent_info, uint32_t num_pkts, hsa_queue_t** queue);

  // Create a Signal object and return its handle.
  //
@@ -198,12 +231,9 @@ class HsaRsrcFactory {
  //
  // @return code buffer, non NULL if successful, NULL otherwise
  //
-  void* LoadAndFinalize(AgentInfo* agent_info, const char* brig_path, const char* kernel_name,
+  void* LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, const char* kernel_name,
                        hsa_executable_t* hsa_exec, hsa_executable_symbol_t* code_desc);

-  // Add an instance of AgentInfo representing a Hsa Gpu agent
-  void AddAgentInfo(AgentInfo* agent_info, bool gpu);
-
  // Print the various fields of Hsa Gpu Agents
  bool PrintGpuAgents(const std::string& header);

@@ -214,7 +244,16 @@ class HsaRsrcFactory {
  typedef hsa_ven_amd_aqlprofile_1_00_pfn_t aqlprofile_pfn_t;
  const aqlprofile_pfn_t* AqlProfileApi() const { return &aqlprofile_api_; }

+  // Return Loader API table
+  const hsa_ven_amd_loader_1_00_pfn_t* LoaderApi() const { return &loader_api_; }
+
 private:
+  // System agents iterating callback
+  static hsa_status_t GetHsaAgentsCallback(hsa_agent_t agent, void* data);
+
+  // Callback function to find and bind kernarg region of an agent
+  static hsa_status_t FindMemRegionsCallback(hsa_region_t region, void* data);
+
  // Load AQL profile HSA extension library directly
  static hsa_status_t LoadAqlProfileLib(aqlprofile_pfn_t* api);

@@ -225,17 +264,26 @@ class HsaRsrcFactory {
  // Destructor of the class
  ~HsaRsrcFactory();

+  // Add an instance of AgentInfo representing a Hsa Gpu agent
+  const AgentInfo* AddAgentInfo(const hsa_agent_t agent);
+
  static HsaRsrcFactory* instance_;
  static mutex_t mutex_;

  // Used to maintain a list of Hsa Gpu Agent Info
-  std::vector<AgentInfo*> gpu_list_;
+  std::vector<const AgentInfo*> gpu_list_;

  // Used to maintain a list of Hsa Cpu Agent Info
-  std::vector<AgentInfo*> cpu_list_;
+  std::vector<const AgentInfo*> cpu_list_;
+
+  // System agents map
+  std::map<hsa_agent_handle_t, const AgentInfo*> agent_map_;

  // AqlProfile API table
  aqlprofile_pfn_t aqlprofile_api_;
+
+  // Loader API table
+  hsa_ven_amd_loader_1_00_pfn_t loader_api_;
 };

 #endif  // TEST_UTIL_HSA_RSRC_FACTORY_H_
@@ -41,6 +41,7 @@ class Xml {
  std::vector<level_t*> GetNodes(std::string global_tag) { return map_[global_tag]; }

  void Print() const {
+    std::cout << "XML file '" << file_name_ << "':" << std::endl;
    for (auto& elem : map_) {
      for (auto node : elem.second) {
        if (node->opts.size()) {
@@ -74,13 +75,14 @@ class Xml {
    while (1) {
      token_t token = (remainder.size()) ? remainder : NextToken();
      remainder.clear();
+
+//      token_t token1 = token;
+//      token1.push_back('\0');
+//      std::cout << "> " << &token1[0] << std::endl;
+
      // End of file
      if (token.size() == 0) break;

-      //      token_t token1 = token;
-      //      token1.push_back('\0');
-      //      std::cout << "> " << &token1[0] << std::endl;
-
      switch (state_) {
        case BODY_STATE:
          if (token[0] == '<') {
@@ -146,6 +148,11 @@ class Xml {

  ~Xml() {}

+  bool SpaceCheck() const {
+    bool cond = ((buffer_[index_] == ' ') || (buffer_[index_] == '	'));
+    return cond;
+  }
+
  bool LineEndCheck() {
    bool found = false;
    if (buffer_[index_] == '\n') {
@@ -162,24 +169,55 @@ class Xml {

  token_t NextToken() {
    token_t token;
+    bool in_string = false;
+    bool special_symb = false;

    while (1) {
      if (data_size_ == 0) {
        data_size_ = read(fd_, buffer_, buf_size_);
        if (data_size_ <= 0) break;
      }
+
      if (token.empty())
-        while ((index_ < data_size_) && ((buffer_[index_] == ' ') || LineEndCheck())) {
+        while ((index_ < data_size_) && (SpaceCheck() || LineEndCheck())) {
          ++index_;
        }
-      while ((index_ < data_size_) && (buffer_[index_] != ' ') && !LineEndCheck()) {
-        token.push_back(buffer_[index_++]);
+      while ((index_ < data_size_) && (in_string || !(SpaceCheck() || LineEndCheck()))) {
+        const char symb = buffer_[index_];
+        bool skip_symb = false;
+
+        switch (symb) {
+          case '\\':
+            if (special_symb) special_symb = false;
+            else {
+              special_symb = true;
+              skip_symb = true;
+            }
+            break;
+          case '"':
+            if (special_symb) special_symb = false;
+            else {
+              in_string = !in_string;
+              if (!in_string) {
+                buffer_[index_] = ' ';
+                --index_;
+              }
+              skip_symb = true;
+            }
+            break;
+        }
+
+        if (!skip_symb) token.push_back(symb);
+        ++index_;
      }
+
      if (index_ == data_size_) {
        index_ = 0;
        data_size_ = 0;
-      } else
+      } else {
+        if (special_symb || in_string) BadFormat(token);
        break;
+      }
    }

    return token;