profiling controll options

[ROCm/rocprofiler commit: 1c8f767da4]
2018-02-27 13:32:11 -06:00
@@ -215,6 +215,7 @@ hsa_status_t rocprofiler_reset(rocprofiler_t* context,  // [in] profiling contex
 // Profiling callback data
 typedef struct {
  hsa_agent_t agent;
+  uint32_t agent_index;
  const hsa_queue_t* queue;
  uint64_t queue_index;
  uint64_t kernel_object;
@@ -335,7 +335,7 @@ class Context {
        const hsa_signal_value_t signal_value = hsa_signal_wait_scacquire(tuple.completion_signal, HSA_SIGNAL_CONDITION_LT, 1, timeout,
                                  HSA_WAIT_STATE_BLOCKED);
        complete = (signal_value == 0);
-        if (!complete) printf("ROCProfiler: Signal timeout, signal(%d) timeout(%lx)\n", (int)signal_value, timeout);
+        if (!complete) printf("ROCProfiler: Signal timeout, signal(%d) timeout(0x%lx)\n", (int)signal_value, timeout);
      }
      for (rocprofiler_feature_t* rinfo : *(tuple.info_vector)) rinfo->data.kind = ROCPROFILER_DATA_KIND_UNINIT;
      callback_data_t callback_data{tuple.info_vector, tuple.info_vector->size(), NULL};
@@ -87,6 +87,7 @@ class InterceptQueue {
            reinterpret_cast<const hsa_kernel_dispatch_packet_t*>(packet);
        const char* kernel_name = GetKernelName(dispatch_packet);
        rocprofiler_callback_data_t data = {obj->agent_info_->dev_id,
+                                            obj->agent_info_->dev_index,
                                            obj->queue_,
                                            user_que_idx,
                                            dispatch_packet->kernel_object,
@@ -174,7 +174,7 @@ const MetricsDict* GetMetrics(const hsa_agent_t& agent) {

 util::Logger::mutex_t util::Logger::mutex_;
 util::Logger* util::Logger::instance_ = NULL;
-uint64_t Context::timeout_ = 1000;
+uint64_t Context::timeout_ = UINT64_MAX;
 }

 ///////////////////////////////////////////////////////////////////////////////////////////////////
@@ -25,6 +25,7 @@ POSSIBILITY OF SUCH DAMAGE.
 #include "util/hsa_rsrc_factory.h"

 #include <dlfcn.h>
+#include <fcntl.h>
 #include <hsa.h>
 #include <hsa_ext_amd.h>
 #include <hsa_ext_finalize.h>
@@ -32,6 +33,8 @@ POSSIBILITY OF SUCH DAMAGE.
 #include <stdio.h>
 #include <string.h>
 #include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/types.h>

 #include <atomic>
 #include <cassert>
@@ -80,14 +83,13 @@ hsa_status_t HsaRsrcFactory::FindMemRegionsCallback(hsa_region_t region, void* d
 }

 // Constructor of the class
-HsaRsrcFactory::HsaRsrcFactory() {
+HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize_hsa) {
  hsa_status_t status;
-#if 0
  // Initialize the Hsa Runtime
-  printf("ROCProfiler: HSA init\n");
-  status = hsa_init();
-  CHECK_STATUS("Error in hsa_init", status);
-#endif
+  if (initialize_hsa_) {
+    status = hsa_init();
+    CHECK_STATUS("Error in hsa_init", status);
+  }
  // Discover the set of Gpu devices available on the platform
  status = hsa_iterate_agents(GetHsaAgentsCallback, this);
  CHECK_STATUS("Error Calling hsa_iterate_agents", status);
@@ -111,11 +113,10 @@ HsaRsrcFactory::HsaRsrcFactory() {
 HsaRsrcFactory::~HsaRsrcFactory() {
  for (auto p : cpu_list_) delete p;
  for (auto p : gpu_list_) delete p;
-#if 0
-  printf("ROCProfiler: HSA shutdown\n");
-  hsa_status_t status = hsa_shut_down();
-  CHECK_STATUS("Error in hsa_shut_down", status);
-#endif
+  if (initialize_hsa_) {
+    hsa_status_t status = hsa_shut_down();
+    CHECK_STATUS("Error in hsa_shut_down", status);
+  }
 }

 hsa_status_t HsaRsrcFactory::LoadAqlProfileLib(aqlprofile_pfn_t* api) {
@@ -371,67 +372,53 @@ bool HsaRsrcFactory::TransferData(void* dest_buff, void* src_buff, uint32_t leng
 //
 // @return bool true if successful, false otherwise
 //
-void* HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path,
-                                      const char* kernel_name, hsa_executable_t* hsa_exec, hsa_executable_symbol_t* code_desc) {
-  // Finalize the Hsail object into code object
-  hsa_status_t status;
-  hsa_code_object_t code_object;
+bool HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path,
+                                      const char* kernel_name, hsa_executable_t* executable, hsa_executable_symbol_t* code_desc) {
+  hsa_status_t status = HSA_STATUS_ERROR;

  // Build the code object filename
  std::string filename(brig_path);
  std::clog << "Code object filename: " << filename << std::endl;

  // Open the file containing code object
-  std::ifstream codeStream(filename.c_str(), std::ios::binary | std::ios::ate);
-  if (!codeStream) {
-    std::cerr << "Error: failed to load " << filename << std::endl;
+  hsa_file_t file_handle = open(filename.c_str(), O_RDONLY);
+  if (file_handle == -1) {
+    std::cerr << "Error: failed to load '" << filename << "'" << std::endl;
    assert(false);
-    return NULL;
+    return false;
  }

-  // Allocate memory to read in code object from file
-  size_t size = std::string::size_type(codeStream.tellg());
-  char* code_buf = (char*)AllocateSysMemory(agent_info, size);
-  if (!code_buf) {
-    std::cerr << "Error: failed to allocate memory for code object." << std::endl;
-    assert(false);
-    return NULL;
-  }
-
-  // Read the code object into allocated memory
-  codeStream.seekg(0, std::ios::beg);
-  std::copy(std::istreambuf_iterator<char>(codeStream), std::istreambuf_iterator<char>(), code_buf);
-
-  // De-Serialize the code object that has been read into memory
-  status = hsa_code_object_deserialize(code_buf, size, NULL, &code_object);
+  // Create code object reader
+  hsa_code_object_reader_t code_obj_rdr = {0};
+  status = hsa_code_object_reader_create_from_file(file_handle, &code_obj_rdr);
  if (status != HSA_STATUS_SUCCESS) {
-    std::cerr << "Failed to deserialize code object" << std::endl;
-    if (code_buf) hsa_memory_free(code_buf);
-    return NULL;
+    std::cerr << "Failed to create code object reader '" << filename << "'" << std::endl;
+    return false;
  }

  // Create executable.
-  status =
-      hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, "", hsa_exec);
+  status = hsa_executable_create_alt(HSA_PROFILE_FULL,
+    HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL, executable);
  CHECK_STATUS("Error in creating executable object", status);

  // Load code object.
-  status = hsa_executable_load_code_object(*hsa_exec, agent_info->dev_id, code_object, "");
+  status = hsa_executable_load_agent_code_object(*executable, agent_info->dev_id,
+    code_obj_rdr, NULL, NULL);
  CHECK_STATUS("Error in loading executable object", status);

  // Freeze executable.
-  status = hsa_executable_freeze(*hsa_exec, "");
+  status = hsa_executable_freeze(*executable, "");
  CHECK_STATUS("Error in freezing executable object", status);

  // Get symbol handle.
  hsa_executable_symbol_t kernelSymbol;
-  status = hsa_executable_get_symbol(*hsa_exec, NULL, kernel_name, agent_info->dev_id, 0,
+  status = hsa_executable_get_symbol(*executable, NULL, kernel_name, agent_info->dev_id, 0,
                                     &kernelSymbol);
  CHECK_STATUS("Error in looking up kernel symbol", status);

  // Update output parameter
  *code_desc = kernelSymbol;
-  return code_buf;
+  return true;
 }

 // Print the various fields of Hsa Gpu Agents
@@ -114,16 +114,16 @@ class HsaRsrcFactory {
 public:
  typedef std::recursive_mutex mutex_t;

-  static HsaRsrcFactory* Create() {
+  static HsaRsrcFactory* Create(bool initialize_hsa = true) {
    std::lock_guard<mutex_t> lck(mutex_);
    if (instance_ == NULL) {
-      instance_ = new HsaRsrcFactory();
+      instance_ = new HsaRsrcFactory(initialize_hsa);
    }
    return instance_;
  }

  static HsaRsrcFactory& Instance() {
-    if (instance_ == NULL) instance_ = Create();
+    if (instance_ == NULL) instance_ = Create(false);
    hsa_status_t status = (instance_ != NULL) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
    CHECK_STATUS("HsaRsrcFactory::Instance() failed", status);
    return *instance_;
@@ -229,9 +229,9 @@ class HsaRsrcFactory {
  // @param code_desc Handle of finalized Code Descriptor that could
  // be used to submit for execution
  //
-  // @return code buffer, non NULL if successful, NULL otherwise
+  // @return true if successful, false otherwise
  //
-  void* LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, const char* kernel_name,
+  bool LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, const char* kernel_name,
                        hsa_executable_t* hsa_exec, hsa_executable_symbol_t* code_desc);

  // Print the various fields of Hsa Gpu Agents
@@ -259,11 +259,14 @@ class HsaRsrcFactory {

  // Constructor of the class. Will initialize the Hsa Runtime and
  // query the system topology to get the list of Cpu and Gpu devices
-  HsaRsrcFactory();
+  HsaRsrcFactory(bool initialize_hsa);

  // Destructor of the class
  ~HsaRsrcFactory();

+  // HSA was initialized
+  const bool initialize_hsa_;
+
  // Add an instance of AgentInfo representing a Hsa Gpu agent
  const AgentInfo* AddAgentInfo(const hsa_agent_t agent);

@@ -89,7 +89,7 @@ class Xml {
    AddExpr(full_tag, name, oss.str());
  }

-  nodes_t GetNodes(std::string global_tag) { return (*map_)[global_tag]; }
+  nodes_t GetNodes(const std::string& global_tag) { return (*map_)[global_tag]; }

  template <class F>
  F ForEach(const F& f_i) {
@@ -43,7 +43,7 @@ HsaRsrcFactory* TestHsa::HsaInstantiate(const uint32_t agent_ind) {
  if (hsa_rsrc_ == NULL) {
    agent_id_ = agent_ind;

-    hsa_rsrc_ = HsaRsrcFactory::CreateInstance();
+    hsa_rsrc_ = HsaRsrcFactory::Create();

    // Print properties of the agents
    hsa_rsrc_->PrintGpuAgents("> GPU agents");
@@ -125,9 +125,8 @@ bool TestHsa::Setup() {

  // Load and Finalize Kernel Code Descriptor
  char* brig_path = (char*)brig_path_obj_.c_str();
-  code_buf_ =
-      hsa_rsrc_->LoadAndFinalize(agent_info_, brig_path, name_.c_str(), &hsa_exec_, &kernel_code_desc_);
-  if (code_buf_ == NULL) {
+  bool suc =  hsa_rsrc_->LoadAndFinalize(agent_info_, brig_path, name_.c_str(), &hsa_exec_, &kernel_code_desc_);
+  if (suc == false) {
    std::cerr << "Error in loading and finalizing Kernel" << std::endl;
    return false;
  }
@@ -241,7 +240,6 @@ void TestHsa::PrintTime() {

 bool TestHsa::Cleanup() {
  hsa_executable_destroy(hsa_exec_);
-  hsa_memory_free(code_buf_);
  hsa_signal_destroy(hsa_signal_);
  return true;
 }
@@ -47,7 +47,6 @@ class TestHsa : public TestAql {
    total_time_taken_ = 0;
    setup_time_taken_ = 0;
    dispatch_time_taken_ = 0;
-    code_buf_ = NULL;
    hsa_exec_ = {};
  }

@@ -123,8 +122,7 @@ class TestHsa : public TestAql {
  // Test kernel name
  std::string name_;

-  // Kernel code buffer
-  void* code_buf_;
+  // Kernel executable
  hsa_executable_t hsa_exec_;
 };

@@ -32,6 +32,9 @@ struct callbacks_data_t {
  unsigned feature_count;
  unsigned group_index;
  FILE* file_handle;
+  std::vector<uint32_t>* gpu_index;
+  std::vector<std::string>* kernel_string;
+  std::vector<uint32_t>* range;
 };

 // Context stored entry type
@@ -54,10 +57,18 @@ typedef std::map<uint32_t, context_entry_t> context_array_t;
 context_array_t* context_array = NULL;
 // Contexts collected count
 uint32_t context_count = 0;
+uint32_t context_collected = 0;
 // Profiling results output file name
 const char* result_prefix = NULL;
 // Global results file handle
 FILE* result_file_handle = NULL;
+// Dispatch filters
+//  GPU index filter
+std::vector<uint32_t>* gpu_index_vec = NULL;
+//  Kernel name filter
+std::vector<std::string>* kernel_string_vec = NULL;
+//  DIspatch number range filter
+std::vector<uint32_t>* range_vec = NULL;

 // Check returned HSA API status
 void check_status(hsa_status_t status) {
@@ -69,6 +80,20 @@ void check_status(hsa_status_t status) {
  }
 }

+uint32_t next_context_count() {
+  if (pthread_mutex_lock(&mutex) != 0) {
+    perror("pthread_mutex_lock");
+    exit(1);
+  }
+  const uint32_t prev_val = context_count;
+  context_count = prev_val + 1;
+  if (pthread_mutex_unlock(&mutex) != 0) {
+    perror("pthread_mutex_unlock");
+    exit(1);
+  }
+  return prev_val;
+}
+
 // Allocate entry to store profiling context
 context_entry_t* alloc_context_entry() {
  if (pthread_mutex_lock(&mutex) != 0) {
@@ -83,7 +108,6 @@ context_entry_t* alloc_context_entry() {
    fprintf(stderr, "context_array corruption, index repeated %u\n", index);
    abort();
  }
-  ++context_count;

  if (pthread_mutex_unlock(&mutex) != 0) {
    perror("pthread_mutex_unlock");
@@ -91,7 +115,6 @@ context_entry_t* alloc_context_entry() {
  }

  context_entry_t* entry = &(ret.first->second);
-  entry->index = index;
  return entry;
 }

@@ -220,6 +243,7 @@ void dump_context(context_entry_t* entry) {
  hsa_status_t status = HSA_STATUS_ERROR;

  if (entry->valid) {
+    ++context_collected;
    entry->valid = 0;
    const uint32_t index = entry->index;
    FILE* file_handle = entry->file_handle;
@@ -285,10 +309,46 @@ void handler(rocprofiler_group_t group, void* arg) {
 // Kernel disoatch callback
 hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, void* user_data,
                               rocprofiler_group_t* group) {
-  // HSA status
-  hsa_status_t status = HSA_STATUS_ERROR;
  // Passed tool data
  callbacks_data_t* tool_data = reinterpret_cast<callbacks_data_t*>(user_data);
+
+  // Checking dispatch condition
+  bool found = true;
+  std::vector<uint32_t>* range_ptr = tool_data->range;
+  if (found && range_ptr) {
+    found = false;
+    std::vector<uint32_t>& range = *range_ptr;
+    if (range.size() == 1) {
+      if (context_count >= range[0]) found = true;
+    } else if (range.size() == 2) {
+      if ((context_count >= range[0]) && (context_count < range[1])) found = true;
+    }
+  }
+  std::vector<uint32_t>* gpu_index = tool_data->gpu_index;
+  if (found && gpu_index) {
+    found = false;
+    for (uint32_t i : *gpu_index) {
+      if (i == callback_data->agent_index) {
+        found = true;
+      }
+    }
+  }
+  std::vector<std::string>* kernel_string  = tool_data->kernel_string;
+  if (found && kernel_string) {
+    found = false;
+    for (const std::string& s : *kernel_string) {
+      if (std::string(callback_data->kernel_name).find(s) != std::string::npos) {
+        found = true;
+      }
+    }
+  }
+  if (found == false) {
+    next_context_count();
+    return HSA_STATUS_SUCCESS;
+  }
+
+  // HSA status
+  hsa_status_t status = HSA_STATUS_ERROR;
  // Profiling context
  rocprofiler_t* context = NULL;
  // Context entry
@@ -320,6 +380,7 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data,
  entry->data = *callback_data;
  entry->data.kernel_name = strdup(callback_data->kernel_name);
  entry->file_handle = tool_data->file_handle;
+  entry->index = next_context_count();
  entry->valid = 1;

  return status;
@@ -341,19 +402,48 @@ static hsa_status_t info_callback(const rocprofiler_info_data_t info, void * arg
  return HSA_STATUS_SUCCESS;
 }

+void get_xml_array(xml::Xml* xml, const std::string& tag, const std::string& field, const std::string& delim, std::vector<std::string>* vec, const char* label = NULL) {
+  auto nodes = xml->GetNodes(tag);
+  auto rit = nodes.rbegin();
+  auto rend = nodes.rend();
+  while (rit != rend) {
+    auto& opts = (*rit)->opts;
+    if (opts.find(field) != opts.end()) break;
+    ++rit;
+  }
+  if (rit != rend) {
+    const std::string array_string = (*rit)->opts[field];
+    if (label != NULL) printf("%s%s = %s\n", label, field.c_str(), array_string.c_str());
+    size_t pos1 = 0;
+    while (pos1 < array_string.length()) {
+      const size_t pos2 = array_string.find(delim, pos1);
+      const std::string token = array_string.substr(pos1, pos2 - pos1);
+      vec->push_back(token);
+      if (pos2 == std::string::npos) break;
+      pos1 = pos2 + 1;
+    }
+  }
+}
+
+void get_xml_array(xml::Xml* xml, const std::string& tag, const std::string& field, const std::string& delim, std::vector<uint32_t>* vec, const char* label = NULL) {
+  std::vector<std::string> str_vec;
+  get_xml_array(xml, tag, field, delim, &str_vec, label);
+  for (const std::string& str : str_vec) vec->push_back(atoi(str.c_str()));
+}
+
 // Tool constructor
 extern "C" PUBLIC_API void OnLoadTool()
 {
  std::map<std::string, hsa_ven_amd_aqlprofile_parameter_name_t> parameters_dict;
-  parameters_dict["HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET"] =
+  parameters_dict["COMPUTE_UNIT_TARGET"] =
      HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET;
-  parameters_dict["HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_VM_ID_MASK"] =
+  parameters_dict["VM_ID_MASK"] =
      HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_VM_ID_MASK;
-  parameters_dict["HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_MASK"] =
+  parameters_dict["MASK"] =
      HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_MASK;
-  parameters_dict["HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK"] =
+  parameters_dict["TOKEN_MASK"] =
      HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK;
-  parameters_dict["HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK2"] =
+  parameters_dict["TOKEN_MASK2"] =
      HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK2;

  char* info_symb = getenv("ROCP_INFO");
@@ -374,7 +464,7 @@ extern "C" PUBLIC_API void OnLoadTool()
    DIR* dir = opendir(result_prefix);
    if (dir == NULL) {
      std::ostringstream errmsg;
-      errmsg << "Cannot open output directory '" << result_prefix << "'";
+      errmsg << "ROCProfiler: Cannot open output directory '" << result_prefix << "'";
      perror(errmsg.str().c_str());
      exit(1);
    }
@@ -383,7 +473,7 @@ extern "C" PUBLIC_API void OnLoadTool()
    result_file_handle = fopen(oss.str().c_str(), "w");
    if (result_file_handle == NULL) {
      std::ostringstream errmsg;
-      errmsg << "fopen error, file '" << oss.str().c_str() << "'";
+      errmsg << "ROCProfiler: fopen error, file '" << oss.str().c_str() << "'";
      perror(errmsg.str().c_str());
      exit(1);
    }
@@ -399,24 +489,23 @@ extern "C" PUBLIC_API void OnLoadTool()
  printf("ROCProfiler: input from \"%s\"\n", xml_name);
  xml::Xml* xml = xml::Xml::Create(xml_name);
  if (xml == NULL) {
-    fprintf(stderr, "Input file not found '%s'\n", xml_name);
+    fprintf(stderr, "ROCProfiler: Input file not found '%s'\n", xml_name);
    exit(1);
  }

  // Getting metrics
-  auto metrics_list = xml->GetNodes("top.metric");
  std::vector<std::string> metrics_vec;
-  for (auto* entry : metrics_list) {
-    const std::string entry_str = entry->opts["name"];
-    size_t pos1 = 0;
-    while (pos1 < entry_str.length()) {
-      const size_t pos2 = entry_str.find(",", pos1);
-      const std::string metric_name = entry_str.substr(pos1, pos2 - pos1);
-      metrics_vec.push_back(metric_name);
-      if (pos2 == std::string::npos) break;
-      pos1 = pos2 + 1;
-    }
-  }
+  get_xml_array(xml, "top.metric", "name", ",", &metrics_vec);
+
+  // Getting GPU indexes
+  gpu_index_vec = new std::vector<uint32_t>;
+  get_xml_array(xml, "top.metric", "gpu_index", ",", gpu_index_vec, "  ");
+  // Getting kernel names
+  kernel_string_vec = new std::vector<std::string>;
+  get_xml_array(xml, "top.metric", "kernel", ",", kernel_string_vec, "  ");
+  // Getting profiling range
+  range_vec = new std::vector<uint32_t>;
+  get_xml_array(xml, "top.metric", "range", ":", range_vec, "  ");

  // Getting traces
  auto traces_list = xml->GetNodes("top.trace");
@@ -458,7 +547,7 @@ extern "C" PUBLIC_API void OnLoadTool()
      for (auto& v : params->opts) {
        const std::string parameter_name = v.first;
        if (parameters_dict.find(parameter_name) == parameters_dict.end()) {
-          fprintf(stderr, "ROCProfiler: unknown trace parameter %s\n", parameter_name.c_str());
+          fprintf(stderr, "ROCProfiler: unknown trace parameter '%s'\n", parameter_name.c_str());
          exit(1);
        }
        const uint32_t value = strtol(v.second.c_str(), NULL, 0);
@@ -489,6 +578,9 @@ extern "C" PUBLIC_API void OnLoadTool()
    callbacks_data->feature_count = feature_count;
    callbacks_data->group_index = 0;
    callbacks_data->file_handle = result_file_handle;
+    callbacks_data->gpu_index = (gpu_index_vec->empty()) ? NULL : gpu_index_vec;
+    callbacks_data->kernel_string = (kernel_string_vec->empty()) ? NULL : kernel_string_vec;
+    callbacks_data->range = (range_vec->empty()) ? NULL : range_vec;;

    rocprofiler_set_queue_callbacks(callbacks_ptrs, callbacks_data);
  }
@@ -503,7 +595,7 @@ extern "C" PUBLIC_API void OnUnloadTool() {

  // Dump stored profiling output data
  const bool result_file_opened = (result_prefix != NULL) && (result_file_handle != NULL);
-  printf("\nROCPRofiler: %u contexts collected", context_count);
+  printf("\nROCPRofiler: %u contexts collected", context_collected);
  if (result_file_opened) printf(", output directory %s", result_prefix);
  printf("\n");
  dump_context_array();
@@ -514,4 +606,7 @@ extern "C" PUBLIC_API void OnUnloadTool() {
    delete[] callbacks_data->features;
    delete callbacks_data;
  }
+  delete gpu_index_vec;
+  delete kernel_string_vec;
+  delete range_vec;
 }
@@ -0,0 +1,93 @@
+<gfx8>
+  <metric name=GRBM_COUNT block=GRBM event=0 ></metric>
+  <metric name=GRBM_GUI_ACTIVE block=GRBM event=2 ></metric>
+
+  <metric name=SQ_CYCLES block=SQ event=2 ></metric>
+  <metric name=SQ_WAVES block=SQ event=4 ></metric>
+  <metric name=SQ_ITEMS block=SQ event=14 ></metric>
+  <metric name=SQ_INSTS_VALU block=SQ event=26 ></metric>
+  <metric name=SQ_INSTS_VMEM_WR block=SQ event=27 ></metric>
+  <metric name=SQ_INSTS_VMEM_RD block=SQ event=28 ></metric>
+  <metric name=SQ_INSTS_SALU block=SQ event=30 ></metric>
+  <metric name=SQ_INSTS_SMEM block=SQ event=31 ></metric>
+  <metric name=SQ_INSTS_FLAT block=SQ event=32 ></metric>
+  <metric name=SQ_INSTS_FLAT_LDS_ONLY block=SQ event=33 ></metric>
+  <metric name=SQ_INSTS_LDS block=SQ event=34 ></metric>
+  <metric name=SQ_INSTS_GDS block=SQ event=35 ></metric>
+  <metric name=SQ_WAVE_READY block=SQ event=47 ></metric>
+
+  <metric name=SQ_WAIT_INST_LDS block=SQ event=61 descr="Number of wave-cycles spent waiting for LDS instruction issue. In units of 4 cycles. (per-simd, nondeterministic)"></metric>
+  <metric name=SQ_ACTIVE_INST_VALU block=SQ event=69 descr="Number of cycles the SQ instruction arbiter is working on a VALU instruction. (per-simd, nondeterministic)"></metric>
+  <metric name=SQ_INST_CYCLES_SALU block=SQ event=86 descr="Number of cycles needed to execute non-memory read scalar operations. (per-simd, emulated)"></metric>
+  <metric name=SQ_THREAD_CYCLES_VALU block=SQ event=89 descr="Number of thread-cycles used to execute VALU operations (similar to INST_CYCLES_VALU but multiplied by # of active threads). (per-simd)"></metric>
+  <metric name=SQ_THREAD_CYCLES_VALU_MAX block=SQ event=90 descr="Maximum number of thread-cycles VALU operations that could have been executed given the instruction mix (similar to INST_CYCLES_VALU but multiplied by # of active threads). (per-simd, emulated)"></metric>
+  <metric name=SQ_LDS_BANK_CONFLICT block=SQ event=97 descr="Number of cycles LDS is stalled by bank conflicts. (emulated)"></metric>
+
+  <metric name=TA_TA_BUSY block=TA event=15 ></metric>
+  <metric name=TA_FLAT_READ_WAVEFRONTS block=TA event=101 ></metric>
+  <metric name=TA_FLAT_WRITE_WAVEFRONTS block=TA event=102 ></metric>
+
+  <metric name=TCC_CYCLE block=TCC event=1 ></metric>
+  <metric name=TCC_REQ block=TCC event=3 ></metric>
+  <metric name=TCC_HIT block=TCC event=18 ></metric>
+  <metric name=TCC_MISS block=TCC event=19 ></metric>
+  <metric name=TCC_WRITEBACK block=TCC event=22 ></metric>
+  <metric name=TCC_EA_WRREQ block=TCC event=26 ></metric>
+  <metric name=TCC_EA_WRREQ_64B block=TCC event=27 ></metric>
+  <metric name=TCC_EA_WRREQ_STALL block=TCC event=30 ></metric>
+  <metric name=TCC_MC_RDREQ block=TCC event=35 ></metric>
+  <metric name="TCC_MC_WRREQ" block=TCC event=26 descr="Number of 32-byte transactions going over the TC_MC_wrreq interface. Atomics may travel over the same interface and are generally classified as write requests."></metric>
+  <metric name="TCC_MC_WRREQ_STALL" block=TCC event=28 descr="Number of cycles a write request was stalled."></metric>
+
+  <metric name="TCP_TA_DATA_STALL_CYCLES" block=TCP event=3 descr="TCP stalls TA data interface. Now Windowed."></metric>
+  <metric name="TCP_TCP_TA_DATA_STALL_CYCLES" block=TCP event=3 descr="TCP stalls TA data interface. Not Windowed."></metric>
+
+  <metric name=CPC_ALWAYS_COUNT block=CPC event=0 ></metric>
+  <metric name=CPC_ME1_STALL_WAIT_ON_RCIU_READ block=CPC event=8 ></metric>
+</gfx8>
+
+<gfx9>
+  <metric name=GRBM_COUNT block=GRBM event=0 ></metric>
+  <metric name=GRBM_GUI_ACTIVE block=GRBM event=2 ></metric>
+
+  <metric name=SQ_CYCLES block=SQ event=2 ></metric>
+  <metric name=SQ_WAVES block=SQ event=4 ></metric>
+  <metric name=SQ_ITEMS block=SQ event=14 ></metric>
+  <metric name=SQ_INSTS_VALU block=SQ event=26 ></metric>
+  <metric name=SQ_INSTS_VMEM_WR block=SQ event=27 ></metric>
+  <metric name=SQ_INSTS_VMEM_RD block=SQ event=28 ></metric>
+  <metric name=SQ_INSTS_SALU block=SQ event=30 ></metric>
+  <metric name=SQ_INSTS_SMEM block=SQ event=31 ></metric>
+  <metric name=SQ_INSTS_FLAT block=SQ event=32 ></metric>
+  <metric name=SQ_INSTS_FLAT_LDS_ONLY block=SQ event=33 ></metric>
+  <metric name=SQ_INSTS_LDS block=SQ event=34 ></metric>
+  <metric name=SQ_INSTS_GDS block=SQ event=35 ></metric>
+  <metric name=SQ_WAVE_READY block=SQ event=47 ></metric>
+
+  <metric name=SQ_WAIT_INST_LDS block=SQ event=63 descr="Number of wave-cycles spent waiting for LDS instruction issue. In units of 4 cycles. (per-simd, nondeterministic)"></metric>
+  <metric name=SQ_ACTIVE_INST_VALU block=SQ event=71 descr="regspec 71? Number of cycles the SQ instruction arbiter is working on a VALU instruction. (per-simd, nondeterministic)"></metric>
+  <metric name=SQ_INST_CYCLES_SALU block=SQ event=84 descr="Number of cycles needed to execute non-memory read scalar operations. (per-simd, emulated)"></metric>
+  <metric name=SQ_THREAD_CYCLES_VALU block=SQ event=85 descr="Number of thread-cycles used to execute VALU operations (similar to INST_CYCLES_VALU but multiplied by # of active threads). (per-simd)"></metric>
+  <metric name=SQ_THREAD_CYCLES_VALU_MAX block=SQ event=86 descr="Maximum number of thread-cycles VALU operations that could have been executed given the instruction mix (similar to INST_CYCLES_VALU but multiplied by # of active threads). (per-simd, emulated)"></metric>
+  <metric name=SQ_LDS_BANK_CONFLICT block=SQ event=93 descr="Number of cycles LDS is stalled by bank conflicts. (emulated)"></metric>
+
+  <metric name=TA_TA_BUSY block=TA event=15 ></metric>
+  <metric name=TA_FLAT_READ_WAVEFRONTS block=TA event=101 ></metric>
+  <metric name=TA_FLAT_WRITE_WAVEFRONTS block=TA event=102 ></metric>
+
+  <metric name=TCC_CYCLE block=TCC event=1 ></metric>
+  <metric name=TCC_REQ block=TCC event=3 ></metric>
+  <metric name=TCC_HIT block=TCC event=20 ></metric>
+  <metric name=TCC_MISS block=TCC event=22 ></metric>
+  <metric name=TCC_WRITEBACK block=TCC event=25 ></metric>
+  <metric name=TCC_EA_WRREQ block=TCC event=29 ></metric>
+  <metric name=TCC_EA_WRREQ_64B block=TCC event=30 ></metric>
+  <metric name=TCC_EA_WRREQ_STALL block=TCC event=33 ></metric>
+  <metric name=TCC_EA_RDREQ block=TCC event=41 ></metric>
+  <metric name=TCC_EA_RDREQ_32B block=TCC event=42 ></metric>
+
+  <metric name=TCP_TA_DATA_STALL_CYCLES block=TCP event=6 descr="TCP stalls TA data interface. Now Windowed."></metric>
+
+  <metric name=CPC_ALWAYS_COUNT block=CPC event=0 ></metric>
+  <metric name=CPC_ME1_STALL_WAIT_ON_RCIU_READ block=CPC event=8 ></metric>
+</gfx9>
@@ -1,9 +1,23 @@
-<metric name=SQ_CYCLES,SQ_WAVES,SQ_INSTS_SMEM,SQ_INSTS_VALU,TA_FLAT_WRITE_WAVEFRONTS[0],TA_FLAT_WRITE_WAVEFRONTS[1],CPC_ALWAYS_COUNT,CPC_ME1_STALL_WAIT_ON_RCIU_READ,GPUBusy,VALUBusy,SALUBusy,MemUnitBusy,SFetchInsts,FetchSize,VWriteInsts,WriteSize
+# Filter by dispatches range, GPU index and kernel names
+<metric
+  # range format "3:9"
+  range=""
+  # list of gpu indexes "0,1,2,3"
+  gpu_index=""
+  # list of matched sub-strings "Simple1,Conv1,SimpleConvolution"
+  kernel=""
 ></metric>

-<trace name=SQTT copy=true >
+# List of metrics
+<metric
+  name=SQ_CYCLES,SQ_WAVES,SQ_INSTS_SMEM,SQ_INSTS_VALU,TA_FLAT_WRITE_WAVEFRONTS[0],TA_FLAT_WRITE_WAVEFRONTS[1],CPC_ALWAYS_COUNT,CPC_ME1_STALL_WAIT_ON_RCIU_READ,GPUBusy,VALUBusy,SALUBusy,MemUnitBusy,SFetchInsts,FetchSize,VWriteInsts,WriteSize
+></metric>
+
+# SQTT trace with parameters
+<trace name=SQTT copy="true">
  <parameters
-    HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_MASK=0xf
-    HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK=0xf
+    MASK=0x0f00
+    TOKEN_MASK=0x144b
+    TOKEN_MASK2=0xffff
  ></parameters>
 </trace>
@@ -1,283 +1,205 @@
+#include "gfx_metrics.xml"
+
 <gfx8>
-  <metric name=GRBM_COUNT block=GRBM event=0 ></metric>
-  <metric name=GRBM_GUI_ACTIVE block=GRBM event=2 ></metric>
-
-  <metric name=SQ_CYCLES block=SQ event=2 ></metric>
-  <metric name=SQ_WAVES block=SQ event=4 ></metric>
-  <metric name=SQ_ITEMS block=SQ event=14 ></metric>
-  <metric name=SQ_INSTS_VALU block=SQ event=26 ></metric>
-  <metric name=SQ_INSTS_VMEM_WR block=SQ event=27 ></metric>
-  <metric name=SQ_INSTS_VMEM_RD block=SQ event=28 ></metric>
-  <metric name=SQ_INSTS_SALU block=SQ event=30 ></metric>
-  <metric name=SQ_INSTS_SMEM block=SQ event=31 ></metric>
-  <metric name=SQ_INSTS_FLAT block=SQ event=32 ></metric>
-  <metric name=SQ_INSTS_FLAT_LDS_ONLY block=SQ event=33 ></metric>
-  <metric name=SQ_INSTS_LDS block=SQ event=34 ></metric>
-  <metric name=SQ_INSTS_GDS block=SQ event=35 ></metric>
-  <metric name=SQ_WAVE_READY block=SQ event=47 ></metric>
-
-  <metric name=SQ_WAIT_INST_LDS block=SQ event=61 descr="Number of wave-cycles spent waiting for LDS instruction issue. In units of 4 cycles. (per-simd, nondeterministic)"></metric>
-  <metric name=SQ_ACTIVE_INST_VALU block=SQ event=69 descr="Number of cycles the SQ instruction arbiter is working on a VALU instruction. (per-simd, nondeterministic)"></metric>
-  <metric name=SQ_INST_CYCLES_SALU block=SQ event=86 descr="Number of cycles needed to execute non-memory read scalar operations. (per-simd, emulated)"></metric>
-  <metric name=SQ_THREAD_CYCLES_VALU block=SQ event=89 descr="Number of thread-cycles used to execute VALU operations (similar to INST_CYCLES_VALU but multiplied by # of active threads). (per-simd)"></metric>
-  <metric name=SQ_THREAD_CYCLES_VALU_MAX block=SQ event=90 descr="Maximum number of thread-cycles VALU operations that could have been executed given the instruction mix (similar to INST_CYCLES_VALU but multiplied by # of active threads). (per-simd, emulated)"></metric>
-  <metric name=SQ_LDS_BANK_CONFLICT block=SQ event=97 descr="Number of cycles LDS is stalled by bank conflicts. (emulated)"></metric>
-
-  <metric name=TA_BUSY block=TA event=15 ></metric>
-  <metric name=TA_FLAT_READ_WAVEFRONTS block=TA event=101 ></metric>
-  <metric name=TA_FLAT_WRITE_WAVEFRONTS block=TA event=102 ></metric>
-
-  <metric name=TCC_CYCLE block=TCC event=1 ></metric>
-  <metric name=TCC_REQ block=TCC event=3 ></metric>
-  <metric name=TCC_HIT block=TCC event=18 ></metric>
-  <metric name=TCC_MISS block=TCC event=19 ></metric>
-  <metric name=TCC_WRITEBACK block=TCC event=22 ></metric>
-  <metric name=TCC_EA_WRREQ block=TCC event=26 ></metric>
-  <metric name=TCC_EA_WRREQ_64B block=TCC event=27 ></metric>
-  <metric name=TCC_EA_WRREQ_STALL block=TCC event=30 ></metric>
-  <metric name=TCC_MC_RDREQ block=TCC event=35 ></metric>
-
-  <metric name=TCP_TA_DATA_STALL_CYCLES block=TCP event=3 descr="TCP stalls TA data interface. Now Windowed."></metric>
-
-  <metric name=CPC_ALWAYS_COUNT block=CPC event=0 ></metric>
-  <metric name=CPC_ME1_STALL_WAIT_ON_RCIU_READ block=CPC event=8 ></metric>
-
-  # average for (16 instances x 4 shader engines)
-  <metric name="TA_BUSY_avr" expr=avr(TA_BUSY,16)/4 ></metric>
+  # average for 16 instances
+  <metric name="TA_BUSY_avr" expr=avr(TA_TA_BUSY,16) ></metric>
+  <metric name="TA_BUSY_max" expr=max(TA_TA_BUSY,16) ></metric>
+  <metric name="TA_BUSY_min" expr=min(TA_TA_BUSY,16) ></metric>
  # sum for 16 instances
  <metric name="TA_FLAT_READ_WAVEFRONTS_sum" expr=sum(TA_FLAT_READ_WAVEFRONTS,16) ></metric>
  <metric name="TA_FLAT_WRITE_WAVEFRONTS_sum" expr=sum(TA_FLAT_WRITE_WAVEFRONTS,16) ></metric>
  <metric name="TCC_HIT_sum" expr=sum(TCC_HIT,16) ></metric>
  <metric name="TCC_MISS_sum" expr=sum(TCC_MISS,16) ></metric>
  <metric name="TCC_MC_RDREQ_sum" expr=sum(TCC_MC_RDREQ,16) ></metric>
+  <metric name="TCC_MC_WRREQ_sum" expr=sum(TCC_MC_WRREQ,16) ></metric>
+  <metric name="TCC_WRREQ_STALL_max" expr=max(TCC_MC_WRREQ_STALL,16) ></metric>

  # FETCH_SIZE, kilobytes
  # The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account.
-  <metric
-    name="FETCH_SIZE"
-    expr=(TCC_MC_RDREQ_sum*32)/1024
-    descr="The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account."
-  ></metric>
+  <metric name="FETCH_SIZE" expr=(TCC_MC_RDREQ_sum*32)/1024 ></metric>
+  # WRITE_SIZE
+  # The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account.
+  <metric name="WRITE_SIZE" expr=(TCC_MC_WRREQ_sum*32)/1024 ></metric>
 </gfx8>

 <gfx9>
-  <metric name=GRBM_COUNT block=GRBM event=0 ></metric>
-  <metric name=GRBM_GUI_ACTIVE block=GRBM event=2 ></metric>
-
-  <metric name=SQ_CYCLES block=SQ event=2 ></metric>
-  <metric name=SQ_WAVES block=SQ event=4 ></metric>
-  <metric name=SQ_ITEMS block=SQ event=14 ></metric>
-  <metric name=SQ_INSTS_VALU block=SQ event=26 ></metric>
-  <metric name=SQ_INSTS_VMEM_WR block=SQ event=27 ></metric>
-  <metric name=SQ_INSTS_VMEM_RD block=SQ event=28 ></metric>
-  <metric name=SQ_INSTS_SALU block=SQ event=30 ></metric>
-  <metric name=SQ_INSTS_SMEM block=SQ event=31 ></metric>
-  <metric name=SQ_INSTS_FLAT block=SQ event=32 ></metric>
-  <metric name=SQ_INSTS_FLAT_LDS_ONLY block=SQ event=33 ></metric>
-  <metric name=SQ_INSTS_LDS block=SQ event=34 ></metric>
-  <metric name=SQ_INSTS_GDS block=SQ event=35 ></metric>
-  <metric name=SQ_WAVE_READY block=SQ event=47 ></metric>
-
-  <metric name=SQ_WAIT_INST_LDS block=SQ event=63 descr="Number of wave-cycles spent waiting for LDS instruction issue. In units of 4 cycles. (per-simd, nondeterministic)"></metric>
-  <metric name=SQ_ACTIVE_INST_VALU block=SQ event=71 descr="regspec 71? Number of cycles the SQ instruction arbiter is working on a VALU instruction. (per-simd, nondeterministic)"></metric>
-  <metric name=SQ_INST_CYCLES_SALU block=SQ event=84 descr="Number of cycles needed to execute non-memory read scalar operations. (per-simd, emulated)"></metric>
-  <metric name=SQ_THREAD_CYCLES_VALU block=SQ event=85 descr="Number of thread-cycles used to execute VALU operations (similar to INST_CYCLES_VALU but multiplied by # of active threads). (per-simd)"></metric>
-  <metric name=SQ_THREAD_CYCLES_VALU_MAX block=SQ event=86 descr="Maximum number of thread-cycles VALU operations that could have been executed given the instruction mix (similar to INST_CYCLES_VALU but multiplied by # of active threads). (per-simd, emulated)"></metric>
-  <metric name=SQ_LDS_BANK_CONFLICT block=SQ event=93 descr="Number of cycles LDS is stalled by bank conflicts. (emulated)"></metric>
-
-  <metric name=TA_BUSY block=TA event=15 ></metric>
-  <metric name=TA_FLAT_READ_WAVEFRONTS block=TA event=101 ></metric>
-  <metric name=TA_FLAT_WRITE_WAVEFRONTS block=TA event=102 ></metric>
-
-  <metric name=TCC_CYCLE block=TCC event=1 ></metric>
-  <metric name=TCC_REQ block=TCC event=3 ></metric>
-  <metric name=TCC_HIT block=TCC event=20 ></metric>
-  <metric name=TCC_MISS block=TCC event=22 ></metric>
-  <metric name=TCC_WRITEBACK block=TCC event=25 ></metric>
-  <metric name=TCC_EA_WRREQ block=TCC event=29 ></metric>
-  <metric name=TCC_EA_WRREQ_64B block=TCC event=30 ></metric>
-  <metric name=TCC_EA_WRREQ_STALL block=TCC event=33 ></metric>
-  <metric name=TCC_EA_RDREQ block=TCC event=41 ></metric>
-  <metric name=TCC_EA_RDREQ_32B block=TCC event=42 ></metric>
-
-  <metric name=TCP_TA_DATA_STALL_CYCLES block=TCP event=6 descr="TCP stalls TA data interface. Now Windowed."></metric>
-
-  <metric name=CPC_ALWAYS_COUNT block=CPC event=0 ></metric>
-  <metric name=CPC_ME1_STALL_WAIT_ON_RCIU_READ block=CPC event=8 ></metric>
-
-  # average for (16 instances x 4 shader engines)
-  <metric name="TA_BUSY_avr" expr=avr(TA_BUSY,16)/4 ></metric>
+  # average for 16 instances
+  <metric name="TA_BUSY_avr" expr=avr(TA_TA_BUSY,16) ></metric>
+  <metric name="TA_BUSY_max" expr=max(TA_TA_BUSY,16) ></metric>
+  <metric name="TA_BUSY_min" expr=min(TA_TA_BUSY,16) ></metric>
  # sum for 16 instances
  <metric name="TA_FLAT_READ_WAVEFRONTS_sum" expr=sum(TA_FLAT_READ_WAVEFRONTS,16) ></metric>
  <metric name="TA_FLAT_WRITE_WAVEFRONTS_sum" expr=sum(TA_FLAT_WRITE_WAVEFRONTS,16) ></metric>
  <metric name="TCC_HIT_sum" expr=sum(TCC_HIT,16) ></metric>
  <metric name="TCC_MISS_sum" expr=sum(TCC_MISS,16) ></metric>
-  <metric name="TCC_EA_RDREQ_sum" expr=sum(TCC_EA_RDREQ,16) ></metric>
  <metric name="TCC_EA_RDREQ_32B_sum" expr=sum(TCC_EA_RDREQ_32B,16) ></metric>
+  <metric name="TCC_EA_RDREQ_sum" expr=sum(TCC_EA_RDREQ,16) ></metric>
+  <metric name="TCC_EA_WRREQ_sum" expr=sum(TCC_EA_WRREQ,16) ></metric>
+  <metric name="TCC_EA_WRREQ_64B_sum" expr=sum(TCC_EA_WRREQ_64B,16) ></metric>
+  <metric name="TCC_WRREQ_STALL_max" expr=max(TCC_EA_WRREQ_STALL,16) ></metric>

  # FETCH_SIZE, kilobytes
  # The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account.
-  <metric
-    name="FETCH_SIZE"
-    expr=((TCC_EA_RDREQ_sum-TCC_EA_RDREQ_32B_sum)*64+TCC_EA_RDREQ_32B_sum*32)/1024
-    descr="The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account."
-  ></metric>
+  <metric name="FETCH_SIZE" expr=(TCC_EA_RDREQ_32B_sum*32+(TCC_EA_RDREQ_sum-TCC_EA_RDREQ_32B_sum)*64)/1024 ></metric>
+  # WRITE_SIZE
+  # The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account.
+  <metric name="WRITE_SIZE" expr=((TCC_EA_WRREQ_sum-TCC_EA_WRREQ_64B_sum)*32+TCC_EA_WRREQ_64B_sum*64)/1024 ></metric>
 </gfx9>

 <global>
-  # GPU_BUSY, percentage
+  # GPUBusy, percentage
  # The percentage of time GPU was busy.
  <metric
    name="GPUBusy"
-    expr=100*GRBM_GUI_ACTIVE/GRBM_COUNT
    descr="The percentage of time GPU was busy."
+    expr=100*GRBM_GUI_ACTIVE/GRBM_COUNT
  ></metric>

  # Wavefronts      Total wavefronts.,
  <metric
    name="Wavefronts"
-    expr=SQ_WAVES
    descr="Total wavefronts."
+    expr=SQ_WAVES
  ></metric>

  # VALUInsts       The average number of vector ALU instructions executed per work-item (affected by flow control).
  <metric
    name="VALUInsts"
-    expr=SQ_INSTS_VALU/SQ_WAVES
    descr="The average number of vector ALU instructions executed per work-item (affected by flow control)."
+    expr=SQ_INSTS_VALU/SQ_WAVES
  ></metric>

  # SALUInsts       The average number of scalar ALU instructions executed per work-item (affected by flow control).
  <metric
    name="SALUInsts"
-    expr=SQ_INSTS_SALU/SQ_WAVES
    descr="The average number of scalar ALU instructions executed per work-item (affected by flow control)."
+    expr=SQ_INSTS_SALU/SQ_WAVES
  ></metric>
  
  # VFetchInsts     The average number of vector fetch instructions from the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that fetch from video memory.
  <metric
    name="VFetchInsts"
-    expr=(SQ_INSTS_VMEM_RD-TA_FLAT_READ_WAVEFRONTS_sum)/SQ_WAVES
    descr="The average number of vector fetch instructions from the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that fetch from video memory."
+    expr=(SQ_INSTS_VMEM_RD-TA_FLAT_READ_WAVEFRONTS_sum)/SQ_WAVES
  ></metric>

  # SFetchInsts     The average number of scalar fetch instructions from the video memory executed per work-item (affected by flow control).
  <metric
    name="SFetchInsts"
-    expr=SQ_INSTS_SMEM/SQ_WAVES
    descr="The average number of scalar fetch instructions from the video memory executed per work-item (affected by flow control)."
+    expr=SQ_INSTS_SMEM/SQ_WAVES
  ></metric>

  # VWriteInsts     The average number of vector write instructions to the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that write to video memory.
  <metric
    name="VWriteInsts"
-    expr=(SQ_INSTS_VMEM_WR-TA_FLAT_WRITE_WAVEFRONTS_sum)/SQ_WAVES
    descr="The average number of vector write instructions to the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that write to video memory."
+    expr=(SQ_INSTS_VMEM_WR-TA_FLAT_WRITE_WAVEFRONTS_sum)/SQ_WAVES
  ></metric>

  # FlatVMemInsts   The average number of FLAT instructions that read from or write to the video memory executed per work item (affected by flow control). Includes FLAT instructions that read from or write to scratch.
  <metric
    name="FlatVMemInsts"
-    expr=(SQ_INSTS_FLAT-SQ_INSTS_FLAT_LDS_ONLY)/SQ_WAVES
    descr="The average number of FLAT instructions that read from or write to the video memory executed per work item (affected by flow control). Includes FLAT instructions that read from or write to scratch."
+    expr=(SQ_INSTS_FLAT-SQ_INSTS_FLAT_LDS_ONLY)/SQ_WAVES
  ></metric>

  # LDSInsts        The average number of LDS read or LDS write instructions executed per work item (affected by flow control).  Excludes FLAT instructions that read from or write to LDS.
  <metric
    name="LDSInsts"
-    expr=(SQ_INSTS_LDS-SQ_INSTS_FLAT_LDS_ONLY)/SQ_WAVES
    descr="The average number of LDS read or LDS write instructions executed per work item (affected by flow control).  Excludes FLAT instructions that read from or write to LDS."
+    expr=(SQ_INSTS_LDS-SQ_INSTS_FLAT_LDS_ONLY)/SQ_WAVES
  ></metric>

  # FlatLDSInsts    The average number of FLAT instructions that read or write to LDS executed per work item (affected by flow control).
  <metric
    name="FlatLDSInsts"
-    expr=SQ_INSTS_FLAT_LDS_ONLY/SQ_WAVES
    descr="The average number of FLAT instructions that read or write to LDS executed per work item (affected by flow control)."
+    expr=SQ_INSTS_FLAT_LDS_ONLY/SQ_WAVES
  ></metric>

  # GDSInsts        The average number of GDS read or GDS write instructions executed per work item (affected by flow control).
  <metric
    name="GDSInsts"
-    expr=SQ_INSTS_GDS/SQ_WAVES
    descr="The average number of GDS read or GDS write instructions executed per work item (affected by flow control)."
+    expr=SQ_INSTS_GDS/SQ_WAVES
  ></metric>

  # VALUUtilization The percentage of active vector ALU threads in a wave. A lower number can mean either more thread divergence in a wave or that the work-group size is not a multiple of 64. Value range: 0% (bad), 100% (ideal - no thread divergence).
  <metric
    name="VALUUtilization"
-    expr=100*SQ_THREAD_CYCLES_VALU/(SQ_ACTIVE_INST_VALU*64)
    descr="The percentage of active vector ALU threads in a wave. A lower number can mean either more thread divergence in a wave or that the work-group size is not a multiple of 64. Value range: 0% (bad), 100% (ideal - no thread divergence)."
+    expr=100*SQ_THREAD_CYCLES_VALU/(SQ_ACTIVE_INST_VALU*MAX_WAVE_SIZE)
  ></metric>

  # VALUBusy        The percentage of GPUTime vector ALU instructions are processed. Value range: 0% (bad) to 100% (optimal).
  <metric
    name="VALUBusy"
-    expr=100*SQ_ACTIVE_INST_VALU*4/SIMD_NUM/GRBM_GUI_ACTIVE
    descr="The percentage of GPUTime vector ALU instructions are processed. Value range: 0% (bad) to 100% (optimal)."
+    expr=100*SQ_ACTIVE_INST_VALU*4/SIMD_NUM/GRBM_GUI_ACTIVE
  ></metric>

  # SALUBusy        The percentage of GPUTime scalar ALU instructions are processed. Value range: 0% (bad) to 100% (optimal).
  <metric
    name="SALUBusy"
-    expr=100*SQ_INST_CYCLES_SALU*4/SIMD_NUM/GRBM_GUI_ACTIVE
    descr="The percentage of GPUTime scalar ALU instructions are processed. Value range: 0% (bad) to 100% (optimal)."
+    expr=100*SQ_INST_CYCLES_SALU*4/SIMD_NUM/GRBM_GUI_ACTIVE
  ></metric>

  # FetchSize       The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account.
  <metric
    name="FetchSize"
-    expr=FETCH_SIZE
    descr="The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account."
+    expr=FETCH_SIZE
  ></metric>

  # WriteSize       The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account.
  <metric
    name="WriteSize"
-    expr=((sum(TCC_EA_WRREQ,16)-sum(TCC_EA_WRREQ_64B,16))*32+sum(TCC_EA_WRREQ_64B,16)*64)/1024
    descr="The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account."
+    expr=WRITE_SIZE
  ></metric>

  # L2CacheHit      The percentage of fetch, write, atomic, and other instructions that hit the data in L2 cache. Value range: 0% (no hit) to 100% (optimal).
  <metric
    name="L2CacheHit"
-    expr=100*sum(TCC_HIT,16)/(sum(TCC_HIT,16)+sum(TCC_MISS,16))
    descr="The percentage of fetch, write, atomic, and other instructions that hit the data in L2 cache. Value range: 0% (no hit) to 100% (optimal)."
+    expr=100*sum(TCC_HIT,16)/(sum(TCC_HIT,16)+sum(TCC_MISS,16))
  ></metric>

  # MemUnitBusy     The percentage of GPUTime the memory unit is active. The result includes the stall time (MemUnitStalled). This is measured with all extra fetches and writes and any cache or memory effects taken into account. Value range: 0% to 100% (fetch-bound).
  <metric
    name="MemUnitBusy"
-    expr=100*max(TA_BUSY,16)/GRBM_GUI_ACTIVE/SE_NUM
    descr="The percentage of GPUTime the memory unit is active. The result includes the stall time (MemUnitStalled). This is measured with all extra fetches and writes and any cache or memory effects taken into account. Value range: 0% to 100% (fetch-bound)."
+    expr=100*max(TA_TA_BUSY,16)/GRBM_GUI_ACTIVE/SE_NUM
  ></metric>

  # MemUnitStalled  The percentage of GPUTime the memory unit is stalled. Try reducing the number or size of fetches and writes if possible. Value range: 0% (optimal) to 100% (bad).
  <metric
    name="MemUnitStalled"
-    expr=100*max(TCP_TA_DATA_STALL_CYCLES,16)/GRBM_GUI_ACTIVE/SE_NUM
    descr="The percentage of GPUTime the memory unit is stalled. Try reducing the number or size of fetches and writes if possible. Value range: 0% (optimal) to 100% (bad)."
+    expr=100*max(TCP_TCP_TA_DATA_STALL_CYCLES,16)/GRBM_GUI_ACTIVE/SE_NUM
  ></metric>

  # WriteUnitStalled The percentage of GPUTime the Write unit is stalled. Value range: 0% to 100% (bad).
  <metric
    name="WriteUnitStalled"
-    expr=100*max(TCC_EA_WRREQ_STALL,16)/GRBM_GUI_ACTIVE
    descr="The percentage of GPUTime the Write unit is stalled. Value range: 0% to 100% (bad)."
+    expr=100*TCC_WRREQ_STALL_max/GRBM_GUI_ACTIVE
  ></metric>

  # The percentage of GPUTime ALU units are stalled by the LDS input queue being full or the output queue being not ready. If there are LDS bank conflicts, reduce them. Otherwise, try reducing the number of LDS accesses if possible. Value range: 0% (optimal) to 100% (bad).
  <metric
    name="ALUStalledByLDS"
-    expr=100*SQ_WAIT_INST_LDS*4/SQ_WAVES/GRBM_GUI_ACTIVE
    descr="The percentage of GPUTime ALU units are stalled by the LDS input queue being full or the output queue being not ready. If there are LDS bank conflicts, reduce them. Otherwise, try reducing the number of LDS accesses if possible. Value range: 0% (optimal) to 100% (bad)."
+    expr=100*SQ_WAIT_INST_LDS*4/SQ_WAVES/GRBM_GUI_ACTIVE
  ></metric>

  # LDSBankConflict The percentage of GPUTime LDS is stalled by bank conflicts. Value range: 0% (optimal) to 100% (bad).
  <metric
    name="LDSBankConflict"
-    expr=100*SQ_LDS_BANK_CONFLICT/GRBM_GUI_ACTIVE/CU_NUM
    descr="The percentage of GPUTime LDS is stalled by bank conflicts. Value range: 0% (optimal) to 100% (bad)."
+    expr=100*SQ_LDS_BANK_CONFLICT/GRBM_GUI_ACTIVE/CU_NUM
  ></metric>

 </global>
@@ -25,6 +25,7 @@ POSSIBILITY OF SUCH DAMAGE.
 #include "util/hsa_rsrc_factory.h"

 #include <dlfcn.h>
+#include <fcntl.h>
 #include <hsa.h>
 #include <hsa_ext_amd.h>
 #include <hsa_ext_finalize.h>
@@ -32,6 +33,8 @@ POSSIBILITY OF SUCH DAMAGE.
 #include <stdio.h>
 #include <string.h>
 #include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/types.h>

 #include <atomic>
 #include <cassert>
@@ -77,12 +80,13 @@ hsa_status_t HsaRsrcFactory::FindMemRegionsCallback(hsa_region_t region, void* d
 }

 // Constructor of the class
-HsaRsrcFactory::HsaRsrcFactory() {
+HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize_hsa) {
+  hsa_status_t status;
  // Initialize the Hsa Runtime
-  printf("HSA init\n");
-  hsa_status_t status = hsa_init();
-  CHECK_STATUS("Error in hsa_init", status);
-
+  if (initialize_hsa_) {
+    status = hsa_init();
+    CHECK_STATUS("Error in hsa_init", status);
+  }
  // Discover the set of Gpu devices available on the platform
  status = hsa_iterate_agents(GetHsaAgentsCallback, this);
  CHECK_STATUS("Error Calling hsa_iterate_agents", status);
@@ -106,10 +110,10 @@ HsaRsrcFactory::HsaRsrcFactory() {
 HsaRsrcFactory::~HsaRsrcFactory() {
  for (auto p : cpu_list_) delete p;
  for (auto p : gpu_list_) delete p;
-
-  printf("HSA shutdown\n");
-  hsa_status_t status = hsa_shut_down();
-  CHECK_STATUS("Error in hsa_shut_down", status);
+  if (initialize_hsa_) {
+    hsa_status_t status = hsa_shut_down();
+    CHECK_STATUS("Error in hsa_shut_down", status);
+  }
 }

 hsa_status_t HsaRsrcFactory::LoadAqlProfileLib(aqlprofile_pfn_t* api) {
@@ -365,67 +369,53 @@ bool HsaRsrcFactory::TransferData(void* dest_buff, void* src_buff, uint32_t leng
 //
 // @return bool true if successful, false otherwise
 //
-void* HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path,
-                                      const char* kernel_name, hsa_executable_t* hsa_exec, hsa_executable_symbol_t* code_desc) {
-  // Finalize the Hsail object into code object
-  hsa_status_t status;
-  hsa_code_object_t code_object;
+bool HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path,
+                                      const char* kernel_name, hsa_executable_t* executable, hsa_executable_symbol_t* code_desc) {
+  hsa_status_t status = HSA_STATUS_ERROR;

  // Build the code object filename
  std::string filename(brig_path);
  std::clog << "Code object filename: " << filename << std::endl;

  // Open the file containing code object
-  std::ifstream codeStream(filename.c_str(), std::ios::binary | std::ios::ate);
-  if (!codeStream) {
-    std::cerr << "Error: failed to load " << filename << std::endl;
+  hsa_file_t file_handle = open(filename.c_str(), O_RDONLY);
+  if (file_handle == -1) {
+    std::cerr << "Error: failed to load '" << filename << "'" << std::endl;
    assert(false);
-    return NULL;
+    return false;
  }

-  // Allocate memory to read in code object from file
-  size_t size = std::string::size_type(codeStream.tellg());
-  char* code_buf = (char*)AllocateSysMemory(agent_info, size);
-  if (!code_buf) {
-    std::cerr << "Error: failed to allocate memory for code object." << std::endl;
-    assert(false);
-    return NULL;
-  }
-
-  // Read the code object into allocated memory
-  codeStream.seekg(0, std::ios::beg);
-  std::copy(std::istreambuf_iterator<char>(codeStream), std::istreambuf_iterator<char>(), code_buf);
-
-  // De-Serialize the code object that has been read into memory
-  status = hsa_code_object_deserialize(code_buf, size, NULL, &code_object);
+  // Create code object reader
+  hsa_code_object_reader_t code_obj_rdr = {0};
+  status = hsa_code_object_reader_create_from_file(file_handle, &code_obj_rdr);
  if (status != HSA_STATUS_SUCCESS) {
-    std::cerr << "Failed to deserialize code object" << std::endl;
-    if (code_buf) hsa_memory_free(code_buf);
-    return NULL;
+    std::cerr << "Failed to create code object reader '" << filename << "'" << std::endl;
+    return false;
  }

  // Create executable.
-  status =
-      hsa_executable_create(HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN, "", hsa_exec);
+  status = hsa_executable_create_alt(HSA_PROFILE_FULL,
+    HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL, executable);
  CHECK_STATUS("Error in creating executable object", status);

  // Load code object.
-  status = hsa_executable_load_code_object(*hsa_exec, agent_info->dev_id, code_object, "");
+  status = hsa_executable_load_agent_code_object(*executable, agent_info->dev_id,
+    code_obj_rdr, NULL, NULL);
  CHECK_STATUS("Error in loading executable object", status);

  // Freeze executable.
-  status = hsa_executable_freeze(*hsa_exec, "");
+  status = hsa_executable_freeze(*executable, "");
  CHECK_STATUS("Error in freezing executable object", status);

  // Get symbol handle.
  hsa_executable_symbol_t kernelSymbol;
-  status = hsa_executable_get_symbol(*hsa_exec, NULL, kernel_name, agent_info->dev_id, 0,
+  status = hsa_executable_get_symbol(*executable, NULL, kernel_name, agent_info->dev_id, 0,
                                     &kernelSymbol);
  CHECK_STATUS("Error in looking up kernel symbol", status);

  // Update output parameter
  *code_desc = kernelSymbol;
-  return code_buf;
+  return true;
 }

 // Print the various fields of Hsa Gpu Agents
@@ -112,20 +112,18 @@ class HsaRsrcFactory {
 public:
  typedef std::recursive_mutex mutex_t;

-  static HsaRsrcFactory* Create() { return NULL; }
-
-  static HsaRsrcFactory* CreateInstance() {
+  static HsaRsrcFactory* Create(bool initialize_hsa = true) {
    std::lock_guard<mutex_t> lck(mutex_);
    if (instance_ == NULL) {
-      instance_ = new HsaRsrcFactory();
+      instance_ = new HsaRsrcFactory(initialize_hsa);
    }
    return instance_;
  }

  static HsaRsrcFactory& Instance() {
-    CreateInstance();
+    if (instance_ == NULL) instance_ = Create(false);
    hsa_status_t status = (instance_ != NULL) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
-    CHECK_STATUS("HsaRsrcFactory::Instance() is not found", status);
+    CHECK_STATUS("HsaRsrcFactory::Instance() failed", status);
    return *instance_;
  }

@@ -229,9 +227,9 @@ class HsaRsrcFactory {
  // @param code_desc Handle of finalized Code Descriptor that could
  // be used to submit for execution
  //
-  // @return code buffer, non NULL if successful, NULL otherwise
+  // @return true if successful, false otherwise
  //
-  void* LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, const char* kernel_name,
+  bool LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, const char* kernel_name,
                        hsa_executable_t* hsa_exec, hsa_executable_symbol_t* code_desc);

  // Print the various fields of Hsa Gpu Agents
@@ -259,11 +257,14 @@ class HsaRsrcFactory {

  // Constructor of the class. Will initialize the Hsa Runtime and
  // query the system topology to get the list of Cpu and Gpu devices
-  HsaRsrcFactory();
+  HsaRsrcFactory(bool initialize_hsa);

  // Destructor of the class
  ~HsaRsrcFactory();

+  // HSA was initialized
+  const bool initialize_hsa_;
+
  // Add an instance of AgentInfo representing a Hsa Gpu agent
  const AgentInfo* AddAgentInfo(const hsa_agent_t agent);