From a12c5628ea8690f85b75e03e93eabac4e532af09 Mon Sep 17 00:00:00 2001
From: Chris Freehill <Chris.Freehill@amd.com>
Date: Wed, 28 Jun 2017 10:54:57 -0500
Subject: [PATCH] Added dispatch time, async copy and test template rocrtst
 tests

Change-Id: I57a844ee65c36bd61616ee6d60d358303f51db56
---
 rocrtst/common/base_rocr.cc                   |   5 +-
 rocrtst/common/base_rocr.h                    |  17 -
 rocrtst/common/base_rocr_utils.cc             | 122 ++-
 rocrtst/common/base_rocr_utils.h              |  38 +-
 rocrtst/common/common.cc                      |  39 -
 rocrtst/common/common.h                       |  30 -
 rocrtst/common/helper_funcs.cc                |  16 +-
 rocrtst/common/helper_funcs.h                 |   4 +-
 rocrtst/common/hsa_perf_cntrs.cc              |   4 +
 rocrtst/common/hsatimer.cc                    |   3 +-
 rocrtst/common/hsatimer.h                     |   1 +
 rocrtst/samples/CMakeLists.txt                |   5 +
 rocrtst/suites/performance/CMakeLists.txt     | 108 ++-
 rocrtst/suites/performance/cp_process_time.cc | 258 -----
 rocrtst/suites/performance/cp_process_time.h  |  91 --
 rocrtst/suites/performance/cu_masking.cc      | 220 -----
 rocrtst/suites/performance/cu_masking.h       | 103 --
 .../performance/device_load_bandwidth.cc      | 293 ------
 .../performance/device_store_bandwidth.cc     | 219 -----
 .../performance/device_store_bandwidth.h      | 119 ---
 rocrtst/suites/performance/dispatch_time.cc   | 247 ++---
 rocrtst/suites/performance/dispatch_time.h    |  87 +-
 rocrtst/suites/performance/flush_latency.cc   | 351 -------
 rocrtst/suites/performance/flush_latency.h    | 122 ---
 rocrtst/suites/performance/hsa_info.cc        | 502 ----------
 rocrtst/suites/performance/image_bandwidth.cc | 328 -------
 rocrtst/suites/performance/image_bandwidth.h  |  99 --
 .../performance/image_load_bandwidth.cc       | 270 ------
 .../performance/image_store_bandwidth.cc      | 271 ------
 .../performance/kernels/cu_masking.brig       | Bin 1200 -> 0 bytes
 .../dispatch_time_kernels.cl}                 |  45 +-
 .../performance/kernels/empty_kernel.hsail    |  12 -
 .../performance/kernels/flush_latency.hsail   |  88 --
 .../kernels/flush_latency_base.hsail          |  88 --
 .../performance/kernels/load_2d_image.hsail   | 109 ---
 .../performance/kernels/simple_kernel.hsail   |  37 -
 .../kernels/simple_kernel_base.hsail          |  28 -
 .../performance/kernels/store_2d_image.hsail  | 105 --
 .../performance/kernels/sysMemRead.hsail      | 237 -----
 .../performance/kernels/sysMemRead_base.hsail | 237 -----
 .../performance/kernels/sysMemWrite.hsail     | 105 --
 .../kernels/sysMemWrite_base.hsail            | 105 --
 .../test_case_template_kernels.cl}            |  46 +-
 .../performance/kernels/test_kernel.hsail     |  53 -
 .../kernels/transpose_kernel.hsail            | 108 ---
 .../performance/kernels/vector_copy.hsail     |  34 -
 .../kernels/vector_copy_base.hsail            |  64 --
 .../kernels/vector_copy_full.hsail            |  64 --
 rocrtst/suites/performance/main.cc            | 267 ++---
 .../suites/performance/matrix_transpose.cc    | 289 ------
 rocrtst/suites/performance/matrix_transpose.h | 101 --
 .../suites/performance/memory_allocation.cc   | 198 ----
 .../suites/performance/memory_allocation.h    |  98 --
 .../suites/performance/memory_async_copy.cc   | 912 ++++++++----------
 .../suites/performance/memory_async_copy.h    | 257 +++--
 rocrtst/suites/performance/memory_copy.cc     | 411 --------
 rocrtst/suites/performance/memory_copy.h      | 109 ---
 .../suites/performance/queue_concurrency.cc   | 284 ------
 .../queue_create_destroy_latency.cc           | 271 ------
 .../queue_create_destroy_latency.h            |  95 --
 .../performance/system_load_bandwidth.cc      | 281 ------
 .../performance/system_load_bandwidth.h       | 119 ---
 .../performance/system_store_bandwidth.cc     | 243 -----
 .../performance/system_store_bandwidth.h      | 121 ---
 .../suites/performance/test_case_template.cc  | 395 ++++++++
 ..._load_bandwidth.h => test_case_template.h} |  39 +-
 rocrtst/suites/performance/vector_copy.cc     | 279 ------
 rocrtst/suites/performance/vector_copy.h      | 109 ---
 .../performance/vector_copy_peer_to_peer.h    | 106 --
 rocrtst/suites/test_common/test_base.cc       | 141 +++
 .../test_base.h}                              |  63 +-
 .../test_common.cc}                           | 132 +--
 .../perf_base.h => test_common/test_common.h} |  25 +-
 73 files changed, 1592 insertions(+), 9290 deletions(-)
 mode change 100644 => 100755 rocrtst/common/base_rocr.h
 delete mode 100755 rocrtst/suites/performance/cp_process_time.cc
 delete mode 100755 rocrtst/suites/performance/cp_process_time.h
 delete mode 100644 rocrtst/suites/performance/cu_masking.cc
 delete mode 100755 rocrtst/suites/performance/cu_masking.h
 delete mode 100755 rocrtst/suites/performance/device_load_bandwidth.cc
 delete mode 100755 rocrtst/suites/performance/device_store_bandwidth.cc
 delete mode 100755 rocrtst/suites/performance/device_store_bandwidth.h
 delete mode 100755 rocrtst/suites/performance/flush_latency.cc
 delete mode 100755 rocrtst/suites/performance/flush_latency.h
 delete mode 100755 rocrtst/suites/performance/hsa_info.cc
 delete mode 100755 rocrtst/suites/performance/image_bandwidth.cc
 delete mode 100755 rocrtst/suites/performance/image_bandwidth.h
 delete mode 100755 rocrtst/suites/performance/image_load_bandwidth.cc
 delete mode 100755 rocrtst/suites/performance/image_store_bandwidth.cc
 delete mode 100644 rocrtst/suites/performance/kernels/cu_masking.brig
 rename rocrtst/suites/performance/{hsa_info.h => kernels/dispatch_time_kernels.cl} (72%)
 delete mode 100755 rocrtst/suites/performance/kernels/empty_kernel.hsail
 delete mode 100755 rocrtst/suites/performance/kernels/flush_latency.hsail
 delete mode 100755 rocrtst/suites/performance/kernels/flush_latency_base.hsail
 delete mode 100755 rocrtst/suites/performance/kernels/load_2d_image.hsail
 delete mode 100755 rocrtst/suites/performance/kernels/simple_kernel.hsail
 delete mode 100755 rocrtst/suites/performance/kernels/simple_kernel_base.hsail
 delete mode 100755 rocrtst/suites/performance/kernels/store_2d_image.hsail
 delete mode 100755 rocrtst/suites/performance/kernels/sysMemRead.hsail
 delete mode 100755 rocrtst/suites/performance/kernels/sysMemRead_base.hsail
 delete mode 100755 rocrtst/suites/performance/kernels/sysMemWrite.hsail
 delete mode 100755 rocrtst/suites/performance/kernels/sysMemWrite_base.hsail
 rename rocrtst/suites/performance/{image_store_bandwidth.h => kernels/test_case_template_kernels.cl} (72%)
 delete mode 100755 rocrtst/suites/performance/kernels/test_kernel.hsail
 delete mode 100755 rocrtst/suites/performance/kernels/transpose_kernel.hsail
 delete mode 100755 rocrtst/suites/performance/kernels/vector_copy.hsail
 delete mode 100755 rocrtst/suites/performance/kernels/vector_copy_base.hsail
 delete mode 100755 rocrtst/suites/performance/kernels/vector_copy_full.hsail
 mode change 100644 => 100755 rocrtst/suites/performance/main.cc
 delete mode 100755 rocrtst/suites/performance/matrix_transpose.cc
 delete mode 100755 rocrtst/suites/performance/matrix_transpose.h
 delete mode 100755 rocrtst/suites/performance/memory_allocation.cc
 delete mode 100755 rocrtst/suites/performance/memory_allocation.h
 mode change 100644 => 100755 rocrtst/suites/performance/memory_async_copy.cc
 delete mode 100755 rocrtst/suites/performance/memory_copy.cc
 delete mode 100644 rocrtst/suites/performance/memory_copy.h
 delete mode 100755 rocrtst/suites/performance/queue_concurrency.cc
 delete mode 100755 rocrtst/suites/performance/queue_create_destroy_latency.cc
 delete mode 100755 rocrtst/suites/performance/queue_create_destroy_latency.h
 delete mode 100755 rocrtst/suites/performance/system_load_bandwidth.cc
 delete mode 100755 rocrtst/suites/performance/system_load_bandwidth.h
 delete mode 100755 rocrtst/suites/performance/system_store_bandwidth.cc
 delete mode 100755 rocrtst/suites/performance/system_store_bandwidth.h
 create mode 100755 rocrtst/suites/performance/test_case_template.cc
 rename rocrtst/suites/performance/{image_load_bandwidth.h => test_case_template.h} (76%)
 delete mode 100644 rocrtst/suites/performance/vector_copy.cc
 delete mode 100755 rocrtst/suites/performance/vector_copy.h
 delete mode 100755 rocrtst/suites/performance/vector_copy_peer_to_peer.h
 create mode 100755 rocrtst/suites/test_common/test_base.cc
 rename rocrtst/suites/{performance/queue_concurrency.h => test_common/test_base.h} (70%)
 rename rocrtst/suites/{performance/device_load_bandwidth.h => test_common/test_common.cc} (55%)
 rename rocrtst/suites/{performance/perf_common/perf_base.h => test_common/test_common.h} (78%)

diff --git a/rocrtst/common/base_rocr.cc b/rocrtst/common/base_rocr.cc
index a7aa71649a..4c95e4cf85 100755
--- a/rocrtst/common/base_rocr.cc
+++ b/rocrtst/common/base_rocr.cc
@@ -50,11 +50,9 @@
 namespace rocrtst {
 
 BaseRocR::BaseRocR(void) {
-  num_iteration_ = 100;
-  signal_.handle = 0;
+  num_iteration_ = 1;
   cpu_device_.handle = -1;
   gpu_device1_.handle = -1;
-  region_.handle = 0;
   device_pool_.handle = 0;
   kern_arg_pool_.handle = 0;
   main_queue_ = nullptr;
@@ -66,6 +64,7 @@ BaseRocR::BaseRocR(void) {
   orig_hsa_enable_interrupt_ = GetEnv("HSA_ENABLE_INTERRUPT");
   set_kernel_file_name("");
   set_verbosity(0);
+  set_title("unset_title");
 }
 
 BaseRocR::~BaseRocR() {
diff --git a/rocrtst/common/base_rocr.h b/rocrtst/common/base_rocr.h
old mode 100644
new mode 100755
index f96b2a9e00..121c5318df
--- a/rocrtst/common/base_rocr.h
+++ b/rocrtst/common/base_rocr.h
@@ -105,13 +105,6 @@ class BaseRocR {
     return kernel_object_;
   }
 
-  void set_signal(hsa_signal_t sig) {
-    signal_.handle = sig.handle;
-  }
-  const hsa_signal_t& signal(void) const {
-    return signal_;
-  }
-
   void set_profile(hsa_profile_t in_prof) {
     profile_ = in_prof;
   }
@@ -151,10 +144,6 @@ class BaseRocR {
     return aql_;
   }
 
-  hsa_region_t& region(void) {
-    return region_;
-  }
-
   void set_num_iteration(int num) {
     num_iteration_ = num;
   }
@@ -237,16 +226,12 @@ class BaseRocR {
  private:
   uint64_t num_iteration_;   ///< Number of times to execute test
 
-  hsa_signal_t signal_;   ///< Completion signal used for kernel execution
-
   hsa_queue_t* main_queue_;   ///< AQL queue used for packets
 
   hsa_agent_t gpu_device1_;   ///< Handle to first GPU found
 
   hsa_agent_t cpu_device_;   ///< Handle to CPU
 
-  hsa_region_t region_;   ///< TODO(cfreehil): delete this
-
   hsa_amd_memory_pool_t device_pool_;   ///< Memory pool on gpu pool list
 
   hsa_amd_memory_pool_t cpu_pool_;   ///< Memory pool on cpu pool list
@@ -255,8 +240,6 @@ class BaseRocR {
 
   uint64_t kernel_object_;   ///< Handle to kernel code
 
-  std::string brig_file_;   // TODO(cfreehil): delete this
-
   std::string kernel_file_name_;   ///< Code object file name
 
   std::string kernel_name_;   ///< Kernel name
diff --git a/rocrtst/common/base_rocr_utils.cc b/rocrtst/common/base_rocr_utils.cc
index 05bc1a9c28..bba6391419 100755
--- a/rocrtst/common/base_rocr_utils.cc
+++ b/rocrtst/common/base_rocr_utils.cc
@@ -70,6 +70,8 @@ namespace rocrtst {
   } \
 }
 
+// Clean up some of the common handles and memory used by BaseRocR code, then
+// shut down hsa. Restore HSA_ENABLE_INTERRUPT to original value, if necessary
 hsa_status_t CommonCleanUp(BaseRocR* test) {
   hsa_status_t err;
 
@@ -87,13 +89,9 @@ hsa_status_t CommonCleanUp(BaseRocR* test) {
     test->set_main_queue(nullptr);
   }
 
-  if (0 != test->signal().handle) {
-    hsa_signal_t sig;
-    sig.handle = 0;
-
-    err = hsa_signal_destroy(test->signal());
+  if (test->aql().completion_signal.handle != 0) {
+    err = hsa_signal_destroy(test->aql().completion_signal);
     RET_IF_HSA_UTILS_ERR(err);
-    test->set_signal(sig);
   }
 
   err = hsa_shut_down();
@@ -122,7 +120,7 @@ static const char* PROFILE_STR[] = {"HSA_PROFILE_BASE", "HSA_PROFILE_FULL", };
 /// \returns bool
 ///          - true Machine meets test requirements
 ///          - false Machine does not meet test requirements
-static bool CheckProfileAndInform(BaseRocR* test) {
+bool CheckProfileAndInform(BaseRocR* test) {
   if (test->verbosity() > 0) {
     std::cout << "Target HW Profile is "
               << PROFILE_STR[test->profile()] << std::endl;
@@ -162,6 +160,10 @@ static hsa_status_t ProcessIterateError(hsa_status_t err) {
   return err;
 }
 
+// Find pools for cpu, gpu and for kernel arguments. These pools have
+// common basic requirements, but are not suitable for all cases. In
+// that case, set cpu_pool(), device_pool() and/or kern_arg_pool()
+// yourself instead of using this function.
 hsa_status_t SetPoolsTypical(BaseRocR* test) {
   hsa_status_t err;
 
@@ -180,11 +182,9 @@ hsa_status_t SetPoolsTypical(BaseRocR* test) {
   return HSA_STATUS_SUCCESS;
 }
 
+// Enable interrupts if necessary, and call hsa_init()
 hsa_status_t InitAndSetupHSA(BaseRocR* test) {
-  hsa_agent_t gpu_device1;
-  hsa_agent_t cpu_device;
   hsa_status_t err;
-  hsa_signal_t sig;
 
   if (test->enable_interrupt()) {
     SetEnv("HSA_ENABLE_INTERRUPT", "1");
@@ -193,6 +193,15 @@ hsa_status_t InitAndSetupHSA(BaseRocR* test) {
   err = hsa_init();
   RET_IF_HSA_UTILS_ERR(err);
 
+  return HSA_STATUS_SUCCESS;
+}
+
+// Attempt to find and set test->cpu_device and test->gpu_device1
+hsa_status_t SetDefaultAgents(BaseRocR* test) {
+  hsa_agent_t gpu_device1;
+  hsa_agent_t cpu_device;
+  hsa_status_t err;
+
   gpu_device1.handle = 0;
   err = hsa_iterate_agents(FindGPUDevice, &gpu_device1);
   RET_IF_HSA_UTILS_ERR(rocrtst::ProcessIterateError(err));
@@ -217,7 +226,7 @@ hsa_status_t InitAndSetupHSA(BaseRocR* test) {
     char name[64] = {0};
     err = hsa_agent_get_info(gpu_device1, HSA_AGENT_INFO_NAME, name);
     RET_IF_HSA_UTILS_ERR(err);
-    std::cout << "The device name is " << name << std::endl;
+    std::cout << "The gpu device name is " << name << std::endl;
   }
 
   hsa_profile_t profile;
@@ -228,14 +237,11 @@ hsa_status_t InitAndSetupHSA(BaseRocR* test) {
   if (!CheckProfileAndInform(test)) {
     return HSA_STATUS_ERROR;
   }
-
-  err = hsa_signal_create(1, 0, NULL, &sig);
-  RET_IF_HSA_UTILS_ERR(err);
-  test->set_signal(sig);
-
   return HSA_STATUS_SUCCESS;
 }
 
+// See if the profile of the target matches any required profile by the
+// test program.
 bool CheckProfile(BaseRocR const* test) {
   if (test->requires_profile() == -1) {
     return true;
@@ -243,6 +249,19 @@ bool CheckProfile(BaseRocR const* test) {
     return (test->requires_profile() == test->profile());
   }
 }
+// Load the specified kernel code from the specified file, inspect and fill
+// in BaseRocR member variables related to the kernel and executable.
+// Required Input BaseRocR member variables:
+// - gpu_device1()
+// - kernel_file_name()
+// - kernel_name()
+//
+// Written BaseRocR member variables:
+//  -kernel_object()
+//  -private_segment_size()
+//  -group_segment_size()
+//  -kernarg_size()
+//  -kernarg_align()
 hsa_status_t LoadKernelFromObjFile(BaseRocR* test) {
   hsa_status_t err;
   hsa_code_object_reader_t code_obj_rdr = {0};
@@ -334,13 +353,16 @@ hsa_status_t CreateQueue(hsa_agent_t device, hsa_queue_t** queue,
 
   return HSA_STATUS_SUCCESS;
 }
-
-void InitializeAQLPacket(const BaseRocR* test,
+// Initialize the provided aql packet with standard default values, and
+// values from provided BaseRocR object.
+hsa_status_t InitializeAQLPacket(const BaseRocR* test,
                          hsa_kernel_dispatch_packet_t* aql) {
+  hsa_status_t err;
+
   assert(aql != nullptr);
 
   if (aql == nullptr) {
-    return;
+    return HSA_STATUS_ERROR;
   }
 
   aql->header = 0;   // Set this right before doorbell ring
@@ -361,19 +383,25 @@ void InitializeAQLPacket(const BaseRocR* test,
   // Pin kernel code and the kernel argument buffer to the aql packet->
   aql->kernel_object = test->kernel_object();
 
-  aql->kernarg_address = NULL;
-  aql->completion_signal.handle = test->signal().handle;
+  // aql->kernarg_address may be filled in by AllocAndSetKernArgs() if it is
+  // called before this function, so we don't want overwrite it, therefore
+  // we ignore it in this function.
 
-  return;
+  err = hsa_signal_create(1, 0, NULL, &aql->completion_signal);
+
+  return err;
 }
 
-void WriteAQLToQueue(BaseRocR* test) {
+// Copy BaseRocR aql object values to the BaseRocR object queue in the
+// specified queue position (ind)
+hsa_kernel_dispatch_packet_t * WriteAQLToQueue(BaseRocR* test, uint64_t *ind) {
   assert(test);
   assert(test->main_queue());
 
   void *queue_base = test->main_queue()->base_address;
   const uint32_t queue_mask = test->main_queue()->size - 1;
   uint64_t que_idx = hsa_queue_add_write_index_relaxed(test->main_queue(), 1);
+  *ind = que_idx;
 
   hsa_kernel_dispatch_packet_t* staging_aql_packet = &test->aql();
   hsa_kernel_dispatch_packet_t* queue_aql_packet;
@@ -395,8 +423,12 @@ void WriteAQLToQueue(BaseRocR* test) {
   queue_aql_packet->kernel_object = staging_aql_packet->kernel_object;
   queue_aql_packet->kernarg_address = staging_aql_packet->kernarg_address;
   queue_aql_packet->completion_signal = staging_aql_packet->completion_signal;
+
+  return queue_aql_packet;
 }
 
+// Allocate a buffer in the kern_arg_pool for the kernel arguments and write
+// the arguments to buffer
 hsa_status_t AllocAndSetKernArgs(BaseRocR* test, void* args, size_t arg_size) {
   void* kern_arg_buf = nullptr;
   hsa_status_t err;
@@ -421,56 +453,18 @@ hsa_status_t AllocAndSetKernArgs(BaseRocR* test, void* args, size_t arg_size) {
   assert(((uintptr_t)adj_kern_arg_buf + arg_size) <
                                         ((uintptr_t)kern_arg_buf + buf_size));
 
-  err = hsa_memory_copy_workaround_cpu(adj_kern_arg_buf, args, arg_size);
-  RET_IF_HSA_UTILS_ERR(err);
-
   hsa_agent_t ag_list[2] = {*test->gpu_device1(), *test->cpu_device()};
   err = hsa_amd_agents_allow_access(2, ag_list, NULL, kern_arg_buf);
   RET_IF_HSA_UTILS_ERR(err);
 
+  err = hsa_memory_copy(adj_kern_arg_buf, args, arg_size);
+  RET_IF_HSA_UTILS_ERR(err);
+
   test->aql().kernarg_address = adj_kern_arg_buf;
 
   return HSA_STATUS_SUCCESS;
 }
 
-hsa_status_t AllocAndAllowAccess(BaseRocR* test, size_t len,
-                                  hsa_amd_memory_pool_t pool, void**buffer) {
-  hsa_status_t err;
-
-  err = hsa_amd_memory_pool_allocate(pool, len, 0, buffer);
-  RET_IF_HSA_UTILS_ERR(err);
-
-  hsa_agent_t ag_list[2] = {*test->gpu_device1(), *test->cpu_device()};
-  err = hsa_amd_agents_allow_access(2, ag_list, NULL, *buffer);
-  RET_IF_HSA_UTILS_ERR(err);
-
-  return err;
-}
-
-hsa_status_t hsa_memory_fill_workaround_gen(void* ptr, uint32_t value,
-    size_t count, hsa_agent_t dst_ag, hsa_agent_t src_ag, BaseRocR* test) {
-
-  hsa_status_t err;
-
-  void *tmp_mem;
-
-  err = hsa_amd_memory_pool_allocate(test->cpu_pool(), count, 0, &tmp_mem);
-  RET_IF_HSA_UTILS_ERR(err);
-
-  hsa_agent_t ag_list[2] = {*test->gpu_device1(), *test->cpu_device()};
-  err = hsa_amd_agents_allow_access(2, ag_list, NULL, tmp_mem);
-  RET_IF_HSA_UTILS_ERR(err);
-
-  (void)memset(tmp_mem, value, count);
-
-  err = hsa_memory_copy_workaround_gen(ptr, tmp_mem, count, dst_ag, src_ag);
-  RET_IF_HSA_UTILS_ERR(err);
-
-  hsa_amd_memory_pool_free(tmp_mem);
-
-  return HSA_STATUS_SUCCESS;
-}
-
 #undef RET_IF_HSA_UTILS_ERR
 
 }  // namespace rocrtst
diff --git a/rocrtst/common/base_rocr_utils.h b/rocrtst/common/base_rocr_utils.h
index a1f0c73612..d083608314 100755
--- a/rocrtst/common/base_rocr_utils.h
+++ b/rocrtst/common/base_rocr_utils.h
@@ -60,14 +60,16 @@ namespace rocrtst {
 /// \param[in] test Test for which the kernel will be loaded.
 /// \returns HSA_STATUS_SUCCESS if no errors
 hsa_status_t LoadKernelFromObjFile(BaseRocR* test);
-/// Do initialization tasks for HSA test program. This includes calling
-/// hsa_init(), finding and setting the cpu and gpu agent member variables,
-/// creating the signal needed for queueing AQL packets and checking
-/// HW requirements.
+
+/// Do initialization tasks for HSA test program.
 /// \param[in] test Test to initialize
 /// \returns HSA_STATUS_SUCCESS if no errors
 hsa_status_t InitAndSetupHSA(BaseRocR* test);
 
+/// Find and set the cpu and gpu agent member variables. Also checks that
+/// gpu agent meets test requirements (e.g., FULL profile vs. BASE profile).
+hsa_status_t SetDefaultAgents(BaseRocR* test);
+
 /// For the provided device agent, create an AQL queue
 /// \param[in] device Device for which a queue is to be created
 /// \param[out] queue Address to which created queue pointer will be written
@@ -84,16 +86,16 @@ hsa_status_t CreateQueue(hsa_agent_t device, hsa_queue_t** queue,
 /// be drawn.
 /// \param[inout] aql Caller provided pointer to aql packet that will be
 /// populated
-/// \returns void
-void InitializeAQLPacket(const BaseRocR* test,
+/// \returns Appropriate hsa_status_t
+hsa_status_t InitializeAQLPacket(const BaseRocR* test,
                          hsa_kernel_dispatch_packet_t* aql);
 
 /// This function writes all of the aql packet fields to the queue besides
 /// "setup" and "header". This assumes all the aql fields have be set
 /// appropriately.
 /// \param[in] test Test containing the queue and aql packet to be written.
-/// \returns void
-void WriteAQLToQueue(BaseRocR* test);
+/// \returns Pointer to dispatch packet in queue that was written to
+hsa_kernel_dispatch_packet_t* WriteAQLToQueue(BaseRocR* test, uint64_t *ind);
 
 /// This function writes the first 32 bits of an aql packet to the provided
 /// aql packet. This function is meant to be called immediately before
@@ -139,6 +141,15 @@ bool CheckProfile(BaseRocR const* test);
 hsa_status_t AllocAndSetKernArgs(BaseRocR* test, void* args,
                                  size_t arg_size);
 
+/// Verify that the machine running the test has the required profile.
+/// This function will verify that the execution machine meets any specific
+/// test requirement for a profile (HSA_PROFILE_BASE or HSA_PROFILE_FULL).
+/// \param[in] test Test that provides profile requirements.
+/// \returns bool
+///          - true Machine meets test requirements
+///          - false Machine does not meet test requirements
+bool CheckProfileAndInform(BaseRocR* test);
+
 /// This function will set the cpu and gpu memory pools to the type used in
 /// many applications.
 /// \param[in] test Test that provides profile requirements.
@@ -146,17 +157,6 @@ hsa_status_t AllocAndSetKernArgs(BaseRocR* test, void* args,
 ///   error code otherwise.
 hsa_status_t SetPoolsTypical(BaseRocR* test);
 
-/// Allocate memory from a specified pool and grant both standard BaseRocR
-/// agents access
-/// \param[in] test Test having the agents to which access is granted
-/// \param[in] len Size of the memory buffer to allocate
-/// \pool[in] Pool from which to allocate memory
-/// \buffer[out] Address of pointer which will point to newly allocated memory
-///  upon return
-///  \returns HSA_STATUS_OK if no errors
-hsa_status_t AllocAndAllowAccess(BaseRocR* test, size_t len,
-                                  hsa_amd_memory_pool_t pool, void**buffer);
-
 /// Work-around for hsa_amd_memory_fill, which is currently broken.
 /// \param[in] ptr Pointer to start of memory location to be filled
 /// \param[in] value Value to write to each byte of input buffer
diff --git a/rocrtst/common/common.cc b/rocrtst/common/common.cc
index 0625e0c2b1..1ee4355e49 100755
--- a/rocrtst/common/common.cc
+++ b/rocrtst/common/common.cc
@@ -341,45 +341,6 @@ hsa_status_t DumpPointerInfo(void* ptr) {
   return HSA_STATUS_SUCCESS;
 }
 
-hsa_status_t hsa_memory_fill_workaround_cpu(void* ptr, uint32_t value,
-                                                            size_t count) {
-  (void)memset(ptr, value, count);
-
-  return HSA_STATUS_SUCCESS;
-}
-
-hsa_status_t hsa_memory_copy_workaround_cpu(void* dst, const void *src,
-                                                            size_t size) {
-  (void)memcpy(dst, src, size);
-
-  return HSA_STATUS_SUCCESS;
-}
-
-hsa_status_t hsa_memory_copy_workaround_gen(void* dst, const void *src,
-                       size_t size, hsa_agent_t dst_ag, hsa_agent_t src_ag) {
-  hsa_signal_t s;
-  hsa_status_t err;
-
-  err = hsa_signal_create(1, 0, NULL, &s);
-  RET_IF_HSA_COMMON_ERR(err);
-
-  err = hsa_amd_memory_async_copy(dst, dst_ag, src, src_ag, size, 0, NULL, s);
-  RET_IF_HSA_COMMON_ERR(err);
-
-  if (hsa_signal_wait_scacquire(s, HSA_SIGNAL_CONDITION_LT, 1,
-                                   UINT64_MAX, HSA_WAIT_STATE_BLOCKED) != 0) {
-    err = HSA_STATUS_ERROR;
-    std::cout << "Async copy signal error" << std::endl;
-
-    RET_IF_HSA_COMMON_ERR(err);
-  }
-
-  err = hsa_signal_destroy(s);
-
-  RET_IF_HSA_COMMON_ERR(err);
-
-  return err;
-}
 
 /*! \brief Writes to the buffer and increments the write pointer to the
  *         buffer. Also, ensures that the argument is written to an
diff --git a/rocrtst/common/common.h b/rocrtst/common/common.h
index f82aea202f..08a59fa736 100755
--- a/rocrtst/common/common.h
+++ b/rocrtst/common/common.h
@@ -140,35 +140,5 @@ hsa_status_t DumpMemoryPoolInfo(const hsa_amd_memory_pool_t pool,
 /// \returns HSA_STATUS_SUCCESS if there are no errors
 hsa_status_t DumpPointerInfo(void* ptr);
 
-/// This is a work-around for filling cpu-memory to be used until
-/// hsa_amd_memory_fill is fixed. Should only be used for cpu memory.
-/// \param[in] ptr Start address of memory to be filled.
-/// \param[in] value Value to fill buffer with
-/// \param[in] count Size of buffer to fill
-/// \returns HSA_STATUS_SUCCESS if there are no errors
-hsa_status_t hsa_memory_fill_workaround_cpu(void* ptr, uint32_t value,
-                                                            size_t count);
-
-/// This is a work-around for copying cpu-memory to be used until
-/// hsa_amd_memory_copy is fixed. Should only be used for cpu memory.
-/// \param[in] dst Destination address of memory to be copied
-/// \param[in] src Source address of memory to be copied
-/// \param[in] size Size of buffer to fill
-/// \returns HSA_STATUS_SUCCESS if there are no errors
-hsa_status_t hsa_memory_copy_workaround_cpu(void* dst, const void *src,
-                                                            size_t size);
-
-/// This is a work-around for copying memory to be used until
-/// hsa_amd_memory_copy is fixed. Should be used when gpu local memory is
-/// involved.
-/// \param[in] dst Destination address of memory to be copied
-/// \param[in] src Source address of memory to be copied
-/// \param[in] size Size of buffer to fill
-/// \param[in] dst_ag Destination agent handle
-/// \param[in] src_ag Source agent handle
-/// \returns HSA_STATUS_SUCCESS if there are no errors
-hsa_status_t hsa_memory_copy_workaround_gen(void* dst, const void *src,
-                       size_t size, hsa_agent_t dst_ag, hsa_agent_t src_ag);
-
 }  // namespace rocrtst
 #endif  // ROCRTST_COMMON_COMMON_H_
diff --git a/rocrtst/common/helper_funcs.cc b/rocrtst/common/helper_funcs.cc
index e0af455863..6e86f6bbc5 100755
--- a/rocrtst/common/helper_funcs.cc
+++ b/rocrtst/common/helper_funcs.cc
@@ -52,10 +52,10 @@
 #include <iostream>
 #include <string>
 #include <vector>
+#include <numeric>
 
 namespace rocrtst {
 
-
 template<typename T>
 void PrintArray(const std::string header, const T* data, const int width,
                 const int height) {
@@ -191,7 +191,7 @@ AlignUp(void* value, size_t alignment) {
                                                                    alignment));
 }
 
-double CalcMedian(std::vector<double> scores) {
+double CalcMedian(const std::vector<double> &scores) {
   double median;
   size_t size = scores.size();
 
@@ -204,15 +204,11 @@ double CalcMedian(std::vector<double> scores) {
   return median;
 }
 
-double CalcMean(std::vector<double> scores) {
-  double mean = 0;
-  size_t size = scores.size();
+double CalcMean(const std::vector<double> &scores) {
+  double mean;
 
-  for (size_t i = 0; i < size; ++i) {
-    mean += scores[i];
-  }
-
-  return mean / size;
+  mean = std::accumulate(scores.begin(), scores.end(), 0.0);
+  return mean/scores.size();
 }
 
 double CalcMean(const std::vector<double>& v1, const std::vector<double>& v2) {
diff --git a/rocrtst/common/helper_funcs.h b/rocrtst/common/helper_funcs.h
index 06008d77fb..5ed8114e95 100755
--- a/rocrtst/common/helper_funcs.h
+++ b/rocrtst/common/helper_funcs.h
@@ -60,7 +60,7 @@ bool Compare(const double* refData, const double* data,
              const int length, const double epsilon = 1e-6);
 
 /// Calculate the mean number of the vector
-double CalcMean(std::vector<double> scores);
+double CalcMean(const std::vector<double> &scores);
 
 /// Calculate the mean time of difference of the two vectors
 double CalcMean(const std::vector<double>& v1, const std::vector<double>& v2);
@@ -68,7 +68,7 @@ double CalcMean(const std::vector<double>& v1, const std::vector<double>& v2);
 /// Return the median value of a vector of doubles
 /// \param[in] scores Vector of doubles
 /// \returns double Median value of provided vector
-double CalcMedian(std::vector<double> scores);
+double CalcMedian(const std::vector<double> &scores);
 
 /// Calculate the standard deviation of the vector
 double CalcStdDeviation(std::vector<double> scores, int score_mean);
diff --git a/rocrtst/common/hsa_perf_cntrs.cc b/rocrtst/common/hsa_perf_cntrs.cc
index d35433de93..fc2a79c66b 100755
--- a/rocrtst/common/hsa_perf_cntrs.cc
+++ b/rocrtst/common/hsa_perf_cntrs.cc
@@ -70,6 +70,7 @@ PreDispatchCallback(const hsa_dispatch_callback_t* dispParam, void* usrArg) {
                         dispParam->aql_translation_handle, true);
   assert((status == HSA_STATUS_SUCCESS) &&
          "Error in beginning Perf Cntr Session");
+  (void)status;  // Avoid warning
 }
 
 static void
@@ -82,6 +83,7 @@ PostDispatchCallback(const hsa_dispatch_callback_t* dispParam, void* usrArg) {
                         dispParam->aql_translation_handle);
   assert((status == HSA_STATUS_SUCCESS) &&
          "Error in endning Perf Cntr Session");
+  (void)status;  // Avoid warning
 }
 
 /// Constructor of the class
@@ -192,6 +194,8 @@ void RocrPerfCntrApp::RegisterCallbacks(hsa_queue_t* queue) {
   status = hsa_ext_tools_set_callback_arguments(queue, &perfMgr_, &perfMgr_);
   assert((status == HSA_STATUS_SUCCESS) &&
          "Error in registering Pre & Post Dispatch Callback Params");
+
+  (void)status;  // Avoid warning
   return;
 }
 
diff --git a/rocrtst/common/hsatimer.cc b/rocrtst/common/hsatimer.cc
index 1e7eef2092..2610ecdd06 100755
--- a/rocrtst/common/hsatimer.cc
+++ b/rocrtst/common/hsatimer.cc
@@ -176,8 +176,7 @@ uint64_t PerfTimer::MeasureTSCFreqHz() {
 
   do {
     tscTicksEnd = __rdtscp(&unused);
-  }
-  while (tscTicksEnd - tscTicksBegin < 1000000000);
+  } while (tscTicksEnd - tscTicksBegin < 1000000000);
 
   uint64_t coarseEndUs = CoarseTimestampUs();
 
diff --git a/rocrtst/common/hsatimer.h b/rocrtst/common/hsatimer.h
index 72b7ba190f..8d12b768eb 100755
--- a/rocrtst/common/hsatimer.h
+++ b/rocrtst/common/hsatimer.h
@@ -91,6 +91,7 @@ class PerfTimer {
   void ResetTimer(int index);
 
   /// Read the time value of the timer associated with the provided index.
+  /// Units are seconds
   /// \param[in] index Index of the timer to read
   /// \returns double Value of the timer
   double ReadTimer(int index);
diff --git a/rocrtst/samples/CMakeLists.txt b/rocrtst/samples/CMakeLists.txt
index bd47600822..a118c68ad5 100755
--- a/rocrtst/samples/CMakeLists.txt
+++ b/rocrtst/samples/CMakeLists.txt
@@ -254,6 +254,11 @@ set(BITCODE_LIBS "${BITCODE_LIBS} ${BITCODE_PREF}/ocml.amdgcn.bc")
 set(CL_FILE_LIST "${PROJECT_SOURCE_DIR}/binary_search/binary_search_kernels.cl")
 process_sample("binary_search")
 
+# P2P Memory Access
+set(BITCODE_LIBS "${COMMON_BITCODE_LIBS}")
+set(CL_FILE_LIST "${PROJECT_SOURCE_DIR}/p2p_mem_access/p2p_mem_access_kernels.cl")
+process_sample("p2p_mem_access")
+
 # RocR Info
 aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/rocrinfo ROCR_INFO_SOURCES)
 add_executable(rocrinfo ${ROCR_INFO_SOURCES})
diff --git a/rocrtst/suites/performance/CMakeLists.txt b/rocrtst/suites/performance/CMakeLists.txt
index 21629ff95f..129ca7c519 100755
--- a/rocrtst/suites/performance/CMakeLists.txt
+++ b/rocrtst/suites/performance/CMakeLists.txt
@@ -25,10 +25,6 @@ cmake_minimum_required(VERSION 2.8.0)
 #   4) Set env. variable TARGET_DEVICE to indicate gpu type (e.g., gfx803,
 #      gfx900, ...)
 #
-#   5) Set env. variables AMDHSAFIN_DIR and and AMDHSAFIN_TARGET to the 
-#      directory containing the amd finalizer executable and version
-#      (e.g, 8:0:3) respectively.      
-#
 #   Building rocrtst Suite
 # 
 #   1) Create build folder e.g. "rocrtst/build" - any name will do
@@ -91,6 +87,32 @@ else()
   endif()
 endif()
 
+if (DEFINED ENV{OPENCL_DIR})
+  set(CLANG $ENV{OPENCL_DIR}/bin/x86_64/clang)
+  set(OPENCL_DIR $ENV{OPENCL_DIR})
+  if (NOT EXISTS ${CLANG})
+    message("ERROR: path to clang (${CLANG}) is not valid. Is env. variable OPENCL_DIR correct?")
+    return()
+  endif()
+
+  if (DEFINED ENV{OPENCL_VER})
+    set(OPENCL_VER $ENV{OPENCL_VER})
+  else()
+    message("OPENCL_VER environment variable is not set. Using default")
+    set(OPENCL_VER "2.0")
+  endif()
+else()
+    message("WARNING: OPENCL_DIR environment variable is not set. Kernels will not be built.")
+endif()
+
+if (DEFINED ENV{TARGET_DEVICE})
+  set(TARGET_DEVICE $ENV{TARGET_DEVICE})
+else()
+  message("ERROR: TARGET_DEVICE environment variable is not defined.")
+  message("Please define a valid clang target (e.g., gfx803, gfx900,...).")
+  return()
+endif() 
+
 #
 # Set Name for rocrtst Suite Project
 #
@@ -105,17 +127,22 @@ project (${ROCRTST_SUITE_NAME})
 #   Build Type: Debug Vs Release, 32 Vs 64
 #   Compiler Version, etc
 #
-MESSAGE("")
-MESSAGE("-------------IS64BIT: " ${IS64BIT})
-MESSAGE("-----------BuildType: " ${BUILD_TYPE})
-MESSAGE("------------Compiler: " ${CMAKE_CXX_COMPILER})
-MESSAGE("-------------Version: " ${CMAKE_CXX_COMPILER_VERSION})
-MESSAGE("--------Proj Src Dir: " ${PROJECT_SOURCE_DIR})
-MESSAGE("--------Proj Bld Dir: " ${PROJECT_BINARY_DIR})
-MESSAGE("--------Proj Lib Dir: " ${PROJECT_BINARY_DIR}/lib)
-MESSAGE("--------Proj Exe Dir: " ${PROJECT_BINARY_DIR}/bin)
-MESSAGE("")
+message("")
+message("Build Configuration:")
+message("-------------IS64BIT: " ${IS64BIT})
+message("-----------BuildType: " ${BUILD_TYPE})
+message("------------Compiler: " ${CMAKE_CXX_COMPILER})
+message("-------------Version: " ${CMAKE_CXX_COMPILER_VERSION})
+message("--------Proj Src Dir: " ${PROJECT_SOURCE_DIR})
+message("--------Proj Bld Dir: " ${PROJECT_BINARY_DIR})
+message("--------Proj Lib Dir: " ${PROJECT_BINARY_DIR}/lib)
+message("--------Proj Exe Dir: " ${PROJECT_BINARY_DIR}/bin)
+message("-------Target Device: " ${TARGET_DEVICE})
+message("----------Clang path: " ${CLANG})
+message("-------OpenCL version " ${OPENCL_VER})
+message("")
 
+set(KERNELS_DIR ${PROJECT_SOURCE_DIR}/kernels)
 #
 # Set the build type based on user input
 #
@@ -148,7 +175,7 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-threadsafe-statics")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fmerge-all-constants")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fms-extensions")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pedantic")
+# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pedantic")
 
 
 #
@@ -164,7 +191,7 @@ endif()
 # Add compiler flags to include symbol information for debug builds
 #
 if(ISDEBUG)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb -O0")
 endif()
 MESSAGE("ISDEBUG STEP:Done")
 
@@ -201,10 +228,11 @@ MESSAGE(${ROCRTST_LIBS})
 set(ROCRTST "rocrtst${ONLY64STR}")
 
 #
-# Sorce files for building rocrtst
+# Source files for building rocrtst
 #
 aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR} performanceSources)
-
+aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/test_common testCommonSources)
+aux_source_directory(${ROCRTST_ROOT}/suites/test_common testCommonSources)
 
 # Header file include path
 
@@ -212,11 +240,51 @@ include_directories(${ROCR_INC_DIR})
 include_directories(${ROCRTST_ROOT})
 include_directories(${ROCRTST_ROOT}/gtest/include)
 
-# Build rules
+# Use this function to build any samples that have kernels to be built
+function(build_kernel S_NAME)
+  set(SNAME_KERNEL "${S_NAME}_kernels.hsaco")
+  set(TARG_NAME "${S_NAME}_hsaco")
+  set(HSACO_TARG_LIST ${HSACO_TARG_LIST} ${TARG_NAME} PARENT_SCOPE)
+  separate_arguments(CLANG_ARG_LIST UNIX_COMMAND "-target amdgcn-amdh-amdhsa -mcpu=${TARGET_DEVICE} -include ${OPENCL_DIR}/include/opencl-c.h ${BITCODE_LIBS} -cl-std=CL${OPENCL_VER} ${CL_FILE_LIST} -o ${PROJECT_BINARY_DIR}/${SNAME_KERNEL}")
+  add_custom_target(${TARG_NAME} ${CLANG} ${CLANG_ARG_LIST}
+     COMMENT "BUILDING KERNEL..."
+     VERBATIM)
+endfunction(build_kernel)
 
-add_executable(${ROCRTST} ${performanceSources} ${common_srcs})
+######################
+# Kernel Build Section
+######################
+set(KERN_SUFFIX "kernels.hsaco")
+set(BITCODE_PREF "-Xclang -mlink-bitcode-file -Xclang")
+set(BITCODE_PREF "${BITCODE_PREF} ${OPENCL_DIR}/lib/x86_64/bitcode")
+
+set(COMMON_BITCODE_LIBS "${BITCODE_PREF}/opencl.amdgcn.bc")
+set(COMMON_BITCODE_LIBS "${COMMON_BITCODE_LIBS} ${BITCODE_PREF}/ockl.amdgcn.bc")
+
+# To build kernels, repeat the pattern used below for the P2P kernel; this
+# pattern sets the bitcode libraries required by the kernel which will be 
+# used in the build_kernel() call, which builds the kernel.
+
+# Test Case Template example
+set(BITCODE_LIBS "${COMMON_BITCODE_LIBS}")
+set(CL_FILE_LIST "${KERNELS_DIR}/test_case_template_kernels.cl")
+build_kernel("test_case_template")
+
+# P2P Memory Access
+#set(BITCODE_LIBS "${COMMON_BITCODE_LIBS}")
+#set(CL_FILE_LIST "${KERNELS_DIR}/p2p_mem_access_kernels.cl")
+#build_kernel("p2p_mem_access")
+
+# Dispatch Time 
+set(BITCODE_LIBS "${COMMON_BITCODE_LIBS}")
+set(CL_FILE_LIST "${KERNELS_DIR}/dispatch_time_kernels.cl")
+build_kernel("dispatch_time")
+
+# Build rules
+add_executable(${ROCRTST} ${performanceSources} ${common_srcs} ${testCommonSources})
 target_link_libraries(${ROCRTST} ${ROCRTST_LIBS} c stdc++ dl pthread rt)
 
+add_custom_target(rocrtst_kernels DEPENDS ${HSACO_TARG_LIST})
 INSTALL(TARGETS ${ROCRTST}
         ARCHIVE DESTINATION ${PROJECT_BINARY_DIR}/lib
         LIBRARY DESTINATION ${PROJECT_BINARY_DIR}/lib
diff --git a/rocrtst/suites/performance/cp_process_time.cc b/rocrtst/suites/performance/cp_process_time.cc
deleted file mode 100755
index a393c617d1..0000000000
--- a/rocrtst/suites/performance/cp_process_time.cc
+++ /dev/null
@@ -1,258 +0,0 @@
-/*
- * =============================================================================
- *   ROC Runtime Conformance Release License
- * =============================================================================
- * The University of Illinois/NCSA
- * Open Source License (NCSA)
- *
- * Copyright (c) 2017, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Developed by:
- *
- *                 AMD Research and AMD ROC Software Development
- *
- *                 Advanced Micro Devices, Inc.
- *
- *                 www.amd.com
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal with the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- *  - Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimers.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimers in
- *    the documentation and/or other materials provided with the distribution.
- *  - Neither the names of <Name of Development Group, Name of Institution>,
- *    nor the names of its contributors may be used to endorse or promote
- *    products derived from this Software without specific prior written
- *    permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS WITH THE SOFTWARE.
- *
- */
-
-#include "cp_process_time.h"
-#include "common/base_rocr_utils.h"
-#include "common/common.h"
-#include "common/helper_funcs.h"
-#include "common/hsatimer.h"
-#include "common/os.h"
-#include "gtest/gtest.h"
-#include "hsa/hsa.h"
-#include "hsa/hsa_ext_amd.h"
-#include "hsa/hsa_ext_finalize.h"
-#include <algorithm>
-
-static const uint64_t kKernelIterations = 10000;
-static const uint64_t kTestBadValue = 1234567891234567891;
-//Set up some expectations for reasonable processing times
-//For gfx803, Overhead time had a max of 18.208uS and a min of 7.82uS
-static const double kGfx803MinOverhead = 7.78;
-static const double kGfx803MaxOverhead = 21.064;
-static const double kOverheadToleranceFactor = 0.25;
-
-CpProcessTime::CpProcessTime() :
-  BaseRocR() {
-  // kernel_name_ = "&__simple_kernel";
-  mean_ = 0.0;
-}
-
-CpProcessTime::~CpProcessTime() {
-}
-
-void CpProcessTime::SetUp() {
-  hsa_status_t err;
-  set_kernel_file_name("simple_kernel.o");
-  set_kernel_name("&__simple_kernel");
-
-  if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
-    return;
-  }
-  hsa_agent_t* gpu_dev = gpu_device1();
-
-  // Create a queue
-  hsa_queue_t* q = nullptr;
-  rocrtst::CreateQueue(*gpu_dev, &q);
-  ASSERT_NE(q, nullptr);
-  set_main_queue(q);
-
-  // Set profiling
-  err = hsa_amd_profiling_set_profiler_enabled(q, 1);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  // Load and finalize the kernel
-  err = rocrtst::LoadKernelFromObjFile(this);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  rocrtst::InitializeAQLPacket(this, &aql());
-  aql().workgroup_size_x = 1;
-  aql().grid_size_x = 1;
-}
-
-size_t CpProcessTime::RealIterationNum() {
-  return num_iteration() * 1.2 + 1;
-}
-
-void CpProcessTime::Run() {
-  hsa_status_t err;
-  std::vector<double> timer;
-
-  if (!rocrtst::CheckProfile(this)) {
-    return;
-  }
-
-  hsa_agent_t* gpu_dev = gpu_device1();
-  hsa_agent_t* cpu_dev = cpu_device();
-
-  ASSERT_NE(gpu_dev, nullptr);
-  ASSERT_NE(cpu_dev, nullptr);
-  uint32_t it = RealIterationNum();
-
-  typedef struct args_t {
-    uint64_t* iteration;
-    uint64_t* result;
-  } args;
-
-  err = rocrtst::SetPoolsTypical(this);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  uint64_t* iter = NULL;
-  uint64_t* result = NULL;
-  err = rocrtst::AllocAndAllowAccess(this, sizeof(uint64_t), cpu_pool(),
-                                                               (void**)&iter);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  err = rocrtst::AllocAndAllowAccess(this, sizeof(uint64_t), cpu_pool(),
-                                                             (void**)&result);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-  
-  *iter = kKernelIterations;
-  *result = kTestBadValue;
-
-  args  k_args;
-
-  k_args.iteration = (uint64_t*)iter;
-  k_args.result = (uint64_t*)result;
-
-  err = rocrtst::AllocAndSetKernArgs(this, &k_args, sizeof(args));
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  rocrtst::WriteAQLToQueue(this);
-
-  void * q_base_addr = main_queue()->base_address;
-  const uint32_t queue_mask = main_queue()->size - 1;
-  uint32_t aql_header = HSA_PACKET_TYPE_KERNEL_DISPATCH;
-//  aql_header |= HSA_FENCE_SCOPE_SYSTEM <<
-//                                    HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
-//  aql_header |= HSA_FENCE_SCOPE_SYSTEM <<
-//                                    HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
-
-  for (uint32_t i = 0; i < it; i++) {
-    // uint64_t que_idx = hsa_queue_load_write_index_relaxed(main_queue());
-    uint64_t que_idx = hsa_queue_add_write_index_relaxed(main_queue(), 1);
-
-    //Get timing stamp an ring the doorbell to dispatch the kernel.
-    rocrtst::PerfTimer p_timer;
-    int id = p_timer.CreateTimer();
-    p_timer.StartTimer(id);
-
-    rocrtst::AtomicSetPacketHeader(aql_header, aql().setup,
-             &((hsa_kernel_dispatch_packet_t*)(q_base_addr))[que_idx & queue_mask]);
-
-    hsa_queue_store_write_index_relaxed(main_queue(), (que_idx + 1));
-    hsa_signal_store_relaxed(main_queue()->doorbell_signal, que_idx);
-
-    while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1,
-                                     (uint64_t) - 1, HSA_WAIT_STATE_ACTIVE))
-      ;
-//    hsa_signal_value_t value = hsa_signal_wait_scacquire(signal(),
-//                HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
-    // value should be 0, or we timed-out
-    //ASSERT_EQ(value, 0);
-
-    p_timer.StopTimer(id);
-
-    hsa_amd_profiling_dispatch_time_t dispatch_time;
-    err = hsa_amd_profiling_get_dispatch_time(*gpu_dev, signal(),
-          &dispatch_time);
-    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-    uint64_t ticks = dispatch_time.end - dispatch_time.start;
-    uint64_t freq;
-
-    err = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &freq);
-    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-    hsa_signal_store_screlease(signal(), 1);
-
-    double execution_time = (double) ticks / freq * 1e6; //convert to us
-    double temp = p_timer.ReadTimer(id) * 1e6;
-    double cp_time = temp - execution_time;
-
-#ifdef DEBUG
-    std::cout << "Total:" << temp << "uS ";
-    std::cout << "Execution:" << execution_time << "uS ";
-    std::cout << "Overhead:" << cp_time << "uS ";
-    std::cout << "Overhead %:" << cp_time / execution_time * 100 << std::endl;
-#endif
-
-    EXPECT_EQ(kKernelIterations, *result);
-    timer.push_back(cp_time);
-
-    //Assume overhead will not deviate too much from previously recorded
-    // values. If this does happen and there is not a performance bug,
-    // modify these constants
-
-    //This may need to be made specific to the gpu being used
-    EXPECT_GT(cp_time, kGfx803MinOverhead * (1 - kOverheadToleranceFactor));
-    EXPECT_LT(cp_time, kGfx803MaxOverhead * (1 + kOverheadToleranceFactor));
-
-    *result = 0;
-  }
-
-  //Abandon the first result and after sort, delete the last 2% value
-  timer.erase(timer.begin());
-  std::sort(timer.begin(), timer.end());
-
-  timer.erase(timer.begin() + num_iteration(), timer.end());
-  mean_ = rocrtst::CalcMean(timer);
-
-  return;
-}
-
-void CpProcessTime::DisplayResults() const {
-
-  if (!rocrtst::CheckProfile(this)) {
-    return;
-  }
-
-  if (mean_ == 0.0) {
-    return;
-  }
-
-  std::cout << "===================================================="
-            << std::endl;
-  std::cout << "The average Command Processor processing time is:  " << mean_
-            << "us" << std::endl;
-  std::cout << "===================================================="
-            << std::endl;
-  return;
-}
-
-void CpProcessTime::Close() {
-  hsa_status_t err;
-  err = rocrtst::CommonCleanUp(this);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-}
diff --git a/rocrtst/suites/performance/cp_process_time.h b/rocrtst/suites/performance/cp_process_time.h
deleted file mode 100755
index 6abec7d9bb..0000000000
--- a/rocrtst/suites/performance/cp_process_time.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * =============================================================================
- *   ROC Runtime Conformance Release License
- * =============================================================================
- * The University of Illinois/NCSA
- * Open Source License (NCSA)
- *
- * Copyright (c) 2017, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Developed by:
- *
- *                 AMD Research and AMD ROC Software Development
- *
- *                 Advanced Micro Devices, Inc.
- *
- *                 www.amd.com
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal with the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- *  - Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimers.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimers in
- *    the documentation and/or other materials provided with the distribution.
- *  - Neither the names of <Name of Development Group, Name of Institution>,
- *    nor the names of its contributors may be used to endorse or promote
- *    products derived from this Software without specific prior written
- *    permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS WITH THE SOFTWARE.
- *
- */
-
-#ifndef __ROCRTST_SRC_CP_PROCESS_TIME_H__
-#define __ROCRTST_SRC_CP_PROCESS_TIME_H__
-#include "perf_common/perf_base.h"
-#include "common/base_rocr.h"
-#include "common/common.h"
-#include "hsa/hsa.h"
-#include "hsa/hsa_ext_amd.h"
-#include <vector>
-
-//@Brief: This class is defined to measure the mean latency of launching
-//an empty kernel
-
-class CpProcessTime: public rocrtst::BaseRocR, public PerfBase {
- public:
-  //@Brief: Constructor
-  CpProcessTime();
-
-  //@Brief: Destructor
-  virtual ~CpProcessTime();
-
-  //@Brief: Set up the environment for the test
-  virtual void SetUp();
-
-  //@Brief: Run the test case
-  virtual void Run();
-
-  //@Brief: Display  results we got
-  virtual void DisplayResults() const;
-
-  //@Brief: Clean up and close the runtime
-  virtual void Close();
-
- private:
-  //@Brief: Get actual iteration number
-  virtual size_t RealIterationNum();
-
-  //@Brief: Store the size of queue
-  uint32_t queue_size_;
-
-  //@Brief: The mean time of CP Processing
-  double mean_;
-
-};
-
-#endif
-
diff --git a/rocrtst/suites/performance/cu_masking.cc b/rocrtst/suites/performance/cu_masking.cc
deleted file mode 100644
index 29f11f377a..0000000000
--- a/rocrtst/suites/performance/cu_masking.cc
+++ /dev/null
@@ -1,220 +0,0 @@
-/*
- * =============================================================================
- *   ROC Runtime Conformance Release License
- * =============================================================================
- * The University of Illinois/NCSA
- * Open Source License (NCSA)
- *
- * Copyright (c) 2017, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Developed by:
- *
- *                 AMD Research and AMD ROC Software Development
- *
- *                 Advanced Micro Devices, Inc.
- *
- *                 www.amd.com
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal with the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- *  - Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimers.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimers in
- *    the documentation and/or other materials provided with the distribution.
- *  - Neither the names of <Name of Development Group, Name of Institution>,
- *    nor the names of its contributors may be used to endorse or promote
- *    products derived from this Software without specific prior written
- *    permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS WITH THE SOFTWARE.
- *
- */
-
-#include "cu_masking.h"
-#include "common/base_rocr_utils.h"
-#include "gtest/gtest.h"
-
-CuMasking::CuMasking() :
-  BaseRocR() {
-  memset(&aql(), 0, sizeof(hsa_kernel_dispatch_packet_t));
-  mean_ = 0.0;
-  group_region_.handle = 0;
-  cu_ = NULL;
-}
-
-CuMasking::~CuMasking() {
-}
-
-void CuMasking::SetUp() {
-  hsa_status_t err;
-
-  hsa_agent_t* gpu_dev = gpu_device1();
-  hsa_agent_t* cpu_dev = cpu_device();
-
-  set_kernel_file_name("cu_masking.o");
-  set_kernel_name("&main");
-
-  if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
-    return;
-  }
-
-  // Create a queue
-  hsa_queue_t* q = nullptr;
-  rocrtst::CreateQueue(*gpu_dev, &q);
-  set_main_queue(q);
-
-  rocrtst::LoadKernelFromObjFile(this);
-
-  // Fill up the kernel packet except header
-  // aql().completion_signal=signal();
-  // TODO: Will delete manual_input later
-  uint32_t cu_count = 0;
-  err = hsa_agent_get_info(*gpu_dev,
-          (hsa_agent_info_t) HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-  std::cout << "CU# is: " << cu_count << std::endl;
-
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  rocrtst::InitializeAQLPacket(this, &aql());
-  aql().workgroup_size_x = 1024;
-
-  //manual_input * group_input;  // workgroup_max_size;
-  aql().grid_size_x = (long long) 1024 * 640 * 640;
-
-  // TODO:Manully set the max cu number to 8, the api return 10
-  std::cout << "Grid size is: " << aql().grid_size_x << std::endl;
-
-  err = hsa_amd_agent_iterate_memory_pools(*cpu_dev,
-                                        rocrtst::FindGlobalPool, &cpu_pool());
-  ASSERT_EQ(err, HSA_STATUS_INFO_BREAK);
-}
-
-size_t CuMasking::RealIterationNum() {
-  return num_iteration() * 1.2 + 1;
-}
-
-void CuMasking::Run() {
-  hsa_status_t err;
-
-  if (!rocrtst::CheckProfile(this)) {
-    return;
-  }
-
-  std::vector<double> timer;
-
-  typedef struct args_t {
-    uint32_t* iteration;
-    uint32_t* result;
-  } local_args;
-
-  uint32_t* iter = NULL;
-  uint32_t* result = NULL;
-  err = hsa_amd_memory_pool_allocate(cpu_pool(), sizeof(uint32_t), 0,
-                                     (void**) &iter);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  err = hsa_amd_memory_pool_allocate(cpu_pool(), sizeof(uint32_t), 0,
-                                     (void**) &result);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  *iter = 0xff;
-  *result = 0;
-
-  err = hsa_amd_agents_allow_access(1, gpu_device1(), NULL, iter);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-  err = hsa_amd_agents_allow_access(1, gpu_device1(), NULL, result);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  local_args* kernarg = NULL;
-  err = hsa_amd_memory_pool_allocate(cpu_pool(), kernarg_size(), 0,
-                                     (void**) &kernarg);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  err = hsa_amd_agents_allow_access(1, gpu_device1(), NULL, kernarg);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  kernarg->iteration = iter;
-  kernarg->result = result;
-
-  aql().kernarg_address = kernarg;
-
-  // Obtain the current queue write inex.
-  uint64_t index = hsa_queue_add_write_index_relaxed(main_queue(), 1);
-
-  // Write the aql packet at the calculate queue index address.
-  const uint32_t queue_mask = main_queue()->size - 1;
-
-  // Set CU mask
-  uint32_t cu_mask = 0;
-#if 0
-  std::cout << "Enter cu mask value:" << std::endl;
-  ASSERT_NE(scanf("%d", &cu_mask), EOF);
-#else
-  cu_mask = 0xAAAAAAAA;
-#endif
-
-  std::cout << "Value of bit array is: 0x" << std::hex << cu_mask << std::endl;
-  err = hsa_amd_queue_cu_set_mask(main_queue(), 32, &cu_mask);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  void *q_base_addr = main_queue()->base_address;
-  // Write the aql packet at the calculate queue index address.
-  aql().completion_signal = signal();
-  ((hsa_kernel_dispatch_packet_t*)(q_base_addr))[index & queue_mask] = aql();
-
-  // Get timing stamp an ring the doorbell to dispatch the kernel.
-  rocrtst::PerfTimer p_timer;
-  int id = p_timer.CreateTimer();
-  p_timer.StartTimer(id);
-  ((hsa_kernel_dispatch_packet_t*)q_base_addr)[index & queue_mask].header |=
-                     HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
-  hsa_signal_store_screlease(main_queue()->doorbell_signal, index);
-
-  // Wait on the dispatch signal until the kernel is finished.
-  while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1,
-                                   (uint64_t) - 1, HSA_WAIT_STATE_ACTIVE))
-    ;
-
-  p_timer.StopTimer(id);
-
-  hsa_signal_store_screlease(signal(), 1);
-
-  double t1 = p_timer.ReadTimer(id) * 1e6;
-  std::cout << "Execution time after setting cu masking: " << t1 << std::endl;
-
-  return;
-}
-
-void CuMasking::DisplayResults() const {
-
-  if (!rocrtst::CheckProfile(this)) {
-    return;
-  }
-
-  std::cout << "===================================================="
-            << std::endl;
-
-  std::cout << "====================================================="
-            << std::endl;
-  return;
-}
-
-void CuMasking::Close() {
-  hsa_status_t err;
-  err = rocrtst::CommonCleanUp(this);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-}
diff --git a/rocrtst/suites/performance/cu_masking.h b/rocrtst/suites/performance/cu_masking.h
deleted file mode 100755
index e6826d9572..0000000000
--- a/rocrtst/suites/performance/cu_masking.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * =============================================================================
- *   ROC Runtime Conformance Release License
- * =============================================================================
- * The University of Illinois/NCSA
- * Open Source License (NCSA)
- *
- * Copyright (c) 2017, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Developed by:
- *
- *                 AMD Research and AMD ROC Software Development
- *
- *                 Advanced Micro Devices, Inc.
- *
- *                 www.amd.com
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal with the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- *  - Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimers.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimers in
- *    the documentation and/or other materials provided with the distribution.
- *  - Neither the names of <Name of Development Group, Name of Institution>,
- *    nor the names of its contributors may be used to endorse or promote
- *    products derived from this Software without specific prior written
- *    permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS WITH THE SOFTWARE.
- *
- */
-
-#ifndef __ROCRTST_SRC_CU_MASKING_TIME_H__
-#define __ROCRTST_SRC_CU_MASKING_TIME_H__
-
-#include "perf_common/perf_base.h"
-#include "common/base_rocr.h"
-#include "common/common.h"
-#include "common/hsatimer.h"
-#include "hsa/hsa.h"
-#include "hsa/hsa_ext_amd.h"
-#include "hsa/hsa_ext_finalize.h"
-#include <algorithm>
-#include <vector>
-
-//@Brief: This class is defined to measure the mean latency of launching
-//an empty kernel
-
-class CuMasking: public rocrtst::BaseRocR, public PerfBase {
- public:
-  //@Brief: Constructor
-  CuMasking();
-
-  //@Brief: Destructor
-  virtual ~CuMasking();
-
-  //@Brief: Set up the environment for the test
-  virtual void SetUp();
-
-  //@Brief: Run the test case
-  virtual void Run();
-
-  //@Brief: Display  results we got
-  virtual void DisplayResults() const;
-
-  //@Brief: Clean up and close the runtime
-  virtual void Close();
-
- private:
-  //@Brief: Get actual iteration number
-  virtual size_t RealIterationNum();
-
-  //@Brief: Store the size of queue
-  uint32_t queue_size_;
-
-  //@Brief: The mean time of CP Processing
-  double mean_;
-
-  //@Brief: The group memory region
-  hsa_region_t group_region_;
-
-  //@Brief: Pointer to cu_id array
-  uint32_t* cu_;
-
-  uint32_t manual_input;
-  uint32_t group_input;
-};
-
-#endif
-
diff --git a/rocrtst/suites/performance/device_load_bandwidth.cc b/rocrtst/suites/performance/device_load_bandwidth.cc
deleted file mode 100755
index 5cfcf829b8..0000000000
--- a/rocrtst/suites/performance/device_load_bandwidth.cc
+++ /dev/null
@@ -1,293 +0,0 @@
-/*
- * =============================================================================
- *   ROC Runtime Conformance Release License
- * =============================================================================
- * The University of Illinois/NCSA
- * Open Source License (NCSA)
- *
- * Copyright (c) 2017, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Developed by:
- *
- *                 AMD Research and AMD ROC Software Development
- *
- *                 Advanced Micro Devices, Inc.
- *
- *                 www.amd.com
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal with the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- *  - Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimers.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimers in
- *    the documentation and/or other materials provided with the distribution.
- *  - Neither the names of <Name of Development Group, Name of Institution>,
- *    nor the names of its contributors may be used to endorse or promote
- *    products derived from this Software without specific prior written
- *    permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS WITH THE SOFTWARE.
- *
- */
-
-#include "device_load_bandwidth.h"
-#include "common/base_rocr_utils.h"
-#include "common/common.h"
-#include "common/helper_funcs.h"
-#include "common/hsatimer.h"
-#include "common/os.h"
-#include "gtest/gtest.h"
-#include <algorithm>
-
-// TODO: The validation code has problems to debug
-#if 0
-static void initGlobalReadBuffer(uint32_t* in_data, uint32_t num_thrds,
-                                 uint32_t num_ops, uint32_t num_loops) {
-
-  // Populate input buffer with thread Id left shifted by 2.
-  uint32_t value = 0;
-  uint32_t val_idx;
-
-  for (uint32_t idx1 = 0; idx1 < num_loops; idx1++) {
-    val_idx = 0;
-    for (uint32_t idx2 = 0; idx2 < num_ops; idx2++) {
-      // Write the value to be read by each thread
-      for (uint32_t idx3 = 0; idx3 < num_thrds; idx3++) {
-        value = idx3 << 2;
-        in_data[val_idx++] = value;
-      }
-    }
-  }
-
-  return;
-}
-
-static bool verifyGlobalLoadKernel(uint32_t* data, uint32_t num_thrds,
-                                   uint32_t scale, const char* kernel_name) {
-
-  // Verify kernel operation i.e. validate the data in the output buffer.
-  uint32_t valid_value = 0;
-
-  for (uint32_t idx = 0; idx < num_thrds; idx++) {
-
-    valid_value = (idx << 2) * scale;
-
-
-    if (data[idx] != valid_value) {
-      std::cout << "Value expected = " << valid_value << std::endl;
-      std::cout << "Value of data = " << data[idx] << std::endl;
-
-      std::cout << kernel_name << ": VALIDATION FAILED ! Bad index: " << idx
-                << std::endl;
-      std::cout << kernel_name << ": VALUE @ Bad index: " << data[idx]
-                << std::endl;
-      std::cout << std::endl;
-      return false;
-    }
-  }
-
-#ifdef DEBUG
-  std::cout << kernel_name << ": Passed validation" << std::endl;
-  std::cout << std::endl;
-#endif
-
-  return true;
-}
-#endif
-
-// Constructor
-DeviceLoadBandwidth::DeviceLoadBandwidth() :
-  BaseRocR() {
-
-  set_group_size(0);
-  set_enable_interrupt(false);
-
-  num_group_ = 0;
-  num_cus_ = 0;
-
-  kernel_loop_count_ = 0;
-  mean_ = 0.0;
-  data_size_ = 0;
-
-  set_requires_profile (HSA_PROFILE_BASE);
-}
-
-// Destructor
-DeviceLoadBandwidth::~DeviceLoadBandwidth() {
-}
-
-// Set up the test environment
-void DeviceLoadBandwidth::SetUp() {
-  SetWorkItemNum();
-
-  set_kernel_file_name("sysMemRead.o");
-  set_kernel_name("&__SysMemLoad");
-
-  if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
-    return;
-  }
-
-  hsa_agent_t* gpu_dev = gpu_device1();
-
-  //Create a queue with max number size
-  hsa_queue_t* q = nullptr;
-  rocrtst::CreateQueue(*gpu_dev, &q);
-  ASSERT_NE(q, nullptr);
-  set_main_queue(q);
-
-  rocrtst::LoadKernelFromObjFile(this);
-
-  uint32_t total_work_items = num_cus_ * num_group_ * group_size();
-
-  //Fill up part of aql
-  rocrtst::InitializeAQLPacket(this, &aql());
-  aql().workgroup_size_x = group_size();
-  aql().grid_size_x = total_work_items;
-
-  return;
-}
-
-// Run the test
-void DeviceLoadBandwidth::Run() {
-  hsa_status_t err;
-
-  if (!rocrtst::CheckProfile(this)) {
-    return;
-  }
-
-  uint32_t total_workitems = num_cus_ * num_group_ * group_size();
-
-  uint32_t ops_thrd = 32;
-  uint64_t addr_step = (uint64_t) total_workitems * sizeof(uint64_t);
-  uint64_t total_ops = (uint64_t) total_workitems * ops_thrd;
-  uint64_t in_data_size = (uint64_t) total_ops * sizeof(uint64_t);
-
-  data_size_ = in_data_size;
-
-  err = rocrtst::SetPoolsTypical(this);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  err = rocrtst::AllocAndAllowAccess(this, in_data_size, device_pool(),
-                                                  (void**)&in_data_);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  //uint32_t out_data_size = total_workitems * sizeof(uint64_t);
-  uint32_t out_data_size = in_data_size;
-
-  err = rocrtst::AllocAndAllowAccess(this, out_data_size, device_pool(),
-                                                          (void**)&out_data_);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-#if 0
-  initGlobalReadBuffer(in_data_, total_workitems, ops_thrd, kernel_loop_count_);
-#endif
-
-  struct local_args_t {
-    void* arg0;
-    void* arg1;
-    uint64_t arg2;
-    void* arg3;
-  } local_args;
-
-  local_args.arg0 = in_data_;
-  local_args.arg1 = in_data_ + total_ops;
-  local_args.arg2 = addr_step;
-  local_args.arg3 = out_data_;
-
-  // Copy the kernel args structure into a registered memory block
-  err = rocrtst::AllocAndSetKernArgs(this, &local_args, sizeof(local_args));
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  std::vector<double> time;
-
-  rocrtst::WriteAQLToQueue(this);
-  // Write the aql packet at the calculated queue index address.
-  const uint32_t queue_mask = main_queue()->size - 1;
-  void * q_base = main_queue()->base_address;
-
-  for (uint32_t i = 0; i < num_iteration(); i++) {
-    uint64_t que_idx = hsa_queue_load_write_index_relaxed(main_queue());
-
-    rocrtst::PerfTimer p_timer;
-    int id = p_timer.CreateTimer();
-    p_timer.StartTimer(id);
-
-    uint32_t aql_header = HSA_PACKET_TYPE_KERNEL_DISPATCH;
-    rocrtst::AtomicSetPacketHeader(aql_header, aql().setup,
-             &((hsa_kernel_dispatch_packet_t*)(q_base))[que_idx & queue_mask]);
-    hsa_signal_store_screlease(main_queue()->doorbell_signal, que_idx);
-
-    // Wait on the dispatch signal until the kernel is finished.
-    while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1,
-                                     (uint64_t) - 1, HSA_WAIT_STATE_ACTIVE))
-      ;
-
-    p_timer.StopTimer(id);
-
-#ifdef DEBUG
-    std::cout << "." << std::flush;
-#endif
-
-#if 0
-    // Verify the results
-   uint32_t scale = kernel_loop_count_ * ops_thrd;
-   verifyGlobalLoadKernel(out_data_, total_workitems, scale,
-                                                     kernel_name().c_str());
-#endif
-   time.push_back(p_timer.ReadTimer(id));
-
-    hsa_signal_store_screlease(signal(), 1);
-  }
-
-#ifdef DEBUG
-  std::cout << std::endl;
-#endif
-
-  time.erase(time.begin());
-  std::sort(time.begin(), time.end());
-  time.erase(time.begin() + num_iteration(), time.end());
-  mean_ = rocrtst::CalcMean(time);
-
-  return;
-}
-
-void DeviceLoadBandwidth::Close() {
-  hsa_status_t err;
-
-  err = hsa_amd_memory_pool_free(in_data_);
-  EXPECT_EQ(err, HSA_STATUS_SUCCESS);
-
-  err = hsa_amd_memory_pool_free(out_data_);
-  EXPECT_EQ(err, HSA_STATUS_SUCCESS);
-
-  err = rocrtst::CommonCleanUp(this);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  return;
-}
-
-void DeviceLoadBandwidth::DisplayResults() const {
-  if (!rocrtst::CheckProfile(this)) {
-    return;
-  }
-
-  std::cout << "=======================================" << std::endl;
-  std::cout << "Device Load Bandwidth:     ";
-  std::cout << data_size_ / mean_ / 1024 / 1024 / 1024 << "(GB/S)" << std::endl;
-  std::cout << "=======================================" << std::endl;
-
-  return;
-}
diff --git a/rocrtst/suites/performance/device_store_bandwidth.cc b/rocrtst/suites/performance/device_store_bandwidth.cc
deleted file mode 100755
index d2d51075d2..0000000000
--- a/rocrtst/suites/performance/device_store_bandwidth.cc
+++ /dev/null
@@ -1,219 +0,0 @@
-/*
- * =============================================================================
- *   ROC Runtime Conformance Release License
- * =============================================================================
- * The University of Illinois/NCSA
- * Open Source License (NCSA)
- *
- * Copyright (c) 2017, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Developed by:
- *
- *                 AMD Research and AMD ROC Software Development
- *
- *                 Advanced Micro Devices, Inc.
- *
- *                 www.amd.com
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal with the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- *  - Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimers.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimers in
- *    the documentation and/or other materials provided with the distribution.
- *  - Neither the names of <Name of Development Group, Name of Institution>,
- *    nor the names of its contributors may be used to endorse or promote
- *    products derived from this Software without specific prior written
- *    permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS WITH THE SOFTWARE.
- *
- */
-
-#include "device_store_bandwidth.h"
-#include "common/base_rocr_utils.h"
-#include "common/common.h"
-#include "common/helper_funcs.h"
-#include "common/hsatimer.h"
-#include "gtest/gtest.h"
-
-// Constructor
-DeviceStoreBandwidth::DeviceStoreBandwidth() :
-  BaseRocR() {
-
-  set_group_size(0);
-  num_group_ = 0;
-  num_cus_ = 0;
-
-  kernel_loop_count_ = 0;
-  mean_ = 0.0;
-  data_size_ = 0;
-  set_requires_profile (HSA_PROFILE_BASE);
-  in_data_ = nullptr;
-  out_data_ = nullptr;
-}
-
-// Destructor
-DeviceStoreBandwidth::~DeviceStoreBandwidth() {
-}
-
-// Set up the test environment
-void DeviceStoreBandwidth::SetUp() {
-  SetWorkItemNum();
-
-  set_kernel_file_name("sysMemWrite.o");
-  set_kernel_name("&__SysMemStore");
-
-  if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
-    return;
-  }
-
-  hsa_agent_t* gpu_dev = gpu_device1();
-
-  //Create a queue with max number size
-  hsa_queue_t* q = nullptr;
-  rocrtst::CreateQueue(*gpu_dev, &q);
-  ASSERT_NE(q, nullptr);
-  set_main_queue(q);
-
-  rocrtst::LoadKernelFromObjFile(this);
-
-  uint32_t total_work_items = num_cus_ * num_group_ * group_size();
-
-  //Fill up part of aql
-  rocrtst::InitializeAQLPacket(this, &aql());
-  aql().workgroup_size_x = group_size();
-  aql().grid_size_x = total_work_items;
-
-  return;
-}
-
-// Run the test
-void DeviceStoreBandwidth::Run() {
-  hsa_status_t err;
-
-  if (!rocrtst::CheckProfile(this)) {
-    return;
-  }
-
-  uint32_t total_workitems = num_cus_ * num_group_ * group_size();
-
-  uint32_t ops_thrd = 16;
-  uint64_t addr_step = (uint64_t) total_workitems * sizeof(uint32_t);
-  uint64_t total_ops = (uint64_t) total_workitems * kernel_loop_count_
-                       * ops_thrd;
-  uint64_t in_data_size = (uint64_t) total_ops * sizeof(uint32_t);
-
-  data_size_ = in_data_size;
-
-  err = rocrtst::SetPoolsTypical(this);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  err = rocrtst::AllocAndAllowAccess(this, in_data_size, device_pool(),
-                                                  (void**)&in_data_);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  uint32_t out_data_size = total_workitems * sizeof(uint32_t);
-
-  err = rocrtst::AllocAndAllowAccess(this, out_data_size, device_pool(),
-                                                          (void**)&out_data_);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  struct local_args_t {
-    void* arg0;
-    void* arg1;
-    uint64_t arg2;
-    void* arg3;
-  } local_args;
-
-  local_args.arg0 = in_data_;
-  local_args.arg1 = in_data_ + total_ops;
-  local_args.arg2 = addr_step;
-  local_args.arg3 = out_data_;
-
-  // Copy the kernel args structure into a registered memory block
-  err = rocrtst::AllocAndSetKernArgs(this, &local_args, sizeof(local_args));
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  std::vector<double> time;
-
-  rocrtst::WriteAQLToQueue(this);
-
-  for (uint32_t i = 0; i < num_iteration(); i++) {
-    uint64_t que_idx = hsa_queue_load_write_index_relaxed(main_queue());
-
-    // Write the aql packet at the calculated queue index address.
-    const uint32_t queue_mask = main_queue()->size - 1;
-
-    rocrtst::PerfTimer p_timer;
-    int id = p_timer.CreateTimer();
-    p_timer.StartTimer(id);
-
-    void * q_base = main_queue()->base_address;
-    uint32_t aql_header = HSA_PACKET_TYPE_KERNEL_DISPATCH;
-    rocrtst::AtomicSetPacketHeader(aql_header, aql().setup,
-             &((hsa_kernel_dispatch_packet_t*)(q_base))[que_idx & queue_mask]);
-    hsa_signal_store_screlease(main_queue()->doorbell_signal, que_idx);
-
-    // Wait on the dispatch signal until the kernel is finished.
-    while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1,
-                                     (uint64_t) - 1, HSA_WAIT_STATE_ACTIVE))
-      ;
-
-    p_timer.StopTimer(id);
-
-#ifdef DEBUG
-    std::cout << "." << std::flush;
-#endif
-
-    time.push_back(p_timer.ReadTimer(id));
-
-    hsa_signal_store_screlease(signal(), 1);
-  }
-
-#ifdef DEBUG
-  std::cout << std::endl;
-#endif
-
-  time.erase(time.begin());
-  mean_ = rocrtst::CalcMean(time);
-
-  return;
-}
-
-void DeviceStoreBandwidth::Close() {
-  hsa_status_t err;
-
-  err = hsa_amd_memory_pool_free(in_data_);
-  EXPECT_EQ(err, HSA_STATUS_SUCCESS);
-  err = hsa_amd_memory_pool_free(out_data_);
-  EXPECT_EQ(err, HSA_STATUS_SUCCESS);
-
-  err = rocrtst::CommonCleanUp(this);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-  return;
-}
-
-void DeviceStoreBandwidth::DisplayResults() const {
-  if (!rocrtst::CheckProfile(this)) {
-    return;
-  }
-  std::cout << "=======================================" << std::endl;
-  std::cout << "Device Store Bandwidth:     ";
-  std::cout << data_size_ / mean_ / 1024 / 1024 / 1024 << "(GB/S)" << std::endl;
-  std::cout << "=======================================" << std::endl;
-  return;
-}
diff --git a/rocrtst/suites/performance/device_store_bandwidth.h b/rocrtst/suites/performance/device_store_bandwidth.h
deleted file mode 100755
index 4aa032751e..0000000000
--- a/rocrtst/suites/performance/device_store_bandwidth.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * =============================================================================
- *   ROC Runtime Conformance Release License
- * =============================================================================
- * The University of Illinois/NCSA
- * Open Source License (NCSA)
- *
- * Copyright (c) 2017, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Developed by:
- *
- *                 AMD Research and AMD ROC Software Development
- *
- *                 Advanced Micro Devices, Inc.
- *
- *                 www.amd.com
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal with the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- *  - Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimers.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimers in
- *    the documentation and/or other materials provided with the distribution.
- *  - Neither the names of <Name of Development Group, Name of Institution>,
- *    nor the names of its contributors may be used to endorse or promote
- *    products derived from this Software without specific prior written
- *    permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS WITH THE SOFTWARE.
- *
- */
-
-#ifndef __ROCRTST_SRC_INC_DEVICE_STORE_BANDWIDTH_H__
-#define __ROCRTST_SRC_INC_DEVICE_STORE_BANDWIDTH_H__
-
-#include "perf_common/perf_base.h"
-#include "common/base_rocr.h"
-#include "hsa/hsa.h"
-#include <stdio.h>
-
-class DeviceStoreBandwidth: public rocrtst::BaseRocR, public PerfBase {
- public:
-  //@Brief: Constructor
-  DeviceStoreBandwidth();
-
-  //@Brief: Destructor
-  ~DeviceStoreBandwidth();
-
-  //@Brief: Set up the testing environment
-  virtual void SetUp();
-
-  //@Brief: Run the test case
-  virtual void Run();
-
-  //@Brief: Close and clean up  the test enrionment
-  virtual void Close();
-
-  //@Brief: Display  load bandwidth
-  virtual void DisplayResults() const;
-
-  //@Brief: Set work-item configuration
-  void SetWorkItemNum() {
-#ifdef INTERACTIVE
-    uint32_t tmp;
-    printf("Please input the number of CUs you want to try:\n");
-    scanf("%d", &num_cus_);
-
-    printf("Please input the number of groups you want to try:\n");
-    scanf("%d", &num_group_);
-
-    printf("Please input the size of each group:\n");
-    scanf("%d", &tmp);
-    set_group_size(tmp);
-
-    printf("Please input the number of kernel loop you want to try:\n");
-    scanf("%d", &kernel_loop_count_);
-#else
-    num_cus_ = 32;
-    num_group_ = 128;
-    set_group_size(64);
-    kernel_loop_count_ = 16;
-#endif
-    return;
-  }
-
- private:
-  //@Brief: number of group
-  uint32_t num_group_;
-
-  //@Brief: number of CUs
-  uint32_t num_cus_;
-
-  //@Brief: number of kernel loop
-  uint32_t kernel_loop_count_;
-
-  //@Brief: Mean execution time
-  double mean_;
-
-  //@Brief: data size for test
-  uint64_t data_size_;
-  uint32_t* in_data_;
-  uint32_t* out_data_;
-};
-
-#endif
-
diff --git a/rocrtst/suites/performance/dispatch_time.cc b/rocrtst/suites/performance/dispatch_time.cc
index 3b4a9262b4..400c314906 100755
--- a/rocrtst/suites/performance/dispatch_time.cc
+++ b/rocrtst/suites/performance/dispatch_time.cc
@@ -43,7 +43,10 @@
  *
  */
 
-#include "dispatch_time.h"
+#include <algorithm>
+#include <string>
+
+#include "suites/performance/dispatch_time.h"
 #include "common/base_rocr_utils.h"
 #include "common/common.h"
 #include "common/os.h"
@@ -52,40 +55,68 @@
 #include "gtest/gtest.h"
 #include "hsa/hsa.h"
 #include "hsa/hsa_ext_finalize.h"
-#include <algorithm>
 
-DispatchTime::DispatchTime() :
-  BaseRocR() {
-  use_default_ = false;
-  launch_single_ = false;
+DispatchTime::
+DispatchTime(bool defaultInterrupt, bool launchSingleKernel) : TestBase(),
+              use_default_interupt_(defaultInterrupt),
+                                          launch_single_(launchSingleKernel) {
   queue_size_ = 0;
   num_batch_ = 100000;
   memset(&aql(), 0, sizeof(hsa_kernel_dispatch_packet_t));
-  single_default_mean_ = 0.0;
-  single_interrupt_mean_ = 0.0;
-  multi_default_mean_ = 0.0;
-  multi_interrupt_mean_ = 0.0;
+  dispatch_time_mean_ = 0.0;
+  set_num_iteration(100);
+
+  set_kernel_file_name("dispatch_time_kernels.hsaco");
+  set_kernel_name("empty_kernel");
+
+  std::string name;
+  std::string desc;
+
+  name = "Average Dispatch Time";
+  desc = "This test measures the time to handle AQL packets that "
+      "do no work. Time is measured from when the packet is made available to"
+      " the Command Processor to when the target agent notifies the host that "
+      "the packet has been executed.  ";
+
+  if (defaultInterrupt) {
+    name += ", Default Interrupts";
+    desc += "Interrupts are controlled by HSA_ENABLE_INTERRUPT environment "
+                                                                "variable. ";
+  } else {
+    name += ", Interrupts Enabled";
+    desc += "Interrupts are enabled. ";
+  }
+
+  if (launchSingleKernel) {
+    name += ", Single Kernel";
+    desc += " One kernel at a time is and executed.";
+  } else {
+    name += ", Multiple Kernels";
+    desc += " Enough kernels to fill the queue are dispatched at one time";
+  }
+
+  set_title(name);
+  set_description(desc);
 }
 
 DispatchTime::~DispatchTime() {
-
 }
 
 void DispatchTime::SetUp() {
-  // If it indicates to use default signal, set env var properly
-  if (use_default_) {
+  hsa_status_t err;
+
+  // This need to happen before TestBase::SetUp()
+  if (use_default_interupt_) {
     set_enable_interrupt(false);
-  }
-  else {
+  } else {
     set_enable_interrupt(true);
   }
 
-  set_kernel_file_name("empty_kernel.o");
-  set_kernel_name("&__Empty_kernel");
+  TestBase::SetUp();
+  // If it indicates to use default signal, set env var properly
 
-  if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
-    return;
-  }
+  err = SetDefaultAgents(this);
+  ASSERT_EQ(HSA_STATUS_SUCCESS, err);
 
   hsa_agent_t* gpu_dev = gpu_device1();
 
@@ -105,24 +136,26 @@ void DispatchTime::SetUp() {
     num_batch_ = num_batch_ > size ? size : num_batch_;
   }
 
-  rocrtst::LoadKernelFromObjFile(this);
+  err = rocrtst::LoadKernelFromObjFile(this);
+  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
 
   // Fill up the kernel packet except header
-  rocrtst::InitializeAQLPacket(this, &aql());
+  err = rocrtst::InitializeAQLPacket(this, &aql());
+  ASSERT_EQ(HSA_STATUS_SUCCESS, err);
+
   aql().workgroup_size_x = 1;
   aql().grid_size_x = 1;
 }
 
 void DispatchTime::Run() {
-
   if (!rocrtst::CheckProfile(this)) {
     return;
   }
 
+  TestBase::Run();
   if (launch_single_) {
     RunSingle();
-  }
-  else {
+  } else {
     RunMulti();
   }
 }
@@ -137,59 +170,59 @@ void DispatchTime::RunSingle() {
   int it = RealIterationNum();
   const uint32_t queue_mask = main_queue()->size - 1;
 
-  //queue should be empty
+  // queue should be empty
   ASSERT_EQ(hsa_queue_load_read_index_scacquire(main_queue()),
             hsa_queue_load_write_index_scacquire(main_queue()));
 
   void *q_base_addr = main_queue()->base_address;
   for (int i = 0; i < it; i++) {
-    //Obtain the current queue write index.
+    // Obtain the current queue write index.
     uint64_t index = hsa_queue_add_write_index_relaxed(main_queue(), 1);
 
     ASSERT_LT(index, main_queue()->size + index);
 
-    //Write the aql packet at the calculated queue index address.
+    // Write the aql packet at the calculated queue index address.
 
-    ((hsa_kernel_dispatch_packet_t*)q_base_addr)[index & queue_mask] = aql();
-
-    //Get timing stamp and ring the doorbell to dispatch the kernel.
+    reinterpret_cast<hsa_kernel_dispatch_packet_t *>(
+                                     q_base_addr)[index & queue_mask] = aql();
+    // Get timing stamp and ring the doorbell to dispatch the kernel.
     rocrtst::PerfTimer p_timer;
     int id = p_timer.CreateTimer();
     p_timer.StartTimer(id);
-    ((hsa_kernel_dispatch_packet_t*)q_base_addr)[index & queue_mask].header |=
-                      HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
+    reinterpret_cast<hsa_kernel_dispatch_packet_t *>(
+                        q_base_addr)[index & queue_mask].header |=
+                    HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
+
     hsa_signal_store_screlease(main_queue()->doorbell_signal, index);
 
-    //Wait on the dispatch signal until the kernel is finished.
-    while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1,
-                                     (uint64_t) - 1, HSA_WAIT_STATE_ACTIVE))
-      ;
+    // Wait on the dispatch signal until the kernel is finished.
+    while (hsa_signal_wait_scacquire(aql().completion_signal,
+         HSA_SIGNAL_CONDITION_LT, 1, (uint64_t) - 1, HSA_WAIT_STATE_ACTIVE)) {
+    }
+
 
     p_timer.StopTimer(id);
 
     timer.push_back(p_timer.ReadTimer(id));
-    hsa_signal_store_screlease(signal(), 1);
+    hsa_signal_store_screlease(aql().completion_signal, 1);
 
-#ifdef DEBUG
-    std::cout << ".";
-    fflush(stdout);
-#endif
+    if (verbosity() >= VERBOSE_PROGRESS) {
+      std::cout << ".";
+      fflush(stdout);
+    }
   }
 
-  std::cout << std::endl;
+  if (verbosity() >= VERBOSE_PROGRESS) {
+    std::cout << std::endl;
+  }
 
-  //Abandon the first result and after sort, delete the last 2% value
+  // Abandon the first result and after sort, delete the last 2% value
   timer.erase(timer.begin());
   std::sort(timer.begin(), timer.end());
 
   timer.erase(timer.begin() + num_iteration(), timer.end());
 
-  if (use_default_) {
-    single_default_mean_ = rocrtst::CalcMean(timer);
-  }
-  else {
-    single_interrupt_mean_ = rocrtst::CalcMean(timer);
-  }
+  dispatch_time_mean_ = rocrtst::CalcMean(timer);
 
   return;
 }
@@ -199,72 +232,69 @@ void DispatchTime::RunMulti() {
   int it = RealIterationNum();
   const uint32_t queue_mask = main_queue()->size - 1;
 
-  //queue should be empty
+  // queue should be empty
   ASSERT_EQ(hsa_queue_load_read_index_scacquire(main_queue()),
             hsa_queue_load_write_index_scacquire(main_queue()));
 
-  for (int i = 0; i < it; i++) {
-    uint64_t* index = (uint64_t*) malloc(sizeof(uint64_t) * num_batch_);
+  rocrtst::PerfTimer p_timer;
 
-    hsa_signal_store_screlease(signal(), num_batch_);
+  for (int i = 0; i < it; i++) {
+    uint64_t* index =
+           reinterpret_cast<uint64_t*>(malloc(sizeof(uint64_t) * num_batch_));
+
+    hsa_signal_store_screlease(aql().completion_signal, num_batch_);
 
     for (uint32_t j = 0; j < num_batch_; j++) {
-      //index[j] = hsa_queue_add_write_index_scacq_screl(main_queue(), 1);
+      // index[j] = hsa_queue_add_write_index_scacq_screl(main_queue(), 1);
       index[j] = hsa_queue_add_write_index_relaxed(main_queue(), 1);
 
-      //Write the aql packet at the calculated queue index address.
-      ((hsa_kernel_dispatch_packet_t*) (main_queue()->base_address))[index[j]
-          & queue_mask] = aql();
+      // Write the aql packet at the calculated queue index address.
+      (reinterpret_cast<hsa_kernel_dispatch_packet_t*>((
+                 main_queue()->base_address)))[index[j] & queue_mask] = aql();
 
       if (j == num_batch_ - 1) {
-        ((hsa_kernel_dispatch_packet_t*) (main_queue()->base_address))[index[j]
-            & queue_mask].header |= 1 << HSA_PACKET_HEADER_BARRIER;
-
-        //TODO: verify if the below is needed. I don't think it is. It should
-        // already be initialized to signal().
-        ((hsa_kernel_dispatch_packet_t*) (main_queue()->base_address))[index[j]
-            & queue_mask].completion_signal = signal();
+        (reinterpret_cast<hsa_kernel_dispatch_packet_t*>(
+            main_queue()->base_address))[index[j] & queue_mask].header |=
+                                               1 << HSA_PACKET_HEADER_BARRIER;
       }
     }
 
     // Set packet header reversly; set all headers except the very first
     // one, for now.
     for (uint32_t j = num_batch_ - 1; j > 0; j--) {
-
-      ((hsa_kernel_dispatch_packet_t*) (main_queue()->base_address))[index[j]
-          & queue_mask].header |= HSA_PACKET_TYPE_KERNEL_DISPATCH
-                                  << HSA_PACKET_HEADER_TYPE;
+      reinterpret_cast<hsa_kernel_dispatch_packet_t*>(
+         (main_queue()->base_address))[index[j] & queue_mask].header |=
+                    HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
     }
 
-    //Get timing stamp and ring the doorbell to dispatch the kernel.
-    rocrtst::PerfTimer p_timer;
+    // Get timing stamp and ring the doorbell to dispatch the kernel.
     int id = p_timer.CreateTimer();
     p_timer.StartTimer(id);
-    //Set the very first header...
-    ((hsa_kernel_dispatch_packet_t*) (main_queue()->base_address))[index[0]
-        & queue_mask].header |= HSA_PACKET_TYPE_KERNEL_DISPATCH
-                                << HSA_PACKET_HEADER_TYPE;
+    // Set the very first header...
+    (reinterpret_cast<hsa_kernel_dispatch_packet_t*>(
+        main_queue()->base_address))[index[0] & queue_mask].header |=
+                    HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
 
     for (uint32_t j = 0; j < num_batch_; j++) {
       hsa_signal_store_screlease(main_queue()->doorbell_signal, index[j]);
     }
 
-    //Wait on the dispatch signal until the kernel is finished.
-    while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_EQ, 0,
-                                     UINT64_MAX, HSA_WAIT_STATE_ACTIVE) != 0)
-      ;
+    // Wait on the dispatch signal until the kernel is finished.
+    while (hsa_signal_wait_scacquire(aql().completion_signal,
+        HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, HSA_WAIT_STATE_ACTIVE) != 0) {
+    }
 
     p_timer.StopTimer(id);
 
     timer.push_back(p_timer.ReadTimer(id));
-    hsa_signal_store_screlease(signal(), 1);
+    hsa_signal_store_screlease(aql().completion_signal, 1);
 
     free(index);
 
-#ifdef DEBUG
-    std::cout << ".";
-    fflush(stdout);
-#endif
+    if (verbosity() >= VERBOSE_PROGRESS) {
+      std::cout << ".";
+      fflush(stdout);
+    }
   }
 
   std::cout << std::endl;
@@ -275,57 +305,34 @@ void DispatchTime::RunMulti() {
 
   timer.erase(timer.begin() + num_iteration(), timer.end());
 
-  if (use_default_) {
-    multi_default_mean_ = rocrtst::CalcMean(timer);
-  }
-  else {
-    multi_interrupt_mean_ = rocrtst::CalcMean(timer);
-  }
+  dispatch_time_mean_ = rocrtst::CalcMean(timer);
 
   return;
 }
 
-void DispatchTime::DisplayResults() const {
+void DispatchTime::DisplayTestInfo(void) {
+  TestBase::DisplayTestInfo();
+}
 
+void DispatchTime::DisplayResults(void) const {
   if (!rocrtst::CheckProfile(this)) {
     return;
   }
 
-  std::cout << "===================================================="
-            << std::endl;
+  TestBase::DisplayResults();
 
-  if (use_default_) {
-    if (launch_single_) {
-      std::cout << "Single_Default:       " << single_default_mean_ * 1e6
-                << std::endl;
-    }
-    else {
-      std::cout << "Multi_Default:         "
-                << multi_default_mean_ * 1e6 / num_batch_ << std::endl;
-    }
-  }
-  else {
-    if (launch_single_) {
-      std::cout << "Single_Interrupt:       " << single_interrupt_mean_ * 1e6
-                << std::endl;
-    }
-    else {
-      std::cout << "Multi_Interrupt:         "
-                << multi_interrupt_mean_ * 1e6 / num_batch_ << std::endl;
-    }
+  std::cout << "Average Time to Completion: ";
+  if (launch_single_) {
+    std::cout << dispatch_time_mean_ * 1e6;
+  } else {
+    std::cout << dispatch_time_mean_ * 1e6 / num_batch_;
   }
 
-  std::cout << "====================================================="
-            << std::endl;
-
+  std::cout << " uS" << std::endl;
   return;
 }
 
 void DispatchTime::Close() {
-  hsa_status_t err;
-
-  err = rocrtst::CommonCleanUp(this);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
+  TestBase::Close();
   return;
 }
diff --git a/rocrtst/suites/performance/dispatch_time.h b/rocrtst/suites/performance/dispatch_time.h
index 559cd5733f..7df879ed9a 100755
--- a/rocrtst/suites/performance/dispatch_time.h
+++ b/rocrtst/suites/performance/dispatch_time.h
@@ -43,83 +43,68 @@
  *
  */
 
-#ifndef __ROCRTST_SRC_DISPATCH_TIME_H__
-#define __ROCRTST_SRC_DISPATCH_TIME_H__
-#include "perf_common/perf_base.h"
+#ifndef ROCRTST_SUITES_PERFORMANCE_DISPATCH_TIME_H_
+#define ROCRTST_SUITES_PERFORMANCE_DISPATCH_TIME_H_
+#include <vector>
+
+#include "suites/test_common/test_base.h"
 #include "common/base_rocr.h"
 #include "common/common.h"
 #include "hsa/hsa.h"
-#include <vector>
 
-//@Brief: This class is defined to measure the mean latency of launching
-//an empty kernel
+// @Brief: This class is defined to measure the mean latency of launching
+// an empty kernel
 
-class DispatchTime: public rocrtst::BaseRocR, public PerfBase {
+class DispatchTime : public TestBase {
  public:
-  //@Brief: Constructor
-  DispatchTime();
+  // @Brief: Constructor
+  DispatchTime(bool defaultInterrupt, bool launchSingleKernel);
 
-  //@Brief: Destructor
-  virtual ~DispatchTime();
+  // @Brief: Destructor
+  virtual ~DispatchTime(void);
 
-  //@Brief: Set up the environment for the test
-  virtual void SetUp();
+  // @Brief: Set up the environment for the test
+  virtual void SetUp(void);
 
-  //@Brief: Run the test case
-  virtual void Run();
+  // @Brief: Run the test case
+  virtual void Run(void);
 
-  //@Brief: Display  results we got
-  virtual void DisplayResults() const;
+  // @Brief: Display  results we got
+  virtual void DisplayResults(void) const;
 
-  //@Brief: Clean up and close the runtime
-  virtual void Close();
+  // @Brief: Display information about what this test does
+  virtual void DisplayTestInfo(void);
 
-  //@Brief: Choose if use default signal or not
-  void UseDefaultSignal(bool use_default = true) {
-    use_default_ = use_default;
-  }
-
-  //@Brief; Choose to launch a single kernels or not
-  void LaunchSingleKernel(bool launch_single = true) {
-    launch_single_ = launch_single;
-  }
+  // @Brief: Clean up and close the runtime
+  virtual void Close(void);
 
  private:
-  //@Brief: Get actual iteration number
-  virtual size_t RealIterationNum();
+  // @Brief: Get actual iteration number
+  virtual size_t RealIterationNum(void);
 
-  //@Brief: Launch single packet each time
-  virtual void RunSingle();
+  // @Brief: Launch single packet each time
+  virtual void RunSingle(void);
 
-  //@Brief: Launch multiple packets each time
-  virtual void RunMulti();
+  // @Brief: Launch multiple packets each time
+  virtual void RunMulti(void);
 
-  //@Brief: Indicate if use default signal or not
-  bool use_default_;
+  // @Brief: Indicate if use default signal or not
+  bool use_default_interupt_;
 
-  //@Brief: Indicate if launch single kernel or not
+  // @Brief: Indicate if launch single kernel or not
   bool launch_single_;
 
-  //@Brief: Store the size of queue
+  // @Brief: Store the size of queue
   uint32_t queue_size_;
 
-  //@Brief: Number of packets in a batch
+  // @Brief: Number of packets in a batch
   uint32_t num_batch_;
 
-  //@Brief: Time of single default signal dispatch time
-  double single_default_mean_;
-
-  //@Brief: Time of single interrupt signal dispatch time
-  double single_interrupt_mean_;
-
-  //@Brief: Time of multi default signal dispatch time
-  double multi_default_mean_;
-
-  //@Brief: Time of multi interrupt signal dispatch time
-  double multi_interrupt_mean_;
+  // @Brief: Ave. dispatch time
+  double dispatch_time_mean_;
 
   char* orig_iterrupt_env_;
 };
 
-#endif
+#endif  // ROCRTST_SUITES_PERFORMANCE_DISPATCH_TIME_H_
 
diff --git a/rocrtst/suites/performance/flush_latency.cc b/rocrtst/suites/performance/flush_latency.cc
deleted file mode 100755
index 298aefb780..0000000000
--- a/rocrtst/suites/performance/flush_latency.cc
+++ /dev/null
@@ -1,351 +0,0 @@
-/*
- * =============================================================================
- *   ROC Runtime Conformance Release License
- * =============================================================================
- * The University of Illinois/NCSA
- * Open Source License (NCSA)
- *
- * Copyright (c) 2017, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Developed by:
- *
- *                 AMD Research and AMD ROC Software Development
- *
- *                 Advanced Micro Devices, Inc.
- *
- *                 www.amd.com
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal with the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- *  - Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimers.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimers in
- *    the documentation and/or other materials provided with the distribution.
- *  - Neither the names of <Name of Development Group, Name of Institution>,
- *    nor the names of its contributors may be used to endorse or promote
- *    products derived from this Software without specific prior written
- *    permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS WITH THE SOFTWARE.
- *
- */
-
-#include "flush_latency.h"
-#include "common/base_rocr_utils.h"
-#include "common/common.h"
-#include "common/helper_funcs.h"
-#include "common/hsatimer.h"
-#include "common/os.h"
-#include "gtest/gtest.h"
-#include <algorithm>
-
-static const int kWorkItem = 1024 * 1204;
-// Constructor
-FlushLatency::FlushLatency() :
-  BaseRocR() {
-  set_group_size(0);
-  num_group_ = 0;
-  num_cus_ = 0;
-
-  kernel_loop_count_ = 0;
-  mean_ = 0.0;
-  data_size_ = 0;
-
-  set_requires_profile (HSA_PROFILE_BASE);
-}
-
-// Destructor
-FlushLatency::~FlushLatency() {
-}
-
-// Set up the test environment
-void FlushLatency::SetUp() {
-  hsa_status_t err;
-
-  SetWorkItemNum();
-
-  set_kernel_file_name("flush_latency.o");
-  set_kernel_name("&main");
-
-  if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
-    return;
-  }
-
-  hsa_agent_t* gpu_dev = gpu_device1();
-
-  //Create a queue with max number size
-  hsa_queue_t* q;
-  rocrtst::CreateQueue(*gpu_dev, &q);
-  set_main_queue(q);
-
-  //Enable profiling
-  err = hsa_amd_profiling_set_profiler_enabled(main_queue(), 1);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  rocrtst::LoadKernelFromObjFile(this);
-
-  uint32_t total_work_items = kWorkItem * 0.3;
-
-  //Fill up part of aql
-  rocrtst::InitializeAQLPacket(this, &aql());
-  aql().workgroup_size_x = group_size();
-  aql().grid_size_x = total_work_items;
-
-  return;
-}
-
-// Run the test
-void FlushLatency::Run() {
-  hsa_status_t err;
-  hsa_amd_memory_pool_t cpu_pool;
-
-  if (!rocrtst::CheckProfile(this)) {
-    return;
-  }
-
-  hsa_agent_t* gpu_dev = gpu_device1();
-  hsa_agent_t* cpu_dev = cpu_device();
-
-  err = hsa_amd_agent_iterate_memory_pools(*gpu_dev, rocrtst::FindStandardPool,
-                                                                &device_pool());
-  ASSERT_EQ(err, HSA_STATUS_INFO_BREAK);
-
-  ASSERT_NE(device_pool().handle, 0);
-
-  cpu_pool.handle = 0;
-  err = hsa_amd_agent_iterate_memory_pools(*cpu_dev, rocrtst::FindGlobalPool,
-        &cpu_pool);
-  ASSERT_EQ(err, HSA_STATUS_INFO_BREAK);
-
-  ASSERT_NE(cpu_pool.handle, 0);
-
-#if DEBUG
-  std::cout << "Device Pool Properties:" << std::endl;
-  err = rocrtst::DumpMemoryPoolInfo(device_pool());
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-  std::cout << "Global Pool Properties:" << std::endl;
-  err = rocrtst::DumpMemoryPoolInfo(cpu_pool);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-#endif
-  uint32_t out_data_size = 1024 * 1024 * sizeof(uint32_t);
-
-  std::vector<double> time_none;
-  std::vector<double> time_release;
-
-  std::vector < uint64_t > time_none_stamp;
-  std::vector < uint64_t > time_release_stamp;
-
-  //Query system timestamp frequency
-  uint64_t freq;
-  err = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &freq);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  void* out = NULL;
-  uint32_t* out_data;
-  const uint32_t queue_mask = main_queue()->size - 1;
-  typedef struct local_args_t {
-    void* arg0;
-  } args;
-
-  // Warm up
-  uint16_t header = 0;
-  header |= HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
-  header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
-  header |= HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
-  aql().header = header;
-
-  err = hsa_amd_memory_pool_allocate(device_pool(), out_data_size, 0,
-                                     (void**) &out_data);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  args* kern_ptr = NULL;
-  err = hsa_amd_memory_pool_allocate(cpu_pool, sizeof(args), 0,
-                                     (void**) &kern_ptr);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  kern_ptr->arg0 = out_data;
-
-  aql().kernarg_address = kern_ptr;
-
-  // Obtain the current queue write index
-  int64_t index = hsa_queue_add_write_index_relaxed(main_queue(), 1);
-
-  void *q_base_addr = main_queue()->base_address;
-  // Write the aql packet at the calculated queue index address.
-  ((hsa_kernel_dispatch_packet_t*)q_base_addr)[index & queue_mask] = aql();
-
-  hsa_signal_store_screlease(main_queue()->doorbell_signal, index);
-
-  // Wait on the dispatch signal until the kernel is finished.
-  while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1,
-                                   (uint64_t) - 1, HSA_WAIT_STATE_ACTIVE))
-    ;
-
-  hsa_signal_store_screlease(signal(), 1);
-
-  for (int i = 0; i < 1000; i++) {
-    err = hsa_amd_memory_pool_allocate(device_pool(), out_data_size, 0,
-                                       (void**) &out_data);
-    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-    args* kern_ptr = NULL;
-    err = hsa_amd_memory_pool_allocate(cpu_pool, sizeof(args), 0,
-                                       (void**) &kern_ptr);
-    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-    kern_ptr->arg0 = out_data;
-
-    aql().kernarg_address = kern_ptr;
-
-    // Obtain the current queue write index
-    int64_t index = hsa_queue_add_write_index_relaxed(main_queue(), 1);
-
-    // Write the aql packet at the calculated queue index address.
-    ((hsa_kernel_dispatch_packet_t*)q_base_addr)[index & queue_mask] = aql();
-
-    hsa_signal_store_screlease(main_queue()->doorbell_signal, index);
-
-    // Wait on the dispatch signal until the kernel is finished.
-    while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1,
-                                     (uint64_t) - 1, HSA_WAIT_STATE_ACTIVE))
-      ;
-
-    hsa_amd_profiling_dispatch_time_t dispatch_time;
-    err = hsa_amd_profiling_get_dispatch_time(*gpu_dev, signal(),
-          &dispatch_time);
-    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-    uint64_t sys_start = 0;
-    uint64_t sys_end = 0;
-    err = hsa_amd_profiling_convert_tick_to_system_domain(*gpu_dev,
-          dispatch_time.start, &sys_start);
-    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-    err = hsa_amd_profiling_convert_tick_to_system_domain(*gpu_dev,
-          dispatch_time.end, &sys_end);
-    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-    uint64_t stamp = dispatch_time.end - dispatch_time.start;
-    double execution_time = (double) stamp / freq * 1e6; // convert to us.
-
-    time_none.push_back(execution_time);
-    time_none_stamp.push_back(stamp);
-
-    hsa_signal_store_screlease(signal(), 1);
-
-    if (out != NULL) {
-      err = hsa_memory_free(out);
-      ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-    }
-
-    out = out_data;
-    out_data = NULL;
-  }
-
-  header = 0;
-  header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
-  header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
-  header |= HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
-  aql().header = header;
-
-  for (int i = 0; i < 1000; i++) {
-    err = hsa_amd_memory_pool_allocate(device_pool(), out_data_size, 0,
-                                       (void**) &out_data);
-    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-    args* kern_ptr = NULL;
-    err = hsa_amd_memory_pool_allocate(cpu_pool, sizeof(args), 0,
-                                       (void**) &kern_ptr);
-    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-    kern_ptr->arg0 = out_data;
-
-    aql().kernarg_address = kern_ptr;
-
-    // Obtain the current queue write index
-    uint64_t index = hsa_queue_add_write_index_relaxed(main_queue(), 1);
-
-    // Write the aql packet at the calculated queue index address.
-    ((hsa_kernel_dispatch_packet_t*)q_base_addr)[index & queue_mask] = aql();
-
-    hsa_signal_store_screlease(main_queue()->doorbell_signal, index);
-
-    // Wait on the dispatch signal until the kernel is finished.
-    while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1,
-                                     (uint64_t) - 1, HSA_WAIT_STATE_ACTIVE))
-      ;
-
-    hsa_signal_store_screlease(signal(), 1);
-
-    hsa_amd_profiling_dispatch_time_t dispatch_time;
-    err = hsa_amd_profiling_get_dispatch_time(*gpu_dev, signal(),
-          &dispatch_time);
-    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-    uint64_t sys_start = 0;
-    uint64_t sys_end = 0;
-    err = hsa_amd_profiling_convert_tick_to_system_domain(*gpu_dev,
-          dispatch_time.start, &sys_start);
-    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-    err = hsa_amd_profiling_convert_tick_to_system_domain(*gpu_dev,
-          dispatch_time.end, &sys_end);
-    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-    uint64_t stamp = dispatch_time.end - dispatch_time.start;
-    double execution_time = (double) stamp / freq * 1e6; // convert to us.
-    time_release.push_back(execution_time);
-    time_release_stamp.push_back(stamp);
-
-    if (out != NULL) {
-      err = hsa_memory_free(out);
-      ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-    }
-
-    out = out_data;
-    out_data = NULL;
-  }
-
-  std::sort(time_none.begin(), time_none.end());
-  std::sort(time_release.begin(), time_release.end());
-
-  time_none.erase(time_none.begin(), time_none.begin() + 50);
-  time_none.erase(time_none.end() - 50, time_none.end());
-  time_release.erase(time_release.begin(), time_release.begin() + 50);
-  time_release.erase(time_release.end() - 50, time_release.end());
-
-  mean_ = rocrtst::CalcMean(time_none, time_release);
-
-  return;
-}
-
-void FlushLatency::Close() {
-  hsa_status_t err;
-  err = rocrtst::CommonCleanUp(this);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-}
-
-void FlushLatency::DisplayResults() const {
-
-  if (!rocrtst::CheckProfile(this)) {
-    return;
-  }
-
-  std::cout << std::endl << "======================================="
-            << std::endl;
-  std::cout << "Average cache flush overhead:     " << mean_ << "uS"
-            << std::endl;
-  std::cout << "=======================================" << std::endl;
-  return;
-}
diff --git a/rocrtst/suites/performance/flush_latency.h b/rocrtst/suites/performance/flush_latency.h
deleted file mode 100755
index 4d4a25fa2d..0000000000
--- a/rocrtst/suites/performance/flush_latency.h
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * =============================================================================
- *   ROC Runtime Conformance Release License
- * =============================================================================
- * The University of Illinois/NCSA
- * Open Source License (NCSA)
- *
- * Copyright (c) 2017, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Developed by:
- *
- *                 AMD Research and AMD ROC Software Development
- *
- *                 Advanced Micro Devices, Inc.
- *
- *                 www.amd.com
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal with the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- *  - Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimers.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimers in
- *    the documentation and/or other materials provided with the distribution.
- *  - Neither the names of <Name of Development Group, Name of Institution>,
- *    nor the names of its contributors may be used to endorse or promote
- *    products derived from this Software without specific prior written
- *    permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS WITH THE SOFTWARE.
- *
- */
-
-#ifndef __ROCRTST_SRC_INC_FLUSH_LATENCY_H__
-#define __ROCRTST_SRC_INC_FLUSH_LATENCY_H__
-
-#include "perf_common/perf_base.h"
-#include "common/base_rocr.h"
-#include "hsa/hsa.h"
-#include <stdio.h>
-
-class FlushLatency: public rocrtst::BaseRocR, public PerfBase {
- public:
-  //@Brief: Constructor
-  FlushLatency();
-
-  //@Brief: Destructor
-  ~FlushLatency();
-
-  //@Brief: Set up the testing environment
-  virtual void SetUp();
-
-  //@Brief: Run the test case
-  virtual void Run();
-
-  //@Brief: Close and clean up  the test enrionment
-  virtual void Close();
-
-  //@Brief: Display  load bandwidth
-  virtual void DisplayResults() const;
-
-  //@Brief: Set work-item configuration
-  void SetWorkItemNum() {
-#ifdef INTERACTIVE
-    uint32_t tmp;
-    printf("Please input the number of CUs you want to try:\n");
-    int i;
-    i = scanf("%d", &num_cus_);
-
-    printf("Please input the number of groups you want to try:\n");
-    i = scanf("%d", &num_group_);
-
-    printf("Please input the size of each group:\n");
-    i = scanf("%d", &tmp);
-    set_group_size(tmp);
-
-    printf("Please input the number of kernel loop you want to try:\n");
-    i = scanf("%d", &kernel_loop_count_);
-#else
-    num_cus_ = 32;
-    num_group_ = 128;
-    group_size_ = 256;
-    kernel_loop_count_ = 16;
-#endif
-    return;
-  }
-
- private:
-  //@Brief: number of work item in one group
-  uint32_t group_size_;
-
-  //@Brief: number of group
-  uint32_t num_group_;
-
-  //@Brief: number of CUs
-  uint32_t num_cus_;
-
-  //@Brief: number of kernel loop
-  uint32_t kernel_loop_count_;
-
-  //@Brief: Mean execution time
-  double mean_;
-
-  //@Brief: data size for test
-  uint64_t data_size_;
-
-};
-
-#endif
-
diff --git a/rocrtst/suites/performance/hsa_info.cc b/rocrtst/suites/performance/hsa_info.cc
deleted file mode 100755
index fc7848e358..0000000000
--- a/rocrtst/suites/performance/hsa_info.cc
+++ /dev/null
@@ -1,502 +0,0 @@
-/*
- * =============================================================================
- *   ROC Runtime Conformance Release License
- * =============================================================================
- * The University of Illinois/NCSA
- * Open Source License (NCSA)
- *
- * Copyright (c) 2017, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Developed by:
- *
- *                 AMD Research and AMD ROC Software Development
- *
- *                 Advanced Micro Devices, Inc.
- *
- *                 www.amd.com
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal with the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- *  - Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimers.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimers in
- *    the documentation and/or other materials provided with the distribution.
- *  - Neither the names of <Name of Development Group, Name of Institution>,
- *    nor the names of its contributors may be used to endorse or promote
- *    products derived from this Software without specific prior written
- *    permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS WITH THE SOFTWARE.
- *
- */
-
-#include "common/base_rocr_utils.h"
-#include "common/common.h"
-#include "gtest/gtest.h"
-#include "hsa_info.h"
-
-static hsa_status_t get_agent_info(hsa_agent_t, void*);
-
-static hsa_status_t get_pool_info(hsa_amd_memory_pool_t, void*);
-
-static int agent_number = 0;
-static bool output_amd = false;
-
-//@Brief: Map to store the peak FLOPS for different agent
-std::map<std::string, double> flops_table = { {"Kaveri CPU", 118.4}, {
-    "S    pectre", 737.0
-  }, {"Carrizo CPU", 67.2}, {"Carrizo GPU", 819.2}
-};
-
-//@Brief: Vector to store the agent_names
-std::vector<std::string> agent_names = {"Kaveri CPU", "Spectre",
-                                        "Carri    zo CPU", "Carrizo GPU"
-                                       };
-
-HsaInfo::HsaInfo() :
-  BaseRocR() {
-}
-
-HsaInfo::~HsaInfo() {
-}
-
-void HsaInfo::SetUp() {
-  // Get Env Var to determine if output AMD specific info
-  char* EnvVar = rocrtst::GetEnv("HSA_VENDOR_AMD");
-
-  if (NULL != EnvVar) {
-    output_amd = ('1' == *EnvVar);
-  }
-
-  if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
-    return;
-  }
-}
-
-void HsaInfo::Run() {
-  hsa_status_t err;
-  // Get the system info first
-  // Get version info
-  uint16_t major, minor;
-
-  if (!rocrtst::CheckProfile(this)) {
-    return;
-  }
-
-  err = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &major);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-  err = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &minor);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  // Get timestamp frequency
-  uint64_t timestamp_frequency = 0;
-  err = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY,
-                            &timestamp_frequency);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  // Get maximum duration of a signal wait operation
-  uint64_t max_wait = 0;
-  err = hsa_system_get_info(HSA_SYSTEM_INFO_SIGNAL_MAX_WAIT, &max_wait);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  // Get Endianness of the system
-  hsa_endianness_t endianness;
-  err = hsa_system_get_info(HSA_SYSTEM_INFO_ENDIANNESS, &endianness);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  // Get machine model info
-  hsa_machine_model_t machine_model;
-  err = hsa_system_get_info(HSA_SYSTEM_INFO_MACHINE_MODEL, &machine_model);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  // Print out the results
-  std::cout << "HSA System Info:" << std::endl;
-  std::cout << "Runtime Version:				" << major <<
-                                                     "." << minor << std::endl;
-  std::cout << "System Timestamp Frequency: 			" <<
-                               timestamp_frequency / 1e6 << "MHz" << std::endl;
-
-  std::cout << "Signal Max Wait Duration:                        " << max_wait
-            << "(number of timestamp)" << std::endl;
-  std::cout << "Machine Model:					";
-
-  if (HSA_MACHINE_MODEL_SMALL == machine_model) {
-    std::cout << "SMALL" << std::endl;
-  }
-  else if (HSA_MACHINE_MODEL_LARGE == machine_model) {
-    std::cout << "LARGE" << std::endl;
-  }
-
-  std::cout << "System Endianness:				";
-
-  if (HSA_ENDIANNESS_LITTLE == endianness) {
-    std::cout << "LITTLE" << std::endl;
-  }
-  else if (HSA_ENDIANNESS_BIG == endianness) {
-    std::cout << "BIG" << std::endl;
-  }
-
-  std::cout << std::endl;
-
-  // Iterate every agent and get their info
-  err = hsa_iterate_agents(get_agent_info, NULL);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-  return;
-
-}
-
-#define RET_IF_HSA_INFO_ERR(err) { \
-  if ((err) != HSA_STATUS_SUCCESS) { \
-    std::cout << "hsa api call failure at line " << __LINE__ << ", file: " << \
-              __FILE__ << std::endl; \
-    return (err); \
-  } \
-}
-
-static hsa_status_t get_agent_info(hsa_agent_t agent, void* data) {
-  int pool_number = 0;
-  hsa_status_t err;
-  {
-    // Increase the number of agent
-    agent_number++;
-
-    // Get agent name and vendor
-    char name[64];
-    err = hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, name);
-    RET_IF_HSA_INFO_ERR(err)
-    char vendor_name[64];
-    err = hsa_agent_get_info(agent, HSA_AGENT_INFO_VENDOR_NAME, &vendor_name);
-    RET_IF_HSA_INFO_ERR(err)
-
-    // Get agent feature
-    hsa_agent_feature_t agent_feature;
-    err = hsa_agent_get_info(agent, HSA_AGENT_INFO_FEATURE, &agent_feature);
-    RET_IF_HSA_INFO_ERR(err)
-
-    // Get profile supported by the agent
-    hsa_profile_t agent_profile;
-    err = hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_profile);
-    RET_IF_HSA_INFO_ERR(err)
-
-    // Get floating-point rounding mode
-    hsa_default_float_rounding_mode_t float_rounding_mode;
-    err = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEFAULT_FLOAT_ROUNDING_MODE,
-                             &float_rounding_mode);
-    RET_IF_HSA_INFO_ERR(err)
-
-    // Get max number of queue
-    uint32_t max_queue = 0;
-    err = hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUES_MAX, &max_queue);
-    RET_IF_HSA_INFO_ERR(err)
-
-    // Get queue min size
-    uint32_t queue_min_size = 0;
-    err = hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE,
-                             &queue_min_size);
-    RET_IF_HSA_INFO_ERR(err)
-
-    // Get queue max size
-    uint32_t queue_max_size = 0;
-    err = hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE,
-                             &queue_max_size);
-    RET_IF_HSA_INFO_ERR(err)
-
-    // Get queue type
-    hsa_queue_type_t queue_type;
-    err = hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_TYPE, &queue_type);
-    RET_IF_HSA_INFO_ERR(err)
-
-    // Get agent node
-    uint32_t node;
-    err = hsa_agent_get_info(agent, HSA_AGENT_INFO_NODE, &node);
-    RET_IF_HSA_INFO_ERR(err)
-
-    // Get device type
-    hsa_device_type_t device_type;
-    err = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type);
-    RET_IF_HSA_INFO_ERR(err)
-
-    // Get cache size
-    uint32_t cache_size[4];
-    err = hsa_agent_get_info(agent, HSA_AGENT_INFO_CACHE_SIZE, cache_size);
-    RET_IF_HSA_INFO_ERR(err)
-
-    // Get chip id
-    uint32_t chip_id = 0;
-    err = hsa_agent_get_info(agent,
-                             (hsa_agent_info_t) HSA_AMD_AGENT_INFO_CHIP_ID,
-                                                                     &chip_id);
-    RET_IF_HSA_INFO_ERR(err)
-
-    // Get cacheline size
-    uint32_t cacheline_size = 0;
-    err = hsa_agent_get_info(agent,
-                         (hsa_agent_info_t) HSA_AMD_AGENT_INFO_CACHELINE_SIZE,
-                                                              &cacheline_size);
-    RET_IF_HSA_INFO_ERR(err)
-
-    // Get Max clock frequency
-    uint32_t max_clock_freq = 0;
-    err = hsa_agent_get_info(agent,
-                    (hsa_agent_info_t) HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY,
-                                                              &max_clock_freq);
-    RET_IF_HSA_INFO_ERR(err)
-
-    // Get Agent BDFID
-    uint16_t bdf_id = 1;
-    err = hsa_agent_get_info(agent, (hsa_agent_info_t) HSA_AMD_AGENT_INFO_BDFID,
-                             &bdf_id);
-    RET_IF_HSA_INFO_ERR(err)
-
-    // Get number of Compute Unit
-    uint32_t compute_unit = 0;
-    err = hsa_agent_get_info(agent,
-                     (hsa_agent_info_t) HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
-                                                                &compute_unit);
-    RET_IF_HSA_INFO_ERR(err)
-
-    // Print out the common results
-    std::cout << std::endl;
-    std::cout << "Agent #" << agent_number << ":" << std::endl;
-    std::cout << "Agent Name:					" << name <<
-                                                                     std::endl;
-    std::cout << "Agent Vendor Name:				" <<
-                                                      vendor_name << std::endl;
-
-    if (agent_feature & HSA_AGENT_FEATURE_KERNEL_DISPATCH
-        && agent_feature & HSA_AGENT_FEATURE_AGENT_DISPATCH)
-      std::cout << "Agent Feature:					KERNEL_DISPATCH & AGENT_DISPATCH"
-                << std::endl;
-    else if (agent_feature & HSA_AGENT_FEATURE_KERNEL_DISPATCH) {
-      std::cout << "Agent Feature:					KERNEL_DISPATCH" << std::endl;
-    }
-    else if (agent_feature & HSA_AGENT_FEATURE_AGENT_DISPATCH) {
-      std::cout << "Agent Feature:					AGENT_DISPATCH" << std::endl;
-    }
-    else {
-      std::cout << "Agent Feature:					Not Supported" << std::endl;
-    }
-
-    if (HSA_PROFILE_BASE == agent_profile) {
-      std::cout << "Agent Profile:					BASE_PROFILE" << std::endl;
-    }
-    else if (HSA_PROFILE_FULL == agent_profile) {
-      std::cout << "Agent Profile:					FULL_PROFILE" << std::endl;
-    }
-    else {
-      std::cout << "Agent Profile:					Not Supported" << std::endl;
-    }
-
-    if (HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO == float_rounding_mode) {
-      std::cout << "Agent Floating Rounding Mode:			ZERO" << std::endl;
-    }
-    else if (HSA_DEFAULT_FLOAT_ROUNDING_MODE_NEAR == float_rounding_mode) {
-      std::cout << "Agent Floating Rounding Mode:			NEAR" << std::endl;
-    }
-    else {
-      std::cout << "Agent Floating Rounding Mode:			Not Supported" << std::endl;
-    }
-
-    std::cout << "Agent Max Queue Number:				" << max_queue << std::endl;
-    std::cout << "Agent Queue Min Size:				" << queue_min_size << std::endl;
-    std::cout << "Agent Queue Max Size:				" << queue_max_size << std::endl;
-
-    if (HSA_QUEUE_TYPE_MULTI == queue_type) {
-      std::cout << "Agent Queue Type:				MULTI" << std::endl;
-    }
-    else if (HSA_QUEUE_TYPE_SINGLE == queue_type) {
-      std::cout << "Agent Queue Type:				SINGLE" << std::endl;
-    }
-    else {
-      std::cout << "Agent Queue Type:				Not Supported" << std::endl;
-    }
-
-    std::cout << "Agent Node:					" << node << std::endl;
-
-    if (HSA_DEVICE_TYPE_CPU == device_type) {
-      std::cout << "Agent Device Type:				CPU" << std::endl;
-    }
-    else if (HSA_DEVICE_TYPE_GPU == device_type) {
-      std::cout << "Agent Device Type:				GPU" << std::endl;
-      // Get ISA info
-      hsa_isa_t agent_isa;
-      err = hsa_agent_get_info(agent, HSA_AGENT_INFO_ISA, &agent_isa);
-      RET_IF_HSA_INFO_ERR(err)
-    }
-    else {
-      std::cout << "Agent Device Type:				DSP" << std::endl;
-    }
-
-    std::cout << "Agent Cache Info:" << std::endl;
-
-    for (int i = 0; i < 4; i++) {
-      if (cache_size[i]) {
-        std::cout << "  $L" << i + 1 << ":						" << cache_size[i] / 1024
-                  << "KB" << std::endl;
-      }
-    }
-
-    std::cout << "Agent Chip ID:					" << chip_id << std::endl;
-    std::cout << "Agent Cacheline Size:				" << cacheline_size << std::endl;
-    std::cout << "Agent Max Clock Frequency:			" << max_clock_freq << "MHz"
-              << std::endl;
-    std::cout << "Agent BDFID:					" << bdf_id << std::endl;
-    std::cout << "Agent Compute Unit:				" << compute_unit << std::endl;
-
-    // Output Peak FLOPS and Peak Bandwidth if Env var is set
-    // TODO: Fan, need to add BW
-    if (output_amd) {
-      std::string agent_name = name;
-
-      for (size_t i = 0; i < agent_names.size(); i++) {
-        if (agent_name.compare(agent_names[i]) == 0)
-          std::cout << "Agent Peak GFLOPS:				" << flops_table[agent_name]
-                    << std::endl;
-      }
-    }
-
-    // Check if the agent is kernel agent
-    if (agent_feature & HSA_AGENT_FEATURE_KERNEL_DISPATCH) {
-
-      // Get flaf of fast_f16 operation
-      bool fast_f16;
-      err = hsa_agent_get_info(agent, HSA_AGENT_INFO_FAST_F16_OPERATION,
-                               &fast_f16);
-      RET_IF_HSA_INFO_ERR(err)
-
-      // Get wavefront size
-      uint32_t wavefront_size = 0;
-      err = hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE,
-                               &wavefront_size);
-      RET_IF_HSA_INFO_ERR(err)
-
-      // Get max total number of work-items in a workgroup
-      uint32_t workgroup_max_size = 0;
-      err = hsa_agent_get_info(agent, HSA_AGENT_INFO_WORKGROUP_MAX_SIZE,
-                               &workgroup_max_size);
-      RET_IF_HSA_INFO_ERR(err)
-
-      // Get max number of work-items of each dimension of a work-group
-      uint16_t workgroup_max_dim[3];
-      err = hsa_agent_get_info(agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
-                               &workgroup_max_dim);
-      RET_IF_HSA_INFO_ERR(err)
-
-      // Get max number of a grid per dimension
-      hsa_dim3_t grid_max_dim;
-      err = hsa_agent_get_info(agent, HSA_AGENT_INFO_GRID_MAX_DIM,
-                               &grid_max_dim);
-      RET_IF_HSA_INFO_ERR(err)
-
-      // Get max total number of work-items in a grid
-      uint32_t grid_max_size = 0;
-      err = hsa_agent_get_info(agent, HSA_AGENT_INFO_GRID_MAX_SIZE,
-                               &grid_max_size);
-      RET_IF_HSA_INFO_ERR(err)
-
-      // Get max number of fbarriers per work group
-      uint32_t fbarrier_max_size = 0;
-      err = hsa_agent_get_info(agent, HSA_AGENT_INFO_FBARRIER_MAX_SIZE,
-                               &fbarrier_max_size);
-      RET_IF_HSA_INFO_ERR(err)
-
-      // Print info for kernel agent
-      if (true == fast_f16) {
-        std::cout << "Agent Fast F16 Operation:			TRUE" <<
-                                                                    std::endl;
-      }
-
-      std::cout << "Agent Wavefront Size:				" <<
-                                                  wavefront_size << std::endl;
-      std::cout << "Agent Workgroup Max Size:			" <<
-                                              workgroup_max_size << std::endl;
-      std::cout <<
-               "Agent Workgroup Max Size Per Dimension:			" <<
-                                                                    std::endl;
-
-      for (int i = 0; i < 3; i++) {
-        std::cout << "  Dim[" << i <<
-            "]:					" << workgroup_max_dim[i] <<
-                                                                    std::endl;
-      }
-
-      std::cout << "Agent Grid Max Size:				" <<
-                                                   grid_max_size << std::endl;
-
-      // Stop using the above kmt functions as per SWDEV-97044
-      //
-      uint32_t waves_per_cu = 0;
-      err = hsa_agent_get_info(agent,
-                        (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU,
-                                                                &waves_per_cu);
-      RET_IF_HSA_INFO_ERR(err)
-      std::cout << "Agent Waves Per CU:				" <<
-                                                     waves_per_cu << std::endl;
-      std::cout << "Agent Max Work-item Per CU:			"
-                << wavefront_size* waves_per_cu << std::endl;
-
-      std::cout << "Agent Grid Max Size per Dimension:" << std::endl;
-
-      for (int i = 0; i < 3; i++) {
-        std::cout << "  Dim[" << i <<
-                                     "]					"
-                 << reinterpret_cast<uint32_t*>(&grid_max_dim)[i] << std::endl;
-      }
-
-      std::cout << "Agent Max number Of fbarriers Per Workgroup:	"
-                << fbarrier_max_size << std::endl;
-    }
-  }
-
-  // Get pool info
-  std::cout << "Agent Pool Info:" << std::endl;
-  err = hsa_amd_agent_iterate_memory_pools(agent, get_pool_info, &pool_number);
-  RET_IF_HSA_INFO_ERR(err)
-
-  return HSA_STATUS_SUCCESS;
-}
-
-// Implement region iteration function
-hsa_status_t get_pool_info(hsa_amd_memory_pool_t pool, void* data) {
-  hsa_status_t err;
-  int* p_int = reinterpret_cast<int*>(data);
-  (*p_int)++;
-
-  std::cout << "  Pool #" << *p_int << ":" << std::endl;
-
-  err = rocrtst::DumpMemoryPoolInfo(pool, 4);
-  RET_IF_HSA_INFO_ERR(err)
-
-  return err;
-}
-
-#undef RET_IF_HSA_INFO_ERR
-
-void HsaInfo::DisplayResults() const {
-  if (!rocrtst::CheckProfile(this)) {
-    return;
-  }
-
-  return;
-}
-
-void HsaInfo::Close() {
-  hsa_status_t err;
-  err = rocrtst::CommonCleanUp(this);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-  return;
-}
-
diff --git a/rocrtst/suites/performance/image_bandwidth.cc b/rocrtst/suites/performance/image_bandwidth.cc
deleted file mode 100755
index 482870ee8e..0000000000
--- a/rocrtst/suites/performance/image_bandwidth.cc
+++ /dev/null
@@ -1,328 +0,0 @@
-/*
- * =============================================================================
- *   ROC Runtime Conformance Release License
- * =============================================================================
- * The University of Illinois/NCSA
- * Open Source License (NCSA)
- *
- * Copyright (c) 2017, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Developed by:
- *
- *                 AMD Research and AMD ROC Software Development
- *
- *                 Advanced Micro Devices, Inc.
- *
- *                 www.amd.com
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal with the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- *  - Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimers.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimers in
- *    the documentation and/or other materials provided with the distribution.
- *  - Neither the names of <Name of Development Group, Name of Institution>,
- *    nor the names of its contributors may be used to endorse or promote
- *    products derived from this Software without specific prior written
- *    permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS WITH THE SOFTWARE.
- *
- */
-
-#include "image_bandwidth.h"
-#include "common/base_rocr_utils.h"
-#include "common/common.h"
-#include "common/hsatimer.h"
-#include "gtest/gtest.h"
-#include "hsa/hsa.h"
-#include "hsa/hsa_ext_image.h"
-#include <stdio.h>
-#include <stdlib.h>
-#include <algorithm>
-
-ImageBandwidth::ImageBandwidth(size_t num) :
-  BaseRocR(), import_bandwidth_ {0.0}, export_bandwidth_ {0.0},
-                                                        copy_bandwidth_ {0.0} {
-  format_.channel_order = HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA;
-  format_.channel_type = HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8;
-  geometry_ = HSA_EXT_IMAGE_GEOMETRY_2D;
-
-  set_requires_profile (HSA_PROFILE_FULL);
-}
-
-ImageBandwidth::~ImageBandwidth() {
-}
-
-const size_t ImageBandwidth::Size[10] = {32, 64, 128, 256, 512, 1024, 2048,
-                                         4096, 8192, 16384
-                                        };
-const char* const ImageBandwidth::Str[10] = {"4K", "16K", "64K", "256K", "1M",
-                                             "4M", "16M", "64M", "256M", "1G"
-                                            };
-
-void ImageBandwidth::SetUp() {
-  hsa_status_t err;
-
-  if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
-    return;
-  }
-
-  hsa_agent_t* gpu_dev = gpu_device1();
-
-  // Find the global region
-  err = hsa_amd_agent_iterate_memory_pools(*gpu_dev, rocrtst::FindGlobalPool,
-                                                                  &cpu_pool());
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-}
-
-void ImageBandwidth::Run() {
-  hsa_status_t err;
-
-  if (!rocrtst::CheckProfile(this)) {
-    return;
-  }
-
-  hsa_agent_t* gpu_dev = gpu_device1();
-
-  for (int i = 0; i < 10; i++) {
-    // Create timer for import, export and copy tests
-    rocrtst::PerfTimer import_timer;
-    rocrtst::PerfTimer export_timer;
-    rocrtst::PerfTimer copy_timer;
-    std::vector<double> import_image;
-    std::vector<double> export_image;
-    std::vector<double> copy_image;
-    // Allocate image buffer in host memory
-    uint32_t* image_buffer = NULL;
-    err = hsa_amd_memory_pool_allocate(cpu_pool(),
-                                       Size[i] * Size[i] * sizeof(uint32_t),
-                                                    0, (void**) &image_buffer);
-    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-    // rocrtst::CommonCleanUp the image buffer
-    for (uint32_t j = 0; j < Size[i] * Size[i]; j++) {
-      image_buffer[j] = 0x10101010;
-    }
-
-    // Prepare for 2D image creation
-    hsa_ext_image_t image_handle;
-
-    hsa_ext_image_descriptor_t image_descriptor;
-    image_descriptor.geometry = geometry_;
-    image_descriptor.width = Size[i];
-    image_descriptor.height = Size[i];
-    image_descriptor.depth = 1;
-    image_descriptor.array_size = 0;
-    image_descriptor.format = format_;
-
-    // Check if device_ supports at least read and write operation on
-    // image format
-    uint32_t capability_mask;
-    err = hsa_ext_image_get_capability(*gpu_dev, HSA_EXT_IMAGE_GEOMETRY_2D,
-                                       &format_, &capability_mask);
-    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-    if (!(capability_mask & HSA_EXT_IMAGE_CAPABILITY_READ_WRITE)) {
-      std::cout <<
-       "Device does not support read and write operation on this kind of image!"
-                << std::endl;
-      ASSERT_NE(capability_mask & HSA_EXT_IMAGE_CAPABILITY_READ_WRITE, 0);
-    }
-
-    // Get image info
-    hsa_ext_image_data_info_t image_info;
-    err = hsa_ext_image_data_get_info(*gpu_dev, &image_descriptor,
-                                      HSA_ACCESS_PERMISSION_RW, &image_info);
-    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-    // Allocate memory for image
-    uintptr_t ptr_temp = 0;
-    err = hsa_amd_memory_pool_allocate(cpu_pool(),
-              image_info.size + image_info.alignment, 0, (void**) &ptr_temp);
-
-    // Align the image address
-    uintptr_t mul = ptr_temp / image_info.alignment;
-    void* ptr_image = (void*) ((mul + 1) * image_info.alignment);
-
-    // rocrtst::CommonCleanUp the image to 0
-    hsa_amd_memory_fill(ptr_image, 0, image_info.size);
-
-    // Create image handle
-    err = hsa_ext_image_create(*gpu_dev, &image_descriptor, ptr_image,
-                               HSA_ACCESS_PERMISSION_RW, &image_handle);
-    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-    // Set import image region
-    hsa_dim3_t range = {(uint32_t) Size[i], (uint32_t) Size[i], 1};
-
-    hsa_ext_image_region_t image_region;
-    hsa_dim3_t image_offset = {0, 0, 0};
-    image_region.offset = image_offset;
-    image_region.range = range;
-
-    size_t iterations = RealIterationNum();
-
-    for (uint32_t it = 0; it < iterations; it++) {
-      // Create a timer
-      int index = import_timer.CreateTimer();
-
-      // Stamp at the beginning
-      import_timer.StartTimer(index);
-
-      // Import image from host
-      err = hsa_ext_image_import(*gpu_dev, image_buffer, 0, 0, image_handle,
-                                 &image_region);
-      ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-      // Stamp in the end
-      import_timer.StopTimer(index);
-      import_image.push_back(import_timer.ReadTimer(index));
-    }
-
-    // Reset image_buffer
-    hsa_amd_memory_fill(image_buffer, 0, Size[i] * Size[i] * sizeof(uint32_t));
-
-    for (uint32_t it = 0; it < iterations; it++) {
-      // Export image
-      // Stamp at the beginning
-      int index = export_timer.CreateTimer();
-      export_timer.StartTimer(index);
-
-      err = hsa_ext_image_export(*gpu_dev, image_handle, image_buffer, 0, 0,
-                                 &image_region);
-      ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-      export_timer.StopTimer(index);
-      export_image.push_back(export_timer.ReadTimer(index));
-
-      // Check if the value is correct
-      for (uint32_t j = 0; j < Size[i] * Size[i]; j++) {
-        ASSERT_EQ(image_buffer[j], 0x10101010);
-      }
-    }
-
-    // Create another image for copy
-    // Allocate memory for image
-    uintptr_t ptr_temp2 = 0;
-    err = hsa_amd_memory_pool_allocate(cpu_pool(),
-              image_info.size + image_info.alignment, 0, (void**) &ptr_temp2);
-    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-    // Align the image address
-    mul = ptr_temp2 / image_info.alignment;
-    void* ptr_image2 = (void*) ((mul + 1) * image_info.alignment);
-
-    // rocrtst::CommonCleanUp the image to 0
-    hsa_amd_memory_fill(ptr_image2, 0, image_info.size);
-
-    // Create image handle
-    hsa_ext_image_t image_handle_copy;
-    err = hsa_ext_image_create(*gpu_dev, &image_descriptor, ptr_image2,
-                               HSA_ACCESS_PERMISSION_RW, &image_handle_copy);
-    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-    for (uint32_t it = 0; it < iterations; it++) {
-      // Stamp at the beginning
-      int index = copy_timer.CreateTimer();
-      copy_timer.StartTimer(index);
-
-      err = hsa_ext_image_copy(*gpu_dev, image_handle, &image_offset,
-                               image_handle_copy, &image_offset, &range);
-      ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-      // Stamp in the end
-      copy_timer.StopTimer(index);
-      copy_image.push_back(copy_timer.ReadTimer(index));
-
-      // Check if image data is correct
-      hsa_amd_memory_fill(image_buffer, 0,
-                                      Size[i] * Size[i] * sizeof(uint32_t));
-
-      // Export image
-      err = hsa_ext_image_export(*gpu_dev, image_handle_copy, image_buffer,
-                                 0, 0, &image_region);
-      ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-      // Check if the value is correct
-      for (uint32_t j = 0; j < Size[i] * Size[i]; j++) {
-        ASSERT_EQ(image_buffer[j], 0x10101010);
-      }
-
-    }
-
-    // Calculate Bandwidth
-    import_bandwidth_[i] = CalculateBandwidth(import_image, Size[i]);
-    export_bandwidth_[i] = CalculateBandwidth(export_image, Size[i]);
-    copy_bandwidth_[i] = CalculateBandwidth(copy_image, Size[i]);
-  }
-}
-
-double ImageBandwidth::CalculateBandwidth(std::vector<double>& vec,
-    size_t size) {
-  double mean = 0.0;
-
-  // Delete the first timer result, which is warm up test
-  vec.erase(vec.begin());
-
-  // Sort the results
-  std::sort(vec.begin(), vec.end());
-
-  // Delete the last 20% of the results
-
-  vec.erase(vec.begin() + num_iteration(), vec.end());
-
-  int num = vec.size();
-
-  for (int index = 0; index < num; index++) {
-    mean += vec[index];
-  }
-
-  mean /= num;
-
-  return (double) size * size * 4 / mean / 1024 / 1024 / 1024;
-}
-
-void ImageBandwidth::DisplayResults() const {
-  if (!rocrtst::CheckProfile(this)) {
-    return;
-  }
-
-  fprintf(stdout, "==================================================="
-                                                "=========================\n");
-
-  fprintf(stdout,
-          "  Size        Import                Export                 Copy\n");
-
-  for (int i = 0; i < 10; i++) {
-    fprintf(stdout,
-            "  %s         %f(GB/s)          %f(GB/s)             %f(GB/s)\n",
-            Str[i], import_bandwidth_[i], export_bandwidth_[i],
-                                                           copy_bandwidth_[i]);
-    fprintf(stdout, "================================================="
-                                              "===========================\n");
-  }
-}
-
-void ImageBandwidth::Close() {
-  hsa_status_t err;
-  err = rocrtst::CommonCleanUp(this);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-}
-
-size_t ImageBandwidth::RealIterationNum() {
-  return num_iteration() * 1.2 + 1;
-}
diff --git a/rocrtst/suites/performance/image_bandwidth.h b/rocrtst/suites/performance/image_bandwidth.h
deleted file mode 100755
index 2e28e31a39..0000000000
--- a/rocrtst/suites/performance/image_bandwidth.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * =============================================================================
- *   ROC Runtime Conformance Release License
- * =============================================================================
- * The University of Illinois/NCSA
- * Open Source License (NCSA)
- *
- * Copyright (c) 2017, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Developed by:
- *
- *                 AMD Research and AMD ROC Software Development
- *
- *                 Advanced Micro Devices, Inc.
- *
- *                 www.amd.com
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal with the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- *  - Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimers.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimers in
- *    the documentation and/or other materials provided with the distribution.
- *  - Neither the names of <Name of Development Group, Name of Institution>,
- *    nor the names of its contributors may be used to endorse or promote
- *    products derived from this Software without specific prior written
- *    permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS WITH THE SOFTWARE.
- *
- */
-
-#ifndef __ROCRTST_SRC_IMAGE_BANDWIDTH_H__
-#define __ROCRTST_SRC_IMAGE_BANDWIDTH_H__
-
-#include "perf_common/perf_base.h"
-#include "common/base_rocr.h"
-#include "hsa/hsa.h"
-#include "hsa/hsa_ext_image.h"
-#include <vector>
-
-class ImageBandwidth: public rocrtst::BaseRocR, public PerfBase {
- public:
-  //@Brief: Constructor for test case of ImageBandwidth
-  ImageBandwidth(size_t num = 100);
-
-  //@Brief: Destructor
-  virtual ~ImageBandwidth();
-
-  //@Brief: Setup the environment for measurement
-  virtual void SetUp();
-
-  //@Brief: Core measurement execution
-  virtual void Run();
-
-  //@Brief: Clean up and retrive the resource
-  virtual void Close();
-
-  //@Brief: Display  results
-  virtual void DisplayResults() const;
-
- private:
-  //@Brief: Define image size and corresponding string
-  static const size_t Size[10];
-  static const char* const Str[10];
-
-  //@Brief: Get actual iteration number
-  size_t RealIterationNum();
-
-  //@Brief: Calculate Bandwidth
-  double CalculateBandwidth(std::vector<double>& vec, size_t size);
-
- protected:
-  //@Brief: bandwidth data
-  double import_bandwidth_[10];
-  double export_bandwidth_[10];
-  double copy_bandwidth_[10];
-
-  //@Brief: Image format
-  hsa_ext_image_format_t format_;
-
-  //@Brief: Image geometry
-  hsa_ext_image_geometry_t geometry_;
-};
-
-#endif
diff --git a/rocrtst/suites/performance/image_load_bandwidth.cc b/rocrtst/suites/performance/image_load_bandwidth.cc
deleted file mode 100755
index 33ec707d9d..0000000000
--- a/rocrtst/suites/performance/image_load_bandwidth.cc
+++ /dev/null
@@ -1,270 +0,0 @@
-/*
- * =============================================================================
- *   ROC Runtime Conformance Release License
- * =============================================================================
- * The University of Illinois/NCSA
- * Open Source License (NCSA)
- *
- * Copyright (c) 2017, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Developed by:
- *
- *                 AMD Research and AMD ROC Software Development
- *
- *                 Advanced Micro Devices, Inc.
- *
- *                 www.amd.com
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal with the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- *  - Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimers.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimers in
- *    the documentation and/or other materials provided with the distribution.
- *  - Neither the names of <Name of Development Group, Name of Institution>,
- *    nor the names of its contributors may be used to endorse or promote
- *    products derived from this Software without specific prior written
- *    permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS WITH THE SOFTWARE.
- *
- */
-
-#include "image_load_bandwidth.h"
-#include "common/base_rocr_utils.h"
-#include "common/common.h"
-#include "common/hsatimer.h"
-#include "common/helper_funcs.h"
-#include "gtest/gtest.h"
-#include "hsa/hsa_ext_image.h"
-#include <stdio.h>
-#include <vector>
-
-// Constructor of the class
-ImageLoadBandwidth::ImageLoadBandwidth() :
-  BaseRocR() {
-  load_bandwidth_ = 0.0;
-  image_size_ = 0;
-
-  set_requires_profile (HSA_PROFILE_FULL);
-}
-
-// Destructor of the class
-ImageLoadBandwidth::~ImageLoadBandwidth() {
-
-}
-
-// Set up the environment
-void ImageLoadBandwidth::SetUp() {
-  hsa_agent_t* gpu_dev = gpu_device1();
-
-  set_kernel_file_name("load_2d_image.o");
-  set_kernel_name("&__OpenCL_load_2d_image_kernel");
-
-  if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
-    return;
-  }
-
-  //Create a queue with max number size
-  hsa_queue_t* q = main_queue();
-  rocrtst::CreateQueue(*gpu_dev, &q);
-
-  rocrtst::LoadKernelFromObjFile(this);
-
-  //Fill up part of aql
-  rocrtst::InitializeAQLPacket(this, &aql());
-  aql().setup = 0;
-  aql().setup |= 2 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
-
-  return;
-}
-
-// Run the test
-void ImageLoadBandwidth::Run() {
-  hsa_agent_t* gpu_dev = gpu_device1();
-  hsa_agent_t* cpu_dev = cpu_device();
-
-  hsa_status_t err;
-
-  if (!rocrtst::CheckProfile(this)) {
-    return;
-  }
-
-  hsa_ext_image_descriptor_t image_descriptor;
-  image_descriptor.geometry = HSA_EXT_IMAGE_GEOMETRY_2D;
-  image_descriptor.width = 256;
-  image_descriptor.height = 256;
-  image_descriptor.depth = 1;
-  image_descriptor.array_size = 0;
-  image_descriptor.format.channel_type =
-    HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8;
-  image_descriptor.format.channel_order = HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA;
-
-  hsa_ext_image_format_t image_format;
-  image_format.channel_type = HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8;
-  image_format.channel_order = HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA;
-
-  // Check if device_ supports at least read only operation on image format
-  uint32_t capability_mask;
-  err = hsa_ext_image_get_capability(*gpu_dev, HSA_EXT_IMAGE_GEOMETRY_2D,
-                                     &image_format, &capability_mask);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  if (!(capability_mask & HSA_EXT_IMAGE_CAPABILITY_READ_ONLY)) {
-    ASSERT_FALSE(
-     "Device does not support read and write operation on this kind of image!");
-  }
-
-  // Get image info
-  hsa_ext_image_data_info_t image_info;
-  err = hsa_ext_image_data_get_info(*gpu_dev, &image_descriptor,
-                                    HSA_ACCESS_PERMISSION_RO, &image_info);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  image_size_ = image_info.size;
-
-  std::vector<double> time;
-
-  for (uint32_t i = 0; i < num_iteration(); i++) {
-#ifdef DEBUG
-    std::cout << ".";
-    fflush(stdout);
-#endif
-    // Allocate memory space for image
-    // Find the global region
-    err = hsa_amd_agent_iterate_memory_pools(*cpu_dev, rocrtst::FindGlobalPool,
-                                                                   &cpu_pool());
-    ASSERT_EQ(err, HSA_STATUS_INFO_BREAK);
-    uintptr_t ptr_temp = 0;
-    err = hsa_amd_memory_pool_allocate(cpu_pool(),
-                                       image_info.size + image_info.alignment,
-                                                        0, (void**) &ptr_temp);
-    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-    err = hsa_amd_agents_allow_access(1, gpu_dev, NULL, (void*) ptr_temp);
-    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-    // Align the image address
-    uintptr_t mul = ptr_temp / image_info.alignment;
-    void* ptr_image = (void*) ((mul + 1) * image_info.alignment);
-
-    // rocrtst::CommonCleanUp the image memory to 1
-    err = hsa_amd_memory_fill(ptr_image, 1, image_info.size);
-    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-    // Create image handle
-    hsa_ext_image_t image_handle;
-    err = hsa_ext_image_create(*gpu_dev, &image_descriptor, ptr_image,
-                               HSA_ACCESS_PERMISSION_RO, &image_handle);
-    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-    // Allocate and initialize the kernel argument
-    typedef struct args_t {
-      uint64_t arg0;
-      int* arg1;
-      int istart;
-      int iend;
-      int istep;
-    } args;
-
-    int local_out = 5;
-    int istart = 0;
-    int iend = 64;
-    int istep = 1;
-
-    args* kern_ptr = NULL;
-    err = hsa_amd_memory_pool_allocate(cpu_pool(), sizeof(args), 0,
-                                       (void**) &kern_ptr);
-    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-    kern_ptr->arg0 = image_handle.handle;
-    kern_ptr->arg1 = &local_out;
-    kern_ptr->istart = istart;
-    kern_ptr->iend = iend;
-    kern_ptr->istep = istep;
-
-    aql().kernarg_address = kern_ptr;
-
-    // Obtain the current queue write index
-    uint64_t index = hsa_queue_add_write_index_relaxed(main_queue(), 1);
-
-    void *q_base_addr = main_queue()->base_address;
-
-    // Write the aql packet at the calculated queue index address.
-    const uint32_t queue_mask = main_queue()->size - 1;
-    ((hsa_kernel_dispatch_packet_t*)q_base_addr)[index & queue_mask] = aql();
-
-    rocrtst::PerfTimer p_timer;
-    int id = p_timer.CreateTimer();
-    p_timer.StartTimer(id);
-
-    ((hsa_kernel_dispatch_packet_t*)q_base_addr)[index & queue_mask].header |=
-                     HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
-    hsa_signal_store_release(main_queue()->doorbell_signal, index);
-
-    // Wait on the dispatch signal until the kernel is finished.
-    while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1,
-                                     (uint64_t) - 1, HSA_WAIT_STATE_ACTIVE))
-      ;
-
-    p_timer.StopTimer(id);
-
-    time.push_back(p_timer.ReadTimer(id));
-
-    hsa_signal_store_release(signal(), 1);
-
-    err = hsa_ext_image_destroy(*gpu_dev, image_handle);
-    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-    err = hsa_memory_deregister(ptr_image, image_info.size);
-    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-    hsa_amd_memory_pool_free((void*) ptr_temp);
-  }
-
-  // Calculte the mean load time
-  time.erase(time.begin());
-#ifdef DEBUG
-
-  for (uint32_t i = 0; i < time.size(); i++) {
-    std::cout << time[i] << std::endl;
-  }
-
-#endif
-  double mean_time = rocrtst::CalcMean(time);
-  load_bandwidth_ = image_size_ / mean_time / 1024 / 1024 / 1024;
-
-}
-
-void ImageLoadBandwidth::Close() {
-  hsa_status_t err;
-  err = rocrtst::CommonCleanUp(this);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-}
-
-void ImageLoadBandwidth::DisplayResults() const {
-  if (!rocrtst::CheckProfile(this)) {
-    return;
-  }
-
-  std::cout << "======================================"
-                         "======================================" << std::endl;
-  std::cout << " Image Size(bytes):              LoadBandwidth(GB/S):    "
-            << std::endl;
-  std::cout << " " << image_size_ << "                                "
-            << load_bandwidth_ << std::endl;
-}
-
diff --git a/rocrtst/suites/performance/image_store_bandwidth.cc b/rocrtst/suites/performance/image_store_bandwidth.cc
deleted file mode 100755
index ea30a620c4..0000000000
--- a/rocrtst/suites/performance/image_store_bandwidth.cc
+++ /dev/null
@@ -1,271 +0,0 @@
-/*
- * =============================================================================
- *   ROC Runtime Conformance Release License
- * =============================================================================
- * The University of Illinois/NCSA
- * Open Source License (NCSA)
- *
- * Copyright (c) 2017, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Developed by:
- *
- *                 AMD Research and AMD ROC Software Development
- *
- *                 Advanced Micro Devices, Inc.
- *
- *                 www.amd.com
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal with the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- *  - Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimers.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimers in
- *    the documentation and/or other materials provided with the distribution.
- *  - Neither the names of <Name of Development Group, Name of Institution>,
- *    nor the names of its contributors may be used to endorse or promote
- *    products derived from this Software without specific prior written
- *    permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS WITH THE SOFTWARE.
- *
- */
-
-#include "image_store_bandwidth.h"
-#include "common/base_rocr_utils.h"
-#include "common/common.h"
-#include "common/helper_funcs.h"
-#include "common/hsatimer.h"
-#include "gtest/gtest.h"
-#include "hsa/hsa_ext_image.h"
-#include <stdio.h>
-#include <vector>
-
-// Constructor of the class
-ImageStoreBandwidth::ImageStoreBandwidth() :
-  BaseRocR() {
-  store_bandwidth_ = 0.0;
-  store_bandwidth_ = 0.0;
-  image_size_ = 0;
-
-  set_requires_profile (HSA_PROFILE_FULL);
-}
-
-// Destructor of the class
-ImageStoreBandwidth::~ImageStoreBandwidth() {
-
-}
-
-// Set up the environment
-void ImageStoreBandwidth::SetUp() {
-
-  set_kernel_file_name("store_2d_image.o");
-  set_kernel_name("&__OpenCL_store_2d_image_kernel");
-
-  if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
-    return;
-  }
-
-  hsa_agent_t* gpu_dev = gpu_device1();
-
-  //Create a queue with max number size
-  hsa_queue_t* q = nullptr;
-  rocrtst::CreateQueue(*gpu_dev, &q);
-  set_main_queue(q);
-
-  rocrtst::LoadKernelFromObjFile(this);
-
-  //Fill up part of aql
-  rocrtst::InitializeAQLPacket(this, &aql());
-  aql().setup = 0;
-  aql().setup |= 2 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
-
-  return;
-}
-
-// Run the test
-void ImageStoreBandwidth::Run() {
-  hsa_status_t err;
-
-  if (!rocrtst::CheckProfile(this)) {
-    return;
-  }
-
-  hsa_agent_t* gpu_dev = gpu_device1();
-  hsa_agent_t* cpu_dev = cpu_device();
-
-  hsa_ext_image_descriptor_t image_descriptor;
-  image_descriptor.geometry = HSA_EXT_IMAGE_GEOMETRY_2D;
-  image_descriptor.width = 256;
-  image_descriptor.height = 256;
-  image_descriptor.depth = 1;
-  image_descriptor.array_size = 0;
-  image_descriptor.format.channel_type =
-    HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8;
-  image_descriptor.format.channel_order = HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA;
-
-  hsa_ext_image_format_t image_format;
-  image_format.channel_type = HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8;
-  image_format.channel_order = HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA;
-
-  // Check if device_ supports at least read only operation on image format
-  uint32_t capability_mask;
-  err = hsa_ext_image_get_capability(*gpu_dev, HSA_EXT_IMAGE_GEOMETRY_2D,
-                                     &image_format, &capability_mask);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  if (!(capability_mask & HSA_EXT_IMAGE_CAPABILITY_READ_ONLY)) {
-    std::cout << 
-     "Device does not support read and write operation on this kind of image!"
-        << std::endl;
-    ASSERT_NE(capability_mask & HSA_EXT_IMAGE_CAPABILITY_READ_ONLY, 0);
-  }
-
-  // Get image info
-  hsa_ext_image_data_info_t image_info;
-  err = hsa_ext_image_data_get_info(*gpu_dev, &image_descriptor,
-                                    HSA_ACCESS_PERMISSION_RW, &image_info);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  image_size_ = image_info.size;
-
-  std::vector<double> time;
-
-  for (uint32_t i = 0; i < num_iteration(); i++) {
-#ifdef DEBUG
-    std::cout << ".";
-    fflush(stdout);
-#endif
-    // Allocate memory space for image
-    err = hsa_amd_agent_iterate_memory_pools(*cpu_dev, rocrtst::FindGlobalPool,
-                                                                   &cpu_pool());
-    ASSERT_EQ(err, HSA_STATUS_INFO_BREAK);
-
-    uintptr_t ptr_temp = 0;
-    err = hsa_amd_memory_pool_allocate(cpu_pool(),
-                                       image_info.size + image_info.alignment,
-                                                         0, (void**) &ptr_temp);
-    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-    // Align the image address
-    uintptr_t mul = ptr_temp / image_info.alignment;
-    void* ptr_image = (void*) ((mul + 1) * image_info.alignment);
-
-    // rocrtst::CommonCleanUp the image memory to 0
-    err = hsa_amd_memory_fill(ptr_image, 0, image_info.size);
-    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-    // Create image handle
-    hsa_ext_image_t image_handle;
-    err = hsa_ext_image_create(*gpu_dev, &image_descriptor, ptr_image,
-                               HSA_ACCESS_PERMISSION_RO, &image_handle);
-    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-    // Allocate and initialize the kernel argument
-    typedef struct args_t {
-      uint64_t arg0;
-      int istart;
-      int iend;
-      int istep;
-    } args;
-
-    //int local_out = 5;
-    int istart = 0;
-    int iend = 64;
-    int istep = 1;
-
-    args* kern_ptr = NULL;
-    err = hsa_amd_memory_pool_allocate(cpu_pool(), sizeof(args), 0,
-                                       (void**) &kern_ptr);
-    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-    kern_ptr->arg0 = image_handle.handle;
-    kern_ptr->istart = istart;
-    kern_ptr->iend = iend;
-    kern_ptr->istep = istep;
-
-    aql().kernarg_address = kern_ptr;
-
-    // Obtain the current queue write index
-    uint64_t index = hsa_queue_add_write_index_relaxed(main_queue(), 1);
-
-    void *q_base_addr = main_queue()->base_address;
-    // Write the aql packet at the calculated queue index address.
-    const uint32_t queue_mask = main_queue()->size - 1;
-    ((hsa_kernel_dispatch_packet_t*)q_base_addr)[index & queue_mask] = aql();
-
-    rocrtst::PerfTimer p_timer;
-    int id = p_timer.CreateTimer();
-    p_timer.StartTimer(id);
-
-    ((hsa_kernel_dispatch_packet_t*)q_base_addr)[index & queue_mask].header |=
-                      HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
-    hsa_signal_store_release(main_queue()->doorbell_signal, index);
-
-    // Wait on the dispatch signal until the kernel is finished.
-    while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1,
-                                     (uint64_t) - 1, HSA_WAIT_STATE_ACTIVE))
-      ;
-
-    p_timer.StopTimer(id);
-
-    time.push_back(p_timer.ReadTimer(id));
-
-    hsa_signal_store_release(signal(), 1);
-
-    err = hsa_ext_image_destroy(*gpu_dev, image_handle);
-    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-    err = hsa_memory_deregister(ptr_image, image_info.size);
-    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-    hsa_amd_memory_pool_free(reinterpret_cast<void*>(ptr_temp));
-  }
-
-  // Calculte the mean load time
-  time.erase(time.begin());
-#ifdef DEBUG
-
-  for (size_t i = 0; i < time.size(); i++) {
-    std::cout << time[i] << std::endl;
-  }
-
-#endif
-  double mean_time = rocrtst::CalcMean(time);
-  std::cout << "mean time: " << mean_time << std::endl;
-
-  store_bandwidth_ = image_size_ / mean_time / 1024 / 1024 / 1024;
-}
-
-void ImageStoreBandwidth::Close() {
-  hsa_status_t err;
-  err = rocrtst::CommonCleanUp(this);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-}
-
-void ImageStoreBandwidth::DisplayResults() const {
-  if (!rocrtst::CheckProfile(this)) {
-    return;
-  }
-
-  std::cout << "============================================="
-                                "===============================" << std::endl;
-
-  std::cout << " Image Size(bytes):              StoreBandwidth(GB/S):    "
-            << std::cout;
-  std::cout << " " << image_size_ << "                                "
-            << store_bandwidth_ << std::endl;
-}
-
diff --git a/rocrtst/suites/performance/kernels/cu_masking.brig b/rocrtst/suites/performance/kernels/cu_masking.brig
deleted file mode 100644
index bec66be1b1b2e09dbbc1edaafcd6373992e58be1..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1200
zcmb`GFKkm$6vof{LnvEAo~W3Zcp=?F!bHUcA&8D-X_r7k0J(&{>@{nXwr^RBLsTM=
zhzSI8asq*@Kp>C^1QLNjA`$$)_wMbQ&T^8k-?`s+?>YD0ciVlwmp*!S&<U)H%h)CV
zfg)=3U7eZZxogyxNj!Apl)MJ|lk8=lO*7ImLVmYLW;fw%ls`B*9@`D_gC~2PeOUSS
zC>xC7HLx|9_Rg~Dpg>T)t+U?b)o?md-UK^9c^h<Heh3eNKf3%Gc;htb(%ymhKt0>T
zqBw=+=?Haj0WQHOH~tKN1>b<y{{TN-{u}=5@`M5#KsB214%l_$HrxeIfodGWIq)%a
zd5R!k_jg?6J?}ScJ(1^^1VO;OIJt`^-i0@<mb78B(2n>$2dc|dyxYlR#){Zt9k+}i
zib_v+r&#qGR!cjmeR;9n(HyT$uW6CZ(88eFX5GT{o<-$-`}_jGX19`$u(S~W3wguh
zbRXqAIZsN@#S)9s7iho8b#C?T;{GA0M;X4ZCioh4a%z+_<Tn^?GV=BGU1Z8Tf$uXI
zN+6nG9q9gxQ=Z^Bmt028)><Je8)mH|r+KQpu=HMg(Xl*T$?|$xSCw@qniI08xTU|;
zTTp4yLi@1L!fW&A&Vv@-a^FgiT6(X(mpG)S&Ldy<>ch`j-H$#^<z4M-d9SP9oPyG-
Jbya`$_y_wKQj-7x

diff --git a/rocrtst/suites/performance/hsa_info.h b/rocrtst/suites/performance/kernels/dispatch_time_kernels.cl
similarity index 72%
rename from rocrtst/suites/performance/hsa_info.h
rename to rocrtst/suites/performance/kernels/dispatch_time_kernels.cl
index 39c33bc54c..7f7b5e08ba 100755
--- a/rocrtst/suites/performance/hsa_info.h
+++ b/rocrtst/suites/performance/kernels/dispatch_time_kernels.cl
@@ -43,43 +43,8 @@
  *
  */
 
-#ifndef __ROCRTST_SRC_HSA_INFO_H__
-#define __ROCRTST_SRC_HSA_INFO_H__
-
-#include "perf_common/perf_base.h"
-#include "common/base_rocr.h"
-#include "common/common.h"
-#include "common/os.h"
-#include "hsa/hsa.h"
-#include "hsa/hsa_ext_amd.h"
-#include <iostream>
-#include <map>
-#include <string>
-#include <vector>
-
-//@Brief: This is trying to replicate clinfo
-
-class HsaInfo: public rocrtst::BaseRocR, public PerfBase {
- public:
-  //@Brief: Constructor
-  HsaInfo();
-
-  //@Brief: Destructor
-  virtual ~HsaInfo();
-
-  //@Brief: Set up the environment for the test
-  virtual void SetUp();
-
-  //@Brief: Run the test case
-  virtual void Run();
-
-  //@Brief: Display  results we got
-  virtual void DisplayResults() const;
-
-  //@Brief: Clean up and close the runtime
-  virtual void Close();
-
-};
-
-#endif
-
+__kernel void
+empty_kernel(void) {
+  return;
+}
+ 
diff --git a/rocrtst/suites/performance/kernels/empty_kernel.hsail b/rocrtst/suites/performance/kernels/empty_kernel.hsail
deleted file mode 100755
index 9736e413a9..0000000000
--- a/rocrtst/suites/performance/kernels/empty_kernel.hsail
+++ /dev/null
@@ -1,12 +0,0 @@
-module &m:1:0:$full:$large:$default;
-extension "amd:gcn";
-extension "IMAGE";
-
-decl prog function &abort()();
-
-prog kernel &__Empty_kernel()
-{
-
-	ret;
-};
-
diff --git a/rocrtst/suites/performance/kernels/flush_latency.hsail b/rocrtst/suites/performance/kernels/flush_latency.hsail
deleted file mode 100755
index 21ed473d0c..0000000000
--- a/rocrtst/suites/performance/kernels/flush_latency.hsail
+++ /dev/null
@@ -1,88 +0,0 @@
-module &m:1:0:$full:$large:$default;
-
-/* Copyright 2014 HSA Foundation Inc.  All Rights Reserved.
- *
- * HSAF is granting you permission to use this software and documentation (if
- * any) (collectively, the "Materials") pursuant to the terms and conditions
- * of the Software License Agreement included with the Materials.  If you do
- * not have a copy of the Software License Agreement, contact the  HSA Foundation for a copy.
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
- * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE.
- */
-
-extension "amd:gcn";
-extension "IMAGE";
-
-decl prog function &abort()();
-
-
-/**
- * @brief Hsail kernel to benchmark READ accesses to system memory.
- * The kernel is given a input buffer from which each each thread will
- * read. The thread will read from multiple locations of the input buffer.
- * The locations to read from is determined by the work-item Id, the function
- * being work-item Id modulo total number of work-items in the global work grid.
- * So given a global work grid of 16 work-items the reads by a thread with absolute
- * id 4 would be 4, 20, 36, 52, etc.
- *
- * @NOTE: A constraint imposed by the kernel is that the buffer size be large
- * enough to support 16 reads by each thread. So a dispatch of 8 work-items
- * should allocate enough buffer for 8 * 16 * sizeof(uint32_t).
- *
- * @param bufStart beginning byte address of user buffer in system memory
- * from which kernel threads could read
- *
- * @param bufEnd byte address that follows the end of user buffer. Accessing
- * memory at bufEnd is illegal
- *
- * @param addrStep size by which to increment byte address following each read
- * operation. The value represents total number of work-items * sizeof(uint32_t)
- *
- * @param outAddr argument that is passed by the user to be updated with values
- * read by the kernel threads. This is ensure compiler and finalizer do not eliminate
- * code because the values being read are not used in any meaningfule way.
- *
- */
-prog kernel &main(kernarg_u64 %outAddr) {
-
-  pragma  "AMD RTI", "ARGSTART:__SysMemLoad";
-  pragma  "AMD RTI", "version:3:1:104";
-  pragma  "AMD RTI", "device:generic";
-  pragma  "AMD RTI", "uniqueid:1024";
-  pragma  "AMD RTI", "function:1:0";
-  pragma  "AMD RTI", "memory:64bitABI";
-  pragma  "AMD RTI", "uavid:8";
-  pragma  "AMD RTI", "privateid:8";
-  pragma  "AMD RTI", "ARGEND:__SysMemLoad";
-
-  ld_kernarg_u64    $d0, [%outAddr];
-
-  // Compute the absolute id of current thread
-  // and shift it by two to get index into user
-  // buffer to access for Read operation
-  workitemflatabsid_u32  $s0;
-  shl_u32         $s0, $s0, 2;
-  cvt_u64_u32     $d4, $s0;
-
-  // Add index to base address of user buffer to obtain
-  // effective address for access
-  add_u64         $d0, $d0, $d4;
-
-  mov_u32         $s2, 1;
-
-  st_global_u32   $s2, [$d0];
-
-};
-
diff --git a/rocrtst/suites/performance/kernels/flush_latency_base.hsail b/rocrtst/suites/performance/kernels/flush_latency_base.hsail
deleted file mode 100755
index 015614252e..0000000000
--- a/rocrtst/suites/performance/kernels/flush_latency_base.hsail
+++ /dev/null
@@ -1,88 +0,0 @@
-module &m:1:0:$base:$large:$default;
-
-/* Copyright 2014 HSA Foundation Inc.  All Rights Reserved.
- *
- * HSAF is granting you permission to use this software and documentation (if
- * any) (collectively, the "Materials") pursuant to the terms and conditions
- * of the Software License Agreement included with the Materials.  If you do
- * not have a copy of the Software License Agreement, contact the  HSA Foundation for a copy.
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
- * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE.
- */
-
-extension "amd:gcn";
-extension "IMAGE";
-
-decl prog function &abort()();
-
-
-/**
- * @brief Hsail kernel to benchmark READ accesses to system memory.
- * The kernel is given a input buffer from which each each thread will
- * read. The thread will read from multiple locations of the input buffer.
- * The locations to read from is determined by the work-item Id, the function
- * being work-item Id modulo total number of work-items in the global work grid.
- * So given a global work grid of 16 work-items the reads by a thread with absolute
- * id 4 would be 4, 20, 36, 52, etc.
- *
- * @NOTE: A constraint imposed by the kernel is that the buffer size be large
- * enough to support 16 reads by each thread. So a dispatch of 8 work-items
- * should allocate enough buffer for 8 * 16 * sizeof(uint32_t).
- *
- * @param bufStart beginning byte address of user buffer in system memory
- * from which kernel threads could read
- *
- * @param bufEnd byte address that follows the end of user buffer. Accessing
- * memory at bufEnd is illegal
- *
- * @param addrStep size by which to increment byte address following each read
- * operation. The value represents total number of work-items * sizeof(uint32_t)
- *
- * @param outAddr argument that is passed by the user to be updated with values
- * read by the kernel threads. This is ensure compiler and finalizer do not eliminate
- * code because the values being read are not used in any meaningfule way.
- *
- */
-prog kernel &main(kernarg_u64 %outAddr) {
-
-  pragma  "AMD RTI", "ARGSTART:__SysMemLoad";
-  pragma  "AMD RTI", "version:3:1:104";
-  pragma  "AMD RTI", "device:generic";
-  pragma  "AMD RTI", "uniqueid:1024";
-  pragma  "AMD RTI", "function:1:0";
-  pragma  "AMD RTI", "memory:64bitABI";
-  pragma  "AMD RTI", "uavid:8";
-  pragma  "AMD RTI", "privateid:8";
-  pragma  "AMD RTI", "ARGEND:__SysMemLoad";
-
-  ld_kernarg_u64    $d0, [%outAddr];
-
-  // Compute the absolute id of current thread
-  // and shift it by two to get index into user
-  // buffer to access for Read operation
-  workitemflatabsid_u32  $s0;
-  shl_u32         $s0, $s0, 2;
-  cvt_u64_u32     $d4, $s0;
-
-  // Add index to base address of user buffer to obtain
-  // effective address for access
-  add_u64         $d0, $d0, $d4;
-
-  mov_u32         $s2, 1;
-
-  st_global_u32   $s2, [$d0];
-
-};
-
diff --git a/rocrtst/suites/performance/kernels/load_2d_image.hsail b/rocrtst/suites/performance/kernels/load_2d_image.hsail
deleted file mode 100755
index 637c14b273..0000000000
--- a/rocrtst/suites/performance/kernels/load_2d_image.hsail
+++ /dev/null
@@ -1,109 +0,0 @@
-module &m:1:0:$full:$large:$default;
-extension "amd:gcn";
-extension "IMAGE";
-
-decl prog function &abort()();
-
-prog kernel &__OpenCL_load_2d_image_kernel(
-	kernarg_rwimg %input,
-	kernarg_u64 %result,
-	kernarg_u32 %istart,
-	kernarg_u32 %iend,
-	kernarg_u32 %istep)
-{
-	pragma  "AMD RTI", "ARGSTART:__OpenCL_load_2d_image_kernel";
-	pragma  "AMD RTI", "version:3:1:104";
-	pragma  "AMD RTI", "device:generic";
-	pragma  "AMD RTI", "uniqueid:1024";
-	pragma  "AMD RTI", "function:1:0";
-	pragma  "AMD RTI", "memory:64bitABI";
-	pragma  "AMD RTI", "uavid:8";
-	pragma  "AMD RTI", "privateid:8";
-	pragma  "AMD RTI", "ARGEND:__OpenCL_load_2d_image_kernel";
-
-@__OpenCL_load_2d_image_kernel_entry:
-	// BB#0:                                // %entry
-	workitemabsid_u32	$s0, 1;
-	workitemabsid_u32	$s1, 0;
-	ld_kernarg_rwimg $d5, [%input];
-	ld_kernarg_u32 $s2, [%istart];
-	ld_kernarg_u32 $s3, [%iend];
-	ld_kernarg_u32 $s4, [%istep];
-
-    add_u32 $s9, 0, 0; // reset s9 to zero
-@loop:
-    add_u32 $s2, $s2, $s4;
-	
-	ldimage_v4_2d_u32_rwimg_u32	($s5, $s6, $s7, $s8), $d5, ($s1, $s0); //(coordWidth, coordHeight)
-    add_u32 $s9, $s9, $s5;
-
-    //force to retrieve different image elements
-	add_u32 $s1, $s1, 64;
-	and_b32 $s1, $s1, 255;
-	add_u32 $s0, $s0, 64;
-	and_b32 $s0, $s0, 255;
-	
-	ldimage_v4_2d_u32_rwimg_u32	($s5, $s6, $s7, $s8), $d5, ($s1, $s0);
-    add_u32 $s9, $s9, $s6;
-	
-	//force to retrieve different image elements
-	add_u32 $s1, $s1, 64;
-	and_b32 $s1, $s1, 255;
-	add_u32 $s0, $s0, 64;
-	and_b32 $s0, $s0, 255;
-
-	ldimage_v4_2d_u32_rwimg_u32	($s5, $s6, $s7, $s8), $d5, ($s1, $s0);
-    add_u32 $s9, $s9, $s7;
-	
-	//force to retrieve different image elements
-	add_u32 $s1, $s1, 64;
-	and_b32 $s1, $s1, 255;
-	add_u32 $s0, $s0, 64;
-	and_b32 $s0, $s0, 255;
-
-	ldimage_v4_2d_u32_rwimg_u32	($s5, $s6, $s7, $s8), $d5, ($s1, $s0);
-    add_u32 $s9, $s9, $s8;
-	
-	//force to retrieve different image elements
-	add_u32 $s1, $s1, 64;
-	and_b32 $s1, $s1, 255;
-	add_u32 $s0, $s0, 64;
-	and_b32 $s0, $s0, 255;
-
-	ldimage_v4_2d_u32_rwimg_u32	($s5, $s6, $s7, $s8), $d5, ($s1, $s0);
-    add_u32 $s9, $s9, $s5;
-	
-	//force to retrieve different image elements
-	add_u32 $s1, $s1, 64;
-	and_b32 $s1, $s1, 255;
-	add_u32 $s0, $s0, 64;
-	and_b32 $s0, $s0, 255;
-	
-	ldimage_v4_2d_u32_rwimg_u32	($s5, $s6, $s7, $s8), $d5, ($s1, $s0);
-    add_u32 $s9, $s9, $s6;
-	
-	//force to retrieve different image elements
-	add_u32 $s1, $s1, 64;
-	and_b32 $s1, $s1, 255;
-	add_u32 $s0, $s0, 64;
-	and_b32 $s0, $s0, 255;
-
-	ldimage_v4_2d_u32_rwimg_u32	($s5, $s6, $s7, $s8), $d5, ($s1, $s0);
-    add_u32 $s9, $s9, $s7;
-		
-	//force to retrieve different image elements
-	add_u32 $s1, $s1, 64;
-	and_b32 $s1, $s1, 255;
-	add_u32 $s0, $s0, 64;
-	and_b32 $s0, $s0, 255;
-	
-	ldimage_v4_2d_u32_rwimg_u32	($s5, $s6, $s7, $s8), $d5, ($s1, $s0);
-	ld_kernarg_align(8)_width(all)_u64	$d4, [%result];
-    add_u32 $s9, $s9, $s8;
-	
-	st_u32 $s9, [$d4];
-
-//loop until we hit condition
-    cmp_lt_b1_u32 $c0, $s2, $s3;
-    cbr_b1 $c0, @loop;
-};
diff --git a/rocrtst/suites/performance/kernels/simple_kernel.hsail b/rocrtst/suites/performance/kernels/simple_kernel.hsail
deleted file mode 100755
index 063f9ece3c..0000000000
--- a/rocrtst/suites/performance/kernels/simple_kernel.hsail
+++ /dev/null
@@ -1,37 +0,0 @@
-module &m:1:0:$full:$large:$default;
-extension "amd:gcn";
-extension "IMAGE";
-
-decl prog function &abort()();
-
-
-/* This function takes in 2 memory locations, one storing a number of 
- iterations to execute, and the other a place to store a result.
- The function iterates through a loop "iteration" times, and stores
- the number of iterations executed in the "results" location. 
- A successful run is when the value stored in %iteration is the 
- same as the value store in %results.
-*/
-
-prog kernel &__simple_kernel(
-	kernarg_u64 %iteration,
-	kernarg_u64 %results)
-{
-        ret;
-	ld_kernarg_align(8)_width(all)_u64 $d1, [%iteration];
-	ld_kernarg_align(8)_width(all)_u64 $d2, [%results];
-
-	ld_global_u32 $s1, [$d1];
-	mov_u32 $s2, 0;
-
-
-@loop:
-        add_u32 $s2, $s2, 1;
-	cmp_lt_b1_u32 $c0, $s2, $s1;
-	cbr_b1 $c0, @loop;
-
-	st_global_u32 $s2, [$d2];
-	
-	ret;
-};
-
diff --git a/rocrtst/suites/performance/kernels/simple_kernel_base.hsail b/rocrtst/suites/performance/kernels/simple_kernel_base.hsail
deleted file mode 100755
index 0ee7207b2a..0000000000
--- a/rocrtst/suites/performance/kernels/simple_kernel_base.hsail
+++ /dev/null
@@ -1,28 +0,0 @@
-module &m:1:0:$base:$large:$default;
-extension "amd:gcn";
-extension "IMAGE";
-
-decl prog function &abort()();
-
-prog kernel &__simple_kernel(
-	kernarg_u64 %iteration,
-	kernarg_u64 %results)
-{
-	
-	ld_kernarg_align(8)_width(all)_u64 $d1, [%iteration];
-	ld_kernarg_align(8)_width(all)_u64 $d2, [%results];
-
-	ld_global_u32 $s1, [$d1];
-	mov_u32 $s2, 0;
-
-
-@loop:
-        add_u32 $s2, $s2, 1;
-	cmp_lt_b1_u32 $c0, $s2, $s1;
-	cbr_b1 $c0, @loop;
-
-	st_global_u32 $s2, [$d2];
-	
-	ret;
-};
-
diff --git a/rocrtst/suites/performance/kernels/store_2d_image.hsail b/rocrtst/suites/performance/kernels/store_2d_image.hsail
deleted file mode 100755
index b24bdebb14..0000000000
--- a/rocrtst/suites/performance/kernels/store_2d_image.hsail
+++ /dev/null
@@ -1,105 +0,0 @@
-module &m:1:0:$full:$large:$default;
-extension "amd:gcn";
-extension "IMAGE";
-
-decl prog function &abort()();
-
-prog kernel &__OpenCL_store_2d_image_kernel(
-    kernarg_rwimg %output,
-	kernarg_u32 %istart,
-	kernarg_u32 %iend,
-	kernarg_u32 %istep)
-{
-	pragma  "AMD RTI", "ARGSTART:__OpenCL_store_2d_image_kernel";
-	pragma  "AMD RTI", "version:3:1:104";
-	pragma  "AMD RTI", "device:generic";
-	pragma  "AMD RTI", "uniqueid:1024";
-	pragma  "AMD RTI", "function:1:0";
-	pragma  "AMD RTI", "memory:64bitABI";
-	pragma  "AMD RTI", "uavid:8";
-	pragma  "AMD RTI", "privateid:8";
-	pragma  "AMD RTI", "ARGEND:__OpenCL_store_2d_image_kernel";
-
-@__OpenCL_store_2d_image_kernel_entry:
-	// BB#0:                                // %entry
-	workitemabsid_u32	$s0, 1;
-	workitemabsid_u32	$s1, 0;
-    ld_kernarg_rwimg $d5, [%output];
-	ld_kernarg_u32 $s2, [%istart];
-	ld_kernarg_u32 $s3, [%iend];
-	ld_kernarg_u32 $s4, [%istep];
-	
-	mov_b32	$s5, 0;
-@loop:
-    add_u32 $s2, $s2, $s4;
-    add_u32 $s5, $s5, 1;
-	stimage_v4_2d_u32_rwimg_u32	($s5, $s5, $s5, $s5), $d5, ($s1, $s0);
-    
-	//force to retrieve different image elements
-	add_u32 $s1, $s1, 64;
-	and_b32 $s1, $s1, 255;
-	add_u32 $s0, $s0, 64;
-	and_b32 $s0, $s0, 255;
-
-    add_u32 $s5, $s5, $s2;
-	stimage_v4_2d_u32_rwimg_u32	($s5, $s5, $s5, $s5), $d5, ($s1, $s0);
-	
-	//force to retrieve different image elements
-	add_u32 $s1, $s1, 64;
-	and_b32 $s1, $s1, 255;
-	add_u32 $s0, $s0, 64;
-	and_b32 $s0, $s0, 255;
-
-    add_u32 $s5, $s5, $s2;
-	stimage_v4_2d_u32_rwimg_u32	($s5, $s5, $s5, $s5), $d5, ($s1, $s0);
-	
-	//force to retrieve different image elements
-	add_u32 $s1, $s1, 64;
-	and_b32 $s1, $s1, 255;
-	add_u32 $s0, $s0, 64;
-	and_b32 $s0, $s0, 255;
-
-    add_u32 $s5, $s5, $s2;
-	stimage_v4_2d_u32_rwimg_u32	($s5, $s5, $s5, $s5), $d5, ($s1, $s0);
-	
-	//force to retrieve different image elements
-	add_u32 $s1, $s1, 64;
-	and_b32 $s1, $s1, 255;
-	add_u32 $s0, $s0, 64;
-	and_b32 $s0, $s0, 255;
-
-    add_u32 $s5, $s5, $s2;
-	stimage_v4_2d_u32_rwimg_u32	($s5, $s5, $s5, $s5), $d5, ($s1, $s0);
-	
-	//force to retrieve different image elements
-	add_u32 $s1, $s1, 64;
-	and_b32 $s1, $s1, 255;
-	add_u32 $s0, $s0, 64;
-	and_b32 $s0, $s0, 255;
-
-    add_u32 $s5, $s5, $s2;
-	stimage_v4_2d_u32_rwimg_u32	($s5, $s5, $s5, $s5), $d5, ($s1, $s0);
-	
-	//force to retrieve different image elements
-	add_u32 $s1, $s1, 64;
-	and_b32 $s1, $s1, 255;
-	add_u32 $s0, $s0, 64;
-	and_b32 $s0, $s0, 255;
-
-    add_u32 $s5, $s5, $s2;
-	stimage_v4_2d_u32_rwimg_u32	($s5, $s5, $s5, $s5), $d5, ($s1, $s0);
-	
-	//force to retrieve different image elements
-	add_u32 $s1, $s1, 64;
-	and_b32 $s1, $s1, 255;
-	add_u32 $s0, $s0, 64;
-	and_b32 $s0, $s0, 255;
-
-    add_u32 $s5, $s5, $s2;
-	stimage_v4_2d_u32_rwimg_u32	($s5, $s5, $s5, $s5), $d5, ($s1, $s0);
-
-//loop until we hit condition
-    cmp_lt_b1_u32 $c0, $s2, $s3;
-	cbr_b1 $c0, @loop;
-	ret;
-};
diff --git a/rocrtst/suites/performance/kernels/sysMemRead.hsail b/rocrtst/suites/performance/kernels/sysMemRead.hsail
deleted file mode 100755
index bfdb35de7c..0000000000
--- a/rocrtst/suites/performance/kernels/sysMemRead.hsail
+++ /dev/null
@@ -1,237 +0,0 @@
-module &m:1:0:$full:$large:$default;
-
-/* Copyright 2014 HSA Foundation Inc.  All Rights Reserved.
- *
- * HSAF is granting you permission to use this software and documentation (if
- * any) (collectively, the "Materials") pursuant to the terms and conditions
- * of the Software License Agreement included with the Materials.  If you do
- * not have a copy of the Software License Agreement, contact the  HSA Foundation for a copy.
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
- * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE.
- */
-
-extension "amd:gcn";
-extension "IMAGE";
-
-decl prog function &abort()();
-
-
-/**
- * @brief Hsail kernel to benchmark READ accesses to system memory.
- * The kernel is given a input buffer from which each each thread will
- * read. The thread will read from multiple locations of the input buffer.
- * The locations to read from is determined by the work-item Id, the function
- * being work-item Id modulo total number of work-items in the global work grid.
- * So given a global work grid of 16 work-items the reads by a thread with absolute
- * id 4 would be 4, 20, 36, 52, etc.
- *
- * @NOTE: A constraint imposed by the kernel is that the buffer size be large
- * enough to support 16 reads by each thread. So a dispatch of 8 work-items
- * should allocate enough buffer for 8 * 16 * sizeof(uint32_t).
- *
- * @param bufStart beginning byte address of user buffer in system memory
- * from which kernel threads could read
- *
- * @param bufEnd byte address that follows the end of user buffer. Accessing
- * memory at bufEnd is illegal
- *
- * @param addrStep size by which to increment byte address following each read
- * operation. The value represents total number of work-items * sizeof(uint32_t)
- *
- * @param outAddr argument that is passed by the user to be updated with values
- * read by the kernel threads. This is ensure compiler and finalizer do not eliminate
- * code because the values being read are not used in any meaningfule way.
- *
- */
-prog kernel &__SysMemLoad(kernarg_u64 %bufStart,
-                          kernarg_u64 %bufEnd,
-                          kernarg_u64 %addrStep,
-                          kernarg_u64 %outAddr) {
-
-  pragma  "AMD RTI", "ARGSTART:__SysMemLoad";
-  pragma  "AMD RTI", "version:3:1:104";
-  pragma  "AMD RTI", "device:generic";
-  pragma  "AMD RTI", "uniqueid:1024";
-  pragma  "AMD RTI", "function:1:0";
-  pragma  "AMD RTI", "memory:64bitABI";
-  pragma  "AMD RTI", "uavid:8";
-  pragma  "AMD RTI", "privateid:8";
-  pragma  "AMD RTI", "ARGEND:__SysMemLoad";
-
-  // Retrieve the values of input arguments
-  // bufStart refers to the starting byte address
-  // bufEnd refers to the end of byte address
-  // addrStep refers to the product of total number
-  // of work-items in the grid * sizeof(uint32_t)
-  ld_kernarg_u64    $d0, [%bufStart];
-  ld_kernarg_u64    $d1, [%bufEnd];
-  ld_kernarg_u64    $d2, [%addrStep];
-  ld_kernarg_u64    $d3, [%outAddr];
-
-  // Compute the absolute id of current thread
-  // and shift it by two to get index into user
-  // buffer to access for Read operation
-  workitemflatabsid_u32  $s0;
-  shl_u32         $s0, $s0, 2;
-  cvt_u64_u32     $d4, $s0;
-
-  // Add index to base address of user buffer to obtain
-  // effective address for access
-  add_u64         $d0, $d0, $d4;
-  add_u64         $d3, $d3, $d4;
-
-  // Initialize thread's read accumulator to zero
-  mov_u32         $s2, 0;
-
-@loop:
-
-  // Read sixteeen values with a stride that is
-  // determined by the total number of work-items
-  // in the global grid
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  // Update output buffer with values read
-  // from input buffer
-  st_global_u32   $s2, [$d3];
-
-};
-
diff --git a/rocrtst/suites/performance/kernels/sysMemRead_base.hsail b/rocrtst/suites/performance/kernels/sysMemRead_base.hsail
deleted file mode 100755
index 264a194c92..0000000000
--- a/rocrtst/suites/performance/kernels/sysMemRead_base.hsail
+++ /dev/null
@@ -1,237 +0,0 @@
-module &m:1:0:$base:$large:$default;
-
-/* Copyright 2014 HSA Foundation Inc.  All Rights Reserved.
- *
- * HSAF is granting you permission to use this software and documentation (if
- * any) (collectively, the "Materials") pursuant to the terms and conditions
- * of the Software License Agreement included with the Materials.  If you do
- * not have a copy of the Software License Agreement, contact the  HSA Foundation for a copy.
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
- * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE.
- */
-
-extension "amd:gcn";
-extension "IMAGE";
-
-decl prog function &abort()();
-
-
-/**
- * @brief Hsail kernel to benchmark READ accesses to system memory.
- * The kernel is given a input buffer from which each each thread will
- * read. The thread will read from multiple locations of the input buffer.
- * The locations to read from is determined by the work-item Id, the function
- * being work-item Id modulo total number of work-items in the global work grid.
- * So given a global work grid of 16 work-items the reads by a thread with absolute
- * id 4 would be 4, 20, 36, 52, etc.
- *
- * @NOTE: A constraint imposed by the kernel is that the buffer size be large
- * enough to support 16 reads by each thread. So a dispatch of 8 work-items
- * should allocate enough buffer for 8 * 16 * sizeof(uint32_t).
- *
- * @param bufStart beginning byte address of user buffer in system memory
- * from which kernel threads could read
- *
- * @param bufEnd byte address that follows the end of user buffer. Accessing
- * memory at bufEnd is illegal
- *
- * @param addrStep size by which to increment byte address following each read
- * operation. The value represents total number of work-items * sizeof(uint32_t)
- *
- * @param outAddr argument that is passed by the user to be updated with values
- * read by the kernel threads. This is ensure compiler and finalizer do not eliminate
- * code because the values being read are not used in any meaningfule way.
- *
- */
-prog kernel &__SysMemLoad(kernarg_u64 %bufStart,
-                          kernarg_u64 %bufEnd,
-                          kernarg_u64 %addrStep,
-                          kernarg_u64 %outAddr) {
-
-  pragma  "AMD RTI", "ARGSTART:__SysMemLoad";
-  pragma  "AMD RTI", "version:3:1:104";
-  pragma  "AMD RTI", "device:generic";
-  pragma  "AMD RTI", "uniqueid:1024";
-  pragma  "AMD RTI", "function:1:0";
-  pragma  "AMD RTI", "memory:64bitABI";
-  pragma  "AMD RTI", "uavid:8";
-  pragma  "AMD RTI", "privateid:8";
-  pragma  "AMD RTI", "ARGEND:__SysMemLoad";
-
-  // Retrieve the values of input arguments
-  // bufStart refers to the starting byte address
-  // bufEnd refers to the end of byte address
-  // addrStep refers to the product of total number
-  // of work-items in the grid * sizeof(uint32_t)
-  ld_kernarg_u64    $d0, [%bufStart];
-  ld_kernarg_u64    $d1, [%bufEnd];
-  ld_kernarg_u64    $d2, [%addrStep];
-  ld_kernarg_u64    $d3, [%outAddr];
-
-  // Compute the absolute id of current thread
-  // and shift it by two to get index into user
-  // buffer to access for Read operation
-  workitemflatabsid_u32  $s0;
-  shl_u32         $s0, $s0, 2;
-  cvt_u64_u32     $d4, $s0;
-
-  // Add index to base address of user buffer to obtain
-  // effective address for access
-  add_u64         $d0, $d0, $d4;
-  add_u64         $d3, $d3, $d4;
-
-  // Initialize thread's read accumulator to zero
-  mov_u32         $s2, 0;
-
-@loop:
-
-  // Read sixteeen values with a stride that is
-  // determined by the total number of work-items
-  // in the global grid
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  ld_global_u32   $s1, [$d0];
-  add_u32         $s2, $s1, $s2;
-  add_u64         $d0, $d0, $d2;
-
-  // Update output buffer with values read
-  // from input buffer
-  st_global_u32   $s2, [$d3];
-
-};
-
diff --git a/rocrtst/suites/performance/kernels/sysMemWrite.hsail b/rocrtst/suites/performance/kernels/sysMemWrite.hsail
deleted file mode 100755
index 97a83e6105..0000000000
--- a/rocrtst/suites/performance/kernels/sysMemWrite.hsail
+++ /dev/null
@@ -1,105 +0,0 @@
-module &m:1:0:$full:$large:$default;
-extension "amd:gcn";
-extension "IMAGE";
-
-decl prog function &abort()();
-
-prog kernel &__SysMemStore(kernarg_u64 %bufStart,
-                           kernarg_u64 %bufEnd,
-                           kernarg_u64 %addrStep,
-                           kernarg_u64 %deadArg) {
-
-  // Directives for Compiler
-  pragma  "AMD RTI", "ARGSTART:__SysMemStore";
-  pragma  "AMD RTI", "version:3:1:104";
-  pragma  "AMD RTI", "device:generic";
-  pragma  "AMD RTI", "uniqueid:1024";
-  pragma  "AMD RTI", "function:1:0";
-  pragma  "AMD RTI", "memory:64bitABI";
-  pragma  "AMD RTI", "uavid:8";
-  pragma  "AMD RTI", "privateid:8";
-  pragma  "AMD RTI", "ARGEND:__SysMemStore";
-
-  // Retrieve the values of input arguments
-  // bufStart refers to the starting byte address
-  // bufEnd refers to the end of byte address
-  // addrStep refers to the product of total number
-  // of work-items in the grid * sizeof(uint32_t)
-  ld_kernarg_u64     $d0, [%bufStart];
-  ld_kernarg_u64     $d1, [%bufEnd];
-  ld_kernarg_u64     $d2, [%addrStep];
-  ld_kernarg_u64     $d3, [%deadArg];
-
-  // Compute the absolute id of current thread
-  // and shift it by two to get index into user
-  // buffer to access for Write operation
-  workitemflatabsid_u32  $s0;
-  shl_u32            $s0, $s0, 2;
-
-  // Convert the thread id into a 64-bit number
-  // and add it to the starting address of user
-  // buffer to obtain effective address for access
-  cvt_u64_u32     $d4, $s0;
-  add_u64         $d0, $d0, $d4;
-
-
-@loop:
-
-  // Write sixteeen values with a stride that is
-  // determined by the total number of work-items
-  // in the global grid
-
-  st_global_u32   $s0, [$d0];
-  add_u64         $d0, $d0, $d2;
-
-  st_global_u32   $s0, [$d0];
-  add_u64         $d0, $d0, $d2;
-
-  st_global_u32   $s0, [$d0];
-  add_u64         $d0, $d0, $d2;
-
-  st_global_u32   $s0, [$d0];
-  add_u64         $d0, $d0, $d2;
-
-  st_global_u32   $s0, [$d0];
-  add_u64         $d0, $d0, $d2;
-
-  st_global_u32   $s0, [$d0];
-  add_u64         $d0, $d0, $d2;
-
-  st_global_u32   $s0, [$d0];
-  add_u64         $d0, $d0, $d2;
-
-  st_global_u32   $s0, [$d0];
-  add_u64         $d0, $d0, $d2;
-
-  st_global_u32   $s0, [$d0];
-  add_u64         $d0, $d0, $d2;
-
-  st_global_u32   $s0, [$d0];
-  add_u64         $d0, $d0, $d2;
-
-  st_global_u32   $s0, [$d0];
-  add_u64         $d0, $d0, $d2;
-
-  st_global_u32   $s0, [$d0];
-  add_u64         $d0, $d0, $d2;
-
-  st_global_u32   $s0, [$d0];
-  add_u64         $d0, $d0, $d2;
-
-  st_global_u32   $s0, [$d0];
-  add_u64         $d0, $d0, $d2;
-
-  st_global_u32   $s0, [$d0];
-  add_u64         $d0, $d0, $d2;
-
-  st_global_u32   $s0, [$d0];
-  add_u64         $d0, $d0, $d2;
-
-  // Loop until we hit end of buffer [%bufEnd]
-  cmp_lt_b1_u64   $c0, $d0, $d1;
-  cbr_b1          $c0, @loop;
-
-};
-
diff --git a/rocrtst/suites/performance/kernels/sysMemWrite_base.hsail b/rocrtst/suites/performance/kernels/sysMemWrite_base.hsail
deleted file mode 100755
index e2f304fe1d..0000000000
--- a/rocrtst/suites/performance/kernels/sysMemWrite_base.hsail
+++ /dev/null
@@ -1,105 +0,0 @@
-module &m:1:0:$base:$large:$default;
-extension "amd:gcn";
-extension "IMAGE";
-
-decl prog function &abort()();
-
-prog kernel &__SysMemStore(kernarg_u64 %bufStart,
-                           kernarg_u64 %bufEnd,
-                           kernarg_u64 %addrStep,
-                           kernarg_u64 %deadArg) {
-
-  // Directives for Compiler
-  pragma  "AMD RTI", "ARGSTART:__SysMemStore";
-  pragma  "AMD RTI", "version:3:1:104";
-  pragma  "AMD RTI", "device:generic";
-  pragma  "AMD RTI", "uniqueid:1024";
-  pragma  "AMD RTI", "function:1:0";
-  pragma  "AMD RTI", "memory:64bitABI";
-  pragma  "AMD RTI", "uavid:8";
-  pragma  "AMD RTI", "privateid:8";
-  pragma  "AMD RTI", "ARGEND:__SysMemStore";
-
-  // Retrieve the values of input arguments
-  // bufStart refers to the starting byte address
-  // bufEnd refers to the end of byte address
-  // addrStep refers to the product of total number
-  // of work-items in the grid * sizeof(uint32_t)
-  ld_kernarg_u64     $d0, [%bufStart];
-  ld_kernarg_u64     $d1, [%bufEnd];
-  ld_kernarg_u64     $d2, [%addrStep];
-  ld_kernarg_u64     $d3, [%deadArg];
-
-  // Compute the absolute id of current thread
-  // and shift it by two to get index into user
-  // buffer to access for Write operation
-  workitemflatabsid_u32  $s0;
-  shl_u32            $s0, $s0, 2;
-
-  // Convert the thread id into a 64-bit number
-  // and add it to the starting address of user
-  // buffer to obtain effective address for access
-  cvt_u64_u32     $d4, $s0;
-  add_u64         $d0, $d0, $d4;
-
-
-@loop:
-
-  // Write sixteeen values with a stride that is
-  // determined by the total number of work-items
-  // in the global grid
-
-  st_global_u32   $s0, [$d0];
-  add_u64         $d0, $d0, $d2;
-
-  st_global_u32   $s0, [$d0];
-  add_u64         $d0, $d0, $d2;
-
-  st_global_u32   $s0, [$d0];
-  add_u64         $d0, $d0, $d2;
-
-  st_global_u32   $s0, [$d0];
-  add_u64         $d0, $d0, $d2;
-
-  st_global_u32   $s0, [$d0];
-  add_u64         $d0, $d0, $d2;
-
-  st_global_u32   $s0, [$d0];
-  add_u64         $d0, $d0, $d2;
-
-  st_global_u32   $s0, [$d0];
-  add_u64         $d0, $d0, $d2;
-
-  st_global_u32   $s0, [$d0];
-  add_u64         $d0, $d0, $d2;
-
-  st_global_u32   $s0, [$d0];
-  add_u64         $d0, $d0, $d2;
-
-  st_global_u32   $s0, [$d0];
-  add_u64         $d0, $d0, $d2;
-
-  st_global_u32   $s0, [$d0];
-  add_u64         $d0, $d0, $d2;
-
-  st_global_u32   $s0, [$d0];
-  add_u64         $d0, $d0, $d2;
-
-  st_global_u32   $s0, [$d0];
-  add_u64         $d0, $d0, $d2;
-
-  st_global_u32   $s0, [$d0];
-  add_u64         $d0, $d0, $d2;
-
-  st_global_u32   $s0, [$d0];
-  add_u64         $d0, $d0, $d2;
-
-  st_global_u32   $s0, [$d0];
-  add_u64         $d0, $d0, $d2;
-
-  // Loop until we hit end of buffer [%bufEnd]
-  cmp_lt_b1_u64   $c0, $d0, $d1;
-  cbr_b1          $c0, @loop;
-
-};
-
diff --git a/rocrtst/suites/performance/image_store_bandwidth.h b/rocrtst/suites/performance/kernels/test_case_template_kernels.cl
similarity index 72%
rename from rocrtst/suites/performance/image_store_bandwidth.h
rename to rocrtst/suites/performance/kernels/test_case_template_kernels.cl
index 6de0d9f860..b7408570f5 100755
--- a/rocrtst/suites/performance/image_store_bandwidth.h
+++ b/rocrtst/suites/performance/kernels/test_case_template_kernels.cl
@@ -43,40 +43,12 @@
  *
  */
 
-#ifndef __ROCRTST_SRC_INC_IMAGE_STORE_BANDWIDTH_H__
-#define __ROCRTST_SRC_INC_IMAGE_STORE_BANDWIDTH_H__
-
-#include "perf_common/perf_base.h"
-#include "common/base_rocr.h"
-#include "hsa/hsa.h"
-
-class ImageStoreBandwidth: public rocrtst::BaseRocR, public PerfBase {
- public:
-  //@Brief: Constructor
-  ImageStoreBandwidth();
-
-  //@Brief: Destructor
-  ~ImageStoreBandwidth();
-
-  //@Brief: Set up the test environment
-  virtual void SetUp();
-
-  //@Brief: Run the actual testing
-  virtual void Run();
-
-  //@Brief: Clean up the test environment
-  virtual void Close();
-
-  //@Brief: Display  results
-  virtual void DisplayResults() const;
-
- private:
-  //@Brief: Image Store Bandwidth
-  double store_bandwidth_;
-
-  //@Brief: Image size
-  size_t image_size_;
-};
-
-#endif //__ROCRTST_SRC_INC_IMAGE_STORE_BANDWIDTH_H__
-
+ __kernel void
+square(__global int *dstArray,  __global const int *srcArray, const int sz) {
+  unsigned int id = get_global_id(0);
+  if (id < sz) {
+     dstArray[id] = srcArray[id] * srcArray[id];
+  } 
+  return;
+}
+ 
diff --git a/rocrtst/suites/performance/kernels/test_kernel.hsail b/rocrtst/suites/performance/kernels/test_kernel.hsail
deleted file mode 100755
index 7c8587b213..0000000000
--- a/rocrtst/suites/performance/kernels/test_kernel.hsail
+++ /dev/null
@@ -1,53 +0,0 @@
-module &m:1:0:$full:$large:$default;
-extension "amd:gcn";
-extension "IMAGE";
-
-decl prog function &abort()();
-
-prog kernel &__OpenCL_vec_assign_kernel(
-	kernarg_u64 %buf,
-	kernarg_u32 %num)
-{
-	pragma  "AMD RTI", "ARGSTART:__OpenCL_vec_assign_kernel";
-	pragma  "AMD RTI", "version:3:1:104";
-	pragma  "AMD RTI", "device:generic";
-	pragma  "AMD RTI", "uniqueid:1024";
-	pragma  "AMD RTI", "function:1:0";
-	pragma  "AMD RTI", "memory:64bitABI";
-	pragma  "AMD RTI", "uavid:8";
-	pragma  "AMD RTI", "privateid:8";
-	pragma  "AMD RTI", "ARGEND:__OpenCL_vec_assign_kernel";
-
-@__OpenCL_vec_assign_kernel_entry:
-	// BB#0:                                // %entry
-	ld_kernarg_align(8)_width(all)_u64	$d0, [%buf];
-	ld_global_u32	$s1, [$d0];
-	ld_kernarg_align(4)_width(all)_u32	$s0, [%num];
-	cmp_ge_b1_s32	$c0, $s1, $s0;
-	cbr_b1	$c0, @BB0_4;
-	// BB#1:                                // %while.body.lr.ph
-	workitemabsid_u32	$s1, 0;
-	cmp_eq_b1_s32	$c0, $s1, 0;
-	cbr_b1	$c0, @BB0_2;
-
-@BB0_3:
-	// %while.cond.backedge
-	ld_global_u32	$s1, [$d0];
-	cmp_lt_b1_s32	$c0, $s1, $s0;
-	cbr_b1	$c0, @BB0_3;
-	br	@BB0_4;
-
-@BB0_2:
-	// %while.cond.backedge.us
-	ld_global_u32	$s1, [$d0];
-	add_u32	$s1, $s1, 1;
-	st_global_u32	$s1, [$d0];
-	ld_global_u32	$s1, [$d0];
-	cmp_lt_b1_s32	$c0, $s1, $s0;
-	cbr_b1	$c0, @BB0_2;
-
-@BB0_4:
-	// %while.end
-	ret;
-};
-
diff --git a/rocrtst/suites/performance/kernels/transpose_kernel.hsail b/rocrtst/suites/performance/kernels/transpose_kernel.hsail
deleted file mode 100755
index b29b30d8b7..0000000000
--- a/rocrtst/suites/performance/kernels/transpose_kernel.hsail
+++ /dev/null
@@ -1,108 +0,0 @@
-module &m:1:0:$full:$large:$default;
-extension "amd:gcn";
-extension "IMAGE";
-
-decl prog function &abort()();
-
-prog kernel &__OpenCL_matrixTranspose_kernel(
-	kernarg_u64 %__global_offset_0,
-	kernarg_u64 %__global_offset_1,
-	kernarg_u64 %__global_offset_2,
-	kernarg_u64 %__printf_buffer,
-	kernarg_u64 %__vqueue_pointer,
-	kernarg_u64 %__aqlwrap_pointer,
-	kernarg_u64 %inBuf,
-	kernarg_u64 %outBuf,
-	kernarg_u64 %localBuf,
-	kernarg_u32 %blockSize,
-	kernarg_u32 %width,
-	kernarg_u32 %height)
-{
-	pragma  "AMD RTI", "ARGSTART:__OpenCL_matrixTranspose_kernel";
-	pragma  "AMD RTI", "version:3:1:104";
-	pragma  "AMD RTI", "device:generic";
-	pragma  "AMD RTI", "uniqueid:1024";
-	pragma  "AMD RTI", "memory:private:0";
-	pragma  "AMD RTI", "memory:region:0";
-	pragma  "AMD RTI", "memory:local:0";
-	pragma  "AMD RTI", "value:__global_offset_0:u64:1:1:0";
-	pragma  "AMD RTI", "value:__global_offset_1:u64:1:1:16";
-	pragma  "AMD RTI", "value:__global_offset_2:u64:1:1:32";
-	pragma  "AMD RTI", "pointer:__printf_buffer:u8:1:1:48:uav:7:1:RW:0:0:0";
-	pragma  "AMD RTI", "value:__vqueue_pointer:u64:1:1:64";
-	pragma  "AMD RTI", "value:__aqlwrap_pointer:u64:1:1:80";
-	pragma  "AMD RTI", "pointer:inBuf:u32:1:1:96:uav:7:4:RW:0:1:0";
-	pragma  "AMD RTI", "pointer:outBuf:u32:1:1:112:uav:7:4:RW:0:1:0";
-	pragma  "AMD RTI", "pointer:localBuf:u32:1:1:128:l:7:4:RW:0:0:0";
-	pragma  "AMD RTI", "value:blockSize:u32:1:1:144";
-	pragma  "AMD RTI", "value:width:u32:1:1:160";
-	pragma  "AMD RTI", "value:height:u32:1:1:176";
-	pragma  "AMD RTI", "function:1:0";
-	pragma  "AMD RTI", "memory:64bitABI";
-	pragma  "AMD RTI", "privateid:8";
-	pragma  "AMD RTI", "enqueue_kernel:0";
-	pragma  "AMD RTI", "kernel_index:0";
-	pragma  "AMD RTI", "reflection:0:size_t";
-	pragma  "AMD RTI", "reflection:1:size_t";
-	pragma  "AMD RTI", "reflection:2:size_t";
-	pragma  "AMD RTI", "reflection:3:size_t";
-	pragma  "AMD RTI", "reflection:4:size_t";
-	pragma  "AMD RTI", "reflection:5:size_t";
-	pragma  "AMD RTI", "reflection:6:uint*";
-	pragma  "AMD RTI", "reflection:7:uint*";
-	pragma  "AMD RTI", "reflection:8:uint*";
-	pragma  "AMD RTI", "reflection:9:uint";
-	pragma  "AMD RTI", "reflection:10:uint";
-	pragma  "AMD RTI", "reflection:11:uint";
-	pragma  "AMD RTI", "ARGEND:__OpenCL_matrixTranspose_kernel";
-
-@__OpenCL_matrixTranspose_kernel_entry:
-	// BB#0:                                // %entry
-	workitemid_u32	$s0, 1;
-	ld_kernarg_align(4)_width(all)_u32	$s1, [%blockSize];
-	workitemid_u32	$s2, 0;
-	mad_u32	$s3, $s2, $s1, $s0;
-	cvt_u64_u32	$d1, $s3;
-	workitemabsid_u32	$s3, 0;
-	cvt_u64_u32	$d0, $s3;
-	ld_kernarg_align(8)_width(all)_u64	$d2, [%__global_offset_0];
-	add_u64	$d0, $d0, $d2;
-	workitemabsid_u32	$s5, 1;
-	workgroupid_u32	$s4, 0;
-	workgroupid_u32	$s3, 1;
-	shl_u64	$d1, $d1, 2;
-	mad_u32	$s3, $s3, $s1, $s2;
-	mad_u32	$s4, $s4, $s1, $s0;
-	cvt_u64_u32	$d2, $s5;
-	ld_kernarg_align(8)_width(all)_u64	$d3, [%__global_offset_1];
-	cvt_u32_u64	$s5, $d0;
-	add_u64	$d0, $d2, $d3;
-	cvt_u32_u64	$s6, $d0;
-	ld_kernarg_align(4)_width(all)_u32	$s7, [%width];
-	ld_kernarg_align(8)_width(all)_u64	$d0, [%localBuf];
-	ld_kernarg_align(4)_width(all)_u32	$s8, [%height];
-	mad_u32	$s3, $s4, $s8, $s3;
-	add_u64	$d1, $d0, $d1;
-	cvt_u32_u64	$s4, $d1;
-	mad_u32	$s5, $s6, $s7, $s5;
-	cvt_u64_u32	$d1, $s5;
-	shl_u64	$d2, $d1, 2;
-	ld_kernarg_align(8)_width(all)_u64	$d1, [%outBuf];
-	ld_kernarg_align(8)_width(all)_u64	$d3, [%inBuf];
-	add_u64	$d2, $d3, $d2;
-	ld_global_align(4)_u32	$s5, [$d2];
-	st_group_align(4)_u32	$s5, [$s4];
-	cvt_u64_u32	$d2, $s3;
-	shl_u64	$d2, $d2, 2;
-	add_u64	$d1, $d1, $d2;
-	mad_u32	$s0, $s0, $s1, $s2;
-	cvt_u64_u32	$d2, $s0;
-	shl_u64	$d2, $d2, 2;
-	add_u64	$d0, $d0, $d2;
-	cvt_u32_u64	$s0, $d0;
-	barrier;
-	ld_group_align(4)_u32	$s0, [$s0];
-	st_global_align(4)_u32	$s0, [$d1];
-	ret;
-};
-
diff --git a/rocrtst/suites/performance/kernels/vector_copy.hsail b/rocrtst/suites/performance/kernels/vector_copy.hsail
deleted file mode 100755
index 79c2bb0708..0000000000
--- a/rocrtst/suites/performance/kernels/vector_copy.hsail
+++ /dev/null
@@ -1,34 +0,0 @@
-module &m:1:0:$full:$large:$default;
-extension "amd:gcn";
-extension "IMAGE";
-
-decl prog function &abort()();
-
-prog kernel &__vector_copy_kernel(
-	kernarg_u64 %a,
-	kernarg_u64 %b)
-{
-	pragma  "AMD RTI", "ARGSTART:__vector_copy_kernel";
-	pragma  "AMD RTI", "version:3:1:104";
-	pragma  "AMD RTI", "device:generic";
-	pragma  "AMD RTI", "uniqueid:1024";
-	pragma  "AMD RTI", "function:1:0";
-	pragma  "AMD RTI", "memory:64bitABI";
-	pragma  "AMD RTI", "uavid:8";
-	pragma  "AMD RTI", "privateid:8";
-	pragma  "AMD RTI", "ARGEND:__vector_copy_kernel";
-
-@__vector_copy_kernel_entry:
-	// BB#0:                                // %entry
-	workitemabsid_u32	$s0, 0;
-	cvt_s64_s32	$d0, $s0;
-	shl_u64	$d0, $d0, 2;
-	ld_kernarg_align(8)_width(all)_u64	$d1, [%b];
-	add_u64	$d1, $d1, $d0;
-	ld_kernarg_align(8)_width(all)_u64	$d2, [%a];
-	add_u64	$d0, $d2, $d0;
-	ld_global_u32	$s0, [$d0];
-	st_global_u32	$s0, [$d1];
-	ret;
-};
-
diff --git a/rocrtst/suites/performance/kernels/vector_copy_base.hsail b/rocrtst/suites/performance/kernels/vector_copy_base.hsail
deleted file mode 100755
index 6a3a1572d8..0000000000
--- a/rocrtst/suites/performance/kernels/vector_copy_base.hsail
+++ /dev/null
@@ -1,64 +0,0 @@
-module &m:1:0:$base:$large:$default;
-
-////////////////////////////////////////////////////////////////////////////////
-//
-// The University of Illinois/NCSA
-// Open Source License (NCSA)
-//
-// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-//
-// Developed by:
-//
-//                 AMD Research and AMD HSA Software Development
-//
-//                 Advanced Micro Devices, Inc.
-//
-//                 www.amd.com
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to
-// deal with the Software without restriction, including without limitation
-// the rights to use, copy, modify, merge, publish, distribute, sublicense,
-// and/or sell copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following conditions:
-//
-//  - Redistributions of source code must retain the above copyright notice,
-//    this list of conditions and the following disclaimers.
-//  - Redistributions in binary form must reproduce the above copyright
-//    notice, this list of conditions and the following disclaimers in
-//    the documentation and/or other materials provided with the distribution.
-//  - Neither the names of Advanced Micro Devices, Inc,
-//    nor the names of its contributors may be used to endorse or promote
-//    products derived from this Software without specific prior written
-//    permission.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
-// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-// DEALINGS WITH THE SOFTWARE.
-//
-////////////////////////////////////////////////////////////////////////////////
-
-
-decl prog function &abort()();
-
-prog kernel &__vector_copy_kernel(
-	kernarg_u64 %in,
-	kernarg_u64 %out)
-{
-@__vector_copy_kernel_entry:
-	// BB#0:                                // %entry
-	workitemabsid_u32	$s0, 0;
-	cvt_s64_s32	$d0, $s0;
-	shl_u64	$d0, $d0, 2;
-	ld_kernarg_align(8)_width(all)_u64	$d1, [%out];
-	add_u64	$d1, $d1, $d0;
-	ld_kernarg_align(8)_width(all)_u64	$d2, [%in];
-	add_u64	$d0, $d2, $d0;
-	ld_global_u32	$s0, [$d0];
-	st_global_u32	$s0, [$d1];
-	ret;
-};
diff --git a/rocrtst/suites/performance/kernels/vector_copy_full.hsail b/rocrtst/suites/performance/kernels/vector_copy_full.hsail
deleted file mode 100755
index 07872eeac3..0000000000
--- a/rocrtst/suites/performance/kernels/vector_copy_full.hsail
+++ /dev/null
@@ -1,64 +0,0 @@
-module &m:1:0:$full:$large:$default;
-
-////////////////////////////////////////////////////////////////////////////////
-//
-// The University of Illinois/NCSA
-// Open Source License (NCSA)
-//
-// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved.
-//
-// Developed by:
-//
-//                 AMD Research and AMD HSA Software Development
-//
-//                 Advanced Micro Devices, Inc.
-//
-//                 www.amd.com
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to
-// deal with the Software without restriction, including without limitation
-// the rights to use, copy, modify, merge, publish, distribute, sublicense,
-// and/or sell copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following conditions:
-//
-//  - Redistributions of source code must retain the above copyright notice,
-//    this list of conditions and the following disclaimers.
-//  - Redistributions in binary form must reproduce the above copyright
-//    notice, this list of conditions and the following disclaimers in
-//    the documentation and/or other materials provided with the distribution.
-//  - Neither the names of Advanced Micro Devices, Inc,
-//    nor the names of its contributors may be used to endorse or promote
-//    products derived from this Software without specific prior written
-//    permission.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
-// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-// DEALINGS WITH THE SOFTWARE.
-//
-////////////////////////////////////////////////////////////////////////////////
-
-
-decl prog function &abort()();
-
-prog kernel &__vector_copy_kernel(
-	kernarg_u64 %in,
-	kernarg_u64 %out)
-{
-@__vector_copy_kernel_entry:
-	// BB#0:                                // %entry
-	workitemabsid_u32	$s0, 0;
-	cvt_s64_s32	$d0, $s0;
-	shl_u64	$d0, $d0, 2;
-	ld_kernarg_align(8)_width(all)_u64	$d1, [%out];
-	add_u64	$d1, $d1, $d0;
-	ld_kernarg_align(8)_width(all)_u64	$d2, [%in];
-	add_u64	$d0, $d2, $d0;
-	ld_global_u32	$s0, [$d0];
-	st_global_u32	$s0, [$d1];
-	ret;
-};
diff --git a/rocrtst/suites/performance/main.cc b/rocrtst/suites/performance/main.cc
old mode 100644
new mode 100755
index b5a14ed4eb..36bab14d5a
--- a/rocrtst/suites/performance/main.cc
+++ b/rocrtst/suites/performance/main.cc
@@ -43,238 +43,85 @@
  *
  */
 
-#include "cp_process_time.h"
-#include "cu_masking.h"
-#include "device_load_bandwidth.h"
-#include "device_store_bandwidth.h"
-#include "dispatch_time.h"
-#include "flush_latency.h"
 #include "gtest/gtest.h"
-#include "hsa_info.h"
-#include "image_bandwidth.h"
-#include "image_load_bandwidth.h"
-#include "image_store_bandwidth.h"
-#include "matrix_transpose.h"
-#include "memory_copy.h"
-#include "memory_allocation.h"
-#include "memory_async_copy.h"
-#include "queue_concurrency.h"
-#include "queue_create_destroy_latency.h"
-#include "system_load_bandwidth.h"
-#include "system_store_bandwidth.h"
-#include "vector_copy.h"
+#include "suites/performance/dispatch_time.h"
+#include "suites/performance/memory_async_copy.h"
+#include "suites/performance/test_case_template.h"
+#include "suites/performance/main.h"
+#include "suites/test_common/test_common.h"
 
-/**
- * Try to order tests from fastest running to slowest running.
- */
+static uint32_t sRocrTstOptVerbosity = 1;
+static uint32_t sRocrTestOptIterations = 0;
 
-// DisplayResultsResults HSA system information first.
-TEST(rocrtst, Feature_Hsa_Info) {
-  HsaInfo hi;
-  hi.SetUp();
-  hi.Run();
-  hi.Close();
+static void RunTest(TestBase *test) {
+  test->set_verbosity(sRocrTstOptVerbosity);
+
+  if (sRocrTestOptIterations) {
+    test->set_num_iteration(sRocrTestOptIterations);
+  }
+  test->DisplayTestInfo();
+  test->SetUp();
+  test->Run();
+  test->DisplayResults();
+  test->Close();
+
+  return;
 }
 
-// Requires HSA_PFOFILE_FULL
-TEST(rocrtst, Perf_Image_Store_Bandwidth) {
-  ImageStoreBandwidth isb;
-  isb.SetUp();
-  isb.Run();
-  isb.DisplayResults();
-  isb.Close();
+// TEST ENTRY TEMPLATE:
+// TEST(rocrtst, Perf_<test name>) {
+//  <Test Implementation class> <test_obj>;
+//
+//  // Copy and modify implementation of RunTest() if you need to deviate
+//  // from the standard pattern implemented there.
+//  RunTest(&<test_obj>);
+// }
+
+TEST(rocrtst, Test_Example) {
+  TestExample tst;
+  RunTest(&tst);
 }
 
-// Requires HSA_PFOFILE_FULL
-TEST(rocrtst, Perf_Image_Load_Bandwidth) {
-  ImageLoadBandwidth ilb;
-  ilb.SetUp();
-  ilb.Run();
-  ilb.DisplayResults();
-  ilb.Close();
+TEST(rocrtst, Perf_Memory_Async_Copy) {
+  MemoryAsyncCopy mac;
+  // To do full test, uncomment this:
+  //  mac.set_full_test(true);
+  // To test only 1 path, add lines like this:
+  //  mac.set_src_pool(<src pool id>);
+  //  mac.set_dst_pool(<dst pool id>);
+  // The default is to and from the cpu to 1 gpu, and to/from a gpu to
+  // another gpu
+  RunTest(&mac);
 }
 
-// Requires HSA_PFOFILE_FULL
-TEST(rocrtst, Perf_Image_Bandwidth) {
-  ImageBandwidth ib;
-  ib.SetUp();
-  ib.Run();
-  ib.DisplayResults();
-  ib.Close();
-}
-
-// Requires HSA_PFOFILE_FULL
-TEST(rocrtst, Perf_Queue_Concurrency) {
-  QueueConcurrency mc;
-  mc.SetUp();
-  mc.Run();
-  mc.DisplayResults();
-  mc.Close();
-}
-
-TEST(rocrtst, Feature_Cu_Masking) {
-  CuMasking cm;
-  cm.SetUp();
-  cm.Run();
-  cm.Close();
-}
-
-TEST(rocrtst, Perf_Flush_Latency) {
-  FlushLatency fl;
-  fl.SetUp();
-  fl.Run();
-  fl.DisplayResults();
-  fl.Close();
-}
-
-// This test apparently has some sort of memory bounds overwrite
-// issue with the out_data_ buffer. Commenting out the free of
-// out_data_ avoids the problem. Left uncommented, a crash will
-// occur immediately or some time after.
-TEST(rocrtst, DISABLED_Perf_Device_Memory_Store_Bandwidth) {
-  DeviceStoreBandwidth slb;
-  slb.SetUp();
-  slb.Run();
-  slb.DisplayResults();
-  slb.Close();
-}
-
-// This test apparently has some sort of memory bounds overwrite
-// issue with the out_data_ buffer. Commenting out the free of
-// out_data_ avoids the problem. Left uncommented, a crash will
-// occur immediately or some time after.
-TEST(rocrtst, DISABLED_Perf_Device_Memory_Load_Bandwidth) {
-  DeviceLoadBandwidth slb;
-  slb.SetUp();
-  slb.Run();
-  slb.DisplayResults();
-  slb.Close();
-}
 TEST(rocrtst, Perf_Dispatch_Time_Single_SpinWait) {
-  DispatchTime dt;
-  dt.set_num_iteration(100);
-  dt.UseDefaultSignal(true);
-  dt.LaunchSingleKernel(true);
-  dt.SetUp();
-  dt.Run();
-  dt.DisplayResults();
-  dt.Close();
+  DispatchTime dt(true, true);
+  RunTest(&dt);
 }
 
 TEST(rocrtst, Perf_Dispatch_Time_Single_Interrupt) {
-  DispatchTime dt;
-  dt.UseDefaultSignal(false);
-  dt.LaunchSingleKernel(true);
-  dt.SetUp();
-  dt.Run();
-  dt.DisplayResults();
-  dt.Close();
+  DispatchTime dt(false, true);
+  RunTest(&dt);
 }
 
 TEST(rocrtst, Perf_Dispatch_Time_Multi_SpinWait) {
-  DispatchTime dt;
-  dt.UseDefaultSignal(true);
-  dt.LaunchSingleKernel(false);
-  dt.SetUp();
-  dt.Run();
-  dt.DisplayResults();
-  dt.Close();
+  DispatchTime dt(true, false);
+  RunTest(&dt);
 }
 
 TEST(rocrtst, Perf_Dispatch_Time_Multi_Interrupt) {
-  DispatchTime dt;
-  dt.UseDefaultSignal(false);
-  dt.LaunchSingleKernel(false);
-  dt.SetUp();
-  dt.Run();
-  dt.DisplayResults();
-  dt.Close();
+  DispatchTime dt(false, false);
+  RunTest(&dt);
 }
-TEST(rocrtst, DISABLED_Perf_CpProcessTime) {
-  CpProcessTime cpt;
-  cpt.set_num_iteration(10);
-  cpt.SetUp();
-  cpt.Run();
-  cpt.DisplayResults();
-  cpt.Close();
-}
-
-TEST(rocrtst, Perf_Memory_Allocation) {
-  MemoryAllocation ma(10);
-  ma.SetUp();
-  ma.Run();
-  ma.DisplayResults();
-  ma.Close();
-}
-
-#if MEM_POOL_FILL_BUG
-TEST(rocrtst, Perf_Queue_Latency) {
-  QueueLatency ql;
-  ql.set_num_iteration(10);
-  ql.SetUp();
-  ql.Run();
-  ql.DisplayResults();
-  ql.Close();
-}
-
-TEST(rocrtst, Perf_System_Memory_Load_Bandwidth) {
-  SystemLoadBandwidth slb;
-  slb.SetUp();
-  slb.Run();
-  slb.DisplayResults();
-  slb.Close();
-}
-
-TEST(rocrtst, Perf_System_Memory_Store_Bandwidth) {
-  SystemStoreBandwidth ssb;
-  ssb.SetUp();
-  ssb.Run();
-  ssb.DisplayResults();
-  ssb.Close();
-}
-
-TEST(rocrtst, Perf_Memory_Copy) {
-  MemoryCopy mc;
-  mc.set_num_iteration(10);
-  mc.SetUp();
-  mc.Run();
-  mc.DisplayResults();
-  mc.Close();
-}
-
-#endif
-
-#if 0
-// These tests were not complete. Needs research/work.
-TEST(rocrtst, Feature_Vector_Copy) {
-  VectorCopy vc;
-  vc.SetUp();
-  vc.Run();
-  vc.Close();
-}
-
-TEST(rocrtst, Perf_Matrix_Transpose) {
-  MatrixTranspose mt;
-  mt.SetUp();
-  mt.Run();
-  mt.DisplayResults();
-  mt.Close();
-}
-
-#endif
-
-//#if NEED_TO_MAKE_BATCH
-TEST(rocrtst, Perf_Memory_Async_Copy) {
-  MemoryAsyncCopy mac;
-  mac.set_num_iteration(10);
-  mac.SetUp();
-  mac.Run();
-  mac.DisplayResults();
-  mac.Close();
-}
-//#endif
 
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
+
+  RocrtstOptions opts(&sRocrTstOptVerbosity, &sRocrTestOptIterations);
+
+  if (ProcessCmdline(&opts, argc, argv)) {
+    return 1;
+  }
+
   return RUN_ALL_TESTS();
 }
diff --git a/rocrtst/suites/performance/matrix_transpose.cc b/rocrtst/suites/performance/matrix_transpose.cc
deleted file mode 100755
index 714b534ae6..0000000000
--- a/rocrtst/suites/performance/matrix_transpose.cc
+++ /dev/null
@@ -1,289 +0,0 @@
-/*
- * =============================================================================
- *   ROC Runtime Conformance Release License
- * =============================================================================
- * The University of Illinois/NCSA
- * Open Source License (NCSA)
- *
- * Copyright (c) 2017, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Developed by:
- *
- *                 AMD Research and AMD ROC Software Development
- *
- *                 Advanced Micro Devices, Inc.
- *
- *                 www.amd.com
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal with the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- *  - Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimers.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimers in
- *    the documentation and/or other materials provided with the distribution.
- *  - Neither the names of <Name of Development Group, Name of Institution>,
- *    nor the names of its contributors may be used to endorse or promote
- *    products derived from this Software without specific prior written
- *    permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS WITH THE SOFTWARE.
- *
- */
-
-#include "matrix_transpose.h"
-#include "common/base_rocr_utils.h"
-#include "common/common.h"
-#include "common/helper_funcs.h"
-#include "common/hsatimer.h"
-#include "hsa/hsa.h"
-#include "hsa/hsa_ext_amd.h"
-#include "hsa/hsa_ext_finalize.h"
-#include "gtest/gtest.h"
-#include <stdlib.h>
-#include <algorithm>
-
-static const unsigned int NUM_BLOCK_SIZES = 2;
-static const unsigned int blockSizes[NUM_BLOCK_SIZES] = {8, 16};
-static const unsigned int NUM_MATRIX_DIMS = 2;
-static const unsigned int matrixDims[NUM_MATRIX_DIMS] = {1024, 64};
-
-MatrixTranspose::MatrixTranspose(void) :
-  BaseRocR() {
-  in_buffer_sys_ = NULL;
-  out_buffer_sys_ = NULL;
-  in_buffer_ = NULL;
-  out_buffer_ = NULL;
-  width_ = 0;
-  height_ = 0;
-  buf_size_ = 0;
-  block_size_ = 0;
-  time_mean_ = 0.0;
-}
-
-MatrixTranspose::~MatrixTranspose(void) {
-
-}
-
-void MatrixTranspose::SetUp(void) {
-  hsa_status_t err;
-
-  InitializeData();
-
-  set_kernel_file_name("transpose_kernel.o");
-  set_kernel_name("&__OpenCL_matrixTranspose_kernel");
-
-  if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
-    return;
-  }
-
-  hsa_agent_t* gpu_dev = gpu_device1();
-  hsa_agent_t* cpu_dev = cpu_device();
-
-  err = hsa_amd_agent_iterate_memory_pools(*cpu_dev, rocrtst::FindGlobalPool,
-                                                                  &cpu_pool());
-  ASSERT_EQ(err, HSA_STATUS_INFO_BREAK);
-
-  err = hsa_amd_memory_pool_allocate(cpu_pool(), buf_size_, 0,
-                                     (void**) &in_buffer_);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  err = hsa_amd_memory_pool_allocate(cpu_pool(), buf_size_, 0,
-                                     (void**) &out_buffer_);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  err = hsa_amd_agents_allow_access(1, gpu_dev, NULL, in_buffer_);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  err = hsa_amd_agents_allow_access(1, gpu_dev, NULL, out_buffer_);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  // Create a queue
-  hsa_queue_t* q = nullptr;
-  rocrtst::CreateQueue(*gpu_dev, &q);
-  set_main_queue(q);
-
-  rocrtst::LoadKernelFromObjFile(this);
-
-  // Fill up aql packet
-  rocrtst::InitializeAQLPacket(this, &aql());
-  aql().setup = 0;
-  aql().setup |= 2 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
-  aql().workgroup_size_x = block_size_;
-  aql().workgroup_size_y = block_size_;
-  aql().grid_size_x = width_;
-  aql().grid_size_y = height_;
-  aql().group_segment_size = sizeof(uint) * block_size_ * block_size_;
-
-  // Debug
-#ifdef DEBUG
-  std::cout << "workgroup size: " << block_size_ << ", " << block_size_
-            << ", " << 1 << std::endl;
-  std::cout << "grid size: " << aql().grid_size_x << ", " <<
-            aql().grid_size_y << ", " << aql().grid_size_z << std::endl;
-  std::cout << "group segment size: " << aql().group_segment_size << std::endl;
-#endif
-}
-
-void MatrixTranspose::Run(void) {
-  hsa_status_t err;
-  hsa_agent_t* gpu_dev = gpu_device1();
-
-  if (!rocrtst::CheckProfile(this)) {
-    return;
-  }
-
-  // Allocate kernel parameter
-  typedef struct args_t {
-    uint* offset_0;
-    uint* offset_1;
-    uint* offset_2;
-    uint* printf_buffer;
-    uint* vqueue_buffer;
-    uint* aqlwrap_pointer;
-
-    uint* in_buf;
-    uint* out_buf;
-    uint* local_buf;
-    uint iblock_size;
-    uint iwidth;
-    uint iheight;
-  } args;
-
-  args* kern_ptr = NULL;
-  err = hsa_amd_memory_pool_allocate(cpu_pool(), sizeof(args), 0,
-                                     (void**) &kern_ptr);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  err = hsa_amd_agents_allow_access(1, gpu_dev, NULL, kern_ptr);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  kern_ptr->offset_0 = 0;
-  kern_ptr->offset_1 = 0;
-  kern_ptr->offset_2 = 0;
-  kern_ptr->printf_buffer = 0;
-  kern_ptr->vqueue_buffer = 0;
-  kern_ptr->aqlwrap_pointer = 0;
-
-  kern_ptr->in_buf = in_buffer_sys_;
-  kern_ptr->out_buf = out_buffer_sys_;
-  kern_ptr->local_buf = 0;
-  kern_ptr->iblock_size = block_size_;
-  kern_ptr->iwidth = width_;
-  kern_ptr->iheight = height_;
-
-  aql().kernarg_address = kern_ptr;
-
-  //Obtain the current queue write index.
-  uint64_t idx = hsa_queue_add_write_index_relaxed(main_queue(), 1);
-
-  ((hsa_kernel_dispatch_packet_t*)(main_queue()->base_address))[idx] = aql();
-
-  rocrtst::PerfTimer p_timer;
-  int id = p_timer.CreateTimer();
-  p_timer.StartTimer(id);
-
-  ((hsa_kernel_dispatch_packet_t*)(main_queue()->base_address))[idx].header |=
-                     HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
-
-  hsa_signal_store_release(main_queue()->doorbell_signal, idx);
-
-  //Wait on the dispatch signal until the kernel is finished.
-  hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1,
-                                       (uint64_t) - 1, HSA_WAIT_STATE_ACTIVE);
-  p_timer.StopTimer(id);
-
-  hsa_amd_profiling_dispatch_time_t dispatch_time;
-  err = hsa_amd_profiling_get_dispatch_time(*gpu_dev, signal(), &dispatch_time);
-
-  uint64_t stamp = dispatch_time.end - dispatch_time.start;
-  uint64_t freq;
-
-  err = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &freq);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  std::cout << "Kernel time is: " <<
-            (double) stamp / (double) freq * 1000.0 << std::endl;
-  hsa_signal_store_release(signal(), 1);
-
-
-  // Verify Results
-  VerifyResults (out_buffer_sys_);
-
-  // Abandon the first result which is warm up
-
-  time_mean_ = p_timer.ReadTimer(id); //rocrtst::CalcMean(timer);
-}
-
-void MatrixTranspose::DisplayResults(void) const {
-  if (!rocrtst::CheckProfile(this)) {
-    return;
-  }
-
-  std::cout << "============================================" << std::endl;
-  std::cout << "Matrix Transpose Mean Time:       " << time_mean_ << std::endl;
-
-  return;
-}
-
-void MatrixTranspose::Close(void) {
-  hsa_status_t err;
-  err = rocrtst::CommonCleanUp(this);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-}
-
-void MatrixTranspose::InitializeData(void) {
-  // int openTest = 1;
-  block_size_ = 16; //blockSizes[openTest % NUM_BLOCK_SIZES];
-  width_ = 1920; //matrixDims[openTest / NUM_BLOCK_SIZES];
-  height_ = width_;
-
-  buf_size_ = width_ * height_ * sizeof(uint);
-
-  in_buffer_sys_ = (uint*) aligned_alloc(256, buf_size_);
-
-  SetData (in_buffer_sys_);
-  out_buffer_sys_ = (uint*) aligned_alloc(256, buf_size_);
-
-  FillData(out_buffer_sys_, 0xdeadbeef);
-
-  return;
-}
-
-void MatrixTranspose::SetData(uint* buffer) {
-  for (unsigned int i = 0; i < height_; i++) {
-    for (unsigned int j = 0; j < width_; j++) {
-      *(buffer + i * width_ + j) = i * width_ + j;
-    }
-  }
-}
-
-void MatrixTranspose::FillData(uint* buffer, unsigned int val) {
-  for (unsigned int i = 0; i < width_ * height_; i++) {
-    buffer[i] = val;
-  }
-}
-
-void MatrixTranspose::VerifyResults(uint* buffer) {
-  bool err = false;
-
-  for (unsigned int i = 0; (i < width_) && !err; i++) {
-    for (unsigned int j = 0; (j < height_) && !err; j++) {
-      ASSERT_EQ(*(buffer + i * height_ + j), j * width_ + i);
-    }
-  }
-
-  std::cout << "PASSED!" << std::endl;
-}
diff --git a/rocrtst/suites/performance/matrix_transpose.h b/rocrtst/suites/performance/matrix_transpose.h
deleted file mode 100755
index 8b90060c41..0000000000
--- a/rocrtst/suites/performance/matrix_transpose.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * =============================================================================
- *   ROC Runtime Conformance Release License
- * =============================================================================
- * The University of Illinois/NCSA
- * Open Source License (NCSA)
- *
- * Copyright (c) 2017, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Developed by:
- *
- *                 AMD Research and AMD ROC Software Development
- *
- *                 Advanced Micro Devices, Inc.
- *
- *                 www.amd.com
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal with the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- *  - Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimers.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimers in
- *    the documentation and/or other materials provided with the distribution.
- *  - Neither the names of <Name of Development Group, Name of Institution>,
- *    nor the names of its contributors may be used to endorse or promote
- *    products derived from this Software without specific prior written
- *    permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS WITH THE SOFTWARE.
- *
- */
-
-#ifndef __ROCRTST_SRC_MATRIX_TRANSPOSE_H__
-#define __ROCRTST_SRC_MATRIX_TRANSPOSE_H__
-
-#include "perf_common/perf_base.h"
-#include "common/base_rocr.h"
-#include "hsa/hsa.h"
-
-class MatrixTranspose: public rocrtst::BaseRocR, public PerfBase {
-
- public:
-  //@Brief: Default Constructor
-  MatrixTranspose();
-
-  //@Brief: Destructor
-  ~MatrixTranspose();
-
-  //@Brief: Override SetUp function
-  virtual void SetUp();
-
-  //@Brief: Run the measurement
-  virtual void Run();
-
-  //@Brief: Clean up and Close
-  virtual void Close();
-
-  //@Brief: Display  results
-  virtual void DisplayResults() const;
-
- private:
-  //@Brief: Set up data
-  virtual void SetData(uint* buffer);
-
-  //@Brief: Fill Data
-  virtual void FillData(uint* buffer, unsigned int val);
-
-  //@Brief: VerifyResults
-  virtual void VerifyResults(uint* buffer);
-
-  //@Brief: Initialize the object attribute
-  virtual void InitializeData();
-
-  uint* in_buffer_;
-  uint* out_buffer_;
-  uint* in_buffer_sys_;
-  uint* out_buffer_sys_;
-  unsigned int width_;
-  unsigned int height_;
-  unsigned int buf_size_;
-  unsigned int block_size_;
-  double time_mean_;
-
-  hsa_barrier_and_packet_t bpkt;
-};
-
-#endif //__ROCRTST_SRC_MATRIX_TRANSPOSE_H__
-
diff --git a/rocrtst/suites/performance/memory_allocation.cc b/rocrtst/suites/performance/memory_allocation.cc
deleted file mode 100755
index a2beb60000..0000000000
--- a/rocrtst/suites/performance/memory_allocation.cc
+++ /dev/null
@@ -1,198 +0,0 @@
-/*
- * =============================================================================
- *   ROC Runtime Conformance Release License
- * =============================================================================
- * The University of Illinois/NCSA
- * Open Source License (NCSA)
- *
- * Copyright (c) 2017, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Developed by:
- *
- *                 AMD Research and AMD ROC Software Development
- *
- *                 Advanced Micro Devices, Inc.
- *
- *                 www.amd.com
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal with the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- *  - Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimers.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimers in
- *    the documentation and/or other materials provided with the distribution.
- *  - Neither the names of <Name of Development Group, Name of Institution>,
- *    nor the names of its contributors may be used to endorse or promote
- *    products derived from this Software without specific prior written
- *    permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS WITH THE SOFTWARE.
- *
- */
-
-#include "memory_allocation.h"
-#include "common/base_rocr_utils.h"
-#include "common/common.h"
-#include "hsa/hsa.h"
-#include "gtest/gtest.h"
-#include <algorithm>
-
-MemoryAllocation::MemoryAllocation(uint32_t num_iters) :
-  BaseRocR(), allocation_time_ {0.0}, mem_pool_flag_(0) {
-  ptr = NULL;
-}
-
-MemoryAllocation::~MemoryAllocation() {
-
-}
-
-const char* MemoryAllocation::Str[16] = {"64K", "128K", "256K", "512K", "1M",
-                                         "2M", "4M", "8M", "16M", "32M",
-                                         "64M", "128M", "256M", "512M", "1G",
-                                         "2G" 
-                                        };
-const size_t MemoryAllocation::Size[16] = {64*1024, 128*1024,
-                                           256*1024,512*1024, 1024*1024,
-                                           2048*1024, 4096*1024, 8*1024*1024,
-                                           16*1024*1024, 32*1024*1024,
-                                           64*1024*1024, 128*1024*1024,
-                                           256 * 1024*1024, 512*1024*1024,
-                                           1024*1024*1024,
-                                           (size_t)2*1024*1024*1024
-                                          };
-
-void MemoryAllocation::SetUp() {
-  hsa_status_t err;
-
-  if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
-    return;
-  }
-
-  hsa_agent_t* cpu_dev = cpu_device();
-
-  err = hsa_amd_agent_iterate_memory_pools(*cpu_dev, rocrtst::FindGlobalPool,
-                                                                  &cpu_pool());
-
-  EXPECT_EQ(err, HSA_STATUS_INFO_BREAK);
-
-  if (err != HSA_STATUS_INFO_BREAK) {
-    std::cout << "Unable to find global pool. Test will not be run."
-              << std::endl;
-    return;
-  }
-
-  //At this point, cpu_pool() should be in the global segment
-  err = hsa_amd_memory_pool_get_info(cpu_pool(),
-         (hsa_amd_memory_pool_info_t) HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS,
-                                                             &mem_pool_flag_);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-}
-
-void MemoryAllocation::Run() {
-
-  if (!rocrtst::CheckProfile(this)) {
-    return;
-  }
-
-  if (cpu_pool().handle == 0) {
-    return;
-  }
-
-  size_t iterations = RealIterationNum();
-  hsa_status_t err;
-
-  //Iterate over the different data size
-  for (int i = 0; i < 16; i++) {
-    std::vector<double> time;
-
-    for (uint32_t it = 0; it < iterations; it++) {
-#if DEBUG
-      std::cout << "." << std::flush;
-#endif
-
-      rocrtst::PerfTimer allocation_timer;
-      int index = allocation_timer.CreateTimer();
-
-      allocation_timer.StartTimer(index);
-      err = hsa_amd_memory_pool_allocate(cpu_pool(), Size[i], 0, &ptr);
-      allocation_timer.StopTimer(index);
-      ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-      //Free the memory which was allocated
-      err = hsa_amd_memory_pool_free(ptr);
-      ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-      ptr = NULL;
-
-      // PUsh the results back to vector time
-      time.push_back(allocation_timer.ReadTimer(index));
-    }
-
-#if DEBUG
-    std::cout << std::endl;
-#endif
-
-    //Get mean copy time and store to the array
-    allocation_time_[i] = GetMeanTime(time);
-  }
-}
-
-size_t MemoryAllocation::RealIterationNum() {
-  return num_iteration() * 1.2 + 1;
-}
-
-double MemoryAllocation::GetMeanTime(std::vector<double>& vec) {
-  std::sort(vec.begin(), vec.end());
-
-  vec.erase(vec.begin());
-  vec.erase(vec.begin(), vec.begin() + num_iteration() * 0.1);
-  vec.erase(vec.begin() + num_iteration(), vec.end());
-
-  double mean = 0.0;
-  int num = vec.size();
-
-  for (int it = 0; it < num; it++) {
-    mean += vec[it];
-  }
-
-  mean /= num;
-  return mean;
-}
-
-void MemoryAllocation::DisplayResults() const {
-
-  if (!rocrtst::CheckProfile(this)) {
-    return;
-  }
-
-  fprintf(stdout, "==============================================\n");
-  fprintf(stdout, "  Data Size  Allocation_time   BandWidth(GB/s)\n");
-
-  for (int i = 0; i < 16; i++) {
-    fprintf(stdout, "  %9s  %15.6f   %15.6f\n", Str[i], allocation_time_[i],
-            2 * Size[i] / allocation_time_[i] / 1024 / 1024 / 1024);
-  }
-
-  fprintf(stdout, "==============================================\n");
-
-  return;
-}
-
-void MemoryAllocation::Close() {
-  hsa_status_t err;
-  err = rocrtst::CommonCleanUp(this);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-  return;
-}
diff --git a/rocrtst/suites/performance/memory_allocation.h b/rocrtst/suites/performance/memory_allocation.h
deleted file mode 100755
index 1c39b1b2d3..0000000000
--- a/rocrtst/suites/performance/memory_allocation.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * =============================================================================
- *   ROC Runtime Conformance Release License
- * =============================================================================
- * The University of Illinois/NCSA
- * Open Source License (NCSA)
- *
- * Copyright (c) 2017, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Developed by:
- *
- *                 AMD Research and AMD ROC Software Development
- *
- *                 Advanced Micro Devices, Inc.
- *
- *                 www.amd.com
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal with the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- *  - Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimers.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimers in
- *    the documentation and/or other materials provided with the distribution.
- *  - Neither the names of <Name of Development Group, Name of Institution>,
- *    nor the names of its contributors may be used to endorse or promote
- *    products derived from this Software without specific prior written
- *    permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS WITH THE SOFTWARE.
- *
- */
-
-#ifndef __ROCRTST_SRC_MEMORY_MEM_ALLOCATION_H__
-#define __ROCRTST_SRC_MEMORY_MEM_ALLOCATION_H__
-
-#include "perf_common/perf_base.h"
-#include "common/base_rocr.h"
-#include "common/hsatimer.h"
-#include "hsa/hsa.h"
-#include <vector>
-
-class MemoryAllocation: public rocrtst::BaseRocR, public PerfBase {
-
- public:
-  //@Brief: Constructor for test case of MemoryAllocation
-  MemoryAllocation(uint32_t num_iters = 100);
-
-  //@Brief: Destructor for test case of MemoryAllocation
-  virtual ~MemoryAllocation();
-
-  //@Brief: Set up the environment for the test
-  virtual void SetUp();
-
-  //@Brief: Execute the test
-  virtual void Run();
-
-  //@Brief: Display  results
-  virtual void DisplayResults() const;
-
-  //@Brief: Clean up and close the environment
-  virtual void Close();
-
- protected:
-  //@Brief: Pointer to the memory space which is allocated by HSA Memory
-  // allocation API
-  void* ptr;
-
-  //@Brief: Array to store the timers results for each data size
-  double allocation_time_[16];
-
- private:
-  //@Brief: Define allocated data size and corresponding string
-  static const size_t Size[16];
-  static const char* Str[16];
-
-  uint32_t mem_pool_flag_;
-
-  //@Brief: Get the actual iteration number
-  size_t RealIterationNum();
-
-  //@Brief: Get mean execution time
-  double GetMeanTime(std::vector<double>& vec);
-
-};
-#endif
diff --git a/rocrtst/suites/performance/memory_async_copy.cc b/rocrtst/suites/performance/memory_async_copy.cc
old mode 100644
new mode 100755
index a6d2112ecf..3c55aad265
--- a/rocrtst/suites/performance/memory_async_copy.cc
+++ b/rocrtst/suites/performance/memory_async_copy.cc
@@ -43,153 +43,236 @@
  *
  */
 
-#include "memory_async_copy.h"
+#include <vector>
+#include <algorithm>
+
+#include "common/base_rocr.h"
+#include "suites/test_common/test_base.h"
+#include "hsa/hsa.h"
+#include "hsa/hsa_ext_amd.h"
+#include "suites/performance/memory_async_copy.h"
 #include "common/base_rocr_utils.h"
 #include "gtest/gtest.h"
 
-const char* Str[20] = {"1k", "2K", "4K", "8K", "16K", "32K", "64K", "128K",
-                       "256K", "512K", "1M", "2M", "4M", "8M", "16M", "32M",
-                                               "64M", "128M", "256M", "512M"
-                      };
-const size_t Size[20] = {1024, 2 * 1024, 4 * 1024, 8 * 1024, 16 * 1024, 32
-                         * 1024, 64 * 1024, 128 * 1024, 256 * 1024, 512 * 1024,
-                         1024 * 1024, 2048 * 1024, 4096 * 1024, 8 * 1024 * 1024,
-                         16 * 1024 * 1024, 32 * 1024 * 1024, 64 * 1024 * 1024,
-                         128 * 1024 * 1024, 256 * 1024 * 1024, 512 * 1024 * 1024
-                        };
+#define RET_IF_HSA_ERR(err) { \
+  if ((err) != HSA_STATUS_SUCCESS) { \
+    const char* msg = 0; \
+    hsa_status_string(err, &msg); \
+    std::cout << "hsa api call failure at line " << __LINE__ << ", file: " << \
+                          __FILE__ << ". Call returned " << err << std::endl; \
+    std::cout << msg << std::endl; \
+    return (err); \
+  } \
+}
+
+static const int kNumGranularity = 20;
+const char* Str[kNumGranularity] = {"1k", "2K", "4K", "8K", "16K", "32K",
+    "64K", "128K", "256K", "512K", "1M", "2M", "4M", "8M", "16M", "32M",
+                                               "64M", "128M", "256M", "512M"};
+
+const size_t Size[kNumGranularity] = {
+    1024, 2*1024, 4*1024, 8*1024, 16*1024, 32*1024, 64*1024, 128*1024,
+    256*1024, 512*1024, 1024*1024, 2048*1024, 4096*1024, 8*1024*1024,
+    16*1024*1024, 32*1024*1024, 64*1024*1024, 128*1024*1024, 256*1024*1024,
+    512*1024*1024};
+
+static const int kMaxCopySize = Size[kNumGranularity - 1];
+
+MemoryAsyncCopy::MemoryAsyncCopy(void) :
+    TestBase() {
+  static_assert(sizeof(Size)/sizeof(size_t) == kNumGranularity,
+      "kNumGranularity does not match size of arrays");
 
-MemoryAsyncCopy::MemoryAsyncCopy() :
-  BaseRocR() {
-//  argc_ = argc;
-//  argv_ = argv;
-  bench_mark_mode_ = false;
-  verification_ = false;
   agent_index_ = 0;
-  region_index_ = 0;
+  pool_index_ = 0;
   tran_.clear();
-  agent_info_.clear();
-  region_info_.clear();
-  node_info_.clear();
+  agent_info()->clear();
+  pool_info()->clear();
+  node_info()->clear();
   verified_ = true;
+  src_pool_id_ = -1;
+  dst_pool_id_ = -1;
+  do_full_test_ = false;
+  set_num_iteration(10);  // Default value
+  set_title("Asynchronous Memory Copy Bandwidth");
+  set_description("This test measures bandwidth to/from Host from/to GPU "
+      "and Peer to Peer using hsa_amd_memory_async_copy() to copy buffers "
+      "of various length from memory pool to another.");
 }
 
-MemoryAsyncCopy::~MemoryAsyncCopy() {
-  size_t size = tran_.size();
+MemoryAsyncCopy::~MemoryAsyncCopy(void) {
+  for (PoolInfo *p : pool_info_) {
+    delete p;
+  }
 
-  if (size != 0) {
-    for (size_t i = 0; i < size; i++) {
-      if (tran_.at(i).dep_signal != nullptr)
-        ;
-
-      delete[] tran_.at(i).dep_signal;
-    }
+  for (AgentInfo *a : agent_info_) {
+    delete a;
   }
 }
 
-void MemoryAsyncCopy::SetUp() {
+void MemoryAsyncCopy::SetUp(void) {
+  TestBase::SetUp();
 
-  if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
-    return;
-  }
   FindTopology();
 
-  ParseArgument();
+  if (verbosity() >= VERBOSE_STANDARD) {
+    PrintTopology();
+  }
+  ConstructTransactionList();
   return;
 }
 
-void MemoryAsyncCopy::Run() {
-  if (bench_mark_mode_)
-    if (verification_) {
-      RunBenchmarkWithVerification();
-    }
-    else {
-      RunBenchmark();
-    }
-  else {
-    RunNormal();
+void MemoryAsyncCopy::Run(void) {
+  TestBase::Run();
+
+  for (Transaction t : tran_) {
+    RunBenchmarkWithVerification(&t);
   }
 }
 
-void MemoryAsyncCopy::FindSystemRegion() {
+void MemoryAsyncCopy::FindSystemPool(void) {
   hsa_status_t err;
 
   err = hsa_iterate_agents(rocrtst::FindCPUDevice, &cpu_agent_);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+  ASSERT_EQ(HSA_STATUS_INFO_BREAK, err);
 
   err = hsa_amd_agent_iterate_memory_pools(cpu_agent_, rocrtst::FindGlobalPool,
-        &sys_region_);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+        &sys_pool_);
+  ASSERT_EQ(HSA_STATUS_INFO_BREAK, err);
 }
 
-void MemoryAsyncCopy::AcquireAccess(hsa_agent_t agent,
+static hsa_status_t AcquireAccess(hsa_agent_t agent,
                                     hsa_amd_memory_pool_t pool, void* ptr) {
   hsa_status_t err;
 
   hsa_amd_memory_pool_access_t access;
   err = hsa_amd_agent_memory_pool_get_info(agent, pool,
-        HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &access);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+                              HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &access);
 
-  ASSERT_NE(HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED, access);
+  RET_IF_HSA_ERR(err);
 
-  if (HSA_AMD_MEMORY_POOL_ACCESS_DISALLOWED_BY_DEFAULT == access) {
+  if (access == HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) {
+    return HSA_STATUS_ERROR;
+  }
+
+  if (access == HSA_AMD_MEMORY_POOL_ACCESS_DISALLOWED_BY_DEFAULT) {
     err = hsa_amd_agents_allow_access(1, &agent, NULL, ptr);
-    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+    RET_IF_HSA_ERR(err);
+  }
+
+  return err;
+}
+
+static hsa_agent_t *
+AcquireAsyncCopyAccess(
+         void *dst_ptr, hsa_amd_memory_pool_t dst_pool, hsa_agent_t *dst_ag,
+         void *src_ptr, hsa_amd_memory_pool_t src_pool, hsa_agent_t *src_ag) {
+  if (AcquireAccess(*src_ag, dst_pool, dst_ptr) != HSA_STATUS_SUCCESS) {
+    if (AcquireAccess(*dst_ag, src_pool, src_ptr) == HSA_STATUS_SUCCESS) {
+      return dst_ag;
+    } else {
+      return nullptr;
+    }
+  } else {
+    return src_ag;
   }
 }
 
-void MemoryAsyncCopy::RunBenchmarkWithVerification() {
+void MemoryAsyncCopy::RunBenchmarkWithVerification(Transaction *t) {
   hsa_status_t err;
   void* ptr_src;
   void* ptr_dst;
 
-  transaction& t = tran_.at(0);
-  size_t size = t.size * 1024;
+  size_t size = t->max_size * 1024;
 
-  FindSystemRegion();
+  hsa_amd_memory_pool_t src_pool =  pool_info_[t->src]->pool_;
+  hsa_agent_t dst_agent = pool_info_[t->dst]->owner_agent_info()->agent();
+  hsa_amd_memory_pool_t dst_pool = pool_info_[t->dst]->pool_;
 
-  err = hsa_amd_memory_pool_allocate(region_info_[t.src].region_, size, 0,
-                                     &ptr_src);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+  hsa_agent_t src_agent = pool_info_[t->src]->owner_agent_info()->agent();
 
-  err = hsa_amd_memory_pool_allocate(region_info_[t.dst].region_, size, 0,
+  if (verbosity() >= VERBOSE_STANDARD) {
+    printf("Executing Copy Path: From Pool %d To Pool %d ", t->src, t->dst);
+    switch (t->type) {
+      case H2D:
+        printf("(Host-To-Device)\n");
+        break;
+
+      case D2H:
+        printf("(Device-To-Host)\n");
+        break;
+
+      case P2P:
+        printf("(Peer-To-Peer)\n");
+        break;
+
+      default:
+        printf("**Unexpected path**\n");
+        return;
+    }
+  }
+
+  err = hsa_amd_memory_pool_allocate(src_pool, size, 0, &ptr_src);
+  ASSERT_EQ(HSA_STATUS_SUCCESS, err);
+
+  err = hsa_amd_memory_pool_allocate(dst_pool, size, 0,
                                      &ptr_dst);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+  ASSERT_EQ(HSA_STATUS_SUCCESS, err);
 
   // rocrtst::CommonCleanUp data
   void* host_ptr_src = NULL;
   void* host_ptr_dst = NULL;
-  err = hsa_amd_memory_pool_allocate(sys_region_, size, 0,
-                                     (void**) &host_ptr_src);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-  err = hsa_amd_memory_pool_allocate(sys_region_, size, 0,
-                                     (void**) &host_ptr_dst);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+  err = hsa_amd_memory_pool_allocate(sys_pool_, size, 0,
+                                     reinterpret_cast<void**>(&host_ptr_src));
+  ASSERT_EQ(HSA_STATUS_SUCCESS, err);
+  err = hsa_amd_memory_pool_allocate(sys_pool_, size, 0,
+                                     reinterpret_cast<void**>(&host_ptr_dst));
+  ASSERT_EQ(HSA_STATUS_SUCCESS, err);
 
-  memset(host_ptr_src, 1, size);
-  memset(host_ptr_dst, 0, size);
+  err = hsa_amd_memory_fill(host_ptr_src, 1, size/sizeof(uint32_t));
+  ASSERT_EQ(HSA_STATUS_SUCCESS, err);
+
+  err = hsa_amd_memory_fill(host_ptr_dst, 0, size/sizeof(uint32_t));
+  ASSERT_EQ(HSA_STATUS_SUCCESS, err);
 
   hsa_signal_t s;
   err = hsa_signal_create(1, 0, NULL, &s);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+  ASSERT_EQ(HSA_STATUS_SUCCESS, err);
 
-  AcquireAccess(region_info_[t.src].owner_agent_, sys_region_, host_ptr_src);
-  AcquireAccess(cpu_agent_, region_info_[t.src].region_, ptr_src);
+  // **** First copy from the system buffer source to the test source pool
+  // Acquire the appropriate access; prefer GPU agent over CPU where there
+  // is a choice.
+  hsa_agent_t *cpy_ag = nullptr;
+  cpy_ag = AcquireAsyncCopyAccess(ptr_src, src_pool, &src_agent, host_ptr_src,
+                                                     sys_pool_, &cpu_agent_);
+  if (cpy_ag == nullptr) {
+    std::cout << "Agents " << t->src << " and " << t->dst <<
+                              "cannot access each other's pool." << std::endl;
+  }
+  ASSERT_NE(cpy_ag, nullptr);
 
-  err = hsa_amd_memory_async_copy(ptr_src, region_info_[t.src].owner_agent_,
-                                  host_ptr_src, cpu_agent_, size, 0, NULL, s);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+  err = hsa_amd_memory_async_copy(ptr_src, *cpy_ag, host_ptr_src, *cpy_ag,
+                                                            size, 0, NULL, s);
+  ASSERT_EQ(HSA_STATUS_SUCCESS, err);
 
   while (hsa_signal_wait_scacquire(s, HSA_SIGNAL_CONDITION_LT, 1, uint64_t(-1),
                                    HSA_WAIT_STATE_ACTIVE))
-    ;
+  {}
 
   int iterations = RealIterationNum();
 
-  AcquireAccess(region_info_[t.dst].owner_agent_, region_info_[t.src].region_,
-                ptr_src);
+  // **** Next, copy from the test source pool to the test destination pool
+  // Prefer a gpu agent to a cpu agent
 
-  for (int i = 0; i < 20; i++) {
+  cpy_ag = AcquireAsyncCopyAccess(ptr_dst, dst_pool, &dst_agent, ptr_src,
+                                                        src_pool, &src_agent);
+  if (cpy_ag == nullptr) {
+    std::cout << "Owner agents for pools" << t->src << " and " <<
+                   t->dst << " cannot access each other's pool." << std::endl;
+  }
+  ASSERT_NE(cpy_ag, nullptr);
+
+  for (int i = 0; i < kNumGranularity; i++) {
     if (Size[i] > size) {
       break;
     }
@@ -197,500 +280,179 @@ void MemoryAsyncCopy::RunBenchmarkWithVerification() {
     std::vector<double> time;
 
     for (int it = 0; it < iterations; it++) {
-#if DEBUG
-      std::cout << ".";
-      std::cout.flush();
-#endif
-      // Check access to memory pool region
-      AcquireAccess(region_info_[t.src].owner_agent_,
-                    region_info_[t.dst].region_, ptr_dst);
+      if (verbosity() >= VERBOSE_PROGRESS) {
+        std::cout << ".";
+        std::cout.flush();
+      }
 
-      hsa_signal_store_relaxed(t.signal, 1);
+      hsa_signal_store_relaxed(t->signal, 1);
 
       rocrtst::PerfTimer copy_timer;
       int index = copy_timer.CreateTimer();
 
       copy_timer.StartTimer(index);
-      err = hsa_amd_memory_async_copy(ptr_dst, region_info_[t.dst].owner_agent_,
-                                      ptr_src, region_info_[t.src].owner_agent_,
-                                                    Size[i], 0, NULL, t.signal);
-      ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+      err = hsa_amd_memory_async_copy(ptr_dst, *cpy_ag, ptr_src, *cpy_ag,
+                                                 Size[i], 0, NULL, t->signal);
+      ASSERT_EQ(HSA_STATUS_SUCCESS, err);
 
-      while (hsa_signal_wait_scacquire(t.signal, HSA_SIGNAL_CONDITION_LT, 1,
-                                       uint64_t(-1), HSA_WAIT_STATE_ACTIVE))
-        ;
+      while (hsa_signal_wait_scacquire(t->signal, HSA_SIGNAL_CONDITION_LT, 1,
+                                         uint64_t(-1), HSA_WAIT_STATE_ACTIVE))
+      {}
 
       copy_timer.StopTimer(index);
 
       hsa_signal_store_relaxed(s, 1);
 
-      AcquireAccess(region_info_[t.dst].owner_agent_, sys_region_,
+      err = AcquireAccess(dst_agent, sys_pool_,
                     host_ptr_dst);
-      AcquireAccess(cpu_agent_, region_info_[t.dst].region_, ptr_dst);
+      ASSERT_EQ(HSA_STATUS_SUCCESS, err);
+
 
       err = hsa_amd_memory_async_copy(host_ptr_dst, cpu_agent_, ptr_dst,
-                          region_info_[t.dst].owner_agent_, size, 0, NULL, s);
-      ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+                                                 dst_agent, size, 0, NULL, s);
+      ASSERT_EQ(HSA_STATUS_SUCCESS, err);
 
       while (hsa_signal_wait_scacquire(s, HSA_SIGNAL_CONDITION_LT, 1,
                                        uint64_t(-1), HSA_WAIT_STATE_ACTIVE))
-        ;
+      {}
 
-      err = hsa_memory_copy(host_ptr_dst, ptr_dst, size);
-      ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+      err = AcquireAccess(cpu_agent_, sys_pool_, host_ptr_dst);
+      ASSERT_EQ(HSA_STATUS_SUCCESS, err);
 
       if (memcmp(host_ptr_src, host_ptr_dst, Size[i])) {
         verified_ = false;
       }
-
       // Push the result back to vector time
+
       time.push_back(copy_timer.ReadTimer(index));
     }
 
-#if DEBUG
-    std::cout << std::endl;
-#endif
+    if (verbosity() >= VERBOSE_PROGRESS) {
+      std::cout << std::endl;
+    }
 
     // Get Min copy time
-    min_time_.push_back(GetMinTime(time));
+    t->min_time->push_back(*std::min_element(time.begin(), time.end()));
     // Get mean copy time and store to the array
-    benchmark_copy_time_.push_back(GetMeanTime(time));
+    t->benchmark_copy_time->push_back(GetMeanTime(&time));
   }
 
-  DisplayBenchmark();
+  err = hsa_signal_destroy(s);
+  ASSERT_EQ(HSA_STATUS_SUCCESS, err);
 }
 
-void MemoryAsyncCopy::RunBenchmark() {
-  hsa_status_t err;
-  void* ptr_src;
-  void* ptr_dst;
-
-  transaction& t = tran_.at(0);
-  size_t size = t.size * 1024;
-
-  FindSystemRegion();
-
-  err = hsa_amd_memory_pool_allocate(region_info_[t.src].region_, size, 0,
-                                     &ptr_src);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  err = hsa_amd_memory_pool_allocate(region_info_[t.dst].region_, size, 0,
-                                     &ptr_dst);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  // Check access to memory pool region
-  AcquireAccess(region_info_[t.src].owner_agent_, region_info_[t.dst].region_,
-                ptr_dst);
-  AcquireAccess(region_info_[t.dst].owner_agent_, region_info_[t.src].region_,
-                ptr_src);
-
-  int iterations = RealIterationNum();
-
-  for (int i = 0; i < 20; i++) {
-    if (Size[i] > size) {
-      break;
-    }
-
-    std::vector<double> time;
-
-    for (int it = 0; it < iterations; it++) {
-#if DEBUG
-      std::cout << ".";
-      std::cout.flush();
-#endif
-
-      hsa_signal_store_relaxed(t.signal, 1);
-
-      rocrtst::PerfTimer copy_timer;
-      int index = copy_timer.CreateTimer();
-
-      copy_timer.StartTimer(index);
-      err = hsa_amd_memory_async_copy(ptr_dst, region_info_[t.dst].owner_agent_,
-                                      ptr_src, region_info_[t.src].owner_agent_,
-                                                    Size[i], 0, NULL, t.signal);
-      ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-      while (hsa_signal_wait_scacquire(t.signal, HSA_SIGNAL_CONDITION_LT, 1,
-                                       uint64_t(-1), HSA_WAIT_STATE_ACTIVE))
-        ;
-
-      copy_timer.StopTimer(index);
-
-      // Push the result back to vector time
-      time.push_back(copy_timer.ReadTimer(index));
-    }
-
-#if DEBUG
-    std::cout << std::endl;
-#endif
-
-    // Get Min copy time
-    min_time_.push_back(GetMinTime(time));
-    // Get mean copy time and store to the array
-    benchmark_copy_time_.push_back(GetMeanTime(time));
-  }
-
-  DisplayBenchmark();
-}
-
-void MemoryAsyncCopy::RunNormal() {
-  int num_transaction = tran_.size();
-  hsa_status_t err;
-  std::vector<void*> ptr_src;
-  std::vector<void*> ptr_dst;
-
-  for (int i = 0; i < num_transaction; i++) {
-    void* ptr_src_temp;
-    void* ptr_dst_temp;
-    transaction& t = tran_[i];
-    hsa_amd_memory_pool_t region_src = region_info_[t.src].region_;
-    hsa_amd_memory_pool_t region_dst = region_info_[t.dst].region_;
-    size_t size = t.size * 1024;
-
-    // Allocate memory
-    err = hsa_amd_memory_pool_allocate(region_src, size, 0,
-                                       (void**) &ptr_src_temp);
-    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-    err = hsa_amd_memory_pool_allocate(region_dst, size, 0,
-                                       (void**) &ptr_dst_temp);
-    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-    AcquireAccess(region_info_[t.dst].owner_agent_, region_src, ptr_src_temp);
-    AcquireAccess(region_info_[t.src].owner_agent_, region_dst, ptr_dst_temp);
-
-    ptr_src.push_back(ptr_src_temp);
-    ptr_dst.push_back(ptr_dst_temp);
-  }
-
-  int iterations = RealIterationNum();
-  std::vector<double> time;
-
-  for (int i = 0; i < iterations; i++) {
-    for (int j = 0; j < num_transaction; j++) {
-      transaction& t = tran_[j];
-      hsa_signal_store_relaxed(t.signal, 1);
-    }
-
-    rocrtst::PerfTimer copy_timer;
-    int index = copy_timer.CreateTimer();
-    copy_timer.StartTimer(index);
-
-    for (int j = 0; j < num_transaction; j++) {
-      transaction& t = tran_[j];
-      err = hsa_amd_memory_async_copy(ptr_dst[j],
-             region_info_[t.dst].owner_agent_, ptr_src[j],
-             region_info_[t.src].owner_agent_, t.size * 1024, t.num_dep_signal,
-                                                        t.dep_signal, t.signal);
-    }
-
-    // Wait on the last transaction to finish
-    while (hsa_signal_wait_scacquire(tran_[num_transaction - 1].signal,
-              HSA_SIGNAL_CONDITION_LT, 1, uint64_t(-1), HSA_WAIT_STATE_ACTIVE))
-      ;
-
-    copy_timer.StopTimer(index);
-    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-    time.push_back(copy_timer.ReadTimer(index));
-  }
-
-  user_copy_time_ = GetMeanTime(time);
-  DisplayResults();
-}
-
-size_t MemoryAsyncCopy::RealIterationNum() {
+size_t MemoryAsyncCopy::RealIterationNum(void) {
   return num_iteration() * 1.2 + 1;
 }
 
-double MemoryAsyncCopy::GetMinTime(std::vector<double>& vec) {
-  std::sort(vec.begin(), vec.end());
-  return vec.at(0);
-}
-double MemoryAsyncCopy::GetMeanTime(std::vector<double>& vec) {
-  std::sort(vec.begin(), vec.end());
+double MemoryAsyncCopy::GetMeanTime(std::vector<double> *vec) {
+  std::sort(vec->begin(), vec->end());
 
-  vec.erase(vec.begin());
-  vec.erase(vec.begin(), vec.begin() + num_iteration() * 0.1);
-  vec.erase(vec.begin() + num_iteration(), vec.end());
+  vec->erase(vec->begin());
+  vec->erase(vec->begin(), vec->begin() + num_iteration() * 0.1);
+  vec->erase(vec->begin() + num_iteration(), vec->end());
 
   double mean = 0.0;
-  int num = vec.size();
+  int num = vec->size();
 
   for (int it = 0; it < num; it++) {
-    mean += vec[it];
+    mean += (*vec)[it];
   }
 
   mean /= num;
   return mean;
 }
 
-void MemoryAsyncCopy::DisplayResults() const {
-
+void MemoryAsyncCopy::DisplayResults(void) const {
   if (!rocrtst::CheckProfile(this)) {
     return;
   }
 
-  printf("================ User-Defined  Mode Result "
-         "===================================\n");
-  double band_width = (double) tran_.back().size / user_copy_time_ / 1024
-                      / 1024;
-  printf("  %zuKB                             %lf\n", tran_.back().size,
-         band_width);
+  TestBase::DisplayResults();
+
+  for (Transaction t : tran_) {
+    DisplayBenchmark(&t);
+    delete t.benchmark_copy_time;
+    delete t.min_time;
+  }
+
   return;
 }
 
-void MemoryAsyncCopy::DisplayBenchmark() {
-  transaction& t = tran_.at(0);
-  size_t size = t.size * 1024;
-  printf("================ Benchmark Mode Result "
-         "===================================\n");
+void MemoryAsyncCopy::DisplayBenchmark(Transaction *t) const {
+  size_t size = t->max_size * 1024;
+  printf("=========================== PATH: From Pool %d To Pool %d (",
+                                                              t->src, t->dst);
+
+  switch (t->type) {
+    case H2D:
+      printf("Host-To-Device) ===========================\n");
+      break;
+
+    case D2H:
+      printf("Device-To-Host) ===========================\n");
+      break;
+
+    case P2P:
+      printf("Peer-To-Peer) =============================\n");
+      break;
+
+    default:
+      ASSERT_EQ(t->type == H2D || t->type == D2H || t->type == P2P, true);
+  }
+  if (verified_) {
+    std::cout << "Verification: Pass" << std::endl;
+  } else {
+    std::cout << "Verification: Fail" << std::endl;
+  }
+
+  if (verbosity() < VERBOSE_STANDARD) {
+    return;
+  }
 
   printf("Data Size             Avg Time(us)         Avg BW(GB/s)"
-                              "          Min Time(us)         Peak BW(GB/s)\n");
+                           "          Min Time(us)          Peak BW(GB/s)\n");
 
   for (int i = 0; i < 20; i++) {
     if (Size[i] > size) {
       break;
     }
 
-    double band_width = (double) Size[i] / benchmark_copy_time_[i] / 1024 / 1024
-                        / 1024;
-    double peak_band_width = (double) Size[i] / min_time_[i] / 1024 / 1024
-                             / 1024;
-    printf("  %4s            %14lf        %14lf         %14lf         %14lf\n",
-          Str[i], benchmark_copy_time_[i] * 1e6, band_width, min_time_[i] * 1e6,
-           peak_band_width);
+    double band_width =
+    static_cast<double>(Size[i]/(*(t->benchmark_copy_time))[i]/1024/1024/1024);
+    double peak_band_width =
+       static_cast<double>(Size[i] / (*(t->min_time))[i]/ 1024 / 1024 / 1024);
+    printf(
+        "  %4s            %14lf        %14lf         %14lf         %14lf\n",
+       Str[i], (*(t->benchmark_copy_time))[i] * 1e6, band_width,
+                                  (*(t->min_time))[i] * 1e6, peak_band_width);
   }
 
-  if (verification_) {
-    if (verified_) {
-      std::cout << "Verification: Pass" << std::endl;
-    }
-    else {
-      std::cout << "Verification: Fail" << std::endl;
-    }
-  }
   return;
 }
 
 void MemoryAsyncCopy::Close() {
-  hsa_status_t err;
-  err = rocrtst::CommonCleanUp(this);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+  TestBase::Close();
 }
 
-void MemoryAsyncCopy::FindTopology() {
-  hsa_status_t err;
-  err = hsa_iterate_agents(AgentInfo, this);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-}
-
-void MemoryAsyncCopy::ParseArgument() {
-  bool print_help_info = false;
-  hsa_status_t err;
-
-  opterr = 0;
-  int c;
-  int src_region = 0;
-  int dst_region = 0;
-  size_t data_size = 512 * 1024;
-  size_t opt_num = 0;
-  char rec = 'n';
-
-  while ((c = getopt(argc_, argv_, "hbvs:f:t:i:r:")) != -1) {
-    switch (c) {
-      case 'h':
-        print_help_info = true;
-        break;
-
-      case 'f':
-        src_region = std::stoi(optarg);
-        opt_num++;
-        break;
-
-      case 't':
-        dst_region = std::stoi(optarg);
-        opt_num++;
-        break;
-
-      case 's':
-        data_size = std::stoi(optarg);
-        break;
-
-      case 'i':
-        set_num_iteration(std::stoi(optarg));
-        break;
-
-      case 'r':
-        rec = tolower(*optarg);
-        break;
-
-      case 'b':
-        bench_mark_mode_ = true;
-        break;
-
-      case 'v':
-        verification_ = true;
-        break;
-
-      case '?':
-        if (optopt == 'f' || optopt == 't' || optopt == 's' || optopt == 'i'
-            || optopt == 'r') {
-          std::cout << "Error: Option -f -t -s -i and -r ALL requires argument"
-                    << std::endl;
-          std::cout << help_info << std::endl;
-        }
-
-        ASSERT_NE("Error: Option -f -t -s -i and -r ALL requires argument", "");
-        break;
-
-      default:
-        std::cout << "Error: Please set option argument properly!" << std::endl;
-        std::cout << help_info << std::endl;
-        ASSERT_NE("Error: Please set option argument properly!", "");
-    }
-  }
-
-  //-h option has the highest priority
-  if (print_help_info) {
-    std::cout << help_info << std::endl;
-    PrintTopology();
-    ASSERT_NE("Exit on -h", "");
-  }
-
-  if (opt_num != 2) {
-    std::cout << "You must specify all of -f -t" << std::endl;
-    std::cout << help_info << std::endl;
-    PrintTopology();
-    ASSERT_NE("You must specify all of -f -t", "");
-  }
-
-  // Set transaction
-  transaction trans;
-  trans.src = src_region;
-  trans.dst = dst_region;
-  trans.size = data_size;
-  trans.num_dep_signal = 0;
-  trans.dep_signal = nullptr;
-  err = hsa_signal_create(1, 0, NULL, &trans.signal);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-  tran_.push_back(trans);
-
-  if (!bench_mark_mode_) {
-    while (rec != 'n') {
-      int dep = 0;
-      ;
-      std::cout
-          << "You will add another copy transaction, which will depends on "
-          "previous ones." << std::endl;
-      std::cout << "There are " << tran_.size() <<
-                         " copy transactions already, how many transactions"
-                                 " you want the new transaction depends on?"
-                << std::endl;
-      std::cin >> dep;
-      std::cout
-          << "Please specify which one you want to depend on, separate with "
-          "whitespace, index from 0:" << std::endl;
-      int* dep_ptr = new int[dep];
-
-      for (int i = 0; i < dep; i++) {
-        std::cin >> dep_ptr[i];
-      }
-
-      std::cout << "Please specify the dst memory pool:" << std::endl;
-      std::cin >> dst_region;
-      std::cout << "Please specify the src memory pool:" << std::endl;
-      std::cin >> src_region;
-      std::cout << "Please specify the data size:" << std::endl;
-      std::cin >> data_size;
-      std::cout << "Do you want to add more copy transaction: \"y\" or \"n\"?"
-                << std::endl;
-      char temp;
-      std::cin >> temp;
-      rec = tolower(temp);
-
-      transaction t;
-      t.dst = dst_region;
-      t.src = src_region;
-      t.size = data_size;
-      err = hsa_signal_create(1, 0, NULL, &t.signal);
-      ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-      t.num_dep_signal = dep;
-      hsa_signal_t* signal_ptr = nullptr;
-
-      if (dep != 0) {
-        signal_ptr = new hsa_signal_t[dep];
-      }
-
-      for (int i = 0; i < dep; i++) {
-        signal_ptr[i] = tran_.at(dep_ptr[i]).signal;
-      }
-
-      t.dep_signal = signal_ptr;
-      tran_.push_back(t);
-
-      delete[] dep_ptr;
-    }
-  }
-}
-
-void MemoryAsyncCopy::PrintTopology() {
-  size_t node_num = node_info_.size();
-
-  for (uint32_t i = 0; i < node_num; i++) {
-    node_info node = node_info_.at(i);
-    // Print agent info
-    std::cout << std::endl;
-    std::cout << "Agent #" << node.agent.index_ << ":" << std::endl;
-
-    if (HSA_DEVICE_TYPE_CPU == node.agent.device_type_)
-      std::cout << "Agent Device Type:                             CPU"
-                << std::endl;
-    else if (HSA_DEVICE_TYPE_GPU == node.agent.device_type_)
-      std::cout << "Agent Device Type:                             GPU"
-                << std::endl;
-
-    // Print region info
-    size_t region_num = node.region.size();
-
-    for (uint32_t j = 0; j < region_num; j++) {
-      std::cout << "    Memory Pool#" << node.region.at(j).index_ << ":"
-                << std::endl;
-      std::cout << "        max allocable size in KB: 		"
-                << node.region.at(j).allocable_size_ / 1024 << std::endl;
-      std::cout << "        is fine-grained: 			"
-                << node.region.at(j).is_fine_grained_ << std::endl;
-    }
-  }
-}
-
-#define RET_IF_MEM_ASYNC_ERR(err) { \
-  if ((err) != HSA_STATUS_SUCCESS) { \
-    std::cout << "hsa api call failure at line " << __LINE__ << ", file: " << \
-              __FILE__ << ". Call returned " << err << std::endl; \
-    return (err); \
-  } \
-}
-
-hsa_status_t RegionInfo(hsa_amd_memory_pool_t region, void* data) {
+static hsa_status_t GetPoolInfo(hsa_amd_memory_pool_t pool, void* data) {
   hsa_status_t err;
   MemoryAsyncCopy* ptr = reinterpret_cast<MemoryAsyncCopy*>(data);
-  // Query region segment, only report global one
+  // Query pool segment, only report global one
   hsa_amd_segment_t region_segment;
-  err = hsa_amd_memory_pool_get_info(region, HSA_AMD_MEMORY_POOL_INFO_SEGMENT,
+  err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT,
                                      &region_segment);
-  RET_IF_MEM_ASYNC_ERR(err);
+  RET_IF_HSA_ERR(err);
 
-  if (HSA_AMD_SEGMENT_GLOBAL != region_segment) {
+  if (region_segment != HSA_AMD_SEGMENT_GLOBAL) {
     return HSA_STATUS_SUCCESS;
   }
 
-  // Check if the region is alloc allowed, if not, discard this region
+  // Check if the pool is alloc allowed, if not, discard this pool
   bool alloc_allowed = false;
-  err = hsa_amd_memory_pool_get_info(region,
+  err = hsa_amd_memory_pool_get_info(pool,
               HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, &alloc_allowed);
-  RET_IF_MEM_ASYNC_ERR(err);
+  RET_IF_HSA_ERR(err);
 
   if (alloc_allowed != true) {
     return HSA_STATUS_SUCCESS;
@@ -698,56 +460,166 @@ hsa_status_t RegionInfo(hsa_amd_memory_pool_t region, void* data) {
 
   // Query the max allocable size
   size_t alloc_max_size = 0;
-  err = hsa_amd_memory_pool_get_info(region, HSA_AMD_MEMORY_POOL_INFO_SIZE,
+  err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SIZE,
                                      &alloc_max_size);
-  RET_IF_MEM_ASYNC_ERR(err);
+  RET_IF_HSA_ERR(err);
 
-  // Check if the region is fine-grained or coarse-grained
+  // Check if the pool is fine-grained or coarse-grained
   uint32_t global_flag = 0;
-  err = hsa_amd_memory_pool_get_info(region,
+  err = hsa_amd_memory_pool_get_info(pool,
                         HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flag);
-  RET_IF_MEM_ASYNC_ERR(err);
+  RET_IF_HSA_ERR(err);
 
   bool is_fine_grained = HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED
                          & global_flag;
-  // ptr->region_info_.push_back(region_info(region, ptr->region_index_,
-  // region_segment, is_fine_grained, host_accessible, alloc_max_size));
 
-  ptr->region_info_.push_back(
-    region_info(region, ptr->region_index_, region_segment, is_fine_grained,
-                alloc_max_size, ptr->agent_info_.back().agent_));
+  int pool_i = ptr->pool_index();
+  int ag_ind = ptr->agent_index();
+  ptr->pool_info()->push_back(
+    new PoolInfo(pool, pool_i, region_segment, is_fine_grained,
+                                  alloc_max_size, ptr->agent_info()->back()));
 
-  // Construct node_info and push back to node_info_
-  ptr->node_info_[ptr->agent_index_].region.push_back(ptr->region_info_.back());
-  ptr->region_index_++;
+  // Construct node_info and push back to agent_info_
+  (*ptr->node_info())[ag_ind].pool.push_back(*ptr->pool_info()->back());
+  ptr->set_pool_index(pool_i + 1);
 
   return HSA_STATUS_SUCCESS;
 }
 
-hsa_status_t AgentInfo(hsa_agent_t agent, void* data) {
+static hsa_status_t GetAgentInfo(hsa_agent_t agent, void* data) {
   MemoryAsyncCopy* ptr = reinterpret_cast<MemoryAsyncCopy*>(data);
 
   hsa_status_t err;
   char name[64];
   err = hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, name);
-  RET_IF_MEM_ASYNC_ERR(err);
+  RET_IF_HSA_ERR(err);
 
   // Get device type
   hsa_device_type_t device_type;
   err = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &device_type);
-  RET_IF_MEM_ASYNC_ERR(err);
+  RET_IF_HSA_ERR(err);
 
-  ptr->agent_info_.push_back(agent_info(agent, ptr->agent_index_, device_type));
+  ptr->agent_info()->push_back(
+                       new AgentInfo(agent, ptr->agent_index(), device_type));
 
-  // Contruct an new node_info structure and push back to node_info_
-  node_info node;
-  node.agent = ptr->agent_info_.back();
-  ptr->node_info_.push_back(node);
-
-  err = hsa_amd_agent_iterate_memory_pools(agent, RegionInfo, ptr);
-  ptr->agent_index_++;
+  // Contruct a new NodeInfo structure and push back to agent_info_
+  NodeInfo node;
+  node.agent = *ptr->agent_info()->back();
+  ptr->node_info()->push_back(node);
 
+  err = hsa_amd_agent_iterate_memory_pools(agent, GetPoolInfo, ptr);
+  ptr->set_agent_index(ptr->agent_index() + 1);
   return HSA_STATUS_SUCCESS;
 }
 
-#undef RET_IF_MEM_ASYNC_ERR
+void MemoryAsyncCopy::FindTopology() {
+  hsa_status_t err;
+
+  err = hsa_iterate_agents(GetAgentInfo, this);
+  FindSystemPool();
+
+  ASSERT_EQ(HSA_STATUS_SUCCESS, err);
+}
+
+void MemoryAsyncCopy::DisplayTestInfo(void) {
+  TestBase::DisplayTestInfo();
+}
+
+void MemoryAsyncCopy::ConstructTransactionList(void) {
+  hsa_status_t err;
+
+  tran_.clear();
+
+  int cpu_pool_indx = -1;
+  int gpu1_pool_indx = -1;
+  int gpu2_pool_indx = -1;
+
+  auto push_trans = [&](int from_indx, int to_indx, TransType type) {
+    Transaction t;
+    t.src = from_indx;
+    t.dst = to_indx;
+    t.max_size = kMaxCopySize/1024;
+    t.type = type;
+    t.benchmark_copy_time = new  std::vector<double>;
+    t.min_time = new std::vector<double>;
+    err = hsa_signal_create(1, 0, NULL, &t.signal);
+    ASSERT_EQ(HSA_STATUS_SUCCESS, err);
+
+    tran_.push_back(t);
+  };
+
+  // Find the CPU Node and pool
+  for (NodeInfo n : *node_info()) {
+    if (cpu_pool_indx == -1 && n.agent.device_type() == HSA_DEVICE_TYPE_CPU) {
+      cpu_pool_indx = n.pool[0].index_;
+      continue;
+    }
+    if (gpu1_pool_indx == -1 && n.agent.device_type() == HSA_DEVICE_TYPE_GPU) {
+      gpu1_pool_indx = n.pool[0].index_;
+      continue;
+    }
+    if (gpu2_pool_indx == -1 &&  n.agent.device_type() == HSA_DEVICE_TYPE_GPU) {
+      gpu2_pool_indx = n.pool[0].index_;
+      break;
+    }
+  }
+
+  ASSERT_NE(cpu_pool_indx, -1);
+  ASSERT_NE(gpu1_pool_indx, -1);
+
+  push_trans(cpu_pool_indx, gpu1_pool_indx, H2D);
+  push_trans(gpu1_pool_indx, cpu_pool_indx, D2H);
+
+  if (do_full_test_) {
+    for (NodeInfo n : *node_info()) {
+      if (n.agent.device_type() == HSA_DEVICE_TYPE_CPU) {
+        continue;
+      }
+
+      for (PoolInfo p : n.pool) {
+        if (p.index_ == gpu1_pool_indx) {
+          continue;
+        }
+        push_trans(gpu1_pool_indx, p.index_, P2P);
+        push_trans(p.index_, gpu1_pool_indx, P2P);
+      }
+    }
+  } else {
+    if (gpu2_pool_indx != -1) {
+      push_trans(gpu1_pool_indx, gpu2_pool_indx, P2P);
+      push_trans(gpu2_pool_indx, gpu1_pool_indx, P2P);
+    }
+  }
+}
+
+void MemoryAsyncCopy::PrintTopology(void) {
+  size_t node_num = node_info()->size();
+
+  for (uint32_t i = 0; i < node_num; i++) {
+    NodeInfo node = node_info()->at(i);
+    // Print agent info
+    std::cout << std::endl;
+    std::cout << "Agent #" << node.agent.index_ << ":" << std::endl;
+
+    if (HSA_DEVICE_TYPE_CPU == node.agent.device_type())
+      std::cout << "Agent Device Type:                             CPU"
+                << std::endl;
+    else if (HSA_DEVICE_TYPE_GPU == node.agent.device_type())
+      std::cout << "Agent Device Type:                             GPU"
+                << std::endl;
+
+    // Print pool info
+    size_t pool_num = node.pool.size();
+
+    for (uint32_t j = 0; j < pool_num; j++) {
+      std::cout << "    Memory Pool#" << node.pool.at(j).index_ << ":"
+                << std::endl;
+      std::cout << "        max allocable size in KB: \t\t"
+                << node.pool.at(j).allocable_size_ / 1024 << std::endl;
+      std::cout << "        is fine-grained: \t\t\t"
+                << node.pool.at(j).is_fine_grained_ << std::endl;
+    }
+  }
+}
+
+#undef RET_IF_HSA_ERR
diff --git a/rocrtst/suites/performance/memory_async_copy.h b/rocrtst/suites/performance/memory_async_copy.h
index 001884a499..6cbf9d7913 100755
--- a/rocrtst/suites/performance/memory_async_copy.h
+++ b/rocrtst/suites/performance/memory_async_copy.h
@@ -43,199 +43,182 @@
  *
  */
 
-#ifndef __ROCRTST_SRC_MEMORY_ASYNC_COPY_H__
-#define __ROCRTST_SRC_MEMORY_ASYNC_COPY_H__
+#ifndef ROCRTST_SUITES_PERFORMANCE_MEMORY_ASYNC_COPY_H_
+#define ROCRTST_SUITES_PERFORMANCE_MEMORY_ASYNC_COPY_H_
+
+#include <vector>
+#include <algorithm>
 
-#include "perf_common/perf_base.h"
 #include "common/base_rocr.h"
-#include "common/common.h"
-#include "common/hsatimer.h"
 #include "hsa/hsa.h"
 #include "hsa/hsa_ext_amd.h"
-#include <unistd.h>
-#include <algorithm>
-#include <vector>
-#include <cctype>
+#include "suites/test_common/test_base.h"
 
-extern int mac_argc;
-extern char** mac_argv;
+typedef enum TransType {H2D = 0, D2H, P2P} TransType;
 
-typedef struct transaction {
+typedef struct Transaction {
   int src;
   int dst;
   hsa_signal_t signal;
-  size_t size;
-  size_t num_dep_signal;
-  hsa_signal_t* dep_signal;
-} transaction;
+  size_t max_size;  // Max. amount of kBytes to copy
+  TransType type;
+  // BenchMark copy time
+  std::vector<double> *benchmark_copy_time;
+  // Min time
+  std::vector<double> *min_time;
+} Transaction;
 
-typedef struct agent_info {
-  agent_info(hsa_agent_t agent, int index, hsa_device_type_t device_type) {
-    agent_ = agent;
-    index_ = index;
-    device_type_ = device_type;
-  }
-  agent_info() {
-  }
-  hsa_agent_t agent_;
-  int index_;
-  hsa_device_type_t device_type_;
-} agent_info;
+class AgentInfo {
+ public:
+    AgentInfo(hsa_agent_t agent, int index, hsa_device_type_t device_type) {
+      agent_ = agent;
+      index_ = index;
+      device_type_ = device_type;
+    }
+    AgentInfo() {}
+
+    ~AgentInfo() {}
+    hsa_agent_t agent(void) const {return agent_;}
+    hsa_device_type_t device_type(void) const {return device_type_;}
+
+    hsa_agent_t agent_;
+    int index_;
+
+ private:
+    hsa_device_type_t device_type_;
+};
+
+class PoolInfo {
+ public:
+    PoolInfo(hsa_amd_memory_pool_t pool, int index,
+               hsa_amd_segment_t segment, bool is_fine_graind, size_t size,
+               AgentInfo *agent_info) {
+      pool_ = pool;
+      index_ = index;
+      segment_ = segment;
+      is_fine_grained_ = is_fine_graind;
+      allocable_size_ = size;
+      owner_agent_info_ = agent_info;
+    }
+    PoolInfo() {}
+    ~PoolInfo() {}
+    AgentInfo* owner_agent_info(void) const {return owner_agent_info_;}
+    hsa_amd_memory_pool_t pool_;
+    int index_;
+    hsa_amd_segment_t segment_;
+    bool is_fine_grained_;
+    size_t allocable_size_;
+ private:
+    AgentInfo *owner_agent_info_;
+};
 
-typedef struct region_info {
-  region_info(hsa_amd_memory_pool_t region, int index,
-              hsa_amd_segment_t segment, bool is_fine_graind, size_t size,
-              hsa_agent_t agent) {
-    region_ = region;
-    index_ = index;
-    segment_ = segment;
-    is_fine_grained_ = is_fine_graind;
-    allocable_size_ = size;
-    owner_agent_ = agent;
-  }
-  region_info() {
-  }
-  hsa_amd_memory_pool_t region_;
-  int index_;
-  hsa_amd_segment_t segment_;
-  bool is_fine_grained_;
-  size_t allocable_size_;
-  hsa_agent_t owner_agent_;
-} region_info;
 
 // Used to print out topology info
-typedef struct node_info {
-  node_info() {
-  }
-  agent_info agent;
-  std::vector<region_info> region;
-} node_info;
+typedef struct NodeInfo {
+  AgentInfo agent;
+  std::vector<PoolInfo> pool;
+} NodeInfo;
 
-hsa_status_t AgentInfo(hsa_agent_t agent, void* data);
-hsa_status_t RegionInfo(hsa_amd_memory_pool_t region, void* data);
 
-class MemoryAsyncCopy: public rocrtst::BaseRocR, public PerfBase {
+class MemoryAsyncCopy : public TestBase {
  public:
   MemoryAsyncCopy();
 
-  //@Brief: Destructor for test case of MemoryAsyncCopy
+  // @Brief: Destructor for test case of MemoryAsyncCopy
   virtual ~MemoryAsyncCopy();
 
-  //@Brief: Setup the environment for measurement
+  // @Brief: Setup the environment for measurement
   virtual void SetUp();
 
-  //@Brief: Core measurement execution
+  // @Brief: Core measurement execution
   virtual void Run();
 
-  //@Brief: Clean up and retrive the resource
+  // @Brief: Clean up and retrive the resource
   virtual void Close();
 
-  //@Brief: Display  results
+  // @Brief: Display  results
   virtual void DisplayResults() const;
 
+  // There are 3 levels of testing, from quickest/very specific to
+  // longest/most complete:
+  // 1. to and from a specified source to a specified target
+  // 2. to and from the cpu to 1 gpu, and to/from a gpu to another gpu
+  //    (if available)
+  // 3. to and from the cpu to 1 gpu and, to/from every gpu to every
+  //    other gpu
+  // The default is #2 above. If *both* a source and dest. are set for #1
+  // above, then that overides both #2 and #3
+  void set_src_pool(int pool_id) {src_pool_id_ = pool_id;}
+  void set_dst_pool(int pool_id) {dst_pool_id_ = pool_id;}
+  void set_full_test(bool full_test) {do_full_test_ = full_test;}
+  int pool_index(void) const {return pool_index_;}
+  void set_pool_index(int i) {pool_index_ = i;}
+  int agent_index(void) const {return agent_index_;}
+  void set_agent_index(int i) {agent_index_ = i;}
+  std::vector<PoolInfo *> *pool_info(void) {return &pool_info_;}
+  std::vector<AgentInfo *> *agent_info(void) {return &agent_info_;}
+  std::vector<NodeInfo> *node_info(void) {return &node_info_;}
+
+  // @Brief: Display information about what this test does
+  virtual void DisplayTestInfo(void);
+
  private:
-  //@Brief: Get real iteration number
-  virtual size_t RealIterationNum();
+  // @Brief: Get real iteration number
+  virtual size_t RealIterationNum(void);
 
-  //@Brief: Get the mean copy time
-  virtual double GetMeanTime(std::vector<double>& vec);
+  // @Brief: Get the mean copy time
+  double GetMeanTime(std::vector<double>* vec);
 
-  //@Brief: Get the min copy time
-  virtual double GetMinTime(std::vector<double>& vec);
+  // @Brief: Find and print out the needed topology info
+  void FindTopology(void);
 
-  //@Brief: Find and print out the needed topology info
-  void FindTopology();
+  // @Brief: Run for Benchmark mode with verification
+  void RunBenchmarkWithVerification(Transaction *t);
 
-  //@Brief: Parse the argument and interact with the user
-  // to fill the vectors.
-  void ParseArgument();
+  // @Brief: Dispaly Benchmark result
+  void DisplayBenchmark(Transaction *t) const;
 
-  //@Brief: Run for Benchmark mode
-  void RunBenchmark();
+  // @Brief: Print topology info
+  void PrintTopology(void);
 
-  //@Brief: Run for Benchmark mode with verification
-  void RunBenchmarkWithVerification();
+  void ConstructTransactionList(void);
 
-  //@Brief: Dispaly Benchmark result
-  void DisplayBenchmark();
+  // @Brief: Find system region
+  void FindSystemPool(void);
 
-  //@Brief: Run user defined
-  void RunNormal();
-
-  //@Brief: Print topology info
-  void PrintTopology();
-
-  //@Brief: Find system region
-  void FindSystemRegion();
-
-  //@Brief: Check if agent and access memory pool, if so, set
-  //access to the agent, if not, exit
-  void AcquireAccess(hsa_agent_t agent, hsa_amd_memory_pool_t pool, void* ptr);
-
-  friend hsa_status_t AgentInfo(hsa_agent_t agent, void* data);
-  friend hsa_status_t RegionInfo(hsa_amd_memory_pool_t region, void* data);
-
- protected:
   // More variables declared for testing
-  std::vector<transaction> tran_;
+  std::vector<Transaction> tran_;
 
   // Variable used to store agent info, indexed by agent_index_
-  std::vector<agent_info> agent_info_;
+  std::vector<AgentInfo *> agent_info_;
 
-  // Variable used to store region info, indexed by region_index_
-  std::vector<region_info> region_info_;
+  // Variable used to store region info, indexed by pool_index_
+  std::vector<PoolInfo *> pool_info_;
 
-  // Variable to store argument number
-  int argc_;
-
-  // Pointer to store address of argument text
-  char** argv_;
+  // To store node info
+  std::vector<NodeInfo> node_info_;
 
   // Variable to help count agent index
   int agent_index_;
 
   // Variable to help count region index
-  int region_index_;
-
-  // BenchMark mode by default
-  bool bench_mark_mode_;
-
-  // BenchMark copy time
-  std::vector<double> benchmark_copy_time_;
-
-  // Min time
-  std::vector<double> min_time_;
-
-  // User define copy time
-  double user_copy_time_;
+  int pool_index_;
 
   // Verification result
   bool verified_;
 
-  // If it needs verification
-  bool verification_;
-
-  // To store node info
-  std::vector<node_info> node_info_;
+  // Store the testing level
+  int src_pool_id_;
+  int dst_pool_id_;
+  bool do_full_test_;
 
   // System region
-  hsa_amd_memory_pool_t sys_region_;
+  hsa_amd_memory_pool_t sys_pool_;
 
   // CPU agent used for verification
   hsa_agent_t cpu_agent_;
 
-  constexpr const static char* help_info = 
-     MULTILINE(. / memory_async_copy - f source_region - t dst_region - s data_size_in_KB - r[y | n] - i iteration_number - b\n\
-      \n\
-      -h Help info \n\
-      -f Memory Pool where data copy from \n\
-      -t Memory Pool where data copy to \n\
-
-    -s Size of copy data, 256MB by default \n\
-        -r If wants to add more copy \n\
-        -i Iteration number for each copy \n\
-        -b Enable benchmark mode \n\
-        Note : -f - t must be specified\n);
+  rocrtst::PerfTimer copy_timer_;
 };
 
-#endif
+#endif  // ROCRTST_SUITES_PERFORMANCE_MEMORY_ASYNC_COPY_H_
diff --git a/rocrtst/suites/performance/memory_copy.cc b/rocrtst/suites/performance/memory_copy.cc
deleted file mode 100755
index a08306fe4c..0000000000
--- a/rocrtst/suites/performance/memory_copy.cc
+++ /dev/null
@@ -1,411 +0,0 @@
-/*
- * =============================================================================
- *   ROC Runtime Conformance Release License
- * =============================================================================
- * The University of Illinois/NCSA
- * Open Source License (NCSA)
- *
- * Copyright (c) 2017, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Developed by:
- *
- *                 AMD Research and AMD ROC Software Development
- *
- *                 Advanced Micro Devices, Inc.
- *
- *                 www.amd.com
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal with the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- *  - Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimers.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimers in
- *    the documentation and/or other materials provided with the distribution.
- *  - Neither the names of <Name of Development Group, Name of Institution>,
- *    nor the names of its contributors may be used to endorse or promote
- *    products derived from this Software without specific prior written
- *    permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS WITH THE SOFTWARE.
- *
- */
-
-#include "memory_copy.h"
-#include "common/base_rocr_utils.h"
-#include "common/common.h"
-#include "hsa/hsa.h"
-#include "gtest/gtest.h"
-#include <algorithm>
-
-MemoryCopy::MemoryCopy(size_t num) :
-  BaseRocR() {
-  ptr_src_ = NULL;
-  ptr_dst_ = NULL;
-  ptr_dev_src_ = NULL;
-  ptr_dev_dst_ = NULL;
-  device_region_.handle = 0;
-  set_requires_profile (HSA_PROFILE_BASE);
-}
-
-MemoryCopy::~MemoryCopy() {
-}
-
-const char* MemoryCopy::Str[16] = {"64K", "128K", "256K", "512K", "1M", "2M",
-                                   "4M", "8M", "16M", "32M", "64M", "128M",
-                                   "256M", "512M", "1G", "2G"
-                                  };
-const size_t MemoryCopy::Size[16] = {64*1024, 128*1024, 256*1024, 512*1024,
-                                     1024*1024, 2048*1024, 4096*1024,
-                                     8*1024*1024, 16*1024* 1024, 32*1024*1024,
-                                     64*1024*1024, 128*1024*1024, 256*1024*1024,
-                                     512*1024*1024, 1024*1024*1024,
-                                     (size_t)2*1024*1024* 1024
-                                    };
-
-
-void MemoryCopy::SetUp() {
-  hsa_status_t err;
-
-  if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
-    return;
-  }
-
-  hsa_agent_t* gpu_dev = gpu_device1();
-  hsa_agent_t* cpu_dev = cpu_device();
-
-  // Find system memory pool for kernarg allocation.
-  // hsa_amd_memory_pool_t sys_coarse_grained_pool;
-  err = hsa_amd_agent_iterate_memory_pools(*cpu_dev, rocrtst::FindGlobalPool,
-        &cpu_pool());
-  ASSERT_EQ(err, HSA_STATUS_INFO_BREAK);
-
-  ASSERT_NE(cpu_pool().handle, 0);
-
-  // Get local memory pool of the first GPU.
-  // hsa_amd_memory_pool_t gpu_pool_;
-  err = hsa_amd_agent_iterate_memory_pools(*gpu_dev, rocrtst::FindStandardPool,
-        &device_pool());
-  ASSERT_EQ(err, HSA_STATUS_INFO_BREAK);
-  ASSERT_NE(device_pool().handle, 0);
-
-  //Allocate buffers whose size is 2GB
-  err = hsa_amd_memory_pool_allocate(cpu_pool(), Size[12], 0, &ptr_src_);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  err = hsa_amd_memory_pool_allocate(cpu_pool(), Size[12], 0, &ptr_dst_);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  err = hsa_amd_memory_pool_allocate(device_pool(), Size[11], 0, &ptr_dev_src_);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  err = hsa_amd_memory_pool_allocate(device_pool(), Size[11], 0, &ptr_dev_dst_);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  //Assign the region ownership to GPU
-  err = hsa_memory_assign_agent(ptr_dev_src_, *gpu_dev,
-                                HSA_ACCESS_PERMISSION_RW);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  err = hsa_memory_assign_agent(ptr_dev_dst_, *gpu_dev,
-                                HSA_ACCESS_PERMISSION_RW);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  //rocrtst::CommonCleanUp the two buffer, src to 1 each byte and dst to 0
-  err = hsa_amd_memory_fill(ptr_src_, 1, Size[12]);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  //Check if the initialization is correct
-#if DEBUG
-  std::cout << "Value after setting source buffer is: "
-            << (int)((uint8_t*)ptr_src_)[0] << std::endl;
-#endif
-
-  return;
-}
-
-void MemoryCopy::Run() {
-  hsa_status_t err;
-
-  if (!rocrtst::CheckProfile(this)) {
-    return;
-  }
-
-  uint32_t iterations = RealIterationNum();
-
-  //Iteration over the different data size on system memory
-  for (int i = 0; i < 13; i++) {
-    std::vector<double> time;
-
-    for (uint32_t it = 0; it < iterations; it++) {
-#if DEBUG
-      std::cout << ".";
-      fflush(stdout);
-#endif
-
-      rocrtst::PerfTimer copy_timer;
-      int index = copy_timer.CreateTimer();
-
-      copy_timer.StartTimer(index);
-      err = hsa_memory_copy(ptr_dst_, ptr_src_, Size[i]);
-      copy_timer.StopTimer(index);
-      ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-      // Push the result back to vector time
-      time.push_back(copy_timer.ReadTimer(index));
-
-#if DEBUG
-      //Check if the data copied is correct
-      uint8_t* temp_ptr = (uint8_t*)ptr_dst_;
-
-      for (uint32_t j = 0; j < Size[i]; j++) {
-        ASSERT_EQ(temp_ptr[j], 1);
-      }
-
-#endif
-    }
-
-#if DEBUG
-    std::cout << std::endl;
-#endif
-
-    //Get mean copy time and store to the array
-    sys2sys_copy_time_.push_back(GetMeanTime(time));
-  }
-
-  //Copy from system memory to device memory
-  for (int i = 0; i < 12; i++) {
-    std::vector<double> time;
-
-    for (uint32_t it = 0; it < iterations; it++) {
-#if DEBUG
-      std::cout << ".";
-      fflush(stdout);
-#endif
-
-      rocrtst::PerfTimer copy_timer;
-      int index = copy_timer.CreateTimer();
-
-      copy_timer.StartTimer(index);
-      err = hsa_memory_copy(ptr_dev_src_, ptr_src_, Size[i]);
-      copy_timer.StopTimer(index);
-      ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-      // Push the result back to vector time
-      time.push_back(copy_timer.ReadTimer(index));
-
-#if DEBUG
-      //Check if the data copied is correct
-      uint8_t* temp_ptr = (uint8_t*)ptr_dst_;
-
-      for (uint32_t j = 0; j < Size[i]; j++) {
-        ASSERT_EQ(temp_ptr[j], 1);
-      }
-
-#endif
-    }
-
-#if DEBUG
-    std::cout << std::endl;
-#endif
-
-    //Get mean copy time and store to the array
-    sys2dev_copy_time_.push_back(GetMeanTime(time));
-  }
-
-  //Copy from device memory to device memory
-  for (int i = 0; i < 12; i++) {
-    std::vector<double> time;
-
-    for (uint32_t it = 0; it < iterations; it++) {
-#if DEBUG
-      std::cout << ".";
-      fflush(stdout);
-#endif
-
-      rocrtst::PerfTimer copy_timer;
-      int index = copy_timer.CreateTimer();
-
-      copy_timer.StartTimer(index);
-      err = hsa_memory_copy(ptr_dev_dst_, ptr_dev_src_, Size[i]);
-      copy_timer.StopTimer(index);
-      ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-      // Push the result back to vector time
-      time.push_back(copy_timer.ReadTimer(index));
-
-#if DEBUG
-      //Check if the data copied is correct
-      uint8_t* temp_ptr = (uint8_t*)ptr_dst_;
-
-      for (uint32_t j = 0; j < Size[i]; j++) {
-        ASSERT_EQ(temp_ptr[j], 1);
-      }
-
-#endif
-    }
-
-#if DEBUG
-    std::cout << std::endl;
-#endif
-
-    //Get mean copy time and store to the array
-    dev2dev_copy_time_.push_back(GetMeanTime(time));
-  }
-
-  //Copy from device memory to system memory
-  for (int i = 0; i < 12; i++) {
-    std::vector<double> time;
-
-    for (uint32_t it = 0; it < iterations; it++) {
-#if DEBUG
-      std::cout << ".";
-      fflush(stdout);
-#endif
-
-      rocrtst::PerfTimer copy_timer;
-      int index = copy_timer.CreateTimer();
-
-      copy_timer.StartTimer(index);
-      err = hsa_memory_copy(ptr_dst_, ptr_dev_src_, Size[i]);
-      copy_timer.StopTimer(index);
-      ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-      // Push the result back to vector time
-      time.push_back(copy_timer.ReadTimer(index));
-
-#if DEBUG
-      //Check if the data copied is correct
-      uint8_t* temp_ptr = (uint8_t*)ptr_dst_;
-
-      for (uint32_t j = 0; j < Size[i]; j++) {
-        if (temp_ptr[j] != 1) {
-          ASSERT_EQ(temp_ptr[j], 1);
-        }
-      }
-
-#endif
-    }
-
-#if DEBUG
-    std::cout << std::endl;
-#endif
-
-    //Get mean copy time and store to the array
-    dev2sys_copy_time_.push_back(GetMeanTime(time));
-  }
-}
-
-size_t MemoryCopy::RealIterationNum() {
-  return num_iteration() * 1.2 + 1;
-}
-
-double MemoryCopy::GetMeanTime(std::vector<double>& vec) {
-  std::sort(vec.begin(), vec.end());
-
-  vec.erase(vec.begin());
-  vec.erase(vec.begin(), vec.begin() + num_iteration() * 0.1);
-  vec.erase(vec.begin() + num_iteration(), vec.end());
-
-  double mean = 0.0;
-  int num = vec.size();
-
-  for (int it = 0; it < num; it++) {
-    //        printf("%f\n", vec[it]);
-    mean += vec[it];
-  }
-
-  mean /= num;
-  return mean;
-}
-
-void MemoryCopy::DisplayResults() const {
-
-  if (!rocrtst::CheckProfile(this)) {
-    return;
-  }
-
-  printf(
-    "================ System to System ==================================\n");
-  printf("  Data Size                      BandWidth(GB/s)\n");
-
-  //Output the BW of system memory to system memory
-  for (int i = 0; i < 13; i++) {
-    double band_width = (double) Size[i] / sys2sys_copy_time_[i] / 1024 / 1024
-                        / 1024 * 2;
-#ifdef DEBUG
-    printf("size: %zu      time: %f\n", Size[i], sys2sys_copy_time_[i]);
-#endif
-    printf("  %s                             %lf\n", Str[i], band_width);
-  }
-
-  printf(
-    "================ System to Device ===================================\n");
-
-  for (int i = 0; i < 12; i++) {
-    double band_width = (double) Size[i] / sys2dev_copy_time_[i] / 1024 / 1024
-                        / 1024 * 2;
-#ifdef DEBUG
-    printf("size: %zu      time: %f\n", Size[i], sys2dev_copy_time_[i]);
-#endif
-    printf("  %s                             %lf\n", Str[i], band_width);
-  }
-
-  printf(
-    "================ Device to Device ===================================\n");
-
-  for (int i = 0; i < 12; i++) {
-    double band_width = (double) Size[i] / dev2dev_copy_time_[i] / 1024 / 1024
-                        / 1024 * 2;
-#ifdef DEBUG
-    printf("size: %zu      time: %f\n", Size[i], dev2dev_copy_time_[i]);
-#endif
-    printf("  %s                             %lf\n", Str[i], band_width);
-  }
-
-  printf(
-    "================ Device to System ===================================\n");
-
-  for (int i = 0; i < 12; i++) {
-    double band_width = (double) Size[i] / dev2sys_copy_time_[i] / 1024 / 1024
-                        / 1024 * 2;
-#ifdef DEBUG
-    printf("size: %zu      time: %f\n", Size[i], dev2sys_copy_time_[i]);
-#endif
-    printf("  %s                             %lf\n", Str[i], band_width);
-  }
-
-  printf("===================================================\n");
-  return;
-}
-
-void MemoryCopy::Close() {
-  hsa_status_t err;
-
-  //Free the memory allocated
-  err = hsa_memory_free(ptr_src_);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  err = hsa_memory_free(ptr_dst_);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  ptr_src_ = NULL;
-  ptr_dst_ = NULL;
-
-  err = rocrtst::CommonCleanUp(this);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  return;
-}
diff --git a/rocrtst/suites/performance/memory_copy.h b/rocrtst/suites/performance/memory_copy.h
deleted file mode 100644
index f6f2deb1eb..0000000000
--- a/rocrtst/suites/performance/memory_copy.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * =============================================================================
- *   ROC Runtime Conformance Release License
- * =============================================================================
- * The University of Illinois/NCSA
- * Open Source License (NCSA)
- *
- * Copyright (c) 2017, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Developed by:
- *
- *                 AMD Research and AMD ROC Software Development
- *
- *                 Advanced Micro Devices, Inc.
- *
- *                 www.amd.com
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal with the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- *  - Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimers.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimers in
- *    the documentation and/or other materials provided with the distribution.
- *  - Neither the names of <Name of Development Group, Name of Institution>,
- *    nor the names of its contributors may be used to endorse or promote
- *    products derived from this Software without specific prior written
- *    permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS WITH THE SOFTWARE.
- *
- */
-
-#ifndef __ROCRTST_SRC_MEMORY_MEM_COPY_H__
-#define __ROCRTST_SRC_MEMORY_MEM_COPY_H__
-
-#include "common/base_rocr.h"
-#include "perf_common/perf_base.h"
-#include "hsa/hsa.h"
-#include "common/hsatimer.h"
-#include <vector>
-
-class MemoryCopy: public rocrtst::BaseRocR, public PerfBase {
-
- public:
-  //@Brief: Constructor for test case of MemoryCopy
-  MemoryCopy(size_t num = 100);
-
-  //@Brief: Destructor for test case of MemoryCopy
-  virtual ~MemoryCopy();
-
-  //@Brief: Setup the environment for measurement
-  virtual void SetUp();
-
-  //@Brief: Core measurement execution
-  virtual void Run();
-
-  //@Brief: Clean up and retrive the resource
-  virtual void Close();
-
-  //@Brief: Display  results
-  virtual void DisplayResults() const;
-
- private:
-  //@Brief: Define copy data size and corresponding string
-  static const size_t Size[16];
-  static const char* Str[16];
-
-  //@Brief: Get real iteration number
-  virtual size_t RealIterationNum();
-
-  //@Brief: Get the mean copy time
-  virtual double GetMeanTime(std::vector<double>& vec);
-
- protected:
-  //@Brief: More variables declared for testing
-  //@Brief: Source pointer from which data copy
-  void* ptr_src_;
-
-  //@Brief: Destination pointer to which data copy
-  void* ptr_dst_;
-
-  //@Brief: Pointer to device memory
-  void* ptr_dev_src_;
-  void* ptr_dev_dst_;
-
-  //@Brief: Array to store the timer results for each data size
-  std::vector<double> sys2sys_copy_time_;
-  std::vector<double> sys2dev_copy_time_;
-  std::vector<double> dev2sys_copy_time_;
-  std::vector<double> dev2dev_copy_time_;
-
-  //@Brief: Device memory region
-  hsa_region_t device_region_;
-};
-
-#endif
diff --git a/rocrtst/suites/performance/queue_concurrency.cc b/rocrtst/suites/performance/queue_concurrency.cc
deleted file mode 100755
index b127e35749..0000000000
--- a/rocrtst/suites/performance/queue_concurrency.cc
+++ /dev/null
@@ -1,284 +0,0 @@
-/*
- * =============================================================================
- *   ROC Runtime Conformance Release License
- * =============================================================================
- * The University of Illinois/NCSA
- * Open Source License (NCSA)
- *
- * Copyright (c) 2017, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Developed by:
- *
- *                 AMD Research and AMD ROC Software Development
- *
- *                 Advanced Micro Devices, Inc.
- *
- *                 www.amd.com
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal with the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- *  - Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimers.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimers in
- *    the documentation and/or other materials provided with the distribution.
- *  - Neither the names of <Name of Development Group, Name of Institution>,
- *    nor the names of its contributors may be used to endorse or promote
- *    products derived from this Software without specific prior written
- *    permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS WITH THE SOFTWARE.
- *
- */
-#include "queue_concurrency.h"
-#include "common/base_rocr_utils.h"
-#include "common/common.h"
-#include "common/helper_funcs.h"
-#include "common/hsatimer.h"
-#include "common/os.h"
-#include "hsa/hsa_ext_finalize.h"
-#include "gtest/gtest.h"
-
-#include <math.h>
-#include <thread>
-
-QueueConcurrency::QueueConcurrency() :
-  BaseRocR(), execution_time_(8) {
-  queue_num_ = 0;
-  std_time_ = 0.0;
-
-  set_enable_interrupt(true);
-  set_requires_profile (HSA_PROFILE_FULL);
-}
-
-QueueConcurrency::~QueueConcurrency() {
-}
-
-void QueueConcurrency::SetUp() {
-  hsa_status_t err;
-
-  set_kernel_file_name("test_kernel.o");
-  set_kernel_name("&__OpenCL_vec_assign_kernel");
-
-  if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
-    return;
-  }
- 
-  rocrtst::LoadKernelFromObjFile(this);
-
-  hsa_agent_t* gpu_dev = gpu_device1();
-
-  // Fill up part of aql pakcet which are the same cross the threads
-  rocrtst::InitializeAQLPacket(this, &aql());
-
-  // Create a queue
-  hsa_queue_t* q = main_queue();
-  rocrtst::CreateQueue(*gpu_dev, &q);
-
-  for (int i = 0; i < 2; i++) {
-    // Output of kernel
-    int output = 0;
-
-    // Iteration number
-    int iterations = 1024 * 1024; // * 1024;
-
-    struct ALIGNED_(16)
-    args_t {
-      void* arg0;
-      int arg1;
-    } local_args;
-
-    local_args.arg0 = (void*) &output;
-    local_args.arg1 = iterations;
-
-    err = hsa_memory_register(&local_args, sizeof(local_args));
-    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-    //Obtain the current queue write index.
-    uint64_t index = hsa_queue_add_write_index_relaxed(main_queue(), 1);
-
-    //Write the aql packet at the calculated queue index address.
-
-    const uint32_t queue_mask = main_queue()->size - 1;
-    hsa_kernel_dispatch_packet_t* pkt_addr =
-      (hsa_kernel_dispatch_packet_t*) (main_queue()->base_address);
-
-    (pkt_addr)[index & queue_mask] = aql();
-    (pkt_addr)[index & queue_mask].completion_signal = signal();
-    (pkt_addr)[index & queue_mask].kernarg_address = &local_args;
-
-    //Get timing stamp and ring the doorbell to dispatch the kernel.
-    rocrtst::PerfTimer p_timer;
-    int id = p_timer.CreateTimer();
-    p_timer.StartTimer(id);
-
-    //.type = HSA_PACKET_TYPE_DISPATCH;
-    (pkt_addr)[index & queue_mask].header |= HSA_PACKET_TYPE_KERNEL_DISPATCH
-        << HSA_PACKET_HEADER_TYPE;
-    hsa_signal_store_screlease(main_queue()->doorbell_signal, index);
-
-    //Wait on the dispatch signal until the kernel is finished.
-    while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1,
-                                     (uint64_t) - 1, HSA_WAIT_STATE_ACTIVE))
-      ;
-
-    p_timer.StopTimer(id);
-    hsa_signal_store_screlease(signal(), 1);
-
-    if (1 == i) {
-      std_time_ = p_timer.ReadTimer(id);
-    }
-  }
-
-  //Destroy the queue
-  err = hsa_queue_destroy(main_queue());
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-}
-
-void QueueConcurrency::Run() {
-
-  if (!rocrtst::CheckProfile(this)) {
-    return;
-  }
-
-  // Launch 8 child threads
-  std::vector < std::thread > threads;
-
-  for (int i = 0; i < 8; i++) {
-    threads.push_back(std::thread(&QueueConcurrency::ThreadFunc, this, i));
-  }
-
-  // Wait for join
-  for (int i = 0; i < 8; i++) {
-    threads[i].join();
-  }
-
-  CalculateQueueNum();
-}
-
-void QueueConcurrency::CalculateQueueNum() {
-  for (int i = 0; i < 8; i++) {
-    double expected_time = execution_time_[0] / (1 << i);
-    double deviation = sqrt(
-                         (expected_time - execution_time_[i])
-                         * (expected_time - execution_time_[i]));
-
-    if (deviation < 0.1 * expected_time) {
-      queue_num_++;
-    }
-  }
-}
-
-void QueueConcurrency::DisplayResults() const {
-
-  if (!rocrtst::CheckProfile(this)) {
-    return;
-  }
-
-  for (int i = 0; i < 8; i++) {
-    std::cout << execution_time_[i] << std::endl;
-  }
-
-  std::cout << "Number of Concurrent Queue is: " << queue_num_ << std::endl;
-
-  ASSERT_EQ(queue_num_, 3);
-
-  return;
-}
-
-void QueueConcurrency::Close() {
-  hsa_status_t err;
-  err = rocrtst::CommonCleanUp(this);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-}
-
-void QueueConcurrency::ThreadFunc(int threadID) {
-  // Define local queue and signal
-  hsa_queue_t* queue;
-  hsa_signal_t signal;
-  hsa_status_t err;
-  hsa_agent_t* gpu_dev = gpu_device1();
-
-  // Create a signal
-  err = hsa_signal_create(1, 0, NULL, &signal);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  rocrtst::CreateQueue(*gpu_dev, &queue);
-
-  std::vector<double> time;
-
-  for (uint32_t i = 0; i < num_iteration(); i++) {
-    // Output of kernel
-    int output = 0;
-
-    // Iteration number
-    int iterations = 1024 * 1024 / (1 << threadID);
-
-    struct ALIGNED_(16)
-    args_t {
-      void* arg0;
-      int arg1;
-    } local_args;
-
-    local_args.arg0 = (void*) &output;
-    local_args.arg1 = iterations;
-
-    err = hsa_memory_register(&local_args, sizeof(local_args));
-    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-    //Obtain the current queue write index.
-    uint64_t index = hsa_queue_add_write_index_relaxed(queue, 1);
-
-    //Write the aql packet at the calculated queue index address.
-
-    const uint32_t queue_mask = queue->size - 1;
-    hsa_kernel_dispatch_packet_t* pkt_addr =
-      (hsa_kernel_dispatch_packet_t*) (queue->base_address);
-    (pkt_addr)[index & queue_mask] = aql();
-    (pkt_addr)[index & queue_mask].completion_signal = signal;
-    (pkt_addr)[index & queue_mask].kernarg_address = &local_args;
-
-    //Get timing stamp and ring the doorbell to dispatch the kernel.
-    rocrtst::PerfTimer p_timer;
-    int id = p_timer.CreateTimer();
-    p_timer.StartTimer(id);
-
-    //.type = HSA_PACKET_TYPE_DISPATCH;
-    (pkt_addr)[index & queue_mask].header |= HSA_PACKET_TYPE_KERNEL_DISPATCH
-        << HSA_PACKET_HEADER_TYPE;
-    hsa_signal_store_screlease(queue->doorbell_signal, index);
-
-    //Wait on the dispatch signal until the kernel is finished.
-    while (hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, 1,
-                                     (uint64_t) - 1, HSA_WAIT_STATE_ACTIVE))
-      ;
-
-    p_timer.StopTimer(id);
-    hsa_signal_store_screlease(signal, 1);
-
-    time.push_back(p_timer.ReadTimer(id));
-
-    EXPECT_EQ(output, iterations);
-
-    if (1 == i) {
-      execution_time_[threadID] = p_timer.ReadTimer(id);
-    }
-  }
-
-  time.erase(time.begin());
-  execution_time_[threadID] = rocrtst::CalcMean(time);
-  return;
-}
-
diff --git a/rocrtst/suites/performance/queue_create_destroy_latency.cc b/rocrtst/suites/performance/queue_create_destroy_latency.cc
deleted file mode 100755
index 28c4f9a160..0000000000
--- a/rocrtst/suites/performance/queue_create_destroy_latency.cc
+++ /dev/null
@@ -1,271 +0,0 @@
-/*
- * =============================================================================
- *   ROC Runtime Conformance Release License
- * =============================================================================
- * The University of Illinois/NCSA
- * Open Source License (NCSA)
- *
- * Copyright (c) 2017, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Developed by:
- *
- *                 AMD Research and AMD ROC Software Development
- *
- *                 Advanced Micro Devices, Inc.
- *
- *                 www.amd.com
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal with the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- *  - Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimers.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimers in
- *    the documentation and/or other materials provided with the distribution.
- *  - Neither the names of <Name of Development Group, Name of Institution>,
- *    nor the names of its contributors may be used to endorse or promote
- *    products derived from this Software without specific prior written
- *    permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS WITH THE SOFTWARE.
- *
- */
-
-#include "queue_create_destroy_latency.h"
-#include "common/hsatimer.h"
-#include "common/common.h"
-#include "common/base_rocr_utils.h"
-#include "common/helper_funcs.h"
-#include "hsa/hsa_ext_amd.h"
-#include "hsa/hsa_ext_finalize.h"
-#include "gtest/gtest.h"
-#include <stdio.h>
-
-static const int kGridDimension = 1024;
-
-// Construct the test case class
-QueueLatency::QueueLatency() :
-  BaseRocR() {
-  max_queue_ = 0;
-  in_ = NULL;
-  out_ = NULL;
-}
-
-// Destruct the test case claa
-QueueLatency::~QueueLatency() {
-
-}
-
-void QueueLatency::Close() {
-  hsa_memory_free (in_);
-  hsa_memory_free (out_);
-
-  hsa_status_t err;
-  err = rocrtst::CommonCleanUp(this);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-  return;
-}
-
-// Set up the environment
-void QueueLatency::SetUp() {
-  hsa_status_t err;
-
-  // We get hangs with vector_copy
-  set_kernel_file_name("vector_copy.o");
-  set_kernel_name("&__vector_copy_kernel");
-
-  if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
-    return;
-  }
-
-  hsa_agent_t* gpu_dev = gpu_device1();
-  hsa_agent_t* cpu_dev = cpu_device();
-
-  // Get the max queue which can be active for GPU device
-  err = hsa_agent_get_info(*gpu_dev, HSA_AGENT_INFO_QUEUES_MAX, &max_queue_);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  // Find system coarse grained region
-  err = hsa_amd_agent_iterate_memory_pools(*cpu_dev, rocrtst::FindGlobalPool,
-                                                                   &cpu_pool());
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  size_t pool_size;
-  err = hsa_amd_memory_pool_get_info(cpu_pool(), HSA_AMD_MEMORY_POOL_INFO_SIZE,
-                                                                    &pool_size);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  err = hsa_amd_memory_pool_allocate(cpu_pool(),
-                                     kGridDimension * kGridDimension * 4, 0,
-                                                                (void**) &in_);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  err = hsa_amd_memory_pool_allocate(cpu_pool(),
-                                     kGridDimension * kGridDimension * 4, 0,
-                                                               (void**) &out_);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  //rocrtst::LoadKernelFromObjFile(gpu_dev, "./"+ kernel_file_name() + ".o");
-  rocrtst::LoadKernelFromObjFile(this);
-
-  // Fill up the aql packet
-  rocrtst::InitializeAQLPacket(this, &aql());
-  aql().grid_size_x = kGridDimension * kGridDimension;
-
-  // rocrtst::CommonCleanUp vector memory and register them
-  //memset(in_, 1, kGridDimension*kGridDimension * 4);
-
-  err = hsa_amd_memory_fill(in_, 1, kGridDimension * kGridDimension * 4);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  return;
-}
-
-void QueueLatency::Run() {
-  hsa_agent_t* gpu_dev = gpu_device1();
-  hsa_status_t err;
-
-  if (!rocrtst::CheckProfile(this)) {
-    return;
-  }
-
-  // The outer for loop iterator represents the predefined queue number
-  // After creating a queue, launch a kernel to train the queue, then destroy
-  // TODO:Hardcode max_queue_ to 100
-  max_queue_ = 20;
-
-  for (uint32_t pre_defined_num = 0; pre_defined_num < max_queue_;
-       pre_defined_num++) {
-#ifdef DEBUG
-    std::cout << "Existing queue number: " << pre_defined_num << std::endl;
-#endif
-    // vector to store the creation and destruction time
-    std::vector<double> creation;
-    std::vector<double> destruction;
-    // Create pre_defined_num queues first
-    hsa_queue_t* q;
-
-    for (uint32_t i = 0; i < pre_defined_num; i++) {
-      q = main_queue();
-      rocrtst::CreateQueue(*gpu_dev, &q);
-
-      queues_.push_back(q);
-    }
-
-    for (uint32_t i = 0; i < num_iteration(); i++) {
-      rocrtst::PerfTimer p_timer;
-      int id = p_timer.CreateTimer();
-
-      uint32_t size = 0;
-      err = hsa_agent_get_info(*gpu_dev, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &size);
-      ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-      p_timer.StartTimer(id);
-      hsa_queue_t* q = main_queue();
-
-      err = hsa_queue_create(*gpu_dev, size, HSA_QUEUE_TYPE_MULTI, NULL, NULL,
-                             UINT32_MAX, UINT32_MAX, &q);
-      p_timer.StopTimer(id);
-      ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-      creation.push_back(p_timer.ReadTimer(id));
-
-      p_timer.ResetTimer(id);
-
-      // Launch a kernel to the currently created queue
-      // Allocate kernel parameter
-      typedef struct args_t {
-        void* in_buf;
-        void* out_buf;
-      } args;
-
-      args* kern_ptr = NULL;
-      err = hsa_amd_memory_pool_allocate(cpu_pool(), sizeof(args), 0,
-                                         (void**) &kern_ptr);
-      ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-      kern_ptr->in_buf = in_;
-      kern_ptr->out_buf = out_;
-
-      aql().kernarg_address = kern_ptr;
-
-      // Obtain the current queue write index.
-      uint64_t index = hsa_queue_add_write_index_relaxed(main_queue(), 1);
-
-      // Write the aql packet at the calculated queue index address.
-      const uint32_t queue_mask = main_queue()->size - 1;
-      ((hsa_kernel_dispatch_packet_t*) (main_queue()->base_address))[index
-          & queue_mask] = aql();
-
-      ((hsa_kernel_dispatch_packet_t*) (main_queue()->base_address))[index
-          & queue_mask].header |= HSA_PACKET_TYPE_KERNEL_DISPATCH
-                                  << HSA_PACKET_HEADER_TYPE; 
-      hsa_signal_store_screlease(main_queue()->doorbell_signal, index);
-
-      // Wait on the dispatch signal until the kernel is finished.
-      while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1,
-                                       (uint64_t) - 1, HSA_WAIT_STATE_ACTIVE))
-        ;
-
-      hsa_signal_store_screlease(signal(), 1);
-
-      // Destroy the queue and record the timer
-      p_timer.StartTimer(id);
-      err = hsa_queue_destroy(main_queue());
-      p_timer.StopTimer(id);
-      ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-      destruction.push_back(p_timer.ReadTimer(id));
-
-    }
-
-#ifdef DEBUG
-    std::cout << std::endl;
-#endif
-
-    // Destroy the predefined queue
-    for (uint32_t i = 0; i < pre_defined_num; i++) {
-
-      ASSERT_EQ(queues_.size(), pre_defined_num);
-
-      err = hsa_queue_destroy(queues_[i]);
-      ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-    }
-
-    // Clear the queue vector
-    queues_.clear();
-
-    // Get the mean creation and detruction time and push back
-    double creation_mean = rocrtst::CalcMean(creation);
-    double destruction_mean = rocrtst::CalcMean(destruction);
-    construction_mean_.push_back(creation_mean);
-    destruction_mean_.push_back(destruction_mean);
-  }
-}
-
-void QueueLatency::DisplayResults() const {
-
-  if (!rocrtst::CheckProfile(this)) {
-    return;
-  }
-
-  printf("======================================================\n");
-  printf(" Existing queue#        Creation        Destroy\n");
-
-  for (uint32_t i = 0; i < max_queue_; i++) {
-    printf("      %d,         %fms          %fms\n", i,
-           construction_mean_[i] * 1e3, destruction_mean_[i] * 1e3);
-  }
-}
diff --git a/rocrtst/suites/performance/queue_create_destroy_latency.h b/rocrtst/suites/performance/queue_create_destroy_latency.h
deleted file mode 100755
index fba92f87e0..0000000000
--- a/rocrtst/suites/performance/queue_create_destroy_latency.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * =============================================================================
- *   ROC Runtime Conformance Release License
- * =============================================================================
- * The University of Illinois/NCSA
- * Open Source License (NCSA)
- *
- * Copyright (c) 2017, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Developed by:
- *
- *                 AMD Research and AMD ROC Software Development
- *
- *                 Advanced Micro Devices, Inc.
- *
- *                 www.amd.com
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal with the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- *  - Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimers.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimers in
- *    the documentation and/or other materials provided with the distribution.
- *  - Neither the names of <Name of Development Group, Name of Institution>,
- *    nor the names of its contributors may be used to endorse or promote
- *    products derived from this Software without specific prior written
- *    permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS WITH THE SOFTWARE.
- *
- */
-
-#ifndef __ROCRTST_SRC_INC_QUEUE_CREATE_DESTROY_LATENCY_H__
-#define __ROCRTST_SRC_INC_QUEUE_CREATE_DESTROY_LATENCY_H__
-
-#include "perf_common/perf_base.h"
-#include "common/base_rocr.h"
-#include "hsa/hsa.h"
-#include <vector>
-
-class QueueLatency: public rocrtst::BaseRocR, public PerfBase {
- public:
-  //@Brief: Constructor
-  QueueLatency();
-
-  //@Brief: Destructor
-  ~QueueLatency();
-
-  //@Brief: Set up the teset environment
-  virtual void SetUp();
-
-  //@Brief: Run the test
-  virtual void Run();
-
-  //@Brief: Clean up and close the test
-  virtual void Close();
-
-  //@Brief: Display  results
-  virtual void DisplayResults() const;
-
- private:
-  //@Brief: A vector to store the pointers to multiple queues
-  std::vector<hsa_queue_t*> queues_;
-
-  //@Brief: Variable to store the mean time for both queue construction
-  //  and destruction
-  std::vector<double> construction_mean_;
-  std::vector<double> destruction_mean_;
-
-  //@Brief: Variable to store the max number of queue which are active for
-  // device_
-  uint32_t max_queue_;
-
-  //@Brief: Pointer which points to original and destination vector memory
-  // space
-  uint8_t* in_;
-  uint8_t* out_;
-
-};
-
-#endif //__ROCRTST_SRC_INC_QUEUE_CREATE_DESTROY_LATENCY_H__
-
diff --git a/rocrtst/suites/performance/system_load_bandwidth.cc b/rocrtst/suites/performance/system_load_bandwidth.cc
deleted file mode 100755
index b0e1d1ed42..0000000000
--- a/rocrtst/suites/performance/system_load_bandwidth.cc
+++ /dev/null
@@ -1,281 +0,0 @@
-/*
- * =============================================================================
- *   ROC Runtime Conformance Release License
- * =============================================================================
- * The University of Illinois/NCSA
- * Open Source License (NCSA)
- *
- * Copyright (c) 2017, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Developed by:
- *
- *                 AMD Research and AMD ROC Software Development
- *
- *                 Advanced Micro Devices, Inc.
- *
- *                 www.amd.com
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal with the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- *  - Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimers.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimers in
- *    the documentation and/or other materials provided with the distribution.
- *  - Neither the names of <Name of Development Group, Name of Institution>,
- *    nor the names of its contributors may be used to endorse or promote
- *    products derived from this Software without specific prior written
- *    permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS WITH THE SOFTWARE.
- *
- */
-
-#include "system_load_bandwidth.h"
-#include "common/base_rocr_utils.h"
-#include "common/common.h"
-#include "common/helper_funcs.h"
-#include "common/hsatimer.h"
-#include "common/os.h"
-#include "gtest/gtest.h"
-#include <algorithm>
-
-#if 0
-static void initGlobalReadBuffer(uint32_t* in_data, uint32_t num_thrds,
-                                 uint32_t num_ops, uint32_t num_loops) {
-
-  // Populate input buffer with thread Id left shifted by 2.
-  uint32_t value = 0;
-  uint32_t val_idx = 0;
-
-  for (int idx1 = 0; idx1 < num_loops; idx1++) {
-    for (int idx2 = 0; idx2 < num_ops; idx2++) {
-      // Write the value to be read by each thread
-      for (int idx3 = 0; idx3 < num_thrds; idx3++) {
-        value = idx3 << 2;
-        in_data[val_idx++] = value;
-      }
-    }
-  }
-
-  return;
-}
-
-static bool verifyGlobalLoadKernel(uint32_t* data, uint32_t num_thrds,
-                  uint32_t scale, const char* kernel_name, bool print_debug) {
-
-  // Verify kernel operation i.e. validate the data in the output buffer.
-  bool valid = true;
-  uint32_t valid_value = 0;
-
-  for (int idx = 0; idx < num_thrds; idx++) {
-
-    valid_value = (idx << 2) * scale;
-
-    if (print_debug) {
-      std::cout << "Value expected = " << valid_value << std::endl;
-      std::cout << "Value of data = " << data[idx] << std::endl;
-    }
-
-    if (data[idx] != valid_value) {
-      std::cout << kernel_name << ": VALIDATION FAILED ! Bad index: " << idx
-                << std::endl;
-      std::cout << kernel_name << ": VALUE @ Bad index: " << data[idx]
-                << std::endl;
-      std::cout << std::endl;
-      break;
-    }
-  }
-
-#ifdef DEBUG
-  std::cout << kernel_name << ": Passed validation" << std::endl;
-  std::cout << std::endl;
-#endif
-
-  return true;
-}
-#endif
-
-// Constructor
-SystemLoadBandwidth::SystemLoadBandwidth() :
-  BaseRocR() {
-  set_group_size(0);
-  num_group_ = 0;
-  num_cus_ = 0;
-
-  kernel_loop_count_ = 0;
-  mean_ = 0.0;
-  data_size_ = 0;
-  set_enable_interrupt(0);
-}
-
-// Destructor
-SystemLoadBandwidth::~SystemLoadBandwidth() {
-}
-
-// Set up the test environment
-void SystemLoadBandwidth::SetUp() {
-  set_kernel_file_name("sysMemRead.o");
-  set_kernel_name("&__SysMemLoad");
-
-  if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
-    return;
-  }
- 
-  hsa_agent_t* gpu_dev = gpu_device1();
-  SetWorkItemNum();
-
-  //Create a queue with max number size
-  hsa_queue_t* q = main_queue();
-  rocrtst::CreateQueue(*gpu_dev, &q);
-
-  rocrtst::LoadKernelFromObjFile(this);
-
-  uint32_t total_work_items = num_cus_ * num_group_ * group_size();
-
-  //Fill up part of aql
-  rocrtst::InitializeAQLPacket(this, &aql());
-  aql().workgroup_size_x = group_size();
-  aql().grid_size_x = total_work_items;
-
-  return;
-}
-
-// Run the test
-void SystemLoadBandwidth::Run() {
-
-  if (!rocrtst::CheckProfile(this)) {
-    return;
-  }
-
-  uint32_t total_workitems = num_cus_ * num_group_ * group_size();
-  hsa_agent_t* gpu_dev = gpu_device1();
-  hsa_status_t err;
-
-  uint32_t ops_thrd = 32;
-  uint64_t addr_step = (uint64_t) total_workitems * sizeof(uint32_t);
-  uint64_t total_ops = (uint64_t) total_workitems * ops_thrd;
-  uint64_t in_data_size = (uint64_t) total_ops * sizeof(uint32_t);
-  //uint32_t *in_data = (uint32_t *)malloc(in_data_size);
-  err = hsa_amd_agent_iterate_memory_pools(*gpu_dev, rocrtst::FindStandardPool,
-                                                                &device_pool());
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  int32_t* in_data = NULL;
-  err = hsa_amd_memory_pool_allocate(device_pool(), in_data_size, 0,
-                                     (void**) &in_data);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-  memset(in_data, 0, in_data_size);
-  uint32_t out_data_size = total_workitems * sizeof(uint32_t);
-  //uint32_t *out_data = (uint32_t *)malloc(out_data_size);
-  uint32_t* out_data;
-  err = hsa_amd_memory_pool_allocate(device_pool(), out_data_size, 0,
-                                     (void**) &out_data);
-  memset(out_data, 0, out_data_size);
-
-  data_size_ = in_data_size;
-
-  // initGlobalReadBuffer (in_data, total_workitems, ops_thrd,
-  //                                                     kernel_loop_count_);
-
-  typedef struct local_args_t {
-    void* arg0;
-    void* arg1;
-    uint64_t arg2;
-    void* arg3;
-  } args;
-
-  args* kern_ptr = NULL;
-  err = hsa_amd_memory_pool_allocate(device_pool(), sizeof(args), 0,
-                                     (void**) &kern_ptr);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  // in_data is 32 bit ptr, so adding total_ops
-  kern_ptr->arg0 = in_data;
-  kern_ptr->arg1 = in_data + total_ops;
-  kern_ptr->arg2 = addr_step;
-  kern_ptr->arg3 = out_data;
-
-  aql().kernarg_address = kern_ptr;
-
-  std::vector<double> time;
-
-  int it = num_iteration() * 1.2 + 1;
-
-  void *q_base_addr = main_queue()->base_address;
-
-  for (int i = 0; i < it; i++) {
-    // Obtain the current queue write index
-    uint64_t index = hsa_queue_add_write_index_relaxed(main_queue(), 1);
-
-    // Write the aql packet at the calculated queue index address.
-    const uint32_t queue_mask = main_queue()->size - 1;
-    ((hsa_kernel_dispatch_packet_t*)q_base_addr)[index & queue_mask] = aql();
-
-    rocrtst::PerfTimer p_timer;
-    int id = p_timer.CreateTimer();
-    p_timer.StartTimer(id);
-
-    ((hsa_kernel_dispatch_packet_t*)q_base_addr)[index & queue_mask].header |=
-                     HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
-    hsa_signal_store_screlease(main_queue()->doorbell_signal, index);
-
-    // Wait on the dispatch signal until the kernel is finished.
-    while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1,
-                                     (uint64_t) - 1, HSA_WAIT_STATE_ACTIVE))
-      ;
-
-    p_timer.StopTimer(id);
-
-#if DEBUG
-    std::cout << ".";
-    std::cout.flush();
-#endif
-
-    // Verify the results
-    // uint32_t scale = kernel_loop_count_ * ops_thrd;
-    //verifyGlobalLoadKernel(out_data, total_workitems, scale,
-    //                                           kernel_name_.c_str(), false);
-
-    time.push_back(p_timer.ReadTimer(id));
-
-    hsa_signal_store_screlease(signal(), 1);
-  }
-
-  time.erase(time.begin());
-  std::sort(time.begin(), time.end());
-  time.erase(time.begin() + num_iteration(), time.end());
-  mean_ = rocrtst::CalcMean(time);
-
-  return;
-
-}
-
-void SystemLoadBandwidth::Close() {
-  hsa_status_t err;
-  err = rocrtst::CommonCleanUp(this);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-}
-
-void SystemLoadBandwidth::DisplayResults() const {
-
-  if (!rocrtst::CheckProfile(this)) {
-    return;
-  }
-
-  std::cout << "=======================================" << std::endl;
-  std::cout << "System Load Bandwidth:     %f(GB/S)" <<
-            data_size_ / mean_ / 1024 / 1024 / 1024 << std::endl;
-}
diff --git a/rocrtst/suites/performance/system_load_bandwidth.h b/rocrtst/suites/performance/system_load_bandwidth.h
deleted file mode 100755
index 69d90be217..0000000000
--- a/rocrtst/suites/performance/system_load_bandwidth.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * =============================================================================
- *   ROC Runtime Conformance Release License
- * =============================================================================
- * The University of Illinois/NCSA
- * Open Source License (NCSA)
- *
- * Copyright (c) 2017, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Developed by:
- *
- *                 AMD Research and AMD ROC Software Development
- *
- *                 Advanced Micro Devices, Inc.
- *
- *                 www.amd.com
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal with the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- *  - Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimers.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimers in
- *    the documentation and/or other materials provided with the distribution.
- *  - Neither the names of <Name of Development Group, Name of Institution>,
- *    nor the names of its contributors may be used to endorse or promote
- *    products derived from this Software without specific prior written
- *    permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS WITH THE SOFTWARE.
- *
- */
-
-#ifndef __ROCRTST_SRC_INC_SYSTEM_LOAD_BANDWIDTH_H__
-#define __ROCRTST_SRC_INC_SYSTEM_LOAD_BANDWIDTH_H__
-
-#include "perf_common/perf_base.h"
-#include "common/base_rocr.h"
-#include "hsa/hsa.h"
-#include <stdio.h>
-
-class SystemLoadBandwidth: public rocrtst::BaseRocR, public PerfBase {
- public:
-  //@Brief: Constructor
-  SystemLoadBandwidth();
-
-  //@Brief: Destructor
-  ~SystemLoadBandwidth();
-
-  //@Brief: Set up the testing environment
-  virtual void SetUp();
-
-  //@Brief: Run the test case
-  virtual void Run();
-
-  //@Brief: Close and clean up  the test enrionment
-  virtual void Close();
-
-  //@Brief: Display  load bandwidth
-  virtual void DisplayResults() const;
-
-  //@Brief: Set work-item configuration
-  void SetWorkItemNum() {
-#ifdef INTERACTIVE
-    uint32_t tmp;
-    printf("Please input the number of CUs you want to try:\n");
-    scanf("%d", &num_cus_);
-
-    printf("Please input the number of groups you want to try:\n");
-    scanf("%d", &num_group_);
-
-    printf("Please input the size of each group:\n");
-    uint32_t sz = 0;
-    scanf("%d", &tmp);
-    set_group_size(tmp);
-
-    printf("Please input the number of kernel loop you want to try:\n");
-    scanf("%d", &kernel_loop_count_);
-#else
-    num_cus_ = 32;
-    num_group_ = 128;
-    set_group_size(256);
-    kernel_loop_count_ = 16;
-#endif
-    return;
-  }
-
- private:
-
-  //@Brief: number of group
-  uint32_t num_group_;
-
-  //@Brief: number of CUs
-  uint32_t num_cus_;
-
-  //@Brief: number of kernel loop
-  uint32_t kernel_loop_count_;
-
-  //@Brief: Mean execution time
-  double mean_;
-
-  //@Brief: data size for test
-  uint64_t data_size_;
-};
-
-#endif
-
diff --git a/rocrtst/suites/performance/system_store_bandwidth.cc b/rocrtst/suites/performance/system_store_bandwidth.cc
deleted file mode 100755
index d2e1cc5082..0000000000
--- a/rocrtst/suites/performance/system_store_bandwidth.cc
+++ /dev/null
@@ -1,243 +0,0 @@
-/*
- * =============================================================================
- *   ROC Runtime Conformance Release License
- * =============================================================================
- * The University of Illinois/NCSA
- * Open Source License (NCSA)
- *
- * Copyright (c) 2017, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Developed by:
- *
- *                 AMD Research and AMD ROC Software Development
- *
- *                 Advanced Micro Devices, Inc.
- *
- *                 www.amd.com
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal with the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- *  - Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimers.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimers in
- *    the documentation and/or other materials provided with the distribution.
- *  - Neither the names of <Name of Development Group, Name of Institution>,
- *    nor the names of its contributors may be used to endorse or promote
- *    products derived from this Software without specific prior written
- *    permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS WITH THE SOFTWARE.
- *
- */
-
-#include "system_store_bandwidth.h"
-#include "common/base_rocr_utils.h"
-#include "common/common.h"
-#include "common/helper_funcs.h"
-#include "common/hsatimer.h"
-#include "gtest/gtest.h"
-
-static bool verifyGlobalStoreKernel(uint32_t* data, uint32_t num_thrds,
-                                    uint32_t loop_cnt, uint32_t ops_loop,
-                                    const char* kernel_name,
-                                    bool print_debug) {
-
-  // Verify kernel operation i.e. validate the data in the output buffer.
-  for (uint32_t idx1 = 0; idx1 < loop_cnt; idx1++) {
-    for (uint32_t idx2 = 0; idx2 < ops_loop; idx2++) {
-      for (uint32_t idx3 = 0; idx3 < num_thrds; idx3++) {
-        if (data[idx3] != (idx3 << 2)) {
-          std::cout << kernel_name << ": VALIDATION FAILED ! Bad index: "
-                    << idx3 << std::endl;
-          std::cout << kernel_name << ": VALUE @ Bad index: " << data[idx3]
-                    << std::endl;
-          break;
-        }
-      }
-    }
-  }
-
-#ifdef DEBUG
-  std::cout << kernel_name << ": Passed validation" << std::endl;
-  std::cout << std::endl;
-#endif
-
-  return true;
-}
-
-// Constructor
-SystemStoreBandwidth::SystemStoreBandwidth() :
-  BaseRocR() {
-
-  set_group_size(0);
-  num_group_ = 0;
-  num_cus_ = 0;
-
-  kernel_loop_count_ = 0;
-  mean_ = 0.0;
-  data_size_ = 0;
-}
-
-// Destructor
-SystemStoreBandwidth::~SystemStoreBandwidth() {
-}
-
-// Set up the test environment
-void SystemStoreBandwidth::SetUp() {
-
-  set_kernel_file_name("sysMemWrite.o");
-  set_kernel_name("&__SysMemStore");
-
-  if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
-    return;
-  }
-  hsa_agent_t* gpu_dev = gpu_device1();
-
-  SetWorkItemNum();
-
-  //Create a queue with max number size
-  hsa_queue_t* q = nullptr;
-  rocrtst::CreateQueue(*gpu_dev, &q);
-  set_main_queue(q);
-
-  rocrtst::LoadKernelFromObjFile(this);
-
-  uint32_t total_work_items = num_cus_ * num_group_ * group_size();
-
-  //Fill up part of aql
-  rocrtst::InitializeAQLPacket(this, &aql());
-  aql().workgroup_size_x = group_size();
-  aql().grid_size_x = total_work_items;
-
-  return;
-}
-
-// Run the test
-void SystemStoreBandwidth::Run() {
-  hsa_status_t err;
-
-  if (!rocrtst::CheckProfile(this)) {
-    return;
-  }
-
-  uint32_t total_workitems = num_cus_ * num_group_ * group_size();
-  hsa_agent_t* gpu_dev = gpu_device1();
-
-  uint32_t ops_thrd = 16;
-  uint64_t addr_step = (uint64_t) total_workitems * sizeof(uint32_t);
-  uint64_t total_ops = (uint64_t) total_workitems * kernel_loop_count_
-                       * ops_thrd;
-  uint64_t in_data_size = (uint64_t) total_ops * sizeof(uint32_t);
-  err = hsa_amd_agent_iterate_memory_pools(*gpu_dev,
-                                   rocrtst::FindStandardPool, &device_pool());
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  uint32_t* in_data = NULL;
-  err = hsa_amd_memory_pool_allocate(device_pool(), in_data_size, 0,
-                                     (void**) &in_data);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  //memset(in_data, 0, in_data_size);
-  err = hsa_amd_memory_fill(in_data, 0, in_data_size);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  uint32_t out_data_size = total_workitems * sizeof(uint32_t);
-  uint32_t* out_data = NULL;
-  err = hsa_amd_memory_pool_allocate(device_pool(), out_data_size, 0,
-                                     (void**) &out_data);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-  //memset(out_data, 0, out_data_size);
-  err = hsa_amd_memory_fill(out_data, 0, out_data_size);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  data_size_ = in_data_size;
-
-  typedef struct local_args_t {
-    void* arg0;
-    void* arg1;
-    uint64_t arg2;
-    void* arg3;
-  } args;
-
-  // in_data is 32 bit ptr, so adding total_ops
-  args* kern_ptr = NULL;
-  err = hsa_amd_memory_pool_allocate(device_pool(), sizeof(args), 0,
-                                     (void**) &kern_ptr);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-  kern_ptr->arg0 = in_data;
-  kern_ptr->arg1 = in_data + total_ops;
-  kern_ptr->arg2 = addr_step;
-  kern_ptr->arg3 = out_data;
-
-  aql().kernarg_address = kern_ptr;
-
-  std::vector<double> time;
-  void *q_base_addr = main_queue()->base_address; 
-  for (uint32_t i = 0; i < num_iteration(); i++) {
-    // Obtain the current queue write index
-    uint64_t index = hsa_queue_add_write_index_relaxed(main_queue(), 1);
-
-    // Write the aql packet at the calculated queue index address.
-    const uint32_t queue_mask = main_queue()->size - 1;
-    ((hsa_kernel_dispatch_packet_t*)(q_base_addr))[index & queue_mask] = aql(); 
-
-    rocrtst::PerfTimer p_timer;
-    int id = p_timer.CreateTimer();
-    p_timer.StartTimer(id);
-
-    ((hsa_kernel_dispatch_packet_t*)(q_base_addr))[index & queue_mask].header |=
-      HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
-    hsa_signal_store_screlease(main_queue()->doorbell_signal, index);
-
-    // Wait on the dispatch signal until the kernel is finished.
-    while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_LT, 1,
-                                     (uint64_t) - 1, HSA_WAIT_STATE_ACTIVE))
-      ;
-
-    p_timer.StopTimer(id);
-
-    // Verify the results
-    verifyGlobalStoreKernel(in_data, total_workitems, kernel_loop_count_,
-                                     ops_thrd, kernel_name().c_str(), false);
-
-    time.push_back(p_timer.ReadTimer(id));
-
-    hsa_signal_store_screlease(signal(), 1);
-  }
-
-  time.erase(time.begin());
-  mean_ = rocrtst::CalcMean(time);
-
-  return;
-}
-
-void SystemStoreBandwidth::Close() {
-  hsa_status_t err;
-  err = rocrtst::CommonCleanUp(this);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-  return;
-}
-
-void SystemStoreBandwidth::DisplayResults() const {
-
-  if (!rocrtst::CheckProfile(this)) {
-    return;
-  }
-
-  std::cout << "=======================================" << std::endl;
-  std::cout << "System Load Bandwidth:     %f(GB/S)"
-            << data_size_ / mean_ / 1024 / 1024 / 1024 << std::endl;
-}
diff --git a/rocrtst/suites/performance/system_store_bandwidth.h b/rocrtst/suites/performance/system_store_bandwidth.h
deleted file mode 100755
index 7327a0d5f1..0000000000
--- a/rocrtst/suites/performance/system_store_bandwidth.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * =============================================================================
- *   ROC Runtime Conformance Release License
- * =============================================================================
- * The University of Illinois/NCSA
- * Open Source License (NCSA)
- *
- * Copyright (c) 2017, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Developed by:
- *
- *                 AMD Research and AMD ROC Software Development
- *
- *                 Advanced Micro Devices, Inc.
- *
- *                 www.amd.com
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal with the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- *  - Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimers.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimers in
- *    the documentation and/or other materials provided with the distribution.
- *  - Neither the names of <Name of Development Group, Name of Institution>,
- *    nor the names of its contributors may be used to endorse or promote
- *    products derived from this Software without specific prior written
- *    permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS WITH THE SOFTWARE.
- *
- */
-
-#ifndef __ROCRTST_SRC_INC_SYSTEM_STORE_BANDWIDTH_H__
-#define __ROCRTST_SRC_INC_SYSTEM_STORE_BANDWIDTH_H__
-
-#include "perf_common/perf_base.h"
-#include "common/base_rocr.h"
-#include "hsa/hsa.h"
-#include <stdio.h>
-
-class SystemStoreBandwidth: public rocrtst::BaseRocR, public PerfBase {
- public:
-  //@Brief: Constructor
-  SystemStoreBandwidth();
-
-  //@Brief: Destructor
-  ~SystemStoreBandwidth();
-
-  //@Brief: Set up the testing environment
-  virtual void SetUp();
-
-  //@Brief: Run the test case
-  virtual void Run();
-
-  //@Brief: Close and clean up  the test enrionment
-  virtual void Close();
-
-  //@Brief: Display  load bandwidth
-  virtual void DisplayResults() const;
-
-  //@Brief: Set work-item configuration
-  void SetWorkItemNum() {
-#ifdef INTERACTIVE
-    uint32_t tmp;
-
-    printf("Please input the number of CUs you want to try:\n");
-    scanf("%d", &num_cus_);
-
-    printf("Please input the number of groups you want to try:\n");
-    scanf("%d", &num_group_);
-
-    printf("Please input the size of each group:\n");
-    scanf("%d", &tmp);
-    set_group_size(tmp);
-
-    printf("Please input the number of kernel loop you want to try:\n");
-    scanf("%d", &kernel_loop_count_);
-#else
-    num_cus_ = 32;
-    num_group_ = 128;
-    group_size_ = 256;
-    kernel_loop_count_ = 16;
-#endif
-    return;
-  }
-
- private:
-  //@Brief: number of work item in one group
-  uint32_t group_size_;
-
-  //@Brief: number of group
-  uint32_t num_group_;
-
-  //@Brief: number of CUs
-  uint32_t num_cus_;
-
-  //@Brief: number of kernel loop
-  uint32_t kernel_loop_count_;
-
-  //@Brief: Mean execution time
-  double mean_;
-
-  //@Brief: data size for test
-  uint64_t data_size_;
-};
-
-#endif
-
diff --git a/rocrtst/suites/performance/test_case_template.cc b/rocrtst/suites/performance/test_case_template.cc
new file mode 100755
index 0000000000..65f24ae7b0
--- /dev/null
+++ b/rocrtst/suites/performance/test_case_template.cc
@@ -0,0 +1,395 @@
+/*
+ * =============================================================================
+ *   ROC Runtime Conformance Release License
+ * =============================================================================
+ * The University of Illinois/NCSA
+ * Open Source License (NCSA)
+ *
+ * Copyright (c) 2017, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Developed by:
+ *
+ *                 AMD Research and AMD ROC Software Development
+ *
+ *                 Advanced Micro Devices, Inc.
+ *
+ *                 www.amd.com
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal with the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ *  - Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimers.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimers in
+ *    the documentation and/or other materials provided with the distribution.
+ *  - Neither the names of <Name of Development Group, Name of Institution>,
+ *    nor the names of its contributors may be used to endorse or promote
+ *    products derived from this Software without specific prior written
+ *    permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS WITH THE SOFTWARE.
+ *
+ */
+
+// The purpose of this test is to provide an example of the use of the
+// common RocrTest classes and utilities that are used in many examples.
+// It can be used as a template to start off with when writing new tests.
+// In many cases, the existing boilerplate code will be sufficient as is.
+// Otherwise, the boilerplate code can be either supplemented or replaced
+// by your own code in your example, as necessary.
+//
+// The comments provided are focused more on the use of the common rocrtst
+// utilities and boilerplate code, rather than the example app. itself.
+//
+// The boilerplate code includes code for:
+// * hsa initialization and clean up
+// * code to load pre-built kernels
+// * creating queues
+// * populating AQL packets
+// * checking for required profiles
+// * finding cpu and gpu agents (callbacks for common use cases)
+// * finding pools (having common requirements)
+// * allocating and setting kernel arguments
+// * somewhat standardized output
+// * handling additional command line arguments, beyond google-test arguments
+// * support for various level of verbosity, controlled from command line arg
+// * support for building OpenCL kernels
+// * timer support
+//
+// Overview of RocrTst code organization:
+// Classes:
+// * class BaseRocR (base_rocr.h) -- base class for all rocrtst examples and
+//   tests. Most of the rocrtst common utilities act on BaseRocR objects
+//
+// * TestBase (test_base.h)  -- derives from BaseRocR and is the base class
+//   for all tests under <rocrtst root>/suites. The implementation in TestBase
+//   methods are typically actions that are required for most/all tests and
+//   should therefore be called from the derived implementions of the methods.
+//
+// Utilities:
+// * <rocrtst root>/common/base_rocr_utils.<cc/h> contains a set of utilities
+//   that act on BaseRocR objects.
+//
+// * <rocrtst root>/common/common.<cc/h> contain other non-BaseRocR utilities
+//
+// Special Files:
+// * main.cc -- The main google test file from which the tests are invoked.
+//     There should be an entry for each test to be run there.
+//
+// * kernels -- OpenCL kernel source files should go in the kernels directory
+//
+// * CMakeLists.txt -- Host code (*.cc and *.h files) should build without
+//     modifying the CMakeList.txt file, if the files are place in the
+//     "performance" directory. However, an entry for OpenCL kernels. For
+//     each kernel to be built, the bitcode libraries must be indicated before
+//     the call to "build_kernel()" is made. See existing code for examples.
+
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include "suites/performance/test_case_template.h"
+#include "common/base_rocr_utils.h"
+#include "common/common.h"
+#include "common/helper_funcs.h"
+#include "common/hsatimer.h"
+#include "gtest/gtest.h"
+#include "hsa/hsa.h"
+#include "hsa/hsa_ext_finalize.h"
+
+static const uint32_t kNumBufferElements = 256;
+
+#define RET_IF_HSA_ERR(err) { \
+  if ((err) != HSA_STATUS_SUCCESS) { \
+    const char* msg = 0; \
+    hsa_status_string(err, &msg); \
+    std::cout << "hsa api call failure at line " << __LINE__ << ", file: " << \
+                          __FILE__ << ". Call returned " << err << std::endl; \
+    std::cout << msg << std::endl; \
+    return (err); \
+  } \
+}
+
+// Many test cases want to perform an operation on memory sizes of various
+// granularities.
+#if 0
+static const int kNumGranularity = 20;
+const char* Str[kNumGranularity] = {"1k", "2K", "4K", "8K", "16K", "32K",
+    "64K", "128K", "256K", "512K", "1M", "2M", "4M", "8M", "16M", "32M",
+                                               "64M", "128M", "256M", "512M"};
+
+const size_t Size[kNumGranularity] = {
+    1024, 2*1024, 4*1024, 8*1024, 16*1024, 32*1024, 64*1024, 128*1024,
+    256*1024, 512*1024, 1024*1024, 2048*1024, 4096*1024, 8*1024*1024,
+    16*1024*1024, 32*1024*1024, 64*1024*1024, 128*1024*1024, 256*1024*1024,
+    512*1024*1024};
+
+static const int kMaxCopySize = Size[kNumGranularity - 1];
+#endif
+TestExample::TestExample(void) :
+    TestBase() {
+  set_num_iteration(10);  // Number of iterations to execute of the main test;
+                          // This is a default value which can be overridden
+                          // on the command line.
+  set_title("Test Case Example");
+  set_description("Put a description of the test case here. Line breaks "
+      "will be taken care of on output, not here.");
+
+  set_kernel_file_name("test_case_template_kernels.hsaco");
+  set_kernel_name("square");  // kernel function name
+
+#if 0
+  // Set required profile to HSA_PROFILE_FULL or HSA_PROFILE_BASE if it
+  // matters for this test. If either profile is fine, then leave with
+  // default
+  set_requires_profile(<value>);
+#endif
+}
+
+TestExample::~TestExample(void) {
+}
+
+// Any 1-time setup involving member variables used in the rest of the test
+// should be done here.
+void TestExample::SetUp(void) {
+  hsa_status_t err;
+
+  // TestBase::SetUp() will set HSA_ENABLE_INTERRUPT if enable_interrupt() is
+  // true, and call hsa_init(). It also prints the SetUp header.
+  TestBase::SetUp();
+
+  // SetDefaultAgents(this) will assign the first CPU and GPU found on
+  // iterating through the agents and assign them to cpu_device_ and
+  // gpu_device1_, respectively (cpu_device() and gpu_device1()). These
+  // BaseRocR member variables are used in some utilities. Additionally,
+  // SetDefaultAgents() checks the profile of the gpu and compares this
+  // to any required profile.
+  //
+  // If SetDefaultAgents() is not used, if the profile of the target GPU
+  // matters for this test, it should be set with set_profile() and
+  // CheckProfileAndInform() should be called to check if it is the
+  // required profile
+  err = rocrtst::SetDefaultAgents(this);
+  ASSERT_EQ(HSA_STATUS_SUCCESS, err);
+
+  hsa_agent_t* gpu_dev = gpu_device1();
+
+  // Find and assign HSA_AMD_SEGMENT_GLOBAL pools for cpu, gpu and a kern_arg
+  // pool
+  err = rocrtst::SetPoolsTypical(this);
+  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+  // Create a queue
+  hsa_queue_t* q = nullptr;
+  rocrtst::CreateQueue(*gpu_dev, &q);
+  ASSERT_NE(q, nullptr);
+  set_main_queue(q);
+
+  err = rocrtst::LoadKernelFromObjFile(this);
+  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+  // Fill up the kernel packet (except header) with some values we've
+  // collected so far, and some reasonable default values; this should be after
+  // LoadKernelFromObjFile(). AllocAndSetKernArgs() will fill in the kern_args
+  err = rocrtst::InitializeAQLPacket(this, &aql());
+  ASSERT_EQ(HSA_STATUS_SUCCESS, err);
+
+  hsa_agent_t ag_list[2] = {*gpu_device1(), *cpu_device()};
+
+  // Allocate a few buffers for our example
+  err = hsa_amd_memory_pool_allocate(cpu_pool(),
+                                   kNumBufferElements*sizeof(uint32_t),
+                                   0, reinterpret_cast<void**>(&src_buffer_));
+  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+  err = hsa_amd_agents_allow_access(2, ag_list, NULL, src_buffer_);
+  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+  // Initialize the source buffer
+  for (uint32_t i = 0; i < kNumBufferElements; ++i) {
+    reinterpret_cast<uint32_t *>(src_buffer_)[i] = i;
+  }
+
+  err = hsa_amd_memory_pool_allocate(cpu_pool(),
+                                   kNumBufferElements*sizeof(uint32_t),
+                                   0, reinterpret_cast<void**>(&dst_buffer_));
+  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+  err = hsa_amd_agents_allow_access(2, ag_list, NULL, dst_buffer_);
+  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+  // Set up Kernel arguments
+  // See the meta-data for the compiled OpenCL kernel code to ascertain
+  // the sizes, padding and alignment required for kernel arguments.
+  // This can be seen by executing
+  // $ amdgcn-amd-amdhsa-readelf -aw ./binary_search_kernels.hsaco
+  // The kernel code will expect the following arguments aligned as shown.
+//  typedef uint32_t uint4[4];
+  struct __attribute__((aligned(16))) local_args_t {
+    uint32_t* dstArray;
+    uint32_t* srcArray;
+    uint32_t size;
+    uint32_t pad;
+    uint64_t global_offset_x;
+    uint64_t global_offset_y;
+    uint64_t global_offset_z;
+  } local_args;
+
+  local_args.dstArray = reinterpret_cast<uint32_t *>(dst_buffer_);
+  local_args.srcArray = reinterpret_cast<uint32_t *>(src_buffer_);
+  local_args.size = kNumBufferElements;
+  local_args.global_offset_x = 0;
+  local_args.global_offset_y = 0;
+  local_args.global_offset_z = 0;
+
+  err = rocrtst::AllocAndSetKernArgs(this, &local_args, sizeof(local_args));
+  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+  return;
+}
+
+// This wrapper atomically writes the provided header and setup to the
+// provided AQL packet. The provided AQL packet address should be in the
+// queue memory space.
+static inline void AtomicSetPacketHeader(uint16_t header, uint16_t setup,
+                                  hsa_kernel_dispatch_packet_t* queue_packet) {
+  __atomic_store_n(reinterpret_cast<uint32_t*>(queue_packet),
+                   header | (setup << 16), __ATOMIC_RELEASE);
+}
+
+// Do a few extra iterations as we toss out some of the inital and final
+// iterations when calculating statistics
+uint32_t TestExample::RealIterationNum(void) {
+  return num_iteration() * 1.2 + 1;
+}
+
+static bool VerifyResult(uint32_t *ar, size_t sz) {
+  for (size_t i = sz; i < sz; ++i) {
+    if (i*i != ar[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+void TestExample::Run(void) {
+  // Compare required profile for this test case with what we're actually
+  // running on
+  if (!rocrtst::CheckProfile(this)) {
+    return;
+  }
+
+  TestBase::Run();
+
+  // Override whatever we need to...
+  aql().workgroup_size_x = kNumBufferElements;
+  aql().grid_size_x = kNumBufferElements;
+
+  std::vector<double> timer;
+
+  int it = RealIterationNum();
+  hsa_kernel_dispatch_packet_t *queue_aql_packet;
+
+  rocrtst::PerfTimer p_timer;
+  uint64_t index;
+
+  for (int i = 0; i < it; i++) {
+    // This function simply copies the data we've collected so far into our
+    // local AQL packet, except the the setup and header fields.
+    queue_aql_packet = WriteAQLToQueue(this, &index);
+    ASSERT_EQ(queue_aql_packet,
+              reinterpret_cast<hsa_kernel_dispatch_packet_t *>
+                                      (main_queue()->base_address) + index);
+    uint32_t aql_header = HSA_PACKET_TYPE_KERNEL_DISPATCH;
+
+    aql_header |= HSA_FENCE_SCOPE_SYSTEM <<
+                  HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
+    aql_header |= HSA_FENCE_SCOPE_SYSTEM <<
+                  HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
+
+    // Create and start a timer for this iteration
+    int id = p_timer.CreateTimer();
+    p_timer.StartTimer(id);
+
+    AtomicSetPacketHeader(aql_header, aql().setup, queue_aql_packet);
+
+    hsa_signal_store_screlease(main_queue()->doorbell_signal, index);
+
+    // Wait on the dispatch signal until the kernel is finished.
+    while (hsa_signal_wait_scacquire(aql().completion_signal,
+         HSA_SIGNAL_CONDITION_LT, 1, (uint64_t) - 1, HSA_WAIT_STATE_ACTIVE)) {
+    }
+
+    // Stop the timer
+    p_timer.StopTimer(id);
+
+    // Store time for later analysis
+    timer.push_back(p_timer.ReadTimer(id));
+    hsa_signal_store_screlease(aql().completion_signal, 1);
+
+    ASSERT_TRUE(VerifyResult(reinterpret_cast<uint32_t *>(dst_buffer_),
+                                                         kNumBufferElements));
+
+    // Pay attention to verbosity level for things like progress output
+    if (verbosity() >= VERBOSE_PROGRESS) {
+      std::cout << ".";
+      fflush(stdout);
+    }
+  }
+
+  if (verbosity() >= VERBOSE_PROGRESS) {
+    std::cout << std::endl;
+  }
+
+  // Abandon the first result and after sort, delete the last 2% value
+  timer.erase(timer.begin());
+  std::sort(timer.begin(), timer.end());
+  timer.erase(timer.begin() + num_iteration(), timer.end());
+
+  time_mean_ = rocrtst::CalcMean(timer);
+}
+
+void TestExample::DisplayTestInfo(void) {
+  TestBase::DisplayTestInfo();
+}
+
+void TestExample::DisplayResults(void) const {
+  // Compare required profile for this test case with what we're actually
+  // running on
+  if (!rocrtst::CheckProfile(this)) {
+    return;
+  }
+
+  TestBase::DisplayResults();
+  std::cout << "The average time was: " << time_mean_ * 1e6 <<
+                                                           " uS" << std::endl;
+  return;
+}
+
+void TestExample::Close() {
+  hsa_status_t err;
+
+  err = hsa_amd_memory_pool_free(src_buffer_);
+  ASSERT_EQ(HSA_STATUS_SUCCESS, err);
+
+  err = hsa_amd_memory_pool_free(dst_buffer_);
+  ASSERT_EQ(HSA_STATUS_SUCCESS, err);
+
+  // This will close handles opened within rocrtst utility calls and call
+  // hsa_shut_down(), so it should be done after other hsa cleanup
+  TestBase::Close();
+}
+
+
+#undef RET_IF_HSA_ERR
diff --git a/rocrtst/suites/performance/image_load_bandwidth.h b/rocrtst/suites/performance/test_case_template.h
similarity index 76%
rename from rocrtst/suites/performance/image_load_bandwidth.h
rename to rocrtst/suites/performance/test_case_template.h
index 9239853064..e20ed27d6b 100755
--- a/rocrtst/suites/performance/image_load_bandwidth.h
+++ b/rocrtst/suites/performance/test_case_template.h
@@ -43,40 +43,41 @@
  *
  */
 
-#ifndef __ROCRTST_SRC_INC_IMAGE_LOAD_BANDWIDTH_H__
-#define __ROCRTST_SRC_INC_IMAGE_LOAD_BANDWIDTH_H__
+#ifndef ROCRTST_SUITES_PERFORMANCE_TEST_CASE_TEMPLATE_H_
+#define ROCRTST_SUITES_PERFORMANCE_TEST_CASE_TEMPLATE_H_
 
 #include "common/base_rocr.h"
 #include "hsa/hsa.h"
-#include "perf_common/perf_base.h"
+#include "suites/test_common/test_base.h"
 
-class ImageLoadBandwidth: public rocrtst::BaseRocR, public PerfBase {
+class TestExample : public TestBase {
  public:
-  //@Brief: Constructor
-  ImageLoadBandwidth();
+  TestExample();
 
-  //@Brief: Destructor
-  ~ImageLoadBandwidth();
+  // @Brief: Destructor for test case of TestExample
+  virtual ~TestExample();
 
-  //@Brief: Set up the test environment
+  // @Brief: Setup the environment for measurement
   virtual void SetUp();
 
-  //@Brief: Run the actual testing
+  // @Brief: Core measurement execution
   virtual void Run();
 
-  //@Brief: Clean up the test environment
+  // @Brief: Clean up and retrive the resource
   virtual void Close();
 
-  //@Brief: Display  results
+  // @Brief: Display  results
   virtual void DisplayResults() const;
 
- private:
-  //@Brief: Image Load Bandwidth
-  double load_bandwidth_;
+  // @Brief: Display information about what this test does
+  virtual void DisplayTestInfo(void);
 
-  //@Brief: Image size
-  size_t image_size_;
+ private:
+  uint32_t RealIterationNum(void);
+
+  double time_mean_;
+  void *src_buffer_;
+  void *dst_buffer_;
 };
 
-#endif //__ROCRTST_SRC_INC_IMAGE_LOAD_BANDWIDTH_H__
-
+#endif  // ROCRTST_SUITES_PERFORMANCE_TEST_CASE_TEMPLATE_H_
diff --git a/rocrtst/suites/performance/vector_copy.cc b/rocrtst/suites/performance/vector_copy.cc
deleted file mode 100644
index f772a48351..0000000000
--- a/rocrtst/suites/performance/vector_copy.cc
+++ /dev/null
@@ -1,279 +0,0 @@
-/*
- * =============================================================================
- *   ROC Runtime Conformance Release License
- * =============================================================================
- * The University of Illinois/NCSA
- * Open Source License (NCSA)
- *
- * Copyright (c) 2017, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Developed by:
- *
- *                 AMD Research and AMD ROC Software Development
- *
- *                 Advanced Micro Devices, Inc.
- *
- *                 www.amd.com
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal with the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- *  - Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimers.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimers in
- *    the documentation and/or other materials provided with the distribution.
- *  - Neither the names of <Name of Development Group, Name of Institution>,
- *    nor the names of its contributors may be used to endorse or promote
- *    products derived from this Software without specific prior written
- *    permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS WITH THE SOFTWARE.
- *
- */
-
-#include "vector_copy.h"
-#include "common/base_rocr_utils.h"
-#include "gtest/gtest.h"
-
-// Copy vector buffer size.
-static const size_t BUFFER_SIZE = 1024 * 1024 * 4;
-static char* gCPUOutput = nullptr;
-static uint64_t gQueueIndex = 0;
-
-//Constructor
-VectorCopy::VectorCopy() :
-  BaseRocR() {
-  set_kernel_name("&__vector_copy_kernel");
-  kernarg_address = NULL;
-}
-
-//Destructor
-VectorCopy::~VectorCopy() {
-}
-
-// Find coarse grained system memory.
-static hsa_status_t get_sys_coarse_grained_memory_pool(
-  hsa_amd_memory_pool_t pool, void* data) {
-  hsa_amd_segment_t segment;
-  hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT,
-                               &segment);
-
-  if (HSA_AMD_SEGMENT_GLOBAL != segment) {
-    return HSA_STATUS_SUCCESS;
-  }
-
-  hsa_amd_memory_pool_global_flag_t flags;
-  hsa_status_t err = hsa_amd_memory_pool_get_info(pool,
-                     HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flags);
-
-  if (HSA_STATUS_SUCCESS == err
-      && (flags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED)) {
-    hsa_amd_memory_pool_t* ret = (hsa_amd_memory_pool_t*) data;
-    *ret = pool;
-    return HSA_STATUS_INFO_BREAK;
-  }
-
-  return err;
-}
-
-// Find out dGPU's local memory pool.
-static hsa_status_t get_local_memory_pool(hsa_amd_memory_pool_t pool,
-    void* data) {
-  // With memory pool API, each agent will only report it is own memory pools.
-  // So, a coarse grained memory pool in global segment is what we want.
-  hsa_amd_segment_t segment;
-
-  hsa_status_t err = hsa_amd_memory_pool_get_info(pool,
-                     HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment);
-
-  if (HSA_STATUS_SUCCESS != err) {
-    return err;
-  }
-
-  if (HSA_AMD_SEGMENT_GLOBAL != segment) {
-    return HSA_STATUS_SUCCESS;
-  }
-
-  hsa_amd_memory_pool_global_flag_t flags;
-  err = hsa_amd_memory_pool_get_info(pool,
-                          HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flags);
-
-  if (HSA_STATUS_SUCCESS == err
-      && (flags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED)) {
-    hsa_amd_memory_pool_t* ret = (hsa_amd_memory_pool_t*) data;
-    *ret = pool;
-    return HSA_STATUS_INFO_BREAK;
-  }
-
-  return err;
-}
-
-void VectorCopy::SetUp() {
-  hsa_status_t err;
-  hsa_agent_t* gpu_dev = gpu_device1();
-
-  if (HSA_STATUS_SUCCESS != rocrtst::InitAndSetupHSA(this)) {
-    return;
-  }
-
-  //Create a queue with max number size
-  hsa_queue_t* q;
-  rocrtst::CreateQueue(*gpu_dev, &q);
-  set_main_queue(q);
-
-  rocrtst::LoadKernelFromObjFile(this);
-
-  // Obtain the current queue write index.
-  gQueueIndex = hsa_queue_load_write_index_scacquire(main_queue());
-
-  rocrtst::InitializeAQLPacket(this, &aql());
-  uint16_t header = 0;
-  header |= HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
-
-  aql().grid_size_x = (uint32_t)(1024 * 1024);
-  aql().kernarg_address = (void*) kernarg_address;
-
-  // Find system memory pool for kernarg allocation.
-  // hsa_amd_memory_pool_t sys_coarse_grained_pool;
-  err = hsa_amd_agent_iterate_memory_pools(cpus[0],
-        get_sys_coarse_grained_memory_pool, &sys_coarse_grained_pool_);
-  ASSERT_EQ(err, HSA_STATUS_INFO_BREAK);
-
-  // Get local memory pool of the first GPU.
-  // hsa_amd_memory_pool_t gpu_pool_;
-  err = hsa_amd_agent_iterate_memory_pools(gpus[0], get_local_memory_pool,
-        &gpu_pool_);
-  ASSERT_EQ(err, HSA_STATUS_INFO_BREAK);
-
-  return;
-}
-
-void VectorCopy::Run() {
-  hsa_status_t err;
-  void* in;
-  void* out;
-
-  if (!rocrtst::CheckProfile(this)) {
-    return;
-  }
-
-  // Allocate vector on the first GPU local memory as input.
-  err = hsa_amd_memory_pool_allocate(gpu_pool_, BUFFER_SIZE, 0, &in);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-  std::cout << "Allocating " << BUFFER_SIZE <<
-            " Bytes of local memory on the first GPU, address = " <<
-                                                              in << std::endl;
-
-  // rocrtst::CommonCleanUp input buffer on the first GPU to 1 for each byte.
-  err = hsa_amd_memory_fill(in, 0x01010101, BUFFER_SIZE / 4);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  // Allocate vector on the first GPU local memory as output
-  err = hsa_amd_memory_pool_allocate(gpu_pool_, BUFFER_SIZE, 0, &out);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-  std::cout << "Allocating " << BUFFER_SIZE <<
-            " Bytes of local memory on the second GPU, address = " <<
-                                                             out << std::endl;
-
-  // rocrtst::CommonCleanUp output buffer on the first GPU to 0.
-  err = hsa_amd_memory_fill(out, 0x00000000, BUFFER_SIZE / 4);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  typedef struct args_t {
-    void* in;
-    void* out;
-  } args;
-
-  args* kargs;
-
-  kargs->in = in;
-  kargs->out = out;
-
-  // Allocate the kernel argument buffer from the system memory pool.
-  err = hsa_amd_memory_pool_allocate(sys_coarse_grained_pool_, kernarg_size(),
-                                     0, &kernarg_address);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-  memcpy(kernarg_address, &kargs, sizeof(args));
-
-  // Map kernarg space to the first GPU
-  err = hsa_amd_agents_allow_access(1, &gpus[0], NULL, kernarg_address);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  /*
-   * Increment the write index and ring the doorbell to dispatch the kernel.
-   */
-  hsa_queue_store_write_index_screlease(main_queue(), gQueueIndex + 1);
-  hsa_signal_store_relaxed(main_queue()->doorbell_signal, gQueueIndex);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  // Wait on the dispatch completion signal until the kernel is finished.
-  while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_EQ, 0,
-                                   UINT64_MAX, HSA_WAIT_STATE_BLOCKED))
-    ;
-
-  // Reset signal value for future usage to copy output.
-  hsa_signal_store_screlease(signal(), 1);
-
-  // Allocate vector on the system memory pool.
-  err = hsa_amd_memory_pool_allocate(sys_coarse_grained_pool_, BUFFER_SIZE, 0,
-                                     (void**) &gCPUOutput);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  // Allow the first GPU to access the output
-  err = hsa_amd_agents_allow_access(1, &gpus[0], NULL, gCPUOutput);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  //Copy the output from GPU to the CPU buffer for validation
-  err = hsa_amd_memory_async_copy(gCPUOutput, cpus[0], out, gpus[0],
-                                  BUFFER_SIZE, 0, NULL, signal());
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  // Wait on the completion signal until the async copy is finished.
-  while (hsa_signal_wait_scacquire(signal(), HSA_SIGNAL_CONDITION_EQ, 0,
-                                   UINT64_MAX, HSA_WAIT_STATE_BLOCKED))
-    ;
-
-  for (uint32_t i = 0; i < BUFFER_SIZE; i++) {
-    ASSERT_EQ(gCPUOutput[i], 1);
-  }
-
-  return;
-}
-
-void VectorCopy::Close() {
-  hsa_status_t err;
-  // Cleanup all allocated resources.
-  err = hsa_amd_memory_pool_free(kernarg_address);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  err = hsa_signal_destroy(signal());
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  err = hsa_queue_destroy(main_queue());
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  err = hsa_amd_memory_pool_free(gCPUOutput);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-
-  err = rocrtst::CommonCleanUp(this);
-  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
-  return;
-}
-
-void VectorCopy::DisplayResults() const {
-  if (!rocrtst::CheckProfile(this)) {
-    return;
-  }
-}
diff --git a/rocrtst/suites/performance/vector_copy.h b/rocrtst/suites/performance/vector_copy.h
deleted file mode 100755
index 5946b04023..0000000000
--- a/rocrtst/suites/performance/vector_copy.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * =============================================================================
- *   ROC Runtime Conformance Release License
- * =============================================================================
- * The University of Illinois/NCSA
- * Open Source License (NCSA)
- *
- * Copyright (c) 2017, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Developed by:
- *
- *                 AMD Research and AMD ROC Software Development
- *
- *                 Advanced Micro Devices, Inc.
- *
- *                 www.amd.com
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal with the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- *  - Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimers.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimers in
- *    the documentation and/or other materials provided with the distribution.
- *  - Neither the names of <Name of Development Group, Name of Institution>,
- *    nor the names of its contributors may be used to endorse or promote
- *    products derived from this Software without specific prior written
- *    permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS WITH THE SOFTWARE.
- *
- */
-
-#ifndef __ROCRTST_SRC_VECTOR_COPY_H__
-#define __ROCRTST_SRC_VECTOR_COPY_H__
-
-#include "perf_common/perf_base.h"
-#include "common/base_rocr.h"
-#include "common/common.h"
-#include "common/hsatimer.h"
-#include "hsa/hsa.h"
-#include "hsa/hsa_ext_amd.h"
-#include "hsa/hsa_ext_finalize.h"
-#include <algorithm>
-#include <vector>
-
-//@Brief: This class is defined to measure the mean latency of launching
-//an empty kernel
-
-class VectorCopy: public rocrtst::BaseRocR, public PerfBase {
- public:
-  //@Brief: Constructor
-  VectorCopy();
-
-  //@Brief: Destructor
-  virtual ~VectorCopy();
-
-  //@Brief: Set up the environment for the test
-  virtual void SetUp();
-
-  //@Brief: Run the test case
-  virtual void Run();
-
-  virtual void DisplayResults() const;
-
-  //@Brief: Clean up and close the runtime
-  virtual void Close();
-
- private:
-
-  //@Brief: Store the size of queue
-  uint32_t queue_size_;
-
-  //@Brief: kernarg_address;
-  void* kernarg_address;
-
-  //@Brief: The mean time of CP Processing
-  double mean_;
-
-  //@Brief: The group memory region
-  hsa_region_t group_region_;
-
-  hsa_amd_memory_pool_t gpu_pool_;
-  hsa_amd_memory_pool_t sys_coarse_grained_pool_;
-
-  std::vector<hsa_agent_t> cpus;
-  std::vector<hsa_agent_t> gpus;
-
-  //@Brief: Pointer to cu_id array
-  uint32_t* cu_;
-
-  uint32_t manual_input;
-  uint32_t group_input;
-};
-
-#endif
-
diff --git a/rocrtst/suites/performance/vector_copy_peer_to_peer.h b/rocrtst/suites/performance/vector_copy_peer_to_peer.h
deleted file mode 100755
index 0f05674cb5..0000000000
--- a/rocrtst/suites/performance/vector_copy_peer_to_peer.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * =============================================================================
- *   ROC Runtime Conformance Release License
- * =============================================================================
- * The University of Illinois/NCSA
- * Open Source License (NCSA)
- *
- * Copyright (c) 2017, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
- * Developed by:
- *
- *                 AMD Research and AMD ROC Software Development
- *
- *                 Advanced Micro Devices, Inc.
- *
- *                 www.amd.com
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal with the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- *  - Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimers.
- *  - Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimers in
- *    the documentation and/or other materials provided with the distribution.
- *  - Neither the names of <Name of Development Group, Name of Institution>,
- *    nor the names of its contributors may be used to endorse or promote
- *    products derived from this Software without specific prior written
- *    permission.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS WITH THE SOFTWARE.
- *
- */
-
-#ifndef __ROCRTST_SRC_VECTOR_COPY_P2P_H__
-#define __ROCRTST_SRC_VECTOR_COPY_P2P_H__
-
-#include "perf_common/perf_base.h"
-#include "common/base_rocr.h"
-#include "common/common.h"
-#include "common/hsatimer.h"
-#include "hsa/hsa.h"
-#include "hsa/hsa_ext_amd.h"
-#include "hsa/hsa_ext_finalize.h"
-#include <algorithm>
-#include <vector>
-
-//@Brief: This class is defined to measure the mean latency of launching
-//an empty kernel
-
-class VectorCopyP2P: public rocrtst::BaseRocR, public PerfBase {
- public:
-  //@Brief: Constructor
-  VectorCopyP2P();
-
-  //@Brief: Destructor
-  virtual ~VectorCopyP2P();
-
-  //@Brief: Set up the environment for the test
-  virtual void SetUp();
-
-  //@Brief: Run the test case
-  virtual void Run();
-
-  //@Brief: Display  results we got
-  virtual void DisplayResults() const;
-
-  //@Brief: Clean up and close the runtime
-  virtual void Close();
-
- private:
-  //@Brief: Get actual iteration number
-  virtual size_t RealIterationNum();
-
-  //@Brief: Create Queue
-  virtual void CreateQueue();
-
-  //@Brief: Store the size of queue
-  uint32_t queue_size_;
-
-  //@Brief: The mean time of CP Processing
-  double mean_;
-
-  //@Brief: The group memory region
-  hsa_region_t group_region_;
-
-  //@Brief: Pointer to cu_id array
-  uint32_t* cu_;
-
-  uint32_t manual_input;
-  uint32_t group_input;
-};
-
-#endif
-
diff --git a/rocrtst/suites/test_common/test_base.cc b/rocrtst/suites/test_common/test_base.cc
new file mode 100755
index 0000000000..d7fa7883bf
--- /dev/null
+++ b/rocrtst/suites/test_common/test_base.cc
@@ -0,0 +1,141 @@
+/*
+ * =============================================================================
+ *   ROC Runtime Conformance Release License
+ * =============================================================================
+ * The University of Illinois/NCSA
+ * Open Source License (NCSA)
+ *
+ * Copyright (c) 2017, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Developed by:
+ *
+ *                 AMD Research and AMD ROC Software Development
+ *
+ *                 Advanced Micro Devices, Inc.
+ *
+ *                 www.amd.com
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal with the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ *  - Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimers.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimers in
+ *    the documentation and/or other materials provided with the distribution.
+ *  - Neither the names of <Name of Development Group, Name of Institution>,
+ *    nor the names of its contributors may be used to endorse or promote
+ *    products derived from this Software without specific prior written
+ *    permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS WITH THE SOFTWARE.
+ *
+ */
+
+#include <assert.h>
+
+#include "suites/test_common/test_base.h"
+#include "common/base_rocr_utils.h"
+#include "gtest/gtest.h"
+
+static const int kOutputLineLength = 80;
+static const char kLabelDelimiter[] = "####";
+static const char kDescriptionLabel[] = "TEST DESCRIPTION";
+static const char kTitleLabel[] = "TEST NAME";
+static const char kSetupLabel[] = "TEST SETUP";
+static const char kRunLabel[] = "TEST EXECUTION";
+static const char kCloseLabel[] = "TEST CLEAN UP";
+static const char kResultsLabel[] = "TEST RESULTS";
+
+
+TestBase::TestBase() {
+  set_description("");
+}
+TestBase::~TestBase() {
+}
+
+static void MakeHeaderStr(const char *inStr, std::string *outStr) {
+  assert(outStr != nullptr);
+  assert(inStr != nullptr);
+
+  outStr->clear();
+  *outStr = kLabelDelimiter;
+  *outStr += " ";
+  *outStr += inStr;
+  *outStr += " ";
+  *outStr += kLabelDelimiter;
+}
+
+void TestBase::SetUp(void) {
+  hsa_status_t err;
+  std::string label;
+  MakeHeaderStr(kSetupLabel, &label);
+  printf("\n\t%s\n", label.c_str());
+
+  err = rocrtst::InitAndSetupHSA(this);
+  ASSERT_EQ(HSA_STATUS_SUCCESS, err);
+
+  return;
+}
+
+void TestBase::Run(void) {
+  std::string label;
+  MakeHeaderStr(kRunLabel, &label);
+  printf("\n\t%s\n", label.c_str());
+}
+
+void TestBase::Close(void) {
+  hsa_status_t err;
+  std::string label;
+  MakeHeaderStr(kCloseLabel, &label);
+  printf("\n\t%s\n", label.c_str());
+
+  err = rocrtst::CommonCleanUp(this);
+  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+}
+
+
+void TestBase::DisplayResults(void) const {
+  std::string label;
+  MakeHeaderStr(kResultsLabel, &label);
+  printf("\n\t%s\n", label.c_str());
+}
+
+void TestBase::DisplayTestInfo(void) {
+  printf("#########################################"
+                                  "######################################\n");
+
+  std::string label;
+  MakeHeaderStr(kTitleLabel, &label);
+  printf("\n\t%s\n%s\n", label.c_str(), title().c_str());
+
+  if (verbosity() >= VERBOSE_STANDARD) {
+    MakeHeaderStr(kDescriptionLabel, &label);
+    printf("\n\t%s\n%s\n", label.c_str(), description().c_str());
+  }
+}
+
+void TestBase::set_description(std::string d) {
+  int le = kOutputLineLength - 4;
+
+  description_ = d;
+  size_t endlptr;
+
+  for (size_t i = le; i < description_.size(); i += le) {
+    endlptr = description_.find_last_of(" ", i);
+    description_.replace(endlptr, 1, "\n");
+    i = endlptr;
+  }
+}
+
diff --git a/rocrtst/suites/performance/queue_concurrency.h b/rocrtst/suites/test_common/test_base.h
similarity index 70%
rename from rocrtst/suites/performance/queue_concurrency.h
rename to rocrtst/suites/test_common/test_base.h
index 326514bfe8..9141fbf66a 100755
--- a/rocrtst/suites/performance/queue_concurrency.h
+++ b/rocrtst/suites/test_common/test_base.h
@@ -42,52 +42,43 @@
  * DEALINGS WITH THE SOFTWARE.
  *
  */
+#ifndef ROCRTST_SUITES_TEST_COMMON_TEST_BASE_H_
+#define ROCRTST_SUITES_TEST_COMMON_TEST_BASE_H_
 
-#ifndef __ROCRTST_SRC_INC_QUEUE_CONCURRENCY_H__
-#define __ROCRTST_SRC_INC_QUEUE_CONCURRENCY_H__
-
-#include "perf_common/perf_base.h"
+#include <string>
 #include "common/base_rocr.h"
-#include "hsa/hsa.h"
-#include <vector>
 
-class QueueConcurrency: public rocrtst::BaseRocR, public PerfBase {
+class TestBase : public rocrtst::BaseRocR {
  public:
-  //@Brief: Constructor
-  QueueConcurrency();
 
-  //@Brief: Destructor
-  ~QueueConcurrency();
+  TestBase(void);
 
-  //@Brief: Set up the test environmnet
-  void SetUp();
+  virtual ~TestBase(void);
 
-  //@Brief: Run the test
-  void Run();
+  enum VerboseLevel {VERBOSE_MIN = 0, VERBOSE_STANDARD, VERBOSE_PROGRESS};
 
-  //@Brief: Clean up and close
-  void Close();
+  // @Brief: Before run the core measure codes, do something to set up
+  // i.e. init runtime, prepare packet...
+  virtual void SetUp(void);
 
-  void DisplayResults() const;
+  // @Brief: Core measurement codes executing here
+  virtual void Run(void);
+
+  // @Brief: Do something clean up
+  virtual void Close(void);
+
+  // @Brief: Display the results
+  virtual void DisplayResults(void) const;
+
+  // @Brief: Display information about the test
+  virtual void DisplayTestInfo(void);
+
+  const std::string & description(void) const {return description_;}
+
+  void set_description(std::string d);
 
  private:
-
-  //@Brief: Thread function
-  void ThreadFunc(int i);
-
-  //@Brief: Calculate the concurrent queue number
-  void CalculateQueueNum();
-
-  //@Brief: Vector to store execution time
-  std::vector<double> execution_time_;
-
-  //@Brief: Number of concurrent queues
-  size_t queue_num_;
-
-  //@Brief: Store the standard execution time
-  double std_time_;
-
+  std::string description_;
 };
 
-#endif //__ROCRTST_SRC_INC_QUEUE_CONCURRENCY_H__
-
+#endif  // ROCRTST_SUITES_TEST_COMMON_TEST_BASE_H_
diff --git a/rocrtst/suites/performance/device_load_bandwidth.h b/rocrtst/suites/test_common/test_common.cc
similarity index 55%
rename from rocrtst/suites/performance/device_load_bandwidth.h
rename to rocrtst/suites/test_common/test_common.cc
index 9cf98ca4be..8ae52d026b 100755
--- a/rocrtst/suites/performance/device_load_bandwidth.h
+++ b/rocrtst/suites/test_common/test_common.cc
@@ -43,77 +43,79 @@
  *
  */
 
-#ifndef __ROCRTST_SRC_INC_DEVICE_LOAD_BANDWIDTH_H__
-#define __ROCRTST_SRC_INC_DEVICE_LOAD_BANDWIDTH_H__
+#include <assert.h>
+#include <stdint.h>
+#include <iostream>
+#include <getopt.h>
 
-#include "perf_common/perf_base.h"
-#include "common/base_rocr.h"
-#include "hsa/hsa.h"
-#include <stdio.h>
+#include "suites/test_common/test_common.h"
 
-class DeviceLoadBandwidth: public rocrtst::BaseRocR, public PerfBase {
- public:
-  //@Brief: Constructor
-  DeviceLoadBandwidth();
+RocrtstOptions::RocrtstOptions(uint32_t *verb, uint32_t *iter) {
+  assert(verb != nullptr);
+  assert(iter != nullptr);
 
-  //@Brief: Destructor
-  ~DeviceLoadBandwidth();
+  verbosity_ = verb;
+  iterations_ = iter;
+}
 
-  //@Brief: Set up the testing environment
-  virtual void SetUp();
+RocrtstOptions::~RocrtstOptions() {
+}
 
-  //@Brief: Run the test case
-  virtual void Run();
+static const struct option long_options[] = {
+  {"iterations", required_argument, nullptr, 'i'},
+  {"verbose", no_argument, nullptr, 'v'},
 
-  //@Brief: Close and clean up  the test enrionment
-  virtual void Close();
-
-  //@Brief: Display  load bandwidth
-  virtual void DisplayResults() const;
-
-  //@Brief: Set work-item configuration
-  void SetWorkItemNum() {
-#ifdef INTERACTIVE
-    uint32_t tmp;
-    printf("Please input the number of CUs you want to try:\n");
-    scanf("%d", &num_cus_);
-
-    printf("Please input the number of groups you want to try:\n");
-    scanf("%d", &num_group_);
-
-    printf("Please input the size of each group:\n");
-    scanf("%d", &tmp);
-    set_group_size(tmp);
-
-    printf("Please input the number of kernel loop you want to try:\n");
-    scanf("%d", &kernel_loop_count_);
-#else
-    num_cus_ = 16;
-    num_group_ = 128;
-    set_group_size(64);
-    kernel_loop_count_ = 16;
-#endif
-    return;
-  }
-
- private:
-  //@Brief: number of group
-  uint32_t num_group_;
-
-  //@Brief: number of CUs
-  uint32_t num_cus_;
-
-  //@Brief: number of kernel loop
-  uint32_t kernel_loop_count_;
-
-  //@Brief: Mean execution time
-  double mean_;
-
-  //@Brief: data size for test
-  uint64_t data_size_;
-  uint32_t* in_data_;
-  uint32_t* out_data_;
+  {nullptr, 0, nullptr, 0}
 };
+static const char* short_options = "i:v:r";
 
-#endif
+static void PrintHelp(void) {
+  std::cout <<
+//            "Required Arguments:\n"
+//           "--kernel, -k <path to kernel obj. file>\n"
+     "Optional RocRTst Arguments:\n"
+     "--iterations, -i <number of iterations to execute>; override default, "
+         "which varies for each test\n"
+     "--rocrtst_help, -r print this help message\n"
+     "--verbosity, -v <verbosity level>\n"
+     "  Verbosity levels:\n"
+     "   0    -- minimal; just summary information\n"
+     "   1    -- intermediate; show intermediate values such as intermediate "
+                  "perf. data\n"
+     "   2    -- progress; show progress displays\n"
+     "   >= 3 -- more debug output\n";
+}
 
+uint32_t ProcessCmdline(RocrtstOptions* test, int arg_cnt, char** arg_list) {
+  int a;
+  int ind = -1;
+
+  assert(test != nullptr);
+
+  while (true) {
+    a = getopt_long(arg_cnt, arg_list, short_options, long_options, &ind);
+
+    if (a == -1) {
+      break;
+    }
+
+    switch (a) {
+      case 'i':
+        *test->iterations_ = std::stoi(optarg);
+        break;
+
+      case 'v':
+        *test->verbosity_ = std::stoi(optarg);
+        break;
+
+      case 'r':
+        PrintHelp();
+        return 1;
+
+      default:
+        PrintHelp();
+        return 1;
+    }
+  }
+  return 0;
+}
diff --git a/rocrtst/suites/performance/perf_common/perf_base.h b/rocrtst/suites/test_common/test_common.h
similarity index 78%
rename from rocrtst/suites/performance/perf_common/perf_base.h
rename to rocrtst/suites/test_common/test_common.h
index 20f118b4ad..c40329bce2 100755
--- a/rocrtst/suites/performance/perf_common/perf_base.h
+++ b/rocrtst/suites/test_common/test_common.h
@@ -43,24 +43,19 @@
  *
  */
 
+#ifndef ROCRTST_SUITES_TEST_COMMON_TEST_COMMON_H_
+#define ROCRTST_SUITES_TEST_COMMON_TEST_COMMON_H_
 
-#ifndef ROCRTST_SUITES_PERFORMANCE_PERF_COMMON_PERF_BASE_H_
-#define ROCRTST_SUITES_PERFORMANCE_PERF_COMMON_PERF_BASE_H_
-
-class PerfBase {
+class RocrtstOptions {
  public:
-  // @Brief: Before run the core measure codes, do something to set up
-  // i.e. init runtime, prepare packet...
-  virtual void SetUp(void) = 0;
+  RocrtstOptions(uint32_t *verb, uint32_t *iter);
 
-  // @Brief: Core measurement codes executing here
-  virtual void Run(void) = 0;
+  ~RocrtstOptions(void);
 
-  // @Brief: Do something clean up
-  virtual void Close(void) = 0;
-
-  // @Brief: Display the results
-  virtual void DisplayResults(void) const = 0;
+  uint32_t *verbosity_;
+  uint32_t *iterations_;
 };
 
-#endif  // ROCRTST_SUITES_PERFORMANCE_PERF_COMMON_PERF_BASE_H_
+uint32_t ProcessCmdline(RocrtstOptions* test, int arg_cnt, char** arg_list);
+
+#endif  // ROCRTST_SUITES_TEST_COMMON_TEST_COMMON_H_