Added CPU to GPU and GPU to CPU MemoryAccess Tests, Added enqueue latency Tests

Change-Id: I18643d283101b792fa25705c8149ddc5a9eefe73
2017-12-19 12:21:54 +05:30
@@ -0,0 +1,502 @@
+/*
+ * =============================================================================
+ *   ROC Runtime Conformance Release License
+ * =============================================================================
+ * The University of Illinois/NCSA
+ * Open Source License (NCSA)
+ *
+ * Copyright (c) 2017, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Developed by:
+ *
+ *                 AMD Research and AMD ROC Software Development
+ *
+ *                 Advanced Micro Devices, Inc.
+ *
+ *                 www.amd.com
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal with the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ *  - Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimers.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimers in
+ *    the documentation and/or other materials provided with the distribution.
+ *  - Neither the names of <Name of Development Group, Name of Institution>,
+ *    nor the names of its contributors may be used to endorse or promote
+ *    products derived from this Software without specific prior written
+ *    permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS WITH THE SOFTWARE.
+ *
+ */
+
+
+#include <fcntl.h>
+#include <algorithm>
+#include <iostream>
+#include <vector>
+#include <memory>
+
+#include "suites/functional/memory_access.h"
+#include "common/base_rocr_utils.h"
+#include "common/common.h"
+#include "common/helper_funcs.h"
+#include "common/hsatimer.h"
+#include "gtest/gtest.h"
+#include "hsa/hsa.h"
+#include "hsa/hsa_ext_finalize.h"
+
+static const uint32_t kNumBufferElements = 256;
+
+
+
+
+
+#define RET_IF_HSA_ERR(err) { \
+  if ((err) != HSA_STATUS_SUCCESS) { \
+    const char* msg = 0; \
+    hsa_status_string(err, &msg); \
+    std::cout << "hsa api call failure at line " << __LINE__ << ", file: " << \
+                          __FILE__ << ". Call returned " << err << std::endl; \
+    std::cout << msg << std::endl; \
+    return (err); \
+  } \
+}
+
+
+
+MemoryAccessTest::MemoryAccessTest(void) :
+    TestBase() {
+  set_num_iteration(10);  // Number of iterations to execute of the main test;
+                          // This is a default value which can be overridden
+                          // on the command line.
+
+  set_title("RocR Memory Access Tests");
+  set_description("This series of tests check memory allocation"
+    "on GPU and CPU, i.e. GPU access to system memory "
+    "and CPU access to GPU memory.");
+}
+
+MemoryAccessTest::~MemoryAccessTest(void) {
+}
+
+// Any 1-time setup involving member variables used in the rest of the test
+// should be done here.
+void MemoryAccessTest::SetUp(void) {
+  hsa_status_t err;
+
+  TestBase::SetUp();
+
+  err = rocrtst::SetDefaultAgents(this);
+  ASSERT_EQ(HSA_STATUS_SUCCESS, err);
+
+  err = rocrtst::SetPoolsTypical(this);
+  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+  return;
+}
+
+void MemoryAccessTest::Run(void) {
+  // Compare required profile for this test case with what we're actually
+  // running on
+  if (!rocrtst::CheckProfile(this)) {
+    return;
+  }
+
+  TestBase::Run();
+}
+
+void MemoryAccessTest::DisplayTestInfo(void) {
+  TestBase::DisplayTestInfo();
+}
+
+void MemoryAccessTest::DisplayResults(void) const {
+  // Compare required profile for this test case with what we're actually
+  // running on
+  if (!rocrtst::CheckProfile(this)) {
+    return;
+  }
+
+  return;
+}
+
+void MemoryAccessTest::Close() {
+  // This will close handles opened within rocrtst utility calls and call
+  // hsa_shut_down(), so it should be done after other hsa cleanup
+  TestBase::Close();
+}
+
+
+
+
+
+typedef struct  __attribute__ ((aligned(16)))  args_t {
+     int *a;
+     int *b;
+     int *c;
+  } args;
+
+  args *kernArgs = NULL;
+
+static const char kSubTestSeparator[] = "  **************************";
+
+static void PrintMemorySubtestHeader(const char *header) {
+  std::cout << "  *** Memory Subtest: " << header << " ***" << std::endl;
+}
+
+static const int kMemoryAllocSize = 1024;
+// Test to check GPU can read & write to system memory
+void MemoryAccessTest::GPUAccessToCPUMemoryTest(hsa_agent_t cpuAgent,
+                                                   hsa_agent_t gpuAgent) {
+  hsa_status_t err;
+
+  // Get Global Memory Pool on the gpuAgent to allocate gpu buffers
+  hsa_amd_memory_pool_t gpu_pool;
+  err = hsa_amd_agent_iterate_memory_pools(gpuAgent,
+                                            rocrtst::GetGlobalMemoryPool,
+                                            &gpu_pool);
+  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+  hsa_amd_memory_pool_access_t access;
+  hsa_amd_agent_memory_pool_get_info(cpuAgent, gpu_pool,
+                                       HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS,
+                                       &access);
+  if (access != HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) {
+    // hsa objects
+    hsa_queue_t *queue = NULL;  // command queue
+    hsa_signal_t signal = {0};  // completion signal
+
+
+    // get queue size
+    uint32_t queue_size = 0;
+    err = hsa_agent_get_info(gpuAgent,
+                                HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size);
+    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+    // create queue
+    err = hsa_queue_create(gpuAgent,
+                              queue_size, HSA_QUEUE_TYPE_MULTI,
+                              NULL, NULL, 0, 0, &queue);
+    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+    // Get System Memory Pool on the cpuAgent to allocate host side buffers
+    hsa_amd_memory_pool_t global_pool;
+    err = hsa_amd_agent_iterate_memory_pools(cpuAgent,
+                                              rocrtst::GetGlobalMemoryPool,
+                                              &global_pool);
+    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+
+
+    // Find a memory pool that supports kernel arguments.
+    hsa_amd_memory_pool_t kernarg_pool;
+    err = hsa_amd_agent_iterate_memory_pools(cpuAgent,
+                                              rocrtst::GetKernArgMemoryPool,
+                                              &kernarg_pool);
+    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+    // Allocate the host side buffers
+    // (sys_data,dup_sys_data,cpuResult,kernArg) on system memory
+    int *sys_data = NULL;
+    int *dup_sys_data = NULL;
+    int *cpuResult = NULL;
+    int *gpuResult = NULL;
+
+    err = hsa_amd_memory_pool_allocate(global_pool,
+                                      kMemoryAllocSize, 0,
+                                      reinterpret_cast<void **>(&cpuResult));
+    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+    err = hsa_amd_memory_pool_allocate(global_pool,
+                                      kMemoryAllocSize, 0,
+                                      reinterpret_cast<void **>(&sys_data));
+    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+    err = hsa_amd_memory_pool_allocate(global_pool,
+                                      kMemoryAllocSize, 0,
+                                      reinterpret_cast<void **>(&dup_sys_data));
+    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+
+    // Allocate the kernel argument buffer from the kernarg_pool.
+    err = hsa_amd_memory_pool_allocate(kernarg_pool, sizeof(args_t), 0,
+                                        reinterpret_cast<void **>(&kernArgs));
+    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+    // initialize the host buffers
+    for (int i = 0; i < kMemoryAllocSize; ++i) {
+      unsigned int seed = time(NULL);
+      sys_data[i] = 1 + rand_r(&seed) % 1;
+      dup_sys_data[i] = sys_data[i];
+    }
+
+    memset(cpuResult, 0, kMemoryAllocSize * sizeof(int));
+
+    // for the dGPU, we have coarse grained local memory,
+    // so allocate memory for it on the GPU's GLOBAL segment .
+
+    // Get local memory of GPU to allocate device side buffers
+
+    err = hsa_amd_memory_pool_allocate(gpu_pool, kMemoryAllocSize, 0,
+                                        reinterpret_cast<void **>(&gpuResult));
+    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+
+    // Allow cpuAgent access to all allocated GPU memory.
+    err = hsa_amd_agents_allow_access(1, &cpuAgent, NULL, gpuResult);
+    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+    memset(gpuResult, 0, kMemoryAllocSize * sizeof(int));
+
+    // Allow gpuAgent access to all allocated system memory.
+    err = hsa_amd_agents_allow_access(1, &gpuAgent, NULL, cpuResult);
+    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+    err = hsa_amd_agents_allow_access(1, &gpuAgent, NULL, sys_data);
+    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+    err = hsa_amd_agents_allow_access(1, &gpuAgent, NULL, dup_sys_data);
+    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+    err = hsa_amd_agents_allow_access(1, &gpuAgent, NULL, kernArgs);
+    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+    kernArgs->a = sys_data;
+    kernArgs->b = cpuResult;  // system memory passed to gpu for write
+    kernArgs->c = gpuResult;  // gpu memory to verify that gpu read system data
+
+
+    // Create the executable, get symbol by name and load the code object
+    set_kernel_file_name("gpuReadWrite_kernels.hsaco");
+    set_kernel_name("gpuReadWrite");
+    err = rocrtst::LoadKernelFromObjFile(this, &gpuAgent);
+    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+
+
+    // Fill the dispatch packet with
+    // workgroup_size, grid_size, kernelArgs and completion signal
+    // Put it on the queue and launch the kernel by ringing the doorbell
+
+    // create completion signal
+    err = hsa_signal_create(1, 0, NULL, &signal);
+    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+    // create aql packet
+    hsa_kernel_dispatch_packet_t aql;
+    memset(&aql, 0, sizeof(aql));
+
+    // initialize aql packet
+    aql.header =
+      (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
+      (1 << HSA_PACKET_HEADER_BARRIER) |
+      (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
+      (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
+    aql.setup = 1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
+    aql.workgroup_size_x = 256;
+    aql.workgroup_size_y = 1;
+    aql.workgroup_size_z = 1;
+    aql.grid_size_x = kMemoryAllocSize;
+    aql.grid_size_y = 1;
+    aql.grid_size_z = 1;
+    aql.private_segment_size = 0;
+    aql.group_segment_size = 0;
+    aql.kernel_object = kernel_object();  // kernel_code;
+    aql.kernarg_address = kernArgs;
+    aql.completion_signal = signal;
+
+    // const uint32_t queue_size = queue->size;
+    const uint32_t queue_mask = queue->size - 1;
+
+    // write to command queue
+    uint64_t index = hsa_queue_load_write_index_relaxed(queue);
+    reinterpret_cast<hsa_kernel_dispatch_packet_t*>
+                              (queue->base_address)[index & queue_mask] = aql;
+    hsa_queue_store_write_index_relaxed(queue, index + 1);
+
+    // ringdoor bell
+    hsa_signal_store_relaxed(queue->doorbell_signal, index);
+    // wait for the signal and reset it for future use
+    while (hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, 1,
+                                      (uint64_t)-1, HSA_WAIT_STATE_ACTIVE)) { }
+    hsa_signal_store_relaxed(signal, 1);
+
+    // compare device and host side results
+    if (verbosity() > 0) {
+      std::cout<< "check gpu has read the system memory"<< std::endl;
+    }
+    for (int i = 0; i < kMemoryAllocSize; ++i) {
+      ASSERT_EQ(gpuResult[i], dup_sys_data[i]);
+    }
+
+    if (verbosity() > 0) {
+      std::cout<< "gpu has read the system memory successfully"<< std::endl;
+      std::cout<< "check gpu has written to system memory"<< std::endl;
+    }
+    for (int i = 0; i < kMemoryAllocSize; ++i) {
+      ASSERT_EQ(cpuResult[i], i);
+    }
+
+    if (verbosity() > 0) {
+      std::cout<< "gpu has written to system memory successfully"<< std::endl;
+    }
+
+    if (sys_data) { hsa_memory_free(sys_data); }
+    if (dup_sys_data) { hsa_memory_free(dup_sys_data); }
+    if (cpuResult) {hsa_memory_free(cpuResult); }
+    if (gpuResult) {hsa_memory_free(gpuResult); }
+    if (kernArgs) { hsa_memory_free(kernArgs); }
+    if (signal.handle) { hsa_signal_destroy(signal); }
+    if (queue) { hsa_queue_destroy(queue); }
+  } else {
+    if (verbosity() > 0) {
+      std::cout<< "Test not applicable as system is not large bar."
+                   "Skipping."<< std::endl;
+      std::cout << kSubTestSeparator << std::endl;
+    }
+    return;
+  }
+}
+
+
+
+
+// Test to check cpu can read & write to GPU memory
+void MemoryAccessTest::CPUAccessToGPUMemoryTest(hsa_agent_t cpuAgent,
+                                                 hsa_agent_t gpuAgent,
+                                                 hsa_amd_memory_pool_t pool) {
+  hsa_status_t err;
+
+  rocrtst::pool_info_t pool_i;
+  err = rocrtst::AcquirePoolInfo(pool, &pool_i);
+  ASSERT_EQ(HSA_STATUS_SUCCESS, err);
+
+  if (pool_i.segment == HSA_AMD_SEGMENT_GLOBAL &&
+        pool_i.global_flag == HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) {
+    hsa_amd_memory_pool_access_t access;
+    hsa_amd_agent_memory_pool_get_info(cpuAgent, pool,
+                                         HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS,
+                                         &access);
+    if (access != HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) {
+      if (!pool_i.alloc_allowed || pool_i.alloc_granule == 0 ||
+                                           pool_i.alloc_alignment == 0) {
+        if (verbosity() > 0) {
+          std::cout << "  Test not applicable. Skipping." << std::endl;
+          std::cout << kSubTestSeparator << std::endl;
+        }
+        return;
+      }
+
+
+      auto gran_sz = pool_i.alloc_granule;
+      auto pool_sz = pool_i.size / gran_sz;
+      auto max_alloc_size = pool_sz/2;
+      unsigned int max_element = max_alloc_size/sizeof(unsigned int);
+      unsigned int *gpu_data;
+      unsigned int *sys_data;
+      sys_data = (unsigned int*)malloc(max_alloc_size);
+      memset(sys_data, 0, max_alloc_size);
+      for (unsigned int i = 1; i <= max_element; ++i) {
+        sys_data[i] = i;
+      }
+      // err = hsa_amd_agents_allow_access(1, &gpuAgent, NULL, sys_data);
+      // EXPECT_EQ(err, HSA_STATUS_SUCCESS);
+      err = hsa_amd_memory_pool_allocate(pool, max_alloc_size, 0,
+                                          reinterpret_cast<void**>(&gpu_data));
+      ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+      /*
+      if (err == HSA_STATUS_ERROR) {
+        err = hsa_amd_memory_pool_free(gpu_data);
+      }*/
+
+      err = hsa_amd_agents_allow_access(1, &cpuAgent, NULL, gpu_data);
+      ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+      // EXPECT_EQ(HSA_STATUS_SUCCESS, err);
+      memset(gpu_data, 0, max_alloc_size);
+
+      // Verify CPU can read & write to GPU memory
+      std::cout<< "Verify CPU can read & write to GPU memory"<< std::endl;
+      for (unsigned int i = 1; i <= max_element; ++i) {
+        gpu_data[i] = i;  // Write to gpu memory directly
+      }
+
+     for (unsigned int  i = 1; i <= max_element; ++i) {
+       if (sys_data[i] != gpu_data[i]) {  // Reading GPU memory
+            fprintf(stdout, "Values not mathing !! sys_data[%d]:%d ,"
+                "gpu_data[%d]\n", sys_data[i], i, gpu_data[i]);
+       }
+     }
+     std::cout<< "CPU have read & write to GPU memory successfully"<< std::endl;
+     err = hsa_amd_memory_pool_free(gpu_data);
+     free(sys_data);
+     } else {
+        if (verbosity() > 0) {
+          std::cout<< "Test not applicable as system is not large bar."
+                         "Skipping."<< std::endl;
+          std::cout << kSubTestSeparator << std::endl;
+        }
+        return;
+    }
+  }
+}
+
+
+void MemoryAccessTest::CPUAccessToGPUMemoryTest(void) {
+  hsa_status_t err;
+
+  PrintMemorySubtestHeader("CPUAccessToGPUMemoryTest in Memory Pools");
+  // find all cpu agents
+  std::vector<hsa_agent_t> cpus;
+  err = hsa_iterate_agents(rocrtst::IterateCPUAgents, &cpus);
+  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+  // find all gpu agents
+  std::vector<hsa_agent_t> gpus;
+  err = hsa_iterate_agents(rocrtst::IterateGPUAgents, &gpus);
+  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+  for (unsigned int i = 0 ; i< gpus.size(); ++i) {
+    hsa_amd_memory_pool_t gpu_pool;
+    err = hsa_amd_agent_iterate_memory_pools(gpus[i],
+                                              rocrtst::GetGlobalMemoryPool,
+                                              &gpu_pool);
+    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+    CPUAccessToGPUMemoryTest(cpus[0], gpus[i], gpu_pool);
+  }
+  if (verbosity() > 0) {
+    std::cout << "subtest Passed" << std::endl;
+    std::cout << kSubTestSeparator << std::endl;
+  }
+}
+
+void MemoryAccessTest::GPUAccessToCPUMemoryTest(void) {
+  hsa_status_t err;
+
+  PrintMemorySubtestHeader("GPUAccessToCPUMemoryTest in Memory Pools");
+  // find all cpu agents
+  std::vector<hsa_agent_t> cpus;
+  err = hsa_iterate_agents(rocrtst::IterateCPUAgents, &cpus);
+  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+  // find all gpu agents
+  std::vector<hsa_agent_t> gpus;
+  err = hsa_iterate_agents(rocrtst::IterateGPUAgents, &gpus);
+  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+  for (unsigned int i = 0 ; i< gpus.size(); ++i) {
+    GPUAccessToCPUMemoryTest(cpus[0], gpus[i]);
+  }
+
+  if (verbosity() > 0) {
+    std::cout << "subtest Passed" << std::endl;
+    std::cout << kSubTestSeparator << std::endl;
+  }
+}
+
+#undef RET_IF_HSA_ERR
@@ -0,0 +1,90 @@
+/*
+ * =============================================================================
+ *   ROC Runtime Conformance Release License
+ * =============================================================================
+ * The University of Illinois/NCSA
+ * Open Source License (NCSA)
+ *
+ * Copyright (c) 2017, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Developed by:
+ *
+ *                 AMD Research and AMD ROC Software Development
+ *
+ *                 Advanced Micro Devices, Inc.
+ *
+ *                 www.amd.com
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal with the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ *  - Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimers.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimers in
+ *    the documentation and/or other materials provided with the distribution.
+ *  - Neither the names of <Name of Development Group, Name of Institution>,
+ *    nor the names of its contributors may be used to endorse or promote
+ *    products derived from this Software without specific prior written
+ *    permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS WITH THE SOFTWARE.
+ *
+ */
+#ifndef ROCRTST_SUITES_FUNCTIONAL_MEMORY_ACCESS_H_
+#define ROCRTST_SUITES_FUNCTIONAL_MEMORY_ACCESS_H_
+
+
+#include "common/base_rocr.h"
+#include "hsa/hsa.h"
+#include "suites/test_common/test_base.h"
+
+class MemoryAccessTest : public TestBase {
+ public:
+    MemoryAccessTest();
+
+  // @Brief: Destructor for test case of MemoryTest
+  virtual ~MemoryAccessTest();
+
+  // @Brief: Setup the environment for measurement
+  virtual void SetUp();
+
+  // @Brief: Core measurement execution
+  virtual void Run();
+
+  // @Brief: Clean up and retrive the resource
+  virtual void Close();
+
+  // @Brief: Display  results
+  virtual void DisplayResults() const;
+
+  // @Brief: Display information about what this test does
+  virtual void DisplayTestInfo(void);
+
+
+  // @Brief: This test verify that CPU is able to Read & write GPU memory
+  void CPUAccessToGPUMemoryTest(void);
+
+  // @Brief: This test verify that GPU is able to Read & write CPU memory
+  void GPUAccessToCPUMemoryTest(void);
+
+
+ private:
+  void CPUAccessToGPUMemoryTest(hsa_agent_t cpuAgent,
+                                                   hsa_agent_t gpuAgent,
+                                                   hsa_amd_memory_pool_t pool);
+  void GPUAccessToCPUMemoryTest(hsa_agent_t cpuAgent, hsa_agent_t gpuAgent);
+};
+
+#endif  // ROCRTST_SUITES_FUNCTIONAL_MEMORY_ACCESS_H_
@@ -0,0 +1,350 @@
+/*
+ * =============================================================================
+ *   ROC Runtime Conformance Release License
+ * =============================================================================
+ * The University of Illinois/NCSA
+ * Open Source License (NCSA)
+ *
+ * Copyright (c) 2017, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Developed by:
+ *
+ *                 AMD Research and AMD ROC Software Development
+ *
+ *                 Advanced Micro Devices, Inc.
+ *
+ *                 www.amd.com
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal with the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ *  - Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimers.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimers in
+ *    the documentation and/or other materials provided with the distribution.
+ *  - Neither the names of <Name of Development Group, Name of Institution>,
+ *    nor the names of its contributors may be used to endorse or promote
+ *    products derived from this Software without specific prior written
+ *    permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS WITH THE SOFTWARE.
+ *
+ */
+#include <fcntl.h>
+#include <algorithm>
+#include <string>
+
+#include "suites/performance/enqueueLatency.h"
+#include "common/base_rocr_utils.h"
+#include "common/common.h"
+#include "common/os.h"
+#include "common/helper_funcs.h"
+#include "common/hsatimer.h"
+#include "gtest/gtest.h"
+#include "hsa/hsa.h"
+#include "hsa/hsa_ext_finalize.h"
+
+#define RET_IF_HSA_ERR(err) { \
+  if ((err) != HSA_STATUS_SUCCESS) { \
+    const char* msg = 0; \
+    hsa_status_string(err, &msg); \
+    std::cout << "hsa api call failure at line " << __LINE__ << ", file: " << \
+                          __FILE__ << ". Call returned " << err << std::endl; \
+    std::cout << msg << std::endl; \
+    return (err); \
+  } \
+}
+
+EnqueueLatency::
+EnqueueLatency(bool enqueueSinglePacket) : TestBase(),
+                                    enqueue_single_(enqueueSinglePacket) {
+  queue_size_ = 0;
+  num_of_pkts_ = 100000;
+  memset(&aql(), 0, sizeof(hsa_kernel_dispatch_packet_t));
+  enqueue_time_mean_ = 0.0;
+  set_num_iteration(100);
+
+  std::string name;
+  std::string desc;
+
+  name = "Average Enqueue Time";
+  desc = "This test measures the time when the packet enqueue to the"
+      " queue and before the door bell is ring to notify the command processor "
+      "to execute the packet";
+
+
+
+  if (enqueueSinglePacket) {
+    name += ", Single Packet";
+    desc += " One Packet at a time in queue.";
+  } else {
+    name += ", Multiple Packets";
+    desc += " Multiple i.e. maximum Packets equeued to queue at one time";
+  }
+
+  set_title(name);
+  set_description(desc);
+}
+
+EnqueueLatency::~EnqueueLatency() {
+}
+
+void EnqueueLatency::SetUp() {
+  hsa_status_t err;
+  TestBase::SetUp();
+  // If it indicates to use default signal, set env var properly
+
+  err = SetDefaultAgents(this);
+  ASSERT_EQ(HSA_STATUS_SUCCESS, err);
+}
+
+void EnqueueLatency::Run() {
+  if (!rocrtst::CheckProfile(this)) {
+    return;
+  }
+  hsa_status_t err;
+  TestBase::Run();
+
+  // find all gpu agents
+  std::vector<hsa_agent_t> gpus;
+  err = hsa_iterate_agents(rocrtst::IterateGPUAgents, &gpus);
+  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+  for (unsigned int i = 0 ; i< gpus.size(); ++i) {
+    hsa_agent_t* gpu_dev = &gpus[i];
+    char agent_name[64];
+    err = hsa_agent_get_info(*gpu_dev, HSA_AGENT_INFO_NAME, agent_name);
+    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+    set_agent_name(agent_name);
+
+    // Create a queue
+    hsa_queue_t* q = nullptr;
+    rocrtst::CreateQueue(*gpu_dev, &q);
+    ASSERT_NE(q, nullptr);
+    set_main_queue(q);
+
+    set_kernel_file_name("dispatch_time_kernels.hsaco");
+    set_kernel_name("empty_kernel");
+    err = rocrtst::LoadKernelFromObjFile(this, gpu_dev);
+    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+    // Fill up the kernel packet except header
+    err = rocrtst::InitializeAQLPacket(this, &aql());
+    ASSERT_EQ(HSA_STATUS_SUCCESS, err);
+
+    aql().workgroup_size_x = 1;
+    aql().grid_size_x = 1;
+
+    // Here, modify the batch size if it is larger than the queue size
+    if (enqueue_single_) {
+      EnqueueSinglePacket();
+    } else {
+      hsa_status_t err;
+      uint32_t size = 0;
+      err = hsa_agent_get_info(*gpu_dev, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &size);
+      ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+      num_of_pkts_ = num_of_pkts_ > size ? size : num_of_pkts_;
+      EnqueueMultiPackets();
+    }
+  }
+}
+
+
+
+size_t EnqueueLatency::RealIterationNum() {
+  return num_iteration() * 1.2 + 1;
+}
+
+void EnqueueLatency::EnqueueSinglePacket() {
+  std::vector<double> timer;
+
+  int it = RealIterationNum();
+  const uint32_t queue_mask = main_queue()->size - 1;
+
+  // queue should be empty
+  ASSERT_EQ(hsa_queue_load_read_index_scacquire(main_queue()),
+            hsa_queue_load_write_index_scacquire(main_queue()));
+
+  void *q_base_addr = main_queue()->base_address;
+  rocrtst::PerfTimer p_timer;
+  for (int i = 0; i < it; i++) {
+    // Get timing stamp and ring the doorbell to dispatch the kernel.
+    int id = p_timer.CreateTimer();
+    p_timer.StartTimer(id);
+    // Obtain the current queue write index.
+    uint64_t index = hsa_queue_add_write_index_relaxed(main_queue(), 1);
+
+    ASSERT_LT(index, main_queue()->size + index);
+
+    // Write the aql packet at the calculated queue index address.
+    reinterpret_cast<hsa_kernel_dispatch_packet_t *>(
+                                     q_base_addr)[index & queue_mask] = aql();
+
+    reinterpret_cast<hsa_kernel_dispatch_packet_t *>(
+                        q_base_addr)[index & queue_mask].header |=
+                    HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
+
+    p_timer.StopTimer(id);
+
+    timer.push_back(p_timer.ReadTimer(id));
+    hsa_signal_store_screlease(main_queue()->doorbell_signal, index);
+
+    // Wait on the dispatch signal until the kernel is finished.
+    while (hsa_signal_wait_scacquire(aql().completion_signal,
+         HSA_SIGNAL_CONDITION_LT, 1, (uint64_t) - 1, HSA_WAIT_STATE_ACTIVE)) {
+    }
+
+
+
+    hsa_signal_store_screlease(aql().completion_signal, 1);
+
+    if (verbosity() >= VERBOSE_PROGRESS) {
+      std::cout << ".";
+      fflush(stdout);
+    }
+  }
+
+  if (verbosity() >= VERBOSE_PROGRESS) {
+    std::cout << std::endl;
+  }
+
+  // Abandon the first result and after sort, delete the last 2% value
+  timer.erase(timer.begin());
+  std::sort(timer.begin(), timer.end());
+
+  timer.erase(timer.begin() + num_iteration(), timer.end());
+
+  enqueue_time_mean_ = rocrtst::CalcMean(timer);
+
+  return;
+}
+
+
+
+void EnqueueLatency::EnqueueMultiPackets() {
+  std::vector<double> timer;
+  int it = RealIterationNum();
+  const uint32_t queue_mask = main_queue()->size - 1;
+
+  // queue should be empty
+  ASSERT_EQ(hsa_queue_load_read_index_scacquire(main_queue()),
+            hsa_queue_load_write_index_scacquire(main_queue()));
+
+  rocrtst::PerfTimer p_timer;
+
+  for (int i = 0; i < it; i++) {
+    // Get timing stamp and ring the doorbell to dispatch the kernel.
+    int id = p_timer.CreateTimer();
+    p_timer.StartTimer(id);
+    uint64_t* index =
+           reinterpret_cast<uint64_t*>(malloc(sizeof(uint64_t) * num_of_pkts_));
+
+    hsa_signal_store_screlease(aql().completion_signal, num_of_pkts_);
+
+    for (uint32_t j = 0; j < num_of_pkts_; j++) {
+      // index[j] = hsa_queue_add_write_index_scacq_screl(main_queue(), 1);
+      index[j] = hsa_queue_add_write_index_relaxed(main_queue(), 1);
+
+      // Write the aql packet at the calculated queue index address.
+      (reinterpret_cast<hsa_kernel_dispatch_packet_t*>((
+                 main_queue()->base_address)))[index[j] & queue_mask] = aql();
+
+      if (j == num_of_pkts_ - 1) {
+        (reinterpret_cast<hsa_kernel_dispatch_packet_t*>(
+            main_queue()->base_address))[index[j] & queue_mask].header |=
+                                               1 << HSA_PACKET_HEADER_BARRIER;
+      }
+    }
+
+    // Set packet header reversly; set all headers except the very first
+    // one, for now.
+    for (uint32_t j = num_of_pkts_ - 1; j > 0; j--) {
+      reinterpret_cast<hsa_kernel_dispatch_packet_t*>(
+         (main_queue()->base_address))[index[j] & queue_mask].header |=
+                    HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
+    }
+
+
+    // Set the very first header...
+    (reinterpret_cast<hsa_kernel_dispatch_packet_t*>(
+        main_queue()->base_address))[index[0] & queue_mask].header |=
+                    HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
+
+    p_timer.StopTimer(id);
+
+    timer.push_back(p_timer.ReadTimer(id));
+
+    for (uint32_t j = 0; j < num_of_pkts_; j++) {
+      hsa_signal_store_screlease(main_queue()->doorbell_signal, index[j]);
+    }
+
+    // Wait on the dispatch signal until the kernel is finished.
+    while (hsa_signal_wait_scacquire(aql().completion_signal,
+        HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, HSA_WAIT_STATE_ACTIVE) != 0) {
+    }
+
+
+    hsa_signal_store_screlease(aql().completion_signal, 1);
+
+    free(index);
+
+    if (verbosity() >= VERBOSE_PROGRESS) {
+      std::cout << ".";
+      fflush(stdout);
+    }
+  }
+
+  std::cout << std::endl;
+
+  // Abandon the first result and after sort, delete the last 2% value
+  timer.erase(timer.begin());
+  std::sort(timer.begin(), timer.end());
+
+  timer.erase(timer.begin() + num_iteration(), timer.end());
+
+  enqueue_time_mean_ = rocrtst::CalcMean(timer);
+
+  return;
+}
+
+
+
+void EnqueueLatency::DisplayTestInfo(void) {
+  TestBase::DisplayTestInfo();
+}
+
+void EnqueueLatency::DisplayResults(void) const {
+  if (!rocrtst::CheckProfile(this)) {
+    return;
+  }
+
+  TestBase::DisplayResults();
+
+  std::cout << "Average Time to Completion: ";
+  if (enqueue_single_) {
+    std::cout << enqueue_time_mean_ * 1e6;
+  } else {
+    std::cout << enqueue_time_mean_ * 1e6 / num_of_pkts_;
+  }
+
+  std::cout << " uS" << std::endl;
+  return;
+}
+
+void EnqueueLatency::Close() {
+  TestBase::Close();
+  return;
+}
@@ -0,0 +1,109 @@
+/*
+ * =============================================================================
+ *   ROC Runtime Conformance Release License
+ * =============================================================================
+ * The University of Illinois/NCSA
+ * Open Source License (NCSA)
+ *
+ * Copyright (c) 2017, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Developed by:
+ *
+ *                 AMD Research and AMD ROC Software Development
+ *
+ *                 Advanced Micro Devices, Inc.
+ *
+ *                 www.amd.com
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal with the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ *  - Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimers.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimers in
+ *    the documentation and/or other materials provided with the distribution.
+ *  - Neither the names of <Name of Development Group, Name of Institution>,
+ *    nor the names of its contributors may be used to endorse or promote
+ *    products derived from this Software without specific prior written
+ *    permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS WITH THE SOFTWARE.
+ *
+ */
+
+#ifndef ROCRTST_SUITES_PERFORMANCE_ENQUEUELATENCY_H_
+#define ROCRTST_SUITES_PERFORMANCE_ENQUEUELATENCY_H_
+#include <vector>
+
+#include "suites/test_common/test_base.h"
+#include "common/base_rocr.h"
+#include "common/common.h"
+#include "hsa/hsa.h"
+
+// @Brief: This class is defined to measure the mean latency of enqueuing
+//  the packets to an empty kernel
+
+class EnqueueLatency : public TestBase {
+ public:
+  // @Brief: Constructor
+  explicit EnqueueLatency(bool launchSingleKernel);
+
+  // @Brief: Destructor
+  virtual ~EnqueueLatency(void);
+
+  // @Brief: Set up the environment for the test
+  virtual void SetUp(void);
+
+  // @Brief: Run the test case
+  virtual void Run(void);
+
+  // @Brief: Display  results we got
+  virtual void DisplayResults(void) const;
+
+  // @Brief: Display information about what this test does
+  virtual void DisplayTestInfo(void);
+
+  // @Brief: Clean up and close the runtime
+  virtual void Close(void);
+
+  // @Brief: Create the executable, get symbol by name and load the code object
+  // virtual void LoadCodeObject(hsa_agent_t gpuAgent,uint64_t &kernel_code);
+
+ private:
+  // @Brief: Get actual iteration number
+  virtual size_t RealIterationNum(void);
+
+  // @Brief: Launch single packet each time
+  virtual void EnqueueSinglePacket(void);
+
+  // @Brief: Launch multiple packets each time
+  virtual void EnqueueMultiPackets(void);
+
+
+  // @Brief: Indicate if we enqueued single pkt or not
+  bool enqueue_single_;
+
+  // @Brief: Store the size of queue
+  uint32_t queue_size_;
+
+  // @Brief: Number of packets in a batch
+  uint32_t num_of_pkts_;
+
+  // @Brief: Ave. dispatch time
+  double enqueue_time_mean_;
+};
+
+#endif  // ROCRTST_SUITES_PERFORMANCE_ENQUEUELATENCY_H_
+
@@ -304,11 +304,16 @@ build_sample_for_devices("test_case_template")
 #set(CL_FILE_LIST "${KERNELS_DIR}/p2p_mem_access_kernels.cl")
 #build_sample_for_devices("p2p_mem_access")

-# Dispatch Time 
+# Dispatch Time
 set(BITCODE_LIBS "${COMMON_BITCODE_LIBS}")
 set(CL_FILE_LIST "${KERNELS_DIR}/dispatch_time_kernels.cl")
 build_sample_for_devices("dispatch_time")

+# gpuReadWrite
+set(BITCODE_LIBS "${COMMON_BITCODE_LIBS}")
+set(CL_FILE_LIST "${KERNELS_DIR}/gpuReadWrite_kernels.cl")
+build_sample_for_devices("gpuReadWrite")
+
 # Build rules
 add_executable(${ROCRTST} ${performanceSources} ${functionalSources}
                                           ${common_srcs} ${testCommonSources})
@@ -0,0 +1,53 @@
+/*
+ * =============================================================================
+ *   ROC Runtime Conformance Release License
+ * =============================================================================
+ * The University of Illinois/NCSA
+ * Open Source License (NCSA)
+ *
+ * Copyright (c) 2017, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Developed by:
+ *
+ *                 AMD Research and AMD ROC Software Development
+ *
+ *                 Advanced Micro Devices, Inc.
+ *
+ *                 www.amd.com
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal with the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ *  - Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimers.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimers in
+ *    the documentation and/or other materials provided with the distribution.
+ *  - Neither the names of <Name of Development Group, Name of Institution>,
+ *    nor the names of its contributors may be used to endorse or promote
+ *    products derived from this Software without specific prior written
+ *    permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS WITH THE SOFTWARE.
+ *
+ */
+
+__kernel void gpuReadWrite(__global const int * a,
+                           __global int * b, __global int * c) {
+  int i = get_global_id(0);
+  // Reading the system memory and writing to gpu memory
+  c[i] = a[i];  // a[i] point to system memory while c[i] to gpu memory.
+  //writing to system memory
+  b[i] = i;
+}
@@ -49,10 +49,12 @@

 #include "gtest/gtest.h"
 #include "suites/functional/memory_basic.h"
+#include "suites/functional/memory_access.h"
 #include "suites/functional/ipc.h"
 #include "suites/performance/dispatch_time.h"
 #include "suites/performance/memory_async_copy.h"
 #include "suites/performance/memory_async_copy_numa.h"
+#include "suites/performance/enqueueLatency.h"
 #include "suites/test_common/test_case_template.h"
 #include "suites/test_common/main.h"
 #include "suites/test_common/test_common.h"
@@ -136,6 +138,13 @@ TEST(rocrtstFunc, IPC) {
  RunGenericTest(&ipc);
 }

+TEST(rocrtstFunc, MemoryAccessTests) {
+  MemoryAccessTest mt;
+  RunCustomTestProlog(&mt);
+  mt.CPUAccessToGPUMemoryTest();
+  mt.GPUAccessToCPUMemoryTest();
+  RunCustomTestEpilog(&mt);
+}
 // Temporarily disable this test until hsa_shut_down() is (probably not the
 // same as with the IPC test above) is addressed. To override the disable,
 // run with --gtest-also_run_disabled_tests flag.
@@ -147,6 +156,13 @@ TEST(rocrtstFunc, DISABLED_Memory_Max_Mem) {
  RunCustomTestEpilog(&mt);
 }

+TEST(rocrtstPerf, ENQUEUE_LATENCY) {
+  EnqueueLatency singlePacketequeue(true);
+  EnqueueLatency multiPacketequeue(false);
+  RunGenericTest(&singlePacketequeue);
+  RunGenericTest(&multiPacketequeue);
+}
+
 TEST(rocrtstPerf, Memory_Async_Copy) {
  MemoryAsyncCopy mac;
  // To do full test, uncomment this: