Different Atomic operation tests on GPU and system memory

Change-Id: I04154b588086d49142a64c8fe4826d041ded2991 [ROCm/ROCR-Runtime commit: 044fb8dc27]
2018-05-18 16:24:32 +05:30
@@ -0,0 +1,499 @@
+/*
+ * =============================================================================
+ *   ROC Runtime Conformance Release License
+ * =============================================================================
+ * The University of Illinois/NCSA
+ * Open Source License (NCSA)
+ *
+ * Copyright (c) 2018, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Developed by:
+ *
+ *                 AMD Research and AMD ROC Software Development
+ *
+ *                 Advanced Micro Devices, Inc.
+ *
+ *                 www.amd.com
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal with the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ *  - Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimers.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimers in
+ *    the documentation and/or other materials provided with the distribution.
+ *  - Neither the names of <Name of Development Group, Name of Institution>,
+ *    nor the names of its contributors may be used to endorse or promote
+ *    products derived from this Software without specific prior written
+ *    permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS WITH THE SOFTWARE.
+ *
+ */
+
+
+#include <fcntl.h>
+#include <algorithm>
+#include <iostream>
+#include <vector>
+#include <memory>
+#include <string>
+
+#include "suites/functional/memory_atomics.h"
+#include "common/base_rocr_utils.h"
+#include "common/common.h"
+#include "common/helper_funcs.h"
+#include "common/hsatimer.h"
+#include "gtest/gtest.h"
+#include "hsa/hsa.h"
+#include "hsa/hsa_ext_finalize.h"
+
+static const uint32_t kNumBufferElements = 256;
+static const int kValue = 5;
+
+
+// This wrapper atomically writes the provided header and setup to the
+// provided AQL packet. The provided AQL packet address should be in the
+// queue memory space.
+static inline void AtomicSetPacketHeader(uint16_t header, uint16_t setup,
+                                  hsa_kernel_dispatch_packet_t* queue_packet) {
+  __atomic_store_n(reinterpret_cast<uint32_t*>(queue_packet),
+                   header | (setup << 16), __ATOMIC_RELEASE);
+}
+
+
+
+MemoryAtomic::MemoryAtomic(AtomicTest testtype) :
+    TestBase() {
+  set_num_iteration(10);  // Number of iterations to execute of the main test;
+                          // This is a default value which can be overridden
+                          // on the command line.
+  testtype_ = testtype;
+  std::string name;
+  std::string desc;
+
+  name = "RocR Memory Atomic Test";
+  desc = "";
+
+  if (testtype_ == ADD) {
+    name += " For ADD";
+    desc += " This test will do Add kernel atomic"
+            " operation on GPU and system memory.";
+  } else if (testtype_ == SUB) {
+    name += " For Sub";
+    desc += " This test will do Sub kernel atomic"
+            " operation on GPU and system memory.";
+  } else if (testtype_ == AND) {
+    name += " For And";
+    desc += " This test will do AND kernel atomic"
+            " operation on GPU and system memory.";
+  } else if (testtype_ == OR) {
+    name += " For Or";
+    desc += " This test will do OR kernel atomic"
+            " operation on GPU and system memory.";
+  } else if (testtype_ == XOR) {
+    name += " For Xor";
+    desc += " This test will do XOR kernel atomic"
+            " operation on GPU and system memory.";
+  } else if (testtype_ == MIN) {
+    name += " For Minimum";
+    desc += " This test will do Minimum kernel atomic"
+            " operation on GPU and system memory.";
+  } else if (testtype_ == MAX) {
+    name += " For Maximum";
+    desc += " This test will do Maximum kernel atomic"
+            " operation on GPU and system memory.";
+  } else if (testtype_ == XCHG) {
+    name += " For Exchange";
+    desc += " This test will do Xchg kernel atomic"
+            " operation on GPU and system memory.";
+  } else if (testtype_ == INC) {
+    name += " For Increment";
+    desc += " This test will do Increment kernel atomic"
+            " operation on GPU and system memory.";
+  } else if (testtype_ == DEC) {
+    name += " For Decremnet";
+    desc += " This test will do decrement kernel atomic"
+            " operation on GPU and system memory.";
+  }
+
+  set_title(name);
+  set_description(desc);
+  memset(&aql(), 0, sizeof(hsa_kernel_dispatch_packet_t));
+}
+
+MemoryAtomic::~MemoryAtomic(void) {
+}
+
+// Any 1-time setup involving member variables used in the rest of the test
+// should be done here.
+void MemoryAtomic::SetUp(void) {
+  hsa_status_t err;
+
+  TestBase::SetUp();
+
+  err = rocrtst::SetDefaultAgents(this);
+  ASSERT_EQ(HSA_STATUS_SUCCESS, err);
+
+  err = rocrtst::SetPoolsTypical(this);
+  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+  // Fill up the kernel packet except header
+  err = rocrtst::InitializeAQLPacket(this, &aql());
+  ASSERT_EQ(HSA_STATUS_SUCCESS, err);
+  return;
+}
+
+void MemoryAtomic::Run(void) {
+  // Compare required profile for this test case with what we're actually
+  // running on
+  if (!rocrtst::CheckProfile(this)) {
+    return;
+  }
+
+  TestBase::Run();
+}
+
+void MemoryAtomic::DisplayTestInfo(void) {
+  TestBase::DisplayTestInfo();
+}
+
+void MemoryAtomic::DisplayResults(void) const {
+  // Compare required profile for this test case with what we're actually
+  // running on
+  if (!rocrtst::CheckProfile(this)) {
+    return;
+  }
+
+  return;
+}
+
+void MemoryAtomic::Close() {
+  // This will close handles opened within rocrtst utility calls and call
+  // hsa_shut_down(), so it should be done after other hsa cleanup
+  TestBase::Close();
+}
+
+void MemoryAtomic::WriteAQLPktToQueue(hsa_queue_t* q) {
+  void* queue_base = q->base_address;
+  const uint32_t queue_mask = q->size - 1;
+  uint64_t index = hsa_queue_add_write_index_relaxed(q, 1);
+
+      reinterpret_cast<hsa_kernel_dispatch_packet_t *>(
+                                     queue_base)[index & queue_mask] = aql();
+}
+
+
+
+
+
+typedef struct  __attribute__ ((aligned(16)))  args_t {
+  int *a;
+  int *b;
+  int *c;
+  int d;
+  int n;
+  } args;
+
+
+
+static const char kSubTestSeparator[] = "  **************************";
+
+
+static const int kMemoryAllocSize = 10;
+
+void MemoryAtomic::MemoryAtomicTest(hsa_agent_t cpuAgent,
+                                                   hsa_agent_t gpuAgent) {
+  hsa_status_t err;
+
+  // Get Global Memory Pool on the gpuAgent to allocate gpu buffers
+  hsa_amd_memory_pool_t gpu_pool;
+  err = hsa_amd_agent_iterate_memory_pools(gpuAgent,
+                                            rocrtst::GetGlobalMemoryPool,
+                                            &gpu_pool);
+  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+  hsa_amd_memory_pool_access_t access;
+  hsa_amd_agent_memory_pool_get_info(cpuAgent, gpu_pool,
+                                       HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS,
+                                       &access);
+  if (access != HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED) {
+    // hsa objects
+    hsa_queue_t *queue = NULL;  // command queue
+    // get queue size
+    uint32_t queue_size = 0;
+    err = hsa_agent_get_info(gpuAgent,
+                             HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_size);
+    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+    // create queue
+    err = hsa_queue_create(gpuAgent,
+                           queue_size, HSA_QUEUE_TYPE_MULTI,
+                           NULL, NULL, 0, 0, &queue);
+    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+    // Find a memory pool that supports kernel arguments.
+    hsa_amd_memory_pool_t kernarg_pool;
+    err = hsa_amd_agent_iterate_memory_pools(cpuAgent,
+                                              rocrtst::GetKernArgMemoryPool,
+                                              &kernarg_pool);
+    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+    // Allocate the host side buffers
+    // (refSysdata,oldValues,oldrefdata,kernArg) on system memory
+
+    // this is ref sys data on which atomics operation need to done
+    int *refSysdata = NULL;
+    // This is oldrefdata which will be required  to compare the returned old values after atomics operation
+    int *oldrefdata = NULL;
+    // This is returned old values
+    int *oldValues = NULL;
+    // This is expected data set
+    int *expecteddata = NULL;
+
+    // Get System Memory Pool on the cpuAgent to allocate host side buffers
+    hsa_amd_memory_pool_t global_pool;
+    err = hsa_amd_agent_iterate_memory_pools(cpuAgent,
+                                              rocrtst::GetGlobalMemoryPool,
+                                              &global_pool);
+    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+    err = hsa_amd_memory_pool_allocate(global_pool,
+                                      kMemoryAllocSize, 0,
+                                      reinterpret_cast<void **>(&oldValues));
+    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+    err = hsa_amd_memory_pool_allocate(global_pool,
+                                      kMemoryAllocSize, 0,
+                                      reinterpret_cast<void **>(&refSysdata));
+    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+    err = hsa_amd_memory_pool_allocate(global_pool,
+                                      kMemoryAllocSize, 0,
+                                      reinterpret_cast<void **>(&oldrefdata));
+    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+    err = hsa_amd_memory_pool_allocate(global_pool,
+                                      kMemoryAllocSize, 0,
+                                      reinterpret_cast<void **>(&expecteddata));
+    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+
+    // Allocate the kernel argument buffer from the kernarg_pool.
+    args *kernArguments = NULL;
+    err = hsa_amd_memory_pool_allocate(kernarg_pool, sizeof(args_t), 0,
+                                        reinterpret_cast<void **>(&kernArguments));
+    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+
+    memset(oldValues, 0, kMemoryAllocSize * sizeof(int));
+    memset(expecteddata, 0, kMemoryAllocSize * sizeof(int));
+
+    // for the dGPU, we have coarse grained local memory,
+    // so allocate memory for it on the GPU's GLOBAL segment .
+
+    // Get local memory of GPU to allocate device side buffers on which atomics operation need to done
+    int *gpuRefData = NULL;
+    err = hsa_amd_memory_pool_allocate(gpu_pool, kMemoryAllocSize, 0,
+                                        reinterpret_cast<void **>(&gpuRefData));
+    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+
+    // Allow cpuAgent access to all allocated GPU memory.
+    err = hsa_amd_agents_allow_access(1, &cpuAgent, NULL, gpuRefData);
+    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+    memset(gpuRefData, 0, kMemoryAllocSize * sizeof(int));
+
+    // initialize the host buffers & gpuRefData buffer
+    for (int i = 0; i < kMemoryAllocSize; ++i) {
+      unsigned int seed = time(NULL);
+      refSysdata[i] = 6 + rand_r(&seed) % 1;
+      gpuRefData[i] = 6 + rand_r(&seed) % 1;
+      oldrefdata[i] = refSysdata[i];
+    }
+
+
+    // Allow gpuAgent access to all allocated system memory.
+    err = hsa_amd_agents_allow_access(1, &gpuAgent, NULL, oldValues);
+    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+    err = hsa_amd_agents_allow_access(1, &gpuAgent, NULL, refSysdata);
+    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+    err = hsa_amd_agents_allow_access(1, &gpuAgent, NULL, oldrefdata);
+    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+    err = hsa_amd_agents_allow_access(1, &gpuAgent, NULL, kernArguments);
+    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+    kernArguments->a = refSysdata;
+    kernArguments->b = gpuRefData;
+    kernArguments->c = oldValues;
+    if (testtype_ != INC && testtype_ != DEC) {
+      kernArguments->d = kValue;
+    }
+
+    // Create the executable, get symbol by name and load the code object
+    set_kernel_file_name("atomicOperations_kernels.hsaco");
+
+    if (testtype_ == ADD) {
+      set_kernel_name("test_atomic_add");
+      // set the expected data result set from kernel
+      for (int i = 0; i < kMemoryAllocSize; ++i) {
+        expecteddata[i] = oldrefdata[i] + kValue;
+      }
+    } else if (testtype_ == SUB) {
+      set_kernel_name("test_atomic_sub");
+      // set the expected data result set from kernel
+      for (int i = 0; i < kMemoryAllocSize; ++i) {
+        expecteddata[i] = oldrefdata[i] - kValue;
+      }
+    } else if (testtype_ == AND) {
+      set_kernel_name("test_atomic_and");
+      // set the expected data result set from kernel
+      for (int i = 0; i < kMemoryAllocSize; ++i) {
+        expecteddata[i] = oldrefdata[i] & kValue;
+      }
+    } else if (testtype_ == OR) {
+      set_kernel_name("test_atomic_or");
+      // set the expected data result set from kernel
+      for (int i = 0; i < kMemoryAllocSize; ++i) {
+        expecteddata[i] = oldrefdata[i] | kValue;
+      }
+    } else if (testtype_ == XOR) {
+      set_kernel_name("test_atomic_xor");
+      // set the expected data result set from kernel
+      for (int i = 0; i < kMemoryAllocSize; ++i) {
+        expecteddata[i] = oldrefdata[i] ^ kValue;
+      }
+    } else if (testtype_ == MIN) {
+      set_kernel_name("test_atomic_min");
+      // set the expected data result set from kernel
+      for (int i = 0; i < kMemoryAllocSize; ++i) {
+        expecteddata[i] = std::min(oldrefdata[i], kValue);
+      }
+    } else if (testtype_ == MAX) {
+      set_kernel_name("test_atomic_max");
+      // set the expected data result set from kernel
+      for (int i = 0; i < kMemoryAllocSize; ++i) {
+        expecteddata[i] = std::max(oldrefdata[i], kValue);
+      }
+    } else if (testtype_ == INC) {
+      set_kernel_name("test_atomic_inc");
+      // set the expected data result set from kernel
+      for (int i = 0; i < kMemoryAllocSize; ++i) {
+        expecteddata[i] = oldrefdata[i] + 4;
+      }
+    } else if (testtype_ == DEC) {
+      set_kernel_name("test_atomic_dec");
+      // set the expected data result set from kernel
+      for (int i = 0; i < kMemoryAllocSize; ++i) {
+        expecteddata[i] = oldrefdata[i] - 4;
+      }
+    } else if (testtype_ == XCHG) {
+      set_kernel_name("test_atomic_xchg");
+      // set the expected data result set from kernel
+      for (int i = 0; i < kMemoryAllocSize; ++i) {
+        expecteddata[i] = kValue;
+      }
+    } else {
+      if (verbosity() > 0) {
+        std::cout<< "No test specified" <<std::endl;
+      }
+    }
+
+    err = rocrtst::LoadKernelFromObjFile(this, &gpuAgent);
+    ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+    // Fill up the kernel packet except header
+    err = rocrtst::InitializeAQLPacket(this, &aql());
+    ASSERT_EQ(HSA_STATUS_SUCCESS, err);
+
+    aql().workgroup_size_x = 256;
+    aql().workgroup_size_y = 1;
+    aql().workgroup_size_z = 1;
+    aql().grid_size_x = kMemoryAllocSize;
+    aql().kernarg_address = kernArguments;
+    aql().kernel_object = kernel_object();
+
+    const uint32_t queue_mask = queue->size - 1;
+
+    // Load index for writing header later to command queue at same index
+    uint64_t index = hsa_queue_load_write_index_relaxed(queue);
+
+    // This function simply copies the data we've collected so far into our
+    // local AQL packet, except the the setup and header fields.
+    WriteAQLPktToQueue(queue);
+
+
+    aql().header = HSA_PACKET_TYPE_KERNEL_DISPATCH;
+    aql().header |= HSA_FENCE_SCOPE_SYSTEM <<
+                 HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
+    aql().header |= HSA_FENCE_SCOPE_SYSTEM <<
+                 HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
+
+    void* q_base = queue->base_address;
+    // Set the Aql packet header
+    AtomicSetPacketHeader(aql().header, aql().setup,
+                        &(reinterpret_cast<hsa_kernel_dispatch_packet_t*>
+                            (q_base))[index & queue_mask]);
+
+
+    // ringdoor bell
+    hsa_signal_store_relaxed(queue->doorbell_signal, index);
+
+    // wait for the signal and reset it for future use
+    while (hsa_signal_wait_scacquire(aql().completion_signal, HSA_SIGNAL_CONDITION_LT, 1,
+                                      (uint64_t)-1, HSA_WAIT_STATE_ACTIVE)) { }
+
+    hsa_signal_store_relaxed(aql().completion_signal, 1);
+
+    // compare results with expected results
+    for (int i = 0; i < kMemoryAllocSize; ++i) {
+      ASSERT_EQ(refSysdata[i], expecteddata[i]);
+      ASSERT_EQ(gpuRefData[i], expecteddata[i]);
+      ASSERT_EQ(oldValues[i], oldrefdata[i]);
+    }
+
+    if (refSysdata) { hsa_memory_free(refSysdata); }
+    if (oldrefdata) { hsa_memory_free(oldrefdata); }
+    if (oldValues) {hsa_memory_free(oldValues); }
+    if (gpuRefData) {hsa_memory_free(gpuRefData); }
+    if (kernArguments) { hsa_memory_free(kernArguments); }
+    if (queue) { hsa_queue_destroy(queue); }
+  } else {
+    if (verbosity() > 0) {
+      std::cout<< "Test not applicable as system is not large bar."
+                   "Skipping."<< std::endl;
+      std::cout << kSubTestSeparator << std::endl;
+    }
+    return;
+  }
+}
+
+void MemoryAtomic::MemoryAtomicTest(void) {
+  hsa_status_t err;
+  // find all cpu agents
+  std::vector<hsa_agent_t> cpus;
+  err = hsa_iterate_agents(rocrtst::IterateCPUAgents, &cpus);
+  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+  // find all gpu agents
+  std::vector<hsa_agent_t> gpus;
+  err = hsa_iterate_agents(rocrtst::IterateGPUAgents, &gpus);
+  ASSERT_EQ(err, HSA_STATUS_SUCCESS);
+
+  for (unsigned int i = 0 ; i< gpus.size(); ++i) {
+    MemoryAtomicTest(cpus[0], gpus[i]);
+  }
+}
+
@@ -0,0 +1,100 @@
+/*
+ * =============================================================================
+ *   ROC Runtime Conformance Release License
+ * =============================================================================
+ * The University of Illinois/NCSA
+ * Open Source License (NCSA)
+ *
+ * Copyright (c) 2018, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Developed by:
+ *
+ *                 AMD Research and AMD ROC Software Development
+ *
+ *                 Advanced Micro Devices, Inc.
+ *
+ *                 www.amd.com
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal with the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ *  - Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimers.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimers in
+ *    the documentation and/or other materials provided with the distribution.
+ *  - Neither the names of <Name of Development Group, Name of Institution>,
+ *    nor the names of its contributors may be used to endorse or promote
+ *    products derived from this Software without specific prior written
+ *    permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS WITH THE SOFTWARE.
+ *
+ */
+#ifndef ROCRTST_SUITES_FUNCTIONAL_MEMORY_ATOMICS_H_
+#define ROCRTST_SUITES_FUNCTIONAL_MEMORY_ATOMICS_H_
+
+
+#include "common/base_rocr.h"
+#include "hsa/hsa.h"
+#include "suites/test_common/test_base.h"
+
+// Atomic Test type
+enum AtomicTest {
+  ADD,  // For add atomic operation
+  SUB,  // For sub atomic operation
+  AND,  // For and atomic operation
+  OR,   // For or atomic operation
+  XOR,  // For xor atomic operation
+  INC,  // For inc atomic operation
+  DEC,  // For dec atomic operation
+  MAX,  // For max atomic operation
+  MIN,  // For min atomic operation
+  XCHG,  // For xchg atomic operation
+  NO_TEST};
+
+class MemoryAtomic : public TestBase {
+ public:
+  explicit MemoryAtomic(AtomicTest testtype);
+
+  // @Brief: Destructor for test case of MemoryTest
+  virtual ~MemoryAtomic();
+
+  // @Brief: Setup the environment for measurement
+  virtual void SetUp();
+
+  // @Brief: Core measurement execution
+  virtual void Run();
+
+  // @Brief: Clean up and retrive the resource
+  virtual void Close();
+
+  // @Brief: Display  results
+  virtual void DisplayResults() const;
+
+  // @Brief: Display information about what this test does
+  virtual void DisplayTestInfo(void);
+
+  void MemoryAtomicTest(void);
+
+
+ private:
+  void MemoryAtomicTest(hsa_agent_t cpuAgent, hsa_agent_t gpuAgent);
+
+  void WriteAQLPktToQueue(hsa_queue_t* q);
+
+  AtomicTest testtype_;
+};
+
+#endif  // ROCRTST_SUITES_FUNCTIONAL_MEMORY_ATOMICS_H_
@@ -316,6 +316,7 @@ set(BITCODE_LIBS "${COMMON_BITCODE_LIBS}")
 set(CL_FILE_LIST "${KERNELS_DIR}/gpuReadWrite_kernels.cl")
 build_sample_for_devices("gpuReadWrite")

+
 # Vector Add Debug Trap
 set(BITCODE_LIBS "${COMMON_BITCODE_LIBS}")
 set(CL_FILE_LIST "${KERNELS_DIR}/vector_add_debug_trap_kernel.cl")
@@ -326,6 +327,12 @@ set(BITCODE_LIBS "${COMMON_BITCODE_LIBS}")
 set(CL_FILE_LIST "${KERNELS_DIR}/vector_add_memory_fault_kernel.cl")
 build_sample_for_devices("vector_add_memory_fault")

+# atomic_add_kernels
+set(BITCODE_LIBS "${COMMON_BITCODE_LIBS}")
+set(CL_FILE_LIST "${KERNELS_DIR}/atomicOperations_kernels.cl")
+build_sample_for_devices("atomicOperations")
+
+
 # Build rules
 add_executable(${ROCRTST} ${performanceSources} ${functionalSources} ${negativeSources} ${stressSources}
                                           ${common_srcs} ${testCommonSources})
@@ -0,0 +1,140 @@
+/*
+ * =============================================================================
+ *   ROC Runtime Conformance Release License
+ * =============================================================================
+ * The University of Illinois/NCSA
+ * Open Source License (NCSA)
+ *
+ * Copyright (c) 2018, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Developed by:
+ *
+ *                 AMD Research and AMD ROC Software Development
+ *
+ *                 Advanced Micro Devices, Inc.
+ *
+ *                 www.amd.com
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal with the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ *  - Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimers.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimers in
+ *    the documentation and/or other materials provided with the distribution.
+ *  - Neither the names of <Name of Development Group, Name of Institution>,
+ *    nor the names of its contributors may be used to endorse or promote
+ *    products derived from this Software without specific prior written
+ *    permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS WITH THE SOFTWARE.
+ *
+ */
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+__kernel void test_atomic_add(volatile __global int *sysMemory,
+                              volatile __global int *gpuMemory,
+                             __global int *oldValues, int value) {
+    int  tid = get_global_id(0);
+    oldValues[tid] = atomic_add(&sysMemory[tid], value);
+    atomic_add(&gpuMemory[tid], value);
+}
+
+__kernel void test_atomic_sub(volatile __global int *sysMemory,
+                              volatile __global int *gpuMemory,
+                             __global int *oldValues, int value) {
+    int  tid = get_global_id(0);
+    oldValues[tid] = atomic_sub(&sysMemory[tid], value);
+    atomic_sub(&gpuMemory[tid], value);
+}
+
+__kernel void test_atomic_and(volatile __global int *sysMemory,
+                              volatile __global int *gpuMemory,
+                             __global int *oldValues, int value) {
+    int  tid = get_global_id(0);
+    oldValues[tid] = atomic_and(&sysMemory[tid], value);
+    atomic_and(&gpuMemory[tid], value);
+}
+
+__kernel void test_atomic_or(volatile __global int *sysMemory,
+                              volatile __global int *gpuMemory,
+                             __global int *oldValues, int value) {
+    int  tid = get_global_id(0);
+    oldValues[tid] = atomic_or(&sysMemory[tid], value);
+    atomic_or(&gpuMemory[tid], value);
+}
+
+__kernel void test_atomic_xor(volatile __global int *sysMemory,
+                              volatile __global int *gpuMemory,
+                             __global int *oldValues, int value) {
+    int  tid = get_global_id(0);
+    oldValues[tid] = atomic_xor(&sysMemory[tid], value);
+    atomic_xor(&gpuMemory[tid], value);
+}
+
+__kernel void test_atomic_xchg(volatile __global int *sysMemory,
+                              volatile __global int *gpuMemory,
+                             __global int *oldValues, int value) {
+    int  tid = get_global_id(0);
+    oldValues[tid] = atomic_xchg(&sysMemory[tid], value);
+    atomic_xchg(&gpuMemory[tid], value);
+}
+
+__kernel void test_atomic_inc(volatile __global int *sysMemory,
+                              volatile __global int *gpuMemory,
+                             __global int *oldValues) {
+    int  tid = get_global_id(0);
+    oldValues[tid] = atomic_inc(&sysMemory[tid]);
+    atomic_inc(&sysMemory[tid]);
+    atomic_inc(&sysMemory[tid]);
+    atomic_inc(&sysMemory[tid]);
+
+    atomic_inc(&gpuMemory[tid]);
+    atomic_inc(&gpuMemory[tid]);
+    atomic_inc(&gpuMemory[tid]);
+    atomic_inc(&gpuMemory[tid]);
+}
+
+__kernel void test_atomic_dec(volatile __global int *sysMemory,
+                              volatile __global int *gpuMemory,
+                             __global int *oldValues) {
+    int  tid = get_global_id(0);
+    oldValues[tid] = atomic_dec(&sysMemory[tid]);
+    atomic_dec(&sysMemory[tid]);
+    atomic_dec(&sysMemory[tid]);
+    atomic_dec(&sysMemory[tid]);
+
+    atomic_dec(&gpuMemory[tid]);
+    atomic_dec(&gpuMemory[tid]);
+    atomic_dec(&gpuMemory[tid]);
+    atomic_dec(&gpuMemory[tid]);
+}
+
+__kernel void test_atomic_max(volatile __global int *sysMemory,
+                              volatile __global int *gpuMemory,
+                             __global int *oldValues, int value) {
+    int  tid = get_global_id(0);
+    oldValues[tid] = atomic_max(&sysMemory[tid], value);
+    atomic_max(&gpuMemory[tid], value);
+}
+
+__kernel void test_atomic_min(volatile __global int *sysMemory,
+                              volatile __global int *gpuMemory,
+                             __global int *oldValues, int value) {
+    int  tid = get_global_id(0);
+    oldValues[tid] = atomic_min(&sysMemory[tid], value);
+    atomic_min(&gpuMemory[tid], value);
+}
+
+
@@ -53,6 +53,7 @@
 #include "suites/functional/memory_access.h"
 #include "suites/functional/ipc.h"
 #include "suites/functional/memory_alignment.h"
+#include "suites/functional/memory_atomics.h"
 #include "suites/performance/dispatch_time.h"
 #include "suites/performance/memory_async_copy.h"
 #include "suites/performance/memory_async_copy_numa.h"
@@ -176,6 +177,76 @@ TEST(rocrtstFunc, Memory_Max_Mem) {
  RunCustomTestEpilog(&mt);
 }

+TEST(rocrtstFunc, Memory_Atomic_Add_Test) {
+  MemoryAtomic ma(ADD);
+  RunCustomTestProlog(&ma);
+  ma.MemoryAtomicTest();
+  RunCustomTestEpilog(&ma);
+}
+
+TEST(rocrtstFunc, Memory_Atomic_Sub_Test) {
+  MemoryAtomic ma(SUB);
+  RunCustomTestProlog(&ma);
+  ma.MemoryAtomicTest();
+  RunCustomTestEpilog(&ma);
+}
+
+TEST(rocrtstFunc, Memory_Atomic_And_Test) {
+  MemoryAtomic ma(AND);
+  RunCustomTestProlog(&ma);
+  ma.MemoryAtomicTest();
+  RunCustomTestEpilog(&ma);
+}
+
+TEST(rocrtstFunc, Memory_Atomic_Or_Test) {
+  MemoryAtomic ma(OR);
+  RunCustomTestProlog(&ma);
+  ma.MemoryAtomicTest();
+  RunCustomTestEpilog(&ma);
+}
+
+TEST(rocrtstFunc, Memory_Atomic_Xor_Test) {
+  MemoryAtomic ma(XOR);
+  RunCustomTestProlog(&ma);
+  ma.MemoryAtomicTest();
+  RunCustomTestEpilog(&ma);
+}
+
+TEST(rocrtstFunc, Memory_Atomic_Min_Test) {
+  MemoryAtomic ma(MIN);
+  RunCustomTestProlog(&ma);
+  ma.MemoryAtomicTest();
+  RunCustomTestEpilog(&ma);
+}
+
+TEST(rocrtstFunc, Memory_Atomic_Max_Test) {
+  MemoryAtomic ma(MAX);
+  RunCustomTestProlog(&ma);
+  ma.MemoryAtomicTest();
+  RunCustomTestEpilog(&ma);
+}
+
+TEST(rocrtstFunc, Memory_Atomic_Inc_Test) {
+  MemoryAtomic ma(INC);
+  RunCustomTestProlog(&ma);
+  ma.MemoryAtomicTest();
+  RunCustomTestEpilog(&ma);
+}
+
+TEST(rocrtstFunc, Memory_Atomic_Dec_Test) {
+  MemoryAtomic ma(DEC);
+  RunCustomTestProlog(&ma);
+  ma.MemoryAtomicTest();
+  RunCustomTestEpilog(&ma);
+}
+
+TEST(rocrtstFunc, Memory_Atomic_Xchg_Test) {
+  MemoryAtomic ma(XCHG);
+  RunCustomTestProlog(&ma);
+  ma.MemoryAtomicTest();
+  RunCustomTestEpilog(&ma);
+}
+
 TEST(rocrtstFunc, DebugBasicTests) {
  DebugBasicTest mt;
  RunCustomTestProlog(&mt);